[beignet] 01/01: Imported Upstream version 1.0.0

Andreas Beckmann anbe at moszumanska.debian.org
Mon Nov 17 13:42:44 UTC 2014


This is an automated email from the git hooks/post-receive script.

anbe pushed a commit to branch upstream
in repository beignet.

commit 8dd06bbda7c96b65b398156c74f2b4673c9c5082
Author: Andreas Beckmann <anbe at debian.org>
Date:   Mon Nov 17 06:57:21 2014 +0100

    Imported Upstream version 1.0.0
---
 CMake/CMakeConfigTemplate.hpp                      |     2 +-
 CMake/FindLLVM.cmake                               |     4 +-
 CMakeLists.txt                                     |    61 +-
 COPYING                                            |     6 +-
 backend/CMakeLists.txt                             |    73 +-
 backend/src/CMakeLists.txt                         |   147 +-
 backend/src/GBEConfig.h.in                         |     5 +-
 backend/src/backend/context.cpp                    |   115 +-
 backend/src/backend/context.hpp                    |     3 +-
 backend/src/backend/gen/gen_mesa_disasm.c          |  1161 +-
 backend/src/backend/gen/gen_mesa_disasm.h          |     2 +-
 backend/src/backend/gen75_context.cpp              |     2 +-
 backend/src/backend/gen75_context.hpp              |     2 +-
 backend/src/backend/gen75_encoder.cpp              |    94 +-
 backend/src/backend/gen75_encoder.hpp              |    10 +-
 backend/src/backend/gen7_encoder.cpp               |   244 +
 backend/src/backend/gen7_encoder.hpp               |    51 +
 backend/src/backend/gen7_instruction.hpp           |   525 +
 backend/src/backend/gen8_context.cpp               |    57 +
 .../{gen75_context.hpp => gen8_context.hpp}        |    28 +-
 backend/src/backend/gen8_encoder.cpp               |   485 +
 .../{gen75_encoder.hpp => gen8_encoder.hpp}        |    32 +-
 backend/src/backend/gen8_instruction.hpp           |   529 +
 backend/src/backend/gen_context.cpp                |    85 +-
 backend/src/backend/gen_context.hpp                |    14 +-
 backend/src/backend/gen_defs.hpp                   |   263 +-
 backend/src/backend/gen_encoder.cpp                |   314 +-
 backend/src/backend/gen_encoder.hpp                |    32 +-
 backend/src/backend/gen_insn_compact.cpp           |    22 +-
 backend/src/backend/gen_insn_scheduling.cpp        |    17 +-
 backend/src/backend/gen_insn_scheduling.hpp        |     2 +-
 backend/src/backend/gen_insn_selection.cpp         |   591 +-
 backend/src/backend/gen_insn_selection.hpp         |    10 +-
 backend/src/backend/gen_insn_selection.hxx         |     6 +-
 backend/src/backend/gen_program.cpp                |    30 +-
 backend/src/backend/gen_program.h                  |     2 +-
 backend/src/backend/gen_program.hpp                |     6 +-
 backend/src/backend/gen_reg_allocation.cpp         |    40 +-
 backend/src/backend/gen_reg_allocation.hpp         |     2 +-
 backend/src/backend/program.cpp                    |   365 +-
 backend/src/backend/program.h                      |     8 +-
 backend/src/backend/program.hpp                    |     4 +-
 backend/src/builtin_vector_proto.def               |   295 -
 backend/src/gbe_bin_generater.cpp                  |     6 +-
 backend/src/gbe_bin_interpreter.cpp                |     2 +-
 backend/src/gen_as.sh                              |   101 -
 backend/src/gen_convert.sh                         |   553 -
 backend/src/genconfig.sh                           |    11 -
 backend/src/ir/constant.cpp                        |     2 +-
 backend/src/ir/constant.hpp                        |     2 +-
 backend/src/ir/context.cpp                         |     2 +-
 backend/src/ir/context.hpp                         |     7 +-
 backend/src/ir/function.cpp                        |    38 +-
 backend/src/ir/function.hpp                        |    76 +-
 backend/src/ir/image.cpp                           |     7 +-
 backend/src/ir/image.hpp                           |     6 +-
 backend/src/ir/immediate.cpp                       |    47 +-
 backend/src/ir/immediate.hpp                       |    68 +-
 backend/src/ir/instruction.cpp                     |   165 +-
 backend/src/ir/instruction.hpp                     |    49 +-
 backend/src/ir/instruction.hxx                     |     7 +
 backend/src/ir/liveness.cpp                        |    16 +-
 backend/src/ir/liveness.hpp                        |     3 +-
 backend/src/ir/lowering.cpp                        |     2 +-
 backend/src/ir/lowering.hpp                        |     2 +-
 backend/src/ir/printf.cpp                          |    40 +-
 backend/src/ir/printf.hpp                          |     2 +-
 backend/src/ir/profile.cpp                         |    10 +-
 backend/src/ir/profile.hpp                         |     2 +-
 backend/src/ir/register.cpp                        |     2 +-
 backend/src/ir/register.hpp                        |    19 +-
 backend/src/ir/sampler.cpp                         |     2 +-
 backend/src/ir/sampler.hpp                         |     2 +-
 backend/src/ir/structural_analysis.cpp             |  1083 ++
 backend/src/ir/structural_analysis.hpp             |   346 +
 backend/src/ir/type.cpp                            |     2 +-
 backend/src/ir/type.hpp                            |     2 +-
 backend/src/ir/unit.cpp                            |     2 +-
 backend/src/ir/unit.hpp                            |     2 +-
 backend/src/ir/value.cpp                           |     2 +-
 backend/src/ir/value.hpp                           |     2 +-
 backend/src/libocl/CMakeLists.txt                  |   218 +
 backend/src/libocl/include/ocl.h                   |    41 +
 backend/src/libocl/include/ocl_async.h             |    66 +
 backend/src/libocl/include/ocl_atom.h              |   102 +
 backend/src/libocl/include/ocl_float.h             |    96 +
 backend/src/libocl/include/ocl_geometric.h         |    56 +
 backend/src/libocl/include/ocl_image.h             |   179 +
 backend/src/libocl/include/ocl_misc.h              |   148 +
 .../src/libocl/include/ocl_printf.h                |    28 +-
 backend/src/libocl/include/ocl_sync.h              |    35 +
 backend/src/libocl/include/ocl_types.h             |   118 +
 backend/src/libocl/include/ocl_vload.h             |   160 +
 .../src/libocl/include/ocl_workitem.h              |    26 +-
 .../script/gen_vector.py}                          |    81 +-
 backend/src/libocl/script/ocl_as.sh                |   147 +
 backend/src/libocl/script/ocl_common.def           |    22 +
 backend/src/libocl/script/ocl_convert.sh           |   676 +
 backend/src/libocl/script/ocl_integer.def          |    30 +
 backend/src/libocl/script/ocl_math.def             |   164 +
 backend/src/libocl/script/ocl_relational.def       |    34 +
 backend/src/libocl/src/ocl_async.cl                |    87 +
 backend/src/libocl/src/ocl_atom.cl                 |   137 +
 backend/src/{ => libocl/src}/ocl_barrier.ll        |     0
 backend/src/libocl/src/ocl_geometric.cl            |   112 +
 backend/src/libocl/src/ocl_image.cl                |   429 +
 backend/src/{ => libocl/src}/ocl_memcpy.ll         |   256 +-
 backend/src/{ => libocl/src}/ocl_memset.ll         |    80 +-
 backend/src/libocl/src/ocl_misc.cl                 |   231 +
 .../src/libocl/src/ocl_sync.cl                     |    23 +-
 backend/src/libocl/src/ocl_vload.cl                |   274 +
 backend/src/libocl/src/ocl_workitem.cl             |    57 +
 backend/src/libocl/tmpl/ocl_common.tmpl.cl         |    65 +
 backend/src/libocl/tmpl/ocl_common.tmpl.h          |    36 +
 backend/src/libocl/tmpl/ocl_defines.tmpl.h         |    38 +
 backend/src/libocl/tmpl/ocl_integer.tmpl.cl        |   398 +
 backend/src/libocl/tmpl/ocl_integer.tmpl.h         |   185 +
 backend/src/libocl/tmpl/ocl_math.tmpl.cl           |  3442 ++++
 backend/src/libocl/tmpl/ocl_math.tmpl.h            |   120 +
 backend/src/libocl/tmpl/ocl_relational.tmpl.cl     |   167 +
 backend/src/libocl/tmpl/ocl_relational.tmpl.h      |   119 +
 backend/src/llvm/llvm_barrier_nodup.cpp            |     2 +-
 backend/src/llvm/llvm_bitcode_link.cpp             |   239 +
 backend/src/llvm/llvm_gen_backend.cpp              |   335 +-
 backend/src/llvm/llvm_gen_backend.hpp              |    15 +-
 backend/src/llvm/llvm_gen_ocl_function.hxx         |     6 +-
 backend/src/llvm/llvm_intrinsic_lowering.cpp       |    16 +-
 backend/src/llvm/llvm_legalize.cpp                 |   704 +
 backend/src/llvm/llvm_loadstore_optimization.cpp   |    57 +-
 backend/src/llvm/llvm_passes.cpp                   |     2 +-
 backend/src/llvm/llvm_printf_parser.cpp            |   406 +-
 backend/src/llvm/llvm_scalarize.cpp                |    17 +-
 backend/src/llvm/llvm_to_gen.cpp                   |   122 +-
 backend/src/llvm/llvm_to_gen.hpp                   |     4 +-
 backend/src/llvm/llvm_unroll.cpp                   |   228 +
 backend/src/ocl_as.h                               |  3086 ----
 backend/src/ocl_convert.h                          | 17415 -------------------
 backend/src/ocl_stdlib.tmpl.h                      |  5160 ------
 backend/src/sys/alloc.cpp                          |     2 +-
 backend/src/sys/alloc.hpp                          |     8 +-
 backend/src/sys/assert.cpp                         |     2 +-
 backend/src/sys/assert.hpp                         |     2 +-
 backend/src/sys/atomic.hpp                         |     2 +-
 backend/src/sys/cvar.cpp                           |     2 +-
 backend/src/sys/cvar.hpp                           |     2 +-
 backend/src/sys/exception.hpp                      |     2 +-
 backend/src/sys/fixed_array.hpp                    |     2 +-
 backend/src/sys/hash_map.hpp                       |     2 +-
 backend/src/sys/intrinsics.hpp                     |     2 +-
 backend/src/sys/list.hpp                           |     2 +-
 backend/src/sys/map.hpp                            |     2 +-
 backend/src/sys/mutex.cpp                          |     2 +-
 backend/src/sys/mutex.hpp                          |     2 +-
 backend/src/sys/platform.cpp                       |     2 +-
 backend/src/sys/platform.hpp                       |     2 +-
 backend/src/sys/set.hpp                            |     2 +-
 backend/src/sys/vector.hpp                         |     2 +-
 backend/src/update.sh                              |     3 -
 backend/src/update_as.sh                           |    11 -
 backend/src/update_blob_ocl_header.py              |    65 -
 backend/src/update_convert.sh                      |    12 -
 benchmark/CMakeLists.txt                           |     5 +
 benchmark/benchmark_run.cpp                        |     4 +-
 docs/Beignet.mdwn                                  |    55 +-
 docs/Beignet/Backend/TODO.mdwn                     |    29 +-
 docs/Beignet/Backend/compiler_backend.mdwn         |     8 +-
 docs/Beignet/Backend/mixed_buffer_pointer.mdwn     |    35 +-
 docs/NEWS.mdwn                                     |     5 +-
 include/CL/cl_intel.h                              |     2 +-
 kernels/compiler_assignment_operation_in_if.cl     |    12 +
 kernels/compiler_box_blur_float_ref.bmp            |   Bin 0 -> 49206 bytes
 kernels/compiler_box_blur_image.cl                 |     8 +-
 kernels/compiler_box_blur_ref.bmp                  |   Bin 0 -> 49206 bytes
 kernels/compiler_bswap.cl                          |    12 +
 kernels/compiler_clod_function_call.cl             |    91 +
 kernels/compiler_overflow.cl                       |    45 +
 kernels/compiler_popcount.cl                       |    16 +
 kernels/compiler_time_stamp.cl                     |    28 +
 kernels/include/runtime_compile_link_inc.h         |     2 +-
 kernels/runtime_use_host_ptr_buffer.cl             |     6 +
 kernels/sample.bmp                                 |   Bin 0 -> 49206 bytes
 kernels/set_kernel_arg.cl                          |    20 +
 kernels/test_fill_image_1d_array.cl                |    11 +
 kernels/test_fill_image_2d_array.cl                |    13 +
 kernels/test_printf.cl                             |    28 +-
 kernels/vload_bench.cl                             |    33 +
 src/CMakeLists.txt                                 |    15 +-
 src/cl_alloc.c                                     |     2 +-
 src/cl_alloc.h                                     |     2 +-
 src/cl_api.c                                       |    32 +-
 src/cl_command_queue.c                             |     8 +-
 src/cl_command_queue.h                             |     2 +-
 src/cl_command_queue_gen7.c                        |    24 +-
 src/cl_context.c                                   |     2 +-
 src/cl_context.h                                   |     5 +-
 src/cl_device_data.h                               |    44 +-
 src/cl_device_id.c                                 |   145 +-
 src/cl_device_id.h                                 |     5 +-
 src/cl_driver.cpp                                  |     2 +-
 src/cl_driver.h                                    |    29 +-
 src/cl_driver_defs.c                               |     5 +-
 src/cl_driver_type.h                               |     5 +-
 src/cl_enqueue.c                                   |    66 +-
 src/cl_enqueue.h                                   |     3 +-
 src/cl_event.c                                     |     5 +-
 src/cl_event.h                                     |     2 +-
 src/cl_extensions.h                                |     3 -
 src/cl_gbe_loader.cpp                              |     2 +-
 src/cl_gbe_loader.h                                |     2 +-
 src/cl_gen75_device.h                              |     2 +-
 src/cl_gen7_device.h                               |     2 +-
 src/cl_gl_api.c                                    |     2 +-
 src/cl_gt_device.h                                 |     7 +-
 src/cl_image.c                                     |     2 +-
 src/cl_image.h                                     |     2 +-
 src/cl_internals.h                                 |     2 +-
 src/cl_kernel.c                                    |     2 +-
 src/cl_kernel.h                                    |     2 +-
 src/cl_khr_icd.c                                   |     2 +-
 src/cl_khr_icd.h                                   |     2 +-
 src/cl_mem.c                                       |   113 +-
 src/cl_mem.h                                       |     9 +-
 src/cl_mem_gl.c                                    |     4 +-
 src/cl_platform_id.c                               |     2 +-
 src/cl_platform_id.h                               |    19 +-
 src/cl_program.c                                   |    14 +-
 src/cl_program.h                                   |     2 +-
 src/cl_sampler.c                                   |     2 +-
 src/cl_sampler.h                                   |     2 +-
 src/cl_thread.c                                    |    15 +-
 src/cl_thread.h                                    |     2 +-
 src/cl_utils.h                                     |     2 +-
 src/git_sha1.sh                                    |    20 +
 src/intel/intel_batchbuffer.c                      |    10 +-
 src/intel/intel_batchbuffer.h                      |     2 +-
 src/intel/intel_defines.h                          |    10 +-
 src/intel/intel_driver.c                           |    89 +-
 src/intel/intel_driver.h                           |    30 +-
 src/intel/intel_gpgpu.c                            |   744 +-
 src/intel/intel_gpgpu.h                            |    66 +-
 src/intel/intel_structs.h                          |   310 +-
 src/kernels/cl_internal_copy_buf_rect_align4.cl    |    15 +
 src/x11/dricommon.c                                |     2 +-
 src/x11/dricommon.h                                |     2 +-
 src/x11/mesa_egl_extension.c                       |     1 -
 src/x11/va_dri2.c                                  |     2 +-
 src/x11/va_dri2.h                                  |     2 +-
 src/x11/va_dri2str.h                               |     2 +-
 src/x11/va_dri2tokens.h                            |     2 +-
 utests/CMakeLists.txt                              |    20 +-
 utests/builtin_kernel_max_global_size.cpp          |     5 +-
 utests/builtin_pow.cpp                             |    16 +-
 utests/builtin_tgamma.cpp                          |     9 +-
 utests/compare_image_2d_and_1d_array.cpp           |    21 +-
 utests/compiler_assignment_operation_in_if.cpp     |    45 +
 utests/compiler_box_blur.cpp                       |     2 +-
 utests/compiler_box_blur_float.cpp                 |     2 +-
 utests/compiler_box_blur_image.cpp                 |     4 +-
 utests/compiler_bswap.cpp                          |   109 +
 utests/compiler_fill_image_1d_array.cpp            |    73 +
 utests/compiler_fill_image_2d_array.cpp            |    84 +
 utests/compiler_local_memory_barrier.cpp           |     2 +-
 utests/compiler_local_memory_barrier_wg64.cpp      |     2 +-
 utests/compiler_local_memory_two_ptr.cpp           |     2 +-
 utests/compiler_mandelbrot.cpp                     |     2 +-
 utests/compiler_mandelbrot_alternate.cpp           |     2 +-
 utests/compiler_overflow.cpp                       |   129 +
 utests/compiler_popcount.cpp                       |    75 +
 utests/compiler_shader_toy.cpp                     |    87 -
 utests/compiler_time_stamp.cpp                     |    52 +
 utests/compiler_write_only.cpp                     |     2 +-
 utests/runtime_flat_address_space.cpp              |     2 +-
 utests/runtime_set_kernel_arg.cpp                  |    30 +
 utests/runtime_use_host_ptr_buffer.cpp             |    39 +
 utests/setenv.sh.in                                |     5 +-
 utests/utest.cpp                                   |    20 +-
 utests/utest.hpp                                   |    23 +-
 utests/utest_assert.cpp                            |     2 +-
 utests/utest_assert.hpp                            |     2 +-
 utests/utest_error.c                               |     2 +-
 utests/utest_error.h                               |     2 +-
 utests/utest_exception.hpp                         |     2 +-
 utests/utest_file_map.cpp                          |     2 +-
 utests/utest_file_map.hpp                          |     2 +-
 utests/utest_generator.py                          |     2 +-
 utests/utest_helper.cpp                            |    24 +-
 utests/utest_helper.hpp                            |     6 +-
 utests/utest_math_gen.py                           |    20 +-
 utests/utest_run.cpp                               |    13 +-
 utests/vload_bench.cpp                             |    98 +
 290 files changed, 18901 insertions(+), 29610 deletions(-)

diff --git a/CMake/CMakeConfigTemplate.hpp b/CMake/CMakeConfigTemplate.hpp
index 7702c54..aa90fd1 100644
--- a/CMake/CMakeConfigTemplate.hpp
+++ b/CMake/CMakeConfigTemplate.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/CMake/FindLLVM.cmake b/CMake/FindLLVM.cmake
index 556b3a9..5fb996d 100644
--- a/CMake/FindLLVM.cmake
+++ b/CMake/FindLLVM.cmake
@@ -3,7 +3,7 @@
 # LLVM_INCLUDE_DIR - where to find llvm include files
 # LLVM_LIBRARY_DIR - where to find llvm libs
 # LLVM_CFLAGS      - llvm compiler flags
-# LLVM_LFLAGS      - llvm linker flags
+# LLVM_LDFLAGS      - llvm linker flags
 # LLVM_MODULE_LIBS - list of llvm libs for working with modules.
 # LLVM_FOUND       - True if llvm found.
 if (LLVM_INSTALL_DIR)
@@ -62,7 +62,7 @@ execute_process(
 
 execute_process(
   COMMAND ${LLVM_CONFIG_EXECUTABLE} --ldflags
-  OUTPUT_VARIABLE LLVM_LFLAGS
+  OUTPUT_VARIABLE LLVM_LDFLAGS
   OUTPUT_STRIP_TRAILING_WHITESPACE
 )
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index ac59859..3c68187 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,9 +1,23 @@
+# compiler choose,now support ICC,GCC CLANG compiler
+set (COMPILER "GCC" CACHE INT "Compiler to choose on Linux (GCC,ICC,CLANG)")
+if (COMPILER STREQUAL "GCC")
+elseif (COMPILER STREQUAL "CLANG")
+  set (CMAKE_C_COMPILER   "clang")
+  set (CMAKE_CXX_COMPILER "clang++")
+  find_program(CMAKE_AR NAMES llvm-ar)
+  find_program(CMAKE_LINKER NAMES llvm-ld)
+elseif (COMPILER STREQUAL "ICC")
+  find_program(CMAKE_C_COMPILER NAMES icc)
+  find_program(CMAKE_CXX_COMPILER NAMES icpc)
+  find_program(CMAKE_AR NAMES xiar)
+  find_program(CMAKE_LINKER NAMES xild)
+endif ()
 
 CMAKE_MINIMUM_REQUIRED(VERSION 2.6.0)
 PROJECT(OCL)
-set (LIBCL_DRIVER_VERSION_MAJOR 0)
-set (LIBCL_DRIVER_VERSION_MINOR 9)
-set (LIBCL_DRIVER_VERSION_PATCH 3)
+set (LIBCL_DRIVER_VERSION_MAJOR 1)
+set (LIBCL_DRIVER_VERSION_MINOR 0)
+set (LIBCL_DRIVER_VERSION_PATCH 0)
 set (LIBCL_C_VERSION_MAJOR 1)
 set (LIBCL_C_VERSION_MINOR 2)
 
@@ -36,9 +50,6 @@ endif (NOT CMAKE_BUILD_TYPE)
 set (CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE} CACHE STRING "assure config" FORCE)
 message(STATUS "Building mode: " ${CMAKE_BUILD_TYPE})
 
-SET(CMAKE_CXX_FLAGS_DEBUGO0 "-O0 -g")
-SET(CMAKE_C_FLAGS_DEBUGO0 "-O0 -g")
-
 IF (EMULATE_HSW)
   SET (USE_FULSIM "true")
   ADD_DEFINITIONS(-DEMULATE_GEN=75)
@@ -62,13 +73,31 @@ ELSE (USE_FULSIM)
   ADD_DEFINITIONS(-DUSE_FULSIM=0)
 ENDIF (USE_FULSIM)
 
-SET(CMAKE_CXX_FLAGS "-Wall -Wno-invalid-offsetof -mfpmath=sse -fno-rtti -Wcast-align -std=c++0x -msse2 -msse3 -mssse3 -msse4.1 ${CMAKE_CXX_FLAGS}")
-SET(CMAKE_C_FLAGS "-Wall -mfpmath=sse -msse2 -Wcast-align -msse2 -msse3 -mssse3 -msse4.1 ${CMAKE_C_FLAGS}")
+# compiler flag setting
+if (COMPILER STREQUAL "GCC")
+  set (CMAKE_C_CXX_FLAGS "${CMAKE_C_CXX_FLAGS} -funroll-loops -fstrict-aliasing -msse2 -msse3 -mssse3 -msse4.1 -fPIC -Wall -mfpmath=sse -Wcast-align -Wl,-E")
+elseif (COMPILER STREQUAL "CLANG")
+  set (CMAKE_C_CXX_FLAGS "${CMAKE_C_CXX_FLAGS} -funroll-loops -fstrict-aliasing -msse2 -msse3 -mssse3 -msse4.1 -fPIC -Wall")
+elseif (COMPILER STREQUAL "ICC")
+  set (CMAKE_C_CXX_FLAGS "${CMAKE_C_CXX_FLAGS}  -wd2928 -Wall -fPIC -fstrict-aliasing -fp-model fast -msse4.1 -Wl,-E")
+endif ()
+set (CMAKE_CXX_FLAGS "${CMAKE_C_CXX_FLAGS} -std=c++0x -Wno-invalid-offsetof -fno-rtti")
+set (CMAKE_C_FLAGS "${CMAKE_C_CXX_FLAGS}")
+set (CMAKE_CXX_FLAGS_DEBUG          "-O0 -g -DGBE_DEBUG=1")
+set (CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DGBE_DEBUG=1")
+set (CMAKE_CXX_FLAGS_MINSIZEREL     "-Os -DNDEBUG -DGBE_DEBUG=0")
+set (CMAKE_CXX_FLAGS_RELEASE        "-O2 -DNDEBUG -DGBE_DEBUG=0")
+set (CMAKE_C_FLAGS_DEBUG          "-O0 -g -DGBE_DEBUG=1")
+set (CMAKE_C_FLAGS_RELWITHDEBINFO "-O2 -g -DGBE_DEBUG=1")
+set (CMAKE_C_FLAGS_MINSIZEREL     "-Os -DNDEBUG -DGBE_DEBUG=0")
+set (CMAKE_C_FLAGS_RELEASE        "-O2 -DNDEBUG -DGBE_DEBUG=0")
 
 # Front end stuff we need
 #INCLUDE(CMake/FindLLVM.cmake)
 Find_Package(LLVM 3.3)
 
+set (CMAKE_SHARED_LINKER_FLAGS "-Wl,--no-undefined ${LLVM_LDFLAGS}")
+
 # XLib
 Find_Package(X11)
 IF(X11_FOUND)
@@ -80,19 +109,27 @@ ENDIF(X11_FOUND)
 # DRM
 pkg_check_modules(DRM REQUIRED libdrm)
 IF(DRM_FOUND)
-  MESSAGE(STATUS "Looking for DRM - found at ${DRM_PREFIX}")
+  MESSAGE(STATUS "Looking for DRM - found at ${DRM_PREFIX} ${DRM_VERSION}")
   INCLUDE_DIRECTORIES(${DRM_INCLUDE_DIRS})
 ELSE(DRM_FOUND)
   MESSAGE(STATUS "Looking for DRM - not found")
 ENDIF(DRM_FOUND)
 
 # DRM Intel
-pkg_check_modules(DRM_INTEL REQUIRED libdrm_intel)
+pkg_check_modules(DRM_INTEL libdrm_intel>=2.4.52)
 IF(DRM_INTEL_FOUND)
   INCLUDE_DIRECTORIES(${DRM_INTEL_INCLUDE_DIRS})
-  MESSAGE(STATUS "Looking for DRM Intel - found at ${DRM_INTEL_PREFIX}")
+  MESSAGE(STATUS "Looking for DRM Intel - found at ${DRM_INTEL_PREFIX} ${DRM_INTEL_VERSION}")
+  #userptr support starts from 2.4.57, but 2.4.58 is the actual stable release
+  #FIXME userptr has randome fail for some cases, need further investigating.
+  #IF(DRM_INTEL_VERSION VERSION_GREATER 2.4.57)
+  #  MESSAGE(STATUS "Enable userptr support")
+  #  SET(DRM_INTEL_USERPTR "enable")
+  #ELSE(DRM_INTEL_VERSION VERSION_GREATER 2.4.57)
+  #  MESSAGE(STATUS "Disable userptr support")
+  #ENDIF(DRM_INTEL_VERSION VERSION_GREATER 2.4.57)
 ELSE(DRM_INTEL_FOUND)
-  MESSAGE(STATUS "Looking for DRM Intel - not found")
+  MESSAGE(FATAL_ERROR "Looking for DRM Intel (>= 2.4.52) - not found")
 ENDIF(DRM_INTEL_FOUND)
 
 # Threads
diff --git a/COPYING b/COPYING
index 4362b49..d27911b 100644
--- a/COPYING
+++ b/COPYING
@@ -7,7 +7,7 @@
  of this license document, but changing it is not allowed.
 
 [This is the first released version of the Lesser GPL.  It also counts
- as the successor of the GNU Library Public License, version 2, hence
+ as the successor of the GNU Library Public License, version 2.1 hence
  the version number 2.1.]
 
                             Preamble
@@ -211,8 +211,8 @@ the scope of this License.
   3. You may opt to apply the terms of the ordinary GNU General Public
 License instead of this License to a given copy of the Library.  To do
 this, you must alter all the notices that refer to this License, so
-that they refer to the ordinary GNU General Public License, version 2,
-instead of to this License.  (If a newer version than version 2 of the
+that they refer to the ordinary GNU General Public License, version 2.1
+instead of to this License.  (If a newer version than version 2.1 of the
 ordinary GNU General Public License has appeared, then you can specify
 that version instead if you wish.)  Do not make any other change in
 these notices.
diff --git a/backend/CMakeLists.txt b/backend/CMakeLists.txt
index 6a31c68..bf96baf 100644
--- a/backend/CMakeLists.txt
+++ b/backend/CMakeLists.txt
@@ -13,12 +13,6 @@ set (CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${GBE_CMAKE_DIR}")
 set (GBE_DEBUG_MEMORY false CACHE bool "Activate the memory debugger")
 set (GBE_USE_BLOB false CACHE bool "Compile everything from one big file")
 
-##############################################################
-# Compiler
-##############################################################
-if (UNIX)
-  set (COMPILER "GCC" CACHE INT "Compiler to choose on Linux (GCC,ICC,CLANG)")
-endif (UNIX)
 
 # Force Release with debug info
 if (NOT CMAKE_BUILD_TYPE)
@@ -34,75 +28,22 @@ else (GBE_DEBUG_MEMORY)
 endif (GBE_DEBUG_MEMORY)
 
 # Hide all symbols and allows the symbols declared as visible to be exported
-set (CMAKE_C_CXX_FLAGS "-fvisibility=hidden -DGBE_COMPILER_AVAILABLE=1 ${CMAKE_C_CXX_FLAGS}")
-
-if (COMPILER STREQUAL "GCC")
-  set (CMAKE_C_CXX_FLAGS "${CMAKE_C_CXX_FLAGS} -funroll-loops -Wstrict-aliasing=2 -fstrict-aliasing -msse2 -msse3 -mssse3 -msse4.1 -fPIC -Wall")
-  set (CMAKE_C_CXX_FLAGS "${CMAKE_C_CXX_FLAGS}  ${LLVM_CFLAGS}")
-  set (CMAKE_CXX_FLAGS "${CMAKE_C_CXX_FLAGS}  -Wno-invalid-offsetof -fno-rtti -std=c++0x")
-  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GBE_DEBUG_MEMORY_FLAG}")
-  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GBE_COMPILE_UTESTS_FLAG}")
-  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-E")
-  set (CMAKE_SHARED_LINKER_FLAGS "-Wl,--no-undefined ${LLVM_LFLAGS}")
-  set (CMAKE_CXX_FLAGS_DEBUG          "-g -DGBE_DEBUG=1")
-  set (CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DGBE_DEBUG=1")
-  set (CMAKE_CXX_FLAGS_MINSIZEREL     "-Os -DNDEBUG -DGBE_DEBUG=0")
-  set (CMAKE_CXX_FLAGS_RELEASE        "-O2 -DNDEBUG -DGBE_DEBUG=0")
-  set (CMAKE_C_FLAGS "${CMAKE_C_CXX_FLAGS}")
-  set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${GBE_DEBUG_MEMORY_FLAG}")
-  set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${GBE_COMPILE_UTESTS_FLAG}")
-  set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wl,-E")
-  set (CMAKE_C_FLAGS_DEBUG          "-g -DGBE_DEBUG=1")
-  set (CMAKE_C_FLAGS_RELWITHDEBINFO "-O2 -g -DGBE_DEBUG=1")
-  set (CMAKE_C_FLAGS_MINSIZEREL     "-Os -DNDEBUG -DGBE_DEBUG=0")
-  set (CMAKE_C_FLAGS_RELEASE        "-O2 -DNDEBUG -DGBE_DEBUG=0")
-elseif (COMPILER STREQUAL "CLANG")
-  set (CMAKE_C_COMPILER             "clang")
-  set (CMAKE_C_FLAGS                "-Wall -std=c99")
-  set (CMAKE_C_FLAGS_DEBUG          "-g -DGBE_DEBUG=1")
-  set (CMAKE_C_FLAGS_RELWITHDEBINFO "-O2 -g -DGBE_DEBUG=1")
-  set (CMAKE_C_FLAGS_MINSIZEREL     "-Os -DNDEBUG -DGBE_DEBUG=0")
-  set (CMAKE_C_FLAGS_RELEASE        "-O2 -DNDEBUG -DGBE_DEBUG=0")
-  set (CMAKE_CXX_COMPILER             "clang++")
-  set (CMAKE_CXX_FLAGS "-fstrict-aliasing -msse2 -fPIC -Wall -Wno-format-security -Wno-invalid-offsetof -std=c++0x")
-  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GBE_DEBUG_MEMORY_FLAG}")
-  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GBE_COMPILE_UTESTS_FLAG}")
-  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${VISIBILITY_FLAG}")
-  set (CMAKE_CXX_FLAGS_DEBUG          "-g -DGBE_DEBUG=1")
-  set (CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DGBE_DEBUG=1")
-  set (CMAKE_CXX_FLAGS_MINSIZEREL     "-Os -DNDEBUG -DGBE_DEBUG=0")
-  set (CMAKE_CXX_FLAGS_RELEASE        "-O2 -DNDEBUG -DGBE_DEBUG=0")
-  set (CMAKE_AR      "/usr/bin/llvm-ar")
-  set (CMAKE_LINKER  "/usr/bin/llvm-ld")
-  set (CMAKE_NM      "/usr/bin/llvm-nm")
-  set (CMAKE_OBJDUMP "/usr/bin/llvm-objdump")
-  set (CMAKE_RANLIB  "ranlib")
-elseif (COMPILER STREQUAL "ICC")
-  set (CMAKE_CXX_COMPILER "icpc")
-  set (CMAKE_C_COMPILER "icc")
-  set (CMAKE_CXX_FLAGS "-std=c++0x -wd2928 -Wall -fPIC -fstrict-aliasing -fp-model fast -xSSE2")
-  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GBE_DEBUG_MEMORY_FLAG}")
-  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GBE_COMPILE_UTESTS_FLAG}")
-  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${VISIBILITY_FLAG} -Wl,-E")
-  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GBE_DEBUG_MODE_FLAG}")
-  set (CMAKE_CXX_FLAGS_DEBUG "-g -O0 -DGBE_DEBUG=1")
-  set (CCMAKE_CXX_FLAGS_RELWITHDEBINFO "-g -O2 -DGBE_DEBUG=1")
-  set (CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG -O2 -DGBE_DEBUG=0")
-  set (CCMAKE_CXX_FLAGS_MINSIZEREL "-Os -DGBE_DEBUG=0")
-  set (CMAKE_EXE_LINKER_FLAGS "")
-endif ()
+set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${LLVM_CFLAGS} ${GBE_DEBUG_MEMORY_FLAG} ${GBE_COMPILE_UTESTS_FLAG} -DGBE_COMPILER_AVAILABLE=1 -fvisibility=hidden")
+set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${LLVM_CFLAGS} ${GBE_DEBUG_MEMORY_FLAG} ${GBE_COMPILE_UTESTS_FLAG} -DGBE_COMPILER_AVAILABLE=1")
 
 include_directories (${CMAKE_CURRENT_BINARY_DIR})
 ##############################################################
 # Project source code
 ##############################################################
 add_subdirectory (src)
-set(LOCAL_PCH_OBJECT_DIR ${LOCAL_PCH_OBJECT_DIR} PARENT_SCOPE)
-set(LOCAL_PCM_OBJECT_DIR ${LOCAL_PCM_OBJECT_DIR} PARENT_SCOPE)
+set(LOCAL_OCL_BITCODE_BIN "${LOCAL_OCL_BITCODE_BIN}" PARENT_SCOPE)
+set(LOCAL_OCL_HEADER_DIR "${LOCAL_OCL_HEADER_DIR}" PARENT_SCOPE)
+set(LOCAL_OCL_PCH_OBJECT "${LOCAL_OCL_PCH_OBJECT}" PARENT_SCOPE)
+
 set(LOCAL_GBE_OBJECT_DIR ${LOCAL_GBE_OBJECT_DIR} PARENT_SCOPE)
 set(LOCAL_INTERP_OBJECT_DIR ${LOCAL_INTERP_OBJECT_DIR} PARENT_SCOPE)
 
 set (GBE_BIN_GENERATER
-     OCL_PCM_PATH=${LOCAL_PCM_OBJECT_DIR} OCL_PCH_PATH=${LOCAL_PCH_OBJECT_DIR} LD_LIBRARY_PATH=${CMAKE_CURRENT_BINARY_DIR}/src ${CMAKE_CURRENT_BINARY_DIR}/src/gbe_bin_generater
+     env OCL_BITCODE_LIB_PATH=${LOCAL_OCL_BITCODE_BIN} OCL_HEADER_FILE_DIR=${LOCAL_OCL_HEADER_DIR} OCL_PCH_PATH=${LOCAL_OCL_PCH_OBJECT} LD_LIBRARY_PATH=${CMAKE_CURRENT_BINARY_DIR}/src ${CMAKE_CURRENT_BINARY_DIR}/src/gbe_bin_generater
      PARENT_SCOPE)
 
diff --git a/backend/src/CMakeLists.txt b/backend/src/CMakeLists.txt
index a3818ab..b4555f1 100644
--- a/backend/src/CMakeLists.txt
+++ b/backend/src/CMakeLists.txt
@@ -1,95 +1,25 @@
-set (ocl_vector_spec_file ${GBE_SOURCE_DIR}/src/builtin_vector_proto.def)
-set (ocl_vector_file ${GBE_SOURCE_DIR}/src/ocl_vector.h)
-set (ocl_as_file ${GBE_SOURCE_DIR}/src/ocl_as.h)
-set (ocl_convert_file ${GBE_SOURCE_DIR}/src/ocl_convert.h)
-set (ocl_stdlib_tmpl_file ${GBE_SOURCE_DIR}/src/ocl_stdlib.tmpl.h)
-set (ocl_common_header_file ${GBE_SOURCE_DIR}/src/ocl_common_defines.h)
-set (ocl_blob_file ${CMAKE_CURRENT_BINARY_DIR}${BEIGNET_INSTALL_DIR}ocl_stdlib.h)
-set (ocl_blob_cpp_file ${GBE_SOURCE_DIR}/src/ocl_stdlib_str.cpp)
-set (ocl_gen_blob_cmd ${GBE_SOURCE_DIR}/src/update_blob_ocl_header.py)
-set (ocl_gen_vector_cmd ${GBE_SOURCE_DIR}/src/gen_builtin_vector.py)
-
-set (string_header "\\\"string\\\"")
-add_custom_command(
-    OUTPUT ${ocl_blob_cpp_file}
-    COMMAND rm -rf ${ocl_blob_cpp_file}
-    COMMAND echo "\\\#include ${string_header}" >> ${ocl_blob_cpp_file}
-    COMMAND echo "namespace gbe {" >> ${ocl_blob_cpp_file}
-    COMMAND echo "std::string ocl_stdlib_str = " >> ${ocl_blob_cpp_file}
-    # Yeah!!! welcome to back slash hell
-    COMMAND cat ${ocl_blob_file} |sed 's/\\\\/\\\\\\\\/g' | sed 's/\\\"/\\\\\\\"/g' | awk '{ printf \(\"\\"%s\\\\n\\"\\n\", $$0\) }' >> ${ocl_blob_cpp_file}
-    COMMAND echo "\;" >> ${ocl_blob_cpp_file}
-    COMMAND echo "}" >> ${ocl_blob_cpp_file}
-    COMMAND echo "" >> ${ocl_blob_cpp_file}
-    DEPENDS ${ocl_blob_file})
-
-set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES "ocl_vector.h;${ocl_blob_file}")
-
-add_custom_command(
-  OUTPUT ${ocl_vector_file}
-  COMMAND ${PYTHON_EXECUTABLE} ${ocl_gen_vector_cmd} ${ocl_vector_spec_file} ${ocl_vector_file}
-  DEPENDS ${ocl_gen_vector_cmd} ${ocl_vector_spec_file}
-  )
-
-add_custom_command(
-  OUTPUT ${ocl_blob_file}
-  COMMAND mkdir -p ${CMAKE_CURRENT_BINARY_DIR}/${BEIGNET_INSTALL_DIR}
-  COMMAND ${PYTHON_EXECUTABLE} ${ocl_gen_blob_cmd} ${ocl_stdlib_tmpl_file} ${ocl_blob_file}
-  DEPENDS ${ocl_gen_blob_cmd} ${ocl_stdlib_tmpl_file} ${ocl_common_header_file} ${ocl_vector_file} ${ocl_as_file} ${ocl_convert_file}
-  )
-
-set (pch_object ${ocl_blob_file}.pch)
-set (local_pch_object ${ocl_blob_file}.local.pch)
-# generate pch object
-if (LLVM_VERSION_NODOT VERSION_GREATER 32)
-    set (clang_cmd -cc1 -x cl -triple spir -ffp-contract=off -cl-kernel-arg-info)
-else (LLVM_VERSION_NODOT VERSION_GREATER 32)
-    if (LLVM_VERSION_NODOT VERSION_GREATER 31)
-        set (clang_cmd -cc1 -x cl -triple nvptx -ffp-contract=off)
-    else (LLVM_VERSION_NODOT VERSION_GREATER 31)
-        set (clang_cmd -cc1 -x cl -triple ptx32)
-    endif (LLVM_VERSION_NODOT VERSION_GREATER 31)
-endif (LLVM_VERSION_NODOT VERSION_GREATER 32)
-set (clang_cmd ${clang_cmd} -cl-std=CL1.2 -fno-builtin -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND)
+set (OCL_BITCODE_BIN "${BEIGNET_INSTALL_DIR}beignet.bc")
+set (OCL_HEADER_DIR "${BEIGNET_INSTALL_DIR}/include")
+set (OCL_PCH_OBJECT "${BEIGNET_INSTALL_DIR}beignet.pch")
+set (GBE_OBJECT_DIR "${BEIGNET_INSTALL_DIR}/libgbe.so")
+set (INTERP_OBJECT_DIR "${BEIGNET_INSTALL_DIR}/libgbeinterp.so")
 
-add_custom_command(
-     OUTPUT ${pch_object}
-     COMMAND rm -f ${pch_object}
-     COMMAND ${LLVM_INSTALL_DIR}clang ${clang_cmd} --relocatable-pch -emit-pch -isysroot ${CMAKE_CURRENT_BINARY_DIR} ${ocl_blob_file} -o ${pch_object}
-     COMMAND ${LLVM_INSTALL_DIR}clang ${clang_cmd} -emit-pch ${ocl_blob_file} -o ${local_pch_object}
-     DEPENDS ${ocl_blob_file}
-     )
+configure_file (
+    "GBEConfig.h.in"
+    "GBEConfig.h"
+)
 
-add_custom_target(pch_object
-                  DEPENDS ${pch_object})
+add_subdirectory(libocl)
+set (LOCAL_GBE_OBJECT_DIR "${CMAKE_CURRENT_BINARY_DIR}/libgbe.so" PARENT_SCOPE)
+set (LOCAL_INTERP_OBJECT_DIR "${CMAKE_CURRENT_BINARY_DIR}/libgbeinterp.so" PARENT_SCOPE)
+set (LOCAL_OCL_BITCODE_BIN "${OCL_OBJECT_DIR}/beignet.bc" PARENT_SCOPE)
+set (LOCAL_OCL_HEADER_DIR "${OCL_OBJECT_DIR}/include/" PARENT_SCOPE)
+set (LOCAL_OCL_PCH_OBJECT "${OCL_OBJECT_DIR}/beignet.local.pch" PARENT_SCOPE)
 
-macro(ll_add_library ll_lib ll_sources)
-  foreach (ll ${${ll_sources}})
-  add_custom_command(
-       OUTPUT  ${ll}.bc
-       COMMAND rm -f ${ll}.bc
-       COMMAND ${LLVM_INSTALL_DIR}llvm-as -o ${ll}.bc ${GBE_SOURCE_DIR}/src/${ll}
-       DEPENDS ${ll}
-       )
-  set (ll_objects ${ll_objects} ${ll}.bc)
-  endforeach (ll ${ll_sources})
-  add_custom_command(
-       OUTPUT ${ll_lib}
-       COMMAND ${LLVM_INSTALL_DIR}llvm-link -o ${ll_lib} ${ll_objects}
-       DEPENDS ${ll_objects}
-       )
-  add_custom_target(${ll_lib}
-                    DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${ll_lib})
-endmacro(ll_add_library)
+add_dependencies(beignet_bitcode libocl)
 
-if (GBE_USE_BLOB)
-  set (GBE_SRC
-       blob.cpp
-       backend/gen/gen_mesa_disasm.c)
-else (GBE_USE_BLOB)
-  set (GBE_SRC
+set (GBE_SRC
     ${ocl_blob_file}
-    ocl_stdlib_str.cpp  # this file is auto-generated.
     sys/vector.hpp
     sys/hash_map.hpp
     sys/map.hpp
@@ -134,6 +64,8 @@ else (GBE_USE_BLOB)
     ir/lowering.hpp
     ir/printf.cpp
     ir/printf.hpp
+    ir/structural_analysis.cpp
+    ir/structural_analysis.hpp
     ir/immediate.hpp
     ir/immediate.cpp
     backend/context.cpp
@@ -141,9 +73,11 @@ else (GBE_USE_BLOB)
     backend/program.cpp
     backend/program.hpp
     backend/program.h
+    llvm/llvm_bitcode_link.cpp
     llvm/llvm_gen_backend.cpp
     llvm/llvm_passes.cpp
     llvm/llvm_scalarize.cpp
+    llvm/llvm_legalize.cpp
     llvm/llvm_intrinsic_lowering.cpp
     llvm/llvm_barrier_nodup.cpp
     llvm/llvm_printf_parser.cpp
@@ -151,6 +85,7 @@ else (GBE_USE_BLOB)
     llvm/llvm_loadstore_optimization.cpp
     llvm/llvm_gen_backend.hpp
     llvm/llvm_gen_ocl_function.hxx
+    llvm/llvm_unroll.cpp
     llvm/llvm_to_gen.hpp
     backend/gen/gen_mesa_disasm.c
     backend/gen_insn_selection.cpp
@@ -163,30 +98,32 @@ else (GBE_USE_BLOB)
     backend/gen_context.cpp
     backend/gen75_context.hpp
     backend/gen75_context.cpp
+    backend/gen8_context.hpp
+    backend/gen8_context.cpp
     backend/gen_program.cpp
     backend/gen_program.hpp
     backend/gen_program.h
+    backend/gen7_instruction.hpp
+    backend/gen8_instruction.hpp
     backend/gen_defs.hpp
     backend/gen_insn_compact.cpp
     backend/gen_encoder.hpp
     backend/gen_encoder.cpp
+    backend/gen7_encoder.hpp
+    backend/gen7_encoder.cpp
     backend/gen75_encoder.hpp
     backend/gen75_encoder.cpp
+    backend/gen8_encoder.hpp
+    backend/gen8_encoder.cpp
     )
 
-endif (GBE_USE_BLOB)
 
 include_directories (.)
 link_directories (${LLVM_LIBRARY_DIRS} ${DRM_LIBDIR})
 include_directories(${LLVM_INCLUDE_DIRS})
 add_library (gbe SHARED ${GBE_SRC})
 
-# for pre compiled module library.
-set (pcm_lib "beignet.bc")
-set (pcm_sources ocl_barrier.ll ocl_memset.ll ocl_memcpy.ll)
-ll_add_library (${pcm_lib} pcm_sources)
 
-ADD_DEPENDENCIES (gbe pch_object ${pcm_lib})
 target_link_libraries(
                       gbe
                       ${DRM_INTEL_LIBRARIES}
@@ -199,6 +136,8 @@ target_link_libraries(
 
 add_library(gbeinterp SHARED gbe_bin_interpreter.cpp)
 
+add_dependencies(gbe beignet_bitcode)
+
 if (LLVM_VERSION_NODOT VERSION_EQUAL 34)
   find_library(TERMINFO NAMES tinfo ncurses)
   if (${TERMINFO} STREQUAL TERMINFO-NOTFOUND)
@@ -215,22 +154,6 @@ TARGET_LINK_LIBRARIES(gbe_bin_generater gbe)
 
 install (TARGETS gbe LIBRARY DESTINATION ${BEIGNET_INSTALL_DIR})
 install (TARGETS gbeinterp LIBRARY DESTINATION ${BEIGNET_INSTALL_DIR})
-#install (FILES backend/program.h DESTINATION include/gen)
-install (FILES ${ocl_blob_file} DESTINATION ${BEIGNET_INSTALL_DIR})
-install (FILES ${pch_object} DESTINATION ${BEIGNET_INSTALL_DIR})
-install (FILES ${CMAKE_CURRENT_BINARY_DIR}/${pcm_lib} DESTINATION ${BEIGNET_INSTALL_DIR})
-# When build beignet itself, we need to export the local precompiled header file and precompiled module
-# file to libcl and utests.
-set (LOCAL_PCH_OBJECT_DIR "${local_pch_object}:${BEIGNET_INSTALL_DIR}/ocl_stdlib.h.pch" PARENT_SCOPE)
-set (LOCAL_PCM_OBJECT_DIR "${CMAKE_CURRENT_BINARY_DIR}/${pcm_lib}:${BEIGNET_INSTALL_DIR}/${pcm_lib}" PARENT_SCOPE)
-set (LOCAL_GBE_OBJECT_DIR "${CMAKE_CURRENT_BINARY_DIR}/libgbe.so" PARENT_SCOPE)
-set (LOCAL_INTERP_OBJECT_DIR "${CMAKE_CURRENT_BINARY_DIR}/libgbeinterp.so" PARENT_SCOPE)
-
-set (PCH_OBJECT_DIR "${BEIGNET_INSTALL_DIR}/ocl_stdlib.h.pch")
-set (PCM_OBJECT_DIR "${BEIGNET_INSTALL_DIR}/${pcm_lib}")
-set (GBE_OBJECT_DIR "${BEIGNET_INSTALL_DIR}/libgbe.so")
-set (INTERP_OBJECT_DIR "${BEIGNET_INSTALL_DIR}/libgbeinterp.so")
-configure_file (
-  "GBEConfig.h.in"
-  "GBEConfig.h"
-)
+install (FILES ${OCL_OBJECT_DIR}/beignet.bc DESTINATION ${BEIGNET_INSTALL_DIR})
+install (FILES ${OCL_OBJECT_DIR}/beignet.pch DESTINATION ${BEIGNET_INSTALL_DIR})
+install (FILES ${OCL_HEADER_FILES} DESTINATION ${BEIGNET_INSTALL_DIR}/include)
diff --git a/backend/src/GBEConfig.h.in b/backend/src/GBEConfig.h.in
index f5c69c6..b5bec14 100644
--- a/backend/src/GBEConfig.h.in
+++ b/backend/src/GBEConfig.h.in
@@ -1,7 +1,8 @@
 // the configured options and settings for LIBGBE
 #define LIBGBE_VERSION_MAJOR @LIBGBE_VERSION_MAJOR@
 #define LIBGBE_VERSION_MINOR @LIBGBE_VERSION_MINOR@
-#define PCH_OBJECT_DIR "@PCH_OBJECT_DIR@"
-#define PCM_OBJECT_DIR "@PCM_OBJECT_DIR@"
 #define GBE_OBJECT_DIR "@GBE_OBJECT_DIR@"
 #define INTERP_OBJECT_DIR "@INTERP_OBJECT_DIR@"
+#define OCL_BITCODE_BIN "@OCL_BITCODE_BIN@"
+#define OCL_HEADER_DIR "@OCL_HEADER_DIR@"
+#define OCL_PCH_OBJECT "@OCL_PCH_OBJECT@"
diff --git a/backend/src/backend/context.cpp b/backend/src/backend/context.cpp
index e09a309..5e33ddd 100644
--- a/backend/src/backend/context.cpp
+++ b/backend/src/backend/context.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -492,42 +492,110 @@ namespace gbe
     });
   }
 
+  /* Because of the structural analysis, control flow of blocks inside a structure
+   * is manipulated by if, else and endif. so these blocks don't need jips. so here
+   * treats all the blocks belong to the same structure as a whole.
+   */
   void Context::buildJIPs(void) {
     using namespace ir;
-
     // Linearly store the branch target for each block and its own label
     const LabelIndex noTarget(fn.labelNum());
     vector<std::pair<LabelIndex, LabelIndex>> braTargets;
-    int32_t curr = 0, blockNum = fn.blockNum();
-    braTargets.resize(blockNum);
-
+    int32_t curr = 0;
     // If some blocks are unused we mark them as such by setting their own label
     // as "invalid" (== noTarget)
+    int blockCount = 0;
+    // because some blocks maybe belong to the same structure, so the number of
+    // blocks we are dealing with may be less than the number of basic blocks.
+    // here calculate the actual block number we would handle.
+    fn.foreachBlock([&](const BasicBlock &bb)
+    {
+      if(bb.belongToStructure && bb.isStructureExit)
+        blockCount++;
+      else if(!bb.belongToStructure)
+        blockCount++;
+    });
+    braTargets.resize(blockCount);
+
+    LabelIndex structureExitLabel;
+    LabelIndex structureEntryLabel;
+    bool flag;
+    set<uint32_t> pos;
+    map<uint32_t, LabelIndex> exitMap;
+    map<uint32_t, LabelIndex> entryMap;
     for (auto &bb : braTargets) bb = std::make_pair(noTarget, noTarget);
     fn.foreachBlock([&](const BasicBlock &bb) {
-      const LabelIndex ownLabel = bb.getLabelIndex();
-      const Instruction *last = bb.getLastInstruction();
-      if (last->getOpcode() != OP_BRA)
-        braTargets[curr++] = std::make_pair(ownLabel, noTarget);
-      else {
-        const BranchInstruction *bra = cast<BranchInstruction>(last);
-        braTargets[curr++] = std::make_pair(ownLabel, bra->getLabelIndex());
+      LabelIndex ownLabel;
+      Instruction *last;
+      flag = false;
+      // bb belongs to a structure and it's not the structure's exit, just simply insert
+      // the target of bra to JIPs.
+      if(bb.belongToStructure && !bb.isStructureExit)
+      {
+        last = bb.getLastInstruction();
+        if(last->getOpcode() == OP_BRA)
+        {
+          BranchInstruction *bra = cast<BranchInstruction>(last);
+          JIPs.insert(std::make_pair(bra, bra->getLabelIndex()));
+        }
+        return;
       }
-    });
+      else
+      {
+        // bb belongs to a structure and it's the strucutre's exit, we treat this bb
+        // as the structure it belongs to, use the label of structure's entry as this
+        // structure's label and last instruction of structure's exit as this structure's
+        // last instruction.
+        if(bb.belongToStructure && bb.isStructureExit)
+        {
+          ownLabel = (bb.matchingStructureEntry)->getLabelIndex();
+          last = bb.getLastInstruction();
+          structureExitLabel = bb.getLabelIndex();
+          structureEntryLabel = ownLabel;
+          flag = true;
+        }
+        // bb belongs to no structure.
+        else
+        {
+          ownLabel = bb.getLabelIndex();
+          last = bb.getLastInstruction();
+        }
 
+        if (last->getOpcode() != OP_BRA)
+        {
+          braTargets[curr++] = std::make_pair(ownLabel, noTarget);
+          if(flag)
+          {
+            pos.insert(curr-1);
+            exitMap[curr-1] = structureExitLabel;
+            entryMap[curr-1] = structureEntryLabel;
+          }
+        }
+        else {
+          const BranchInstruction *bra = cast<BranchInstruction>(last);
+          braTargets[curr++] = std::make_pair(ownLabel, bra->getLabelIndex());
+          if(flag)
+          {
+            exitMap[curr-1] = structureExitLabel;
+            entryMap[curr-1] = structureEntryLabel;
+            pos.insert(curr-1);
+          }
+        }
+      }
+    });
     // Backward jumps are special. We must insert the label of the next block
     // when we hit the "DO" i.e. the target label of the backward branch (as in
     // do { } while) . So, we store the bwd jumps per targets
     // XXX does not use custom allocator
     std::multimap<LabelIndex, LabelIndex> bwdTargets;
-    for (int32_t blockID = 0; blockID < blockNum; ++blockID) {
+    for (int32_t blockID = 0; blockID <curr; ++blockID) {
       const LabelIndex ownLabel = braTargets[blockID].first;
       const LabelIndex target = braTargets[blockID].second;
       if (ownLabel == noTarget) continue; // unused block
       if (target == noTarget) continue; // no branch
       if (target <= ownLabel) { // This is a backward jump
         // Last block is just "RET". So, it cannot be the last block
-        GBE_ASSERT(blockID < blockNum - 1);
+        GBE_ASSERT(blockID < curr - 1);
         const LabelIndex fallThrough = braTargets[blockID+1].first;
         bwdTargets.insert(std::make_pair(target, fallThrough));
       }
@@ -535,15 +603,21 @@ namespace gbe
 
     // Stores the current forward targets
     set<LabelIndex> fwdTargets;
-
     // Now retraverse the blocks and figure out all JIPs
-    for (int32_t blockID = 0; blockID < blockNum; ++blockID) {
+    for (int32_t blockID = 0; blockID <curr; ++blockID) {
+
       const LabelIndex ownLabel = braTargets[blockID].first;
       const LabelIndex target = braTargets[blockID].second;
-      const BasicBlock &bb = fn.getBlock(ownLabel);
-      const Instruction *label = bb.getFirstInstruction();
+      LabelIndex tmp;
+      if(pos.find(blockID)!=pos.end())
+        tmp = exitMap[blockID];
+      else
+        tmp = ownLabel;
+      BasicBlock &bb = fn.getBlock(tmp);
+      Instruction *label = bb.getFirstInstruction();
+      if(pos.find(blockID)!=pos.end())
+        label = fn.getBlock(entryMap[blockID]).getFirstInstruction();
       const Instruction *bra = bb.getLastInstruction();
-
       // Expires the branches that point to us (if any)
       auto it = fwdTargets.find(ownLabel);
       if (it != fwdTargets.end()) fwdTargets.erase(it);
@@ -573,6 +647,7 @@ namespace gbe
       auto jip = fwdTargets.lower_bound(LabelIndex(0));
       JIPs.insert(std::make_pair(bra, *jip));
     }
+
   }
 
   void Context::handleSLM(void) {
diff --git a/backend/src/backend/context.hpp b/backend/src/backend/context.hpp
index 3faead2..1b3744b 100644
--- a/backend/src/backend/context.hpp
+++ b/backend/src/backend/context.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -21,6 +21,7 @@
 #define __GBE_CONTEXT_HPP__
 
 #include "ir/instruction.hpp"
+#include "ir/function.hpp"
 #include "backend/program.h"
 #include "sys/set.hpp"
 #include "sys/map.hpp"
diff --git a/backend/src/backend/gen/gen_mesa_disasm.c b/backend/src/backend/gen/gen_mesa_disasm.c
index c120b60..877c102 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.c
+++ b/backend/src/backend/gen/gen_mesa_disasm.c
@@ -1,10 +1,10 @@
-/* 
+/*
  * Copyright © 2012 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -49,6 +49,7 @@
 #include <assert.h>
 
 #include "backend/gen_defs.hpp"
+#include "backend/gen7_instruction.hpp"
 #include "src/cl_device_data.h"
 
 static const struct {
@@ -141,7 +142,7 @@ static const char *_abs[2] = {
   [1] = "(abs)",
 };
 
-static const char *vert_stride[16] = {
+static const char *vert_stride_gen7[16] = {
   [0] = "0",
   [1] = "1",
   [2] = "2",
@@ -151,6 +152,15 @@ static const char *vert_stride[16] = {
   [6] = "32",
   [15] = "VxH",
 };
+static const char *vert_stride_gen8[16] = {
+  [0] = "0",
+  [1] = "1",
+  [2] = "2",
+  [3] = "4",
+  [4] = "8",
+  [5] = "16",
+  [6] = "32",
+};
 
 static const char *width[8] = {
   [0] = "1",
@@ -232,10 +242,16 @@ static const char *pred_ctrl_align1[16] = {
   [11] = ".all16h",
 };
 
-static const char *thread_ctrl[4] = {
+static const char *thread_ctrl_gen7[4] = {
   [0] = "",
   [2] = "switch"
 };
+static const char *thread_ctrl_gen8[4] = {
+  [0] = "",
+  [1] = "atomic",
+  [2] = "switch"
+};
+
 
 static const char *dep_ctrl[4] = {
   [0] = "",
@@ -244,11 +260,6 @@ static const char *dep_ctrl[4] = {
   [3] = "NoDDClr,NoDDChk",
 };
 
-static const char *mask_ctrl[4] = {
-  [0] = "",
-  [1] = "nomask",
-};
-
 static const char *access_mode[2] = {
   [0] = "align1",
   [1] = "align16",
@@ -307,31 +318,35 @@ static const char *end_of_thread[2] = {
   [1] = "EOT"
 };
 
-static const char *target_function_gen6[16] = {
+static const char *target_function_gen7[16] = {
   [GEN_SFID_NULL] = "null",
-  [GEN_SFID_MATH] = "math",
+  [GEN_SFID_RESERVED] = NULL,
   [GEN_SFID_SAMPLER] = "sampler",
   [GEN_SFID_MESSAGE_GATEWAY] = "gateway",
+  [GEN_SFID_DATAPORT_SAMPLER] = "dataport_sampler",
+  [GEN_SFID_DATAPORT_RENDER] = "render",
   [GEN_SFID_URB] = "urb",
   [GEN_SFID_THREAD_SPAWNER] = "thread_spawner",
-  [GEN6_SFID_DATAPORT_SAMPLER_CACHE] = "sampler",
-  [GEN6_SFID_DATAPORT_RENDER_CACHE] = "render",
-  [GEN6_SFID_DATAPORT_CONSTANT_CACHE] = "const",
-  [GEN_SFID_DATAPORT_DATA_CACHE] = "data"
+  [GEN_SFID_VIDEO_MOTION_EST] = "video_motion_estimation",
+  [GEN_SFID_DATAPORT_CONSTANT] = "const",
+  [GEN_SFID_DATAPORT_DATA] = "data",
+  [GEN_SFID_PIXEL_INTERPOLATOR] = "pix_interpolator",
 };
 
 static const char *target_function_gen75[16] = {
   [GEN_SFID_NULL] = "null",
-  [GEN_SFID_MATH] = "math",
+  [GEN_SFID_RESERVED] = NULL,
   [GEN_SFID_SAMPLER] = "sampler",
   [GEN_SFID_MESSAGE_GATEWAY] = "gateway",
+  [GEN_SFID_DATAPORT_SAMPLER] = "dataport_sampler",
+  [GEN_SFID_DATAPORT_RENDER] = "render",
   [GEN_SFID_URB] = "urb",
   [GEN_SFID_THREAD_SPAWNER] = "thread_spawner",
-  [GEN6_SFID_DATAPORT_SAMPLER_CACHE] = "sampler",
-  [GEN6_SFID_DATAPORT_RENDER_CACHE] = "render",
-  [GEN6_SFID_DATAPORT_CONSTANT_CACHE] = "const",
-  [GEN_SFID_DATAPORT_DATA_CACHE] = "data (0)",
-  [GEN_SFID_DATAPORT1_DATA_CACHE] = "data (1)"
+  [GEN_SFID_VIDEO_MOTION_EST] = "video_motion_estimation",
+  [GEN_SFID_DATAPORT_CONSTANT] = "const",
+  [GEN_SFID_DATAPORT_DATA] = "data (0)",
+  [GEN_SFID_PIXEL_INTERPOLATOR] = "pix_interpolator",
+  [GEN_SFID_DATAPORT1_DATA] = "data (1)",
 };
 
 static const char *gateway_sub_function[8] = {
@@ -345,7 +360,7 @@ static const char *gateway_sub_function[8] = {
   [7] = "reserved"
 };
 
-static const char *math_function[16] = {
+static const char *math_function_gen7[16] = {
   [GEN_MATH_FUNCTION_INV] = "inv",
   [GEN_MATH_FUNCTION_LOG] = "log",
   [GEN_MATH_FUNCTION_EXP] = "exp",
@@ -359,25 +374,19 @@ static const char *math_function[16] = {
   [GEN_MATH_FUNCTION_INT_DIV_QUOTIENT] = "intdiv",
   [GEN_MATH_FUNCTION_INT_DIV_REMAINDER] = "intmod",
 };
-
-static const char *math_saturate[2] = {
-  [0] = "",
-  [1] = "sat"
-};
-
-static const char *math_signed[2] = {
-  [0] = "",
-  [1] = "signed"
-};
-
-static const char *math_scalar[2] = {
-  [0] = "",
-  [1] = "scalar"
-};
-
-static const char *math_precision[2] = {
-  [0] = "",
-  [1] = "partial_precision"
+static const char *math_function_gen8[16] = {
+  [GEN_MATH_FUNCTION_INV] = "inv",
+  [GEN_MATH_FUNCTION_LOG] = "log",
+  [GEN_MATH_FUNCTION_EXP] = "exp",
+  [GEN_MATH_FUNCTION_SQRT] = "sqrt",
+  [GEN_MATH_FUNCTION_RSQ] = "rsq",
+  [GEN_MATH_FUNCTION_SIN] = "sin",
+  [GEN_MATH_FUNCTION_COS] = "cos",
+  [GEN_MATH_FUNCTION_FDIV] = "fdiv",
+  [GEN_MATH_FUNCTION_POW] = "pow",
+  [GEN_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER] = "intdivmod",
+  [GEN8_MATH_FUNCTION_INVM] = "invm",
+  [GEN8_MATH_FUNCTION_RSQRTM] = "rsqrtm",
 };
 
 static const char *data_port_data_cache_simd_mode[] = {
@@ -446,14 +455,82 @@ static const char *data_port1_data_cache_msg_type[] = {
 
 static int column;
 
-static int string (FILE *file, const char *string)
+static int gen_version;
+
+#define GEN_BITS_FIELD(inst, gen)                               \
+  ({                                                            \
+    int bits;                                                   \
+    if (gen_version < 80)                                       \
+      bits = ((const union Gen7NativeInstruction *)inst)->gen;  \
+    else                                                        \
+      bits = ((const union Gen8NativeInstruction *)inst)->gen;  \
+    bits;                                                       \
+  })
+
+#define GEN_BITS_FIELD2(inst, gen7, gen8)                       \
+  ({                                                            \
+    int bits;                                                   \
+    if (gen_version < 80)                                       \
+      bits = ((const union Gen7NativeInstruction *)inst)->gen7; \
+    else                                                        \
+      bits = ((const union Gen8NativeInstruction *)inst)->gen8; \
+    bits;                                                       \
+  })
+
+#define PRED_CTRL(inst)            GEN_BITS_FIELD(inst, header.predicate_control)
+#define PRED_INV(inst)             GEN_BITS_FIELD(inst, header.predicate_inverse)
+#define FLAG_REG_NR(inst)          GEN_BITS_FIELD2(inst, bits2.da1.flag_reg_nr, bits1.da1.flag_reg_nr)
+#define FLAG_SUB_REG_NR(inst)      GEN_BITS_FIELD2(inst, bits2.da1.flag_sub_reg_nr, bits1.da1.flag_sub_reg_nr)
+#define ACCESS_MODE(inst)          GEN_BITS_FIELD(inst, header.access_mode)
+#define MASK_CONTROL(inst)         GEN_BITS_FIELD2(inst, header.mask_control, bits1.da1.mask_control)
+#define DEPENDENCY_CONTROL(inst)   GEN_BITS_FIELD(inst, header.dependency_control)
+#define THREAD_CONTROL(inst)       GEN_BITS_FIELD(inst, header.thread_control)
+#define ACC_WR_CONTROL(inst)       GEN_BITS_FIELD(inst, header.acc_wr_control)
+#define QUARTER_CONTROL(inst)      GEN_BITS_FIELD(inst, header.quarter_control)
+#define END_OF_THREAD(inst)        GEN_BITS_FIELD(inst, bits3.generic_gen5.end_of_thread)
+#define OPCODE(inst)               GEN_BITS_FIELD(inst, header.opcode)
+#define SATURATE(inst)             GEN_BITS_FIELD(inst, header.saturate)
+#define DEBUG_CONTROL(inst)        GEN_BITS_FIELD(inst, header.debug_control)
+#define MATH_FUNCTION(inst)        GEN_BITS_FIELD(inst, header.destreg_or_condmod)
+#define MATH_SATURATE(inst)        GEN_BITS_FIELD(inst, bits3.math_gen5.saturate)
+#define MATH_SIGNED(inst)          GEN_BITS_FIELD(inst, bits3.math_gen5.int_type)
+#define MATH_SCALAR(inst)          GEN_BITS_FIELD(inst, bits3.math_gen5.data_type)
+#define MATH_PRECISION(inst)       GEN_BITS_FIELD(inst, bits3.math_gen5.precision)
+#define COND_DST_OR_MODIFIER(inst) GEN_BITS_FIELD(inst, header.destreg_or_condmod)
+#define EXECUTION_SIZE(inst)       GEN_BITS_FIELD(inst, header.execution_size)
+#define BRANCH_JIP(inst)           GEN_BITS_FIELD2(inst, bits3.gen7_branch.jip, bits3.gen8_branch.jip/8)
+#define BRANCH_UIP(inst)           GEN_BITS_FIELD2(inst, bits3.gen7_branch.uip, bits2.gen8_branch.uip/8)
+#define SAMPLE_BTI(inst)           GEN_BITS_FIELD(inst, bits3.sampler_gen7.bti)
+#define SAMPLER(inst)              GEN_BITS_FIELD(inst, bits3.sampler_gen7.sampler)
+#define SAMPLER_MSG_TYPE(inst)     GEN_BITS_FIELD(inst, bits3.sampler_gen7.msg_type)
+#define SAMPLER_SIMD_MODE(inst)    GEN_BITS_FIELD(inst, bits3.sampler_gen7.simd_mode)
+#define UNTYPED_RW_BTI(inst)       GEN_BITS_FIELD(inst, bits3.gen7_untyped_rw.bti)
+#define UNTYPED_RW_RGBA(inst)      GEN_BITS_FIELD(inst, bits3.gen7_untyped_rw.rgba)
+#define UNTYPED_RW_SIMD_MODE(inst) GEN_BITS_FIELD(inst, bits3.gen7_untyped_rw.simd_mode)
+#define UNTYPED_RW_CATEGORY(inst)  GEN_BITS_FIELD(inst, bits3.gen7_untyped_rw.category)
+#define UNTYPED_RW_MSG_TYPE(inst)  GEN_BITS_FIELD(inst, bits3.gen7_untyped_rw.msg_type)
+#define SCRATCH_RW_OFFSET(inst)    GEN_BITS_FIELD(inst, bits3.gen7_scratch_rw.offset)
+#define SCRATCH_RW_BLOCK_SIZE(inst) GEN_BITS_FIELD(inst, bits3.gen7_scratch_rw.block_size)
+#define SCRATCH_RW_INVALIDATE_AFTER_READ(inst) GEN_BITS_FIELD(inst, bits3.gen7_scratch_rw.invalidate_after_read)
+#define SCRATCH_RW_BLOCK_SIZE(inst) GEN_BITS_FIELD(inst, bits3.gen7_scratch_rw.block_size)
+#define SCRATCH_RW_CHANNEL_MODE(inst) GEN_BITS_FIELD(inst, bits3.gen7_scratch_rw.channel_mode)
+#define SCRATCH_RW_MSG_TYPE(inst)  GEN_BITS_FIELD(inst, bits3.gen7_scratch_rw.msg_type)
+#define DWORD_RW_BTI(inst)         GEN_BITS_FIELD(inst, bits3.gen7_dword_rw.msg_type)
+#define DWORD_RW_MSG_TYPE(inst)    GEN_BITS_FIELD(inst, bits3.gen7_dword_rw.bti)
+#define MSG_GW_SUBFUNC(inst)       GEN_BITS_FIELD(inst, bits3.gen7_msg_gw.subfunc)
+#define MSG_GW_NOTIFY(inst)        GEN_BITS_FIELD(inst, bits3.gen7_msg_gw.notify)
+#define MSG_GW_ACKREQ(inst)        GEN_BITS_FIELD(inst, bits3.gen7_msg_gw.ackreq)
+#define GENERIC_MSG_LENGTH(inst)   GEN_BITS_FIELD(inst, bits3.generic_gen5.msg_length)
+#define GENERIC_RESPONSE_LENGTH(inst) GEN_BITS_FIELD(inst, bits3.generic_gen5.response_length)
+
+static int string(FILE *file, const char *string)
 {
   fputs (string, file);
   column += strlen (string);
   return 0;
 }
 
-static int format (FILE *f, const char *format, ...)
+static int format(FILE *f, const char *format, ...)
 {
   char    buf[1024];
   va_list	args;
@@ -461,267 +538,270 @@ static int format (FILE *f, const char *format, ...)
 
   vsnprintf (buf, sizeof (buf) - 1, format, args);
   va_end (args);
-  string (f, buf);
+  string(f, buf);
   return 0;
 }
 
-static int newline (FILE *f)
+static int newline(FILE *f)
 {
   putc ('\n', f);
   column = 0;
   return 0;
 }
 
-static int pad (FILE *f, int c)
+static int pad(FILE *f, int c)
 {
   do
-    string (f, " ");
+    string(f, " ");
   while (column < c);
   return 0;
 }
 
-static int flag_reg (FILE *file, const int flag_nr, const int flag_sub_reg_nr)
+static int flag_reg(FILE *file, const int flag_nr, const int flag_sub_reg_nr)
 {
   if (flag_nr || flag_sub_reg_nr)
-    return format (file, ".f%d.%d", flag_nr, flag_sub_reg_nr);
+    return format(file, ".f%d.%d", flag_nr, flag_sub_reg_nr);
   return 0;
 }
 
-static int control (FILE *file, const char *name, const char *ctrl[], uint32_t id, int *space)
+static int control(FILE *file, const char *name, const char *ctrl[], uint32_t id, int *space)
 {
   if (!ctrl[id]) {
     fprintf (file, "*** invalid %s value %d ",
-        name, id);
+             name, id);
     return 1;
   }
-  if (ctrl[id][0])
-  {
+  if (ctrl[id][0]) {
     if (space && *space)
-      string (file, " ");
-    string (file, ctrl[id]);
+      string(file, " ");
+    string(file, ctrl[id]);
     if (space)
       *space = 1;
   }
   return 0;
 }
 
-static int print_opcode (FILE *file, int id)
+static int print_opcode(FILE *file, int id)
 {
   if (!opcode[id].name) {
-    format (file, "*** invalid opcode value %d ", id);
+    format(file, "*** invalid opcode value %d ", id);
     return 1;
   }
-  string (file, opcode[id].name);
+  string(file, opcode[id].name);
   return 0;
 }
 
-static int reg (FILE *file, uint32_t _reg_file, uint32_t _reg_nr)
+static int reg(FILE *file, uint32_t _reg_file, uint32_t _reg_nr)
 {
-  int	err = 0;
+  int err = 0;
 
   if (_reg_file == GEN_ARCHITECTURE_REGISTER_FILE) {
     switch (_reg_nr & 0xf0) {
       case GEN_ARF_NULL:
-        string (file, "null");
+        string(file, "null");
         return -1;
       case GEN_ARF_ADDRESS:
-        format (file, "a%d", _reg_nr & 0x0f);
+        format(file, "a%d", _reg_nr & 0x0f);
         break;
       case GEN_ARF_ACCUMULATOR:
-        format (file, "acc%d", _reg_nr & 0x0f);
+        format(file, "acc%d", _reg_nr & 0x0f);
         break;
       case GEN_ARF_FLAG:
-        format (file, "f%d", _reg_nr & 0x0f);
+        format(file, "f%d", _reg_nr & 0x0f);
         break;
       case GEN_ARF_MASK:
-        format (file, "mask%d", _reg_nr & 0x0f);
+        format(file, "mask%d", _reg_nr & 0x0f);
         break;
       case GEN_ARF_MASK_STACK:
-        format (file, "msd%d", _reg_nr & 0x0f);
+        format(file, "msd%d", _reg_nr & 0x0f);
         break;
       case GEN_ARF_STATE:
-        format (file, "sr%d", _reg_nr & 0x0f);
+        format(file, "sr%d", _reg_nr & 0x0f);
         break;
       case GEN_ARF_CONTROL:
-        format (file, "cr%d", _reg_nr & 0x0f);
+        format(file, "cr%d", _reg_nr & 0x0f);
         break;
       case GEN_ARF_NOTIFICATION_COUNT:
-        format (file, "n%d", _reg_nr & 0x0f);
+        format(file, "n%d", _reg_nr & 0x0f);
         break;
       case GEN_ARF_IP:
-        string (file, "ip");
+        string(file, "ip");
         return -1;
         break;
+      case GEN_ARF_TM:
+        format(file, "tm%d", _reg_nr & 0x0f);
+        break;
       default:
-        format (file, "ARF%d", _reg_nr);
+        format(file, "ARF%d", _reg_nr);
         break;
     }
   } else {
-    err  |= control (file, "src reg file", reg_file, _reg_file, NULL);
-    format (file, "%d", _reg_nr);
+    err |= control(file, "src reg file", reg_file, _reg_file, NULL);
+    format(file, "%d", _reg_nr);
   }
   return err;
 }
 
-static int dest (FILE *file, const union GenNativeInstruction *inst)
+static int dest(FILE *file, const void* inst)
 {
-  int	err = 0;
+  int err = 0;
 
-  if (inst->header.access_mode == GEN_ALIGN_1)
-  {
-    if (inst->bits1.da1.dest_address_mode == GEN_ADDRESS_DIRECT)
-    {
-      err |= reg (file, inst->bits1.da1.dest_reg_file, inst->bits1.da1.dest_reg_nr);
+  if (ACCESS_MODE(inst) == GEN_ALIGN_1) {
+    if (GEN_BITS_FIELD(inst, bits1.da1.dest_address_mode) == GEN_ADDRESS_DIRECT) {
+      err |= reg(file, GEN_BITS_FIELD(inst, bits1.da1.dest_reg_file),
+                 GEN_BITS_FIELD(inst, bits1.da1.dest_reg_nr));
       if (err == -1) {
-        control (file, "dest reg encoding", reg_encoding, inst->bits1.da1.dest_reg_type, NULL);
+        control(file, "dest reg encoding", reg_encoding, GEN_BITS_FIELD(inst, bits1.da1.dest_reg_type), NULL);
         return 0;
       }
-      if (inst->bits1.da1.dest_subreg_nr)
-        format (file, ".%d", inst->bits1.da1.dest_subreg_nr /
-            reg_type_size[inst->bits1.da1.dest_reg_type]);
-      format (file, "<%s>", horiz_stride[inst->bits1.da1.dest_horiz_stride]);
-      err |= control (file, "dest reg encoding", reg_encoding, inst->bits1.da1.dest_reg_type, NULL);
-    }
-    else
-    {
-      string (file, "g[a0");
-      if (inst->bits1.ia1.dest_subreg_nr)
-        format (file, ".%d", inst->bits1.ia1.dest_subreg_nr /
-            reg_type_size[inst->bits1.ia1.dest_reg_type]);
-      if (inst->bits1.ia1.dest_indirect_offset)
-        format (file, " %d", inst->bits1.ia1.dest_indirect_offset);
-      string (file, "]");
-      format (file, "<%s>", horiz_stride[inst->bits1.ia1.dest_horiz_stride]);
-      err |= control (file, "dest reg encoding", reg_encoding, inst->bits1.ia1.dest_reg_type, NULL);
+      if (GEN_BITS_FIELD(inst, bits1.da1.dest_subreg_nr))
+        format(file, ".%d", GEN_BITS_FIELD(inst, bits1.da1.dest_subreg_nr) /
+               reg_type_size[GEN_BITS_FIELD(inst, bits1.da1.dest_reg_type)]);
+      format(file, "<%s>", horiz_stride[GEN_BITS_FIELD(inst, bits1.da1.dest_horiz_stride)]);
+      err |= control(file, "dest reg encoding", reg_encoding, GEN_BITS_FIELD(inst, bits1.da1.dest_reg_type), NULL);
+    } else {
+      string(file, "g[a0");
+      if (GEN_BITS_FIELD(inst, bits1.ia1.dest_subreg_nr))
+        format(file, ".%d", GEN_BITS_FIELD(inst, bits1.ia1.dest_subreg_nr) /
+               reg_type_size[GEN_BITS_FIELD(inst, bits1.ia1.dest_reg_type)]);
+      if (GEN_BITS_FIELD(inst, bits1.ia1.dest_indirect_offset))
+        format(file, " %d", GEN_BITS_FIELD(inst, bits1.ia1.dest_indirect_offset));
+      string(file, "]");
+      format(file, "<%s>", horiz_stride[GEN_BITS_FIELD(inst, bits1.ia1.dest_horiz_stride)]);
+      err |= control(file, "dest reg encoding", reg_encoding, GEN_BITS_FIELD(inst, bits1.ia1.dest_reg_type), NULL);
     }
-  }
-  else
-  {
-    if (inst->bits1.da16.dest_address_mode == GEN_ADDRESS_DIRECT)
-    {
-      err |= reg (file, inst->bits1.da16.dest_reg_file, inst->bits1.da16.dest_reg_nr);
+  } else {
+    if (GEN_BITS_FIELD(inst, bits1.da16.dest_address_mode) == GEN_ADDRESS_DIRECT) {
+      err |= reg(file, GEN_BITS_FIELD(inst, bits1.da16.dest_reg_file), GEN_BITS_FIELD(inst, bits1.da16.dest_reg_nr));
       if (err == -1)
         return 0;
-      if (inst->bits1.da16.dest_subreg_nr)
-        format (file, ".%d", inst->bits1.da16.dest_subreg_nr /
-            reg_type_size[inst->bits1.da16.dest_reg_type]);
-      string (file, "<1>");
-      err |= control (file, "writemask", writemask, inst->bits1.da16.dest_writemask, NULL);
-      err |= control (file, "dest reg encoding", reg_encoding, inst->bits1.da16.dest_reg_type, NULL);
-    }
-    else
-    {
+      if (GEN_BITS_FIELD(inst, bits1.da16.dest_subreg_nr))
+        format(file, ".%d", GEN_BITS_FIELD(inst, bits1.da16.dest_subreg_nr) /
+               reg_type_size[GEN_BITS_FIELD(inst, bits1.da16.dest_reg_type)]);
+      string(file, "<1>");
+      err |= control(file, "writemask", writemask, GEN_BITS_FIELD(inst, bits1.da16.dest_writemask), NULL);
+      err |= control(file, "dest reg encoding", reg_encoding, GEN_BITS_FIELD(inst, bits1.da16.dest_reg_type), NULL);
+    } else {
       err = 1;
-      string (file, "Indirect align16 address mode not supported");
+      string(file, "Indirect align16 address mode not supported");
     }
   }
 
   return 0;
 }
 
-static int dest_3src (FILE *file, const union GenNativeInstruction *inst)
+static int dest_3src(FILE *file, const void *inst)
 {
-  int	err = 0;
+  int err = 0;
   const uint32_t reg_file = GEN_GENERAL_REGISTER_FILE;
 
-  err |= reg (file, reg_file, inst->bits1.da3src.dest_reg_nr);
+  err |= reg(file, reg_file, GEN_BITS_FIELD(inst, bits1.da3src.dest_reg_nr));
   if (err == -1)
     return 0;
-  if (inst->bits1.da3src.dest_subreg_nr)
-    format (file, ".%d", inst->bits1.da3src.dest_subreg_nr);
-  string (file, "<1>");
-  err |= control (file, "writemask", writemask, inst->bits1.da3src.dest_writemask, NULL);
-  err |= control (file, "dest reg encoding", reg_encoding, GEN_TYPE_F, NULL);
+  if (GEN_BITS_FIELD(inst, bits1.da3src.dest_subreg_nr))
+    format(file, ".%d", GEN_BITS_FIELD(inst, bits1.da3src.dest_subreg_nr));
+  string(file, "<1>");
+  err |= control(file, "writemask", writemask, GEN_BITS_FIELD(inst, bits1.da3src.dest_writemask), NULL);
+  err |= control(file, "dest reg encoding", reg_encoding, GEN_TYPE_F, NULL);
 
   return 0;
 }
 
-static int src_align1_region (FILE *file,
-    uint32_t _vert_stride, uint32_t _width, uint32_t _horiz_stride)
+static int src_align1_region(FILE *file,
+                             uint32_t _vert_stride, uint32_t _width, uint32_t _horiz_stride)
 {
   int err = 0;
-  string (file, "<");
-  err |= control (file, "vert stride", vert_stride, _vert_stride, NULL);
-  string (file, ",");
-  err |= control (file, "width", width, _width, NULL);
-  string (file, ",");
-  err |= control (file, "horiz_stride", horiz_stride, _horiz_stride, NULL);
-  string (file, ">");
+  string(file, "<");
+  if (gen_version < 80) {
+    err |= control(file, "vert stride", vert_stride_gen7, _vert_stride, NULL);
+  } else {
+    err |= control(file, "vert stride", vert_stride_gen8, _vert_stride, NULL);
+  }
+  string(file, ",");
+  err |= control(file, "width", width, _width, NULL);
+  string(file, ",");
+  err |= control(file, "horiz_stride", horiz_stride, _horiz_stride, NULL);
+  string(file, ">");
   return err;
 }
 
-static int src_da1 (FILE *file, uint32_t type, uint32_t _reg_file,
-    uint32_t _vert_stride, uint32_t _width, uint32_t _horiz_stride,
-    uint32_t reg_num, uint32_t sub_reg_num, uint32_t __abs, uint32_t _negate)
+static int src_da1(FILE *file, uint32_t type, uint32_t _reg_file,
+                   uint32_t _vert_stride, uint32_t _width, uint32_t _horiz_stride,
+                   uint32_t reg_num, uint32_t sub_reg_num, uint32_t __abs, uint32_t _negate)
 {
   int err = 0;
-  err |= control (file, "negate", negate, _negate, NULL);
-  err |= control (file, "abs", _abs, __abs, NULL);
+  err |= control(file, "negate", negate, _negate, NULL);
+  err |= control(file, "abs", _abs, __abs, NULL);
 
-  err |= reg (file, _reg_file, reg_num);
+  err |= reg(file, _reg_file, reg_num);
   if (err == -1)
     return 0;
   if (sub_reg_num)
-    format (file, ".%d", sub_reg_num / reg_type_size[type]); /* use formal style like spec */
-  src_align1_region (file, _vert_stride, _width, _horiz_stride);
-  err |= control (file, "src reg encoding", reg_encoding, type, NULL);
+    format(file, ".%d", sub_reg_num / reg_type_size[type]); /* use formal style like spec */
+  src_align1_region(file, _vert_stride, _width, _horiz_stride);
+  err |= control(file, "src reg encoding", reg_encoding, type, NULL);
   return err;
 }
 
-static int src_ia1 (FILE *file,
-                    uint32_t type,
-                    uint32_t _reg_file,
-                    int32_t _addr_imm,
-                    uint32_t _addr_subreg_nr,
-                    uint32_t _negate,
-                    uint32_t __abs,
-                    uint32_t _addr_mode,
-                    uint32_t _horiz_stride,
-                    uint32_t _width,
-                    uint32_t _vert_stride)
+static int src_ia1(FILE *file,
+                   uint32_t type,
+                   uint32_t _reg_file,
+                   int32_t _addr_imm,
+                   uint32_t _addr_subreg_nr,
+                   uint32_t _negate,
+                   uint32_t __abs,
+                   uint32_t _addr_mode,
+                   uint32_t _horiz_stride,
+                   uint32_t _width,
+                   uint32_t _vert_stride)
 {
   int err = 0;
-  err |= control (file, "negate", negate, _negate, NULL);
-  err |= control (file, "abs", _abs, __abs, NULL);
+  err |= control(file, "negate", negate, _negate, NULL);
+  err |= control(file, "abs", _abs, __abs, NULL);
 
-  string (file, "g[a0");
+  string(file, "g[a0");
   if (_addr_subreg_nr)
-    format (file, ".%d", _addr_subreg_nr);
+    format(file, ".%d", _addr_subreg_nr);
   if (_addr_imm)
-    format (file, " %d", _addr_imm);
-  string (file, "]");
-  src_align1_region (file, _vert_stride, _width, _horiz_stride);
-  err |= control (file, "src reg encoding", reg_encoding, type, NULL);
+    format(file, " %d", _addr_imm);
+  string(file, "]");
+  src_align1_region(file, _vert_stride, _width, _horiz_stride);
+  err |= control(file, "src reg encoding", reg_encoding, type, NULL);
   return err;
 }
 
-static int src_da16 (FILE *file,
-                     uint32_t _reg_type,
-                     uint32_t _reg_file,
-                     uint32_t _vert_stride,
-                     uint32_t _reg_nr,
-                     uint32_t _subreg_nr,
-                     uint32_t __abs,
-                     uint32_t _negate,
-                     uint32_t swz_x,
-                     uint32_t swz_y,
-                     uint32_t swz_z,
-                     uint32_t swz_w)
+static int src_da16(FILE *file,
+                    uint32_t _reg_type,
+                    uint32_t _reg_file,
+                    uint32_t _vert_stride,
+                    uint32_t _reg_nr,
+                    uint32_t _subreg_nr,
+                    uint32_t __abs,
+                    uint32_t _negate,
+                    uint32_t swz_x,
+                    uint32_t swz_y,
+                    uint32_t swz_z,
+                    uint32_t swz_w)
 {
   int err = 0;
-  err |= control (file, "negate", negate, _negate, NULL);
-  err |= control (file, "abs", _abs, __abs, NULL);
+  err |= control(file, "negate", negate, _negate, NULL);
+  err |= control(file, "abs", _abs, __abs, NULL);
 
-  err |= reg (file, _reg_file, _reg_nr);
+  err |= reg(file, _reg_file, _reg_nr);
   if (err == -1)
     return 0;
   if (_subreg_nr)
     /* bit4 for subreg number byte addressing. Make this same meaning as
        in da1 case, so output looks consistent. */
-    format (file, ".%d", 16 / reg_type_size[_reg_type]);
-  string (file, "<");
-  err |= control (file, "vert stride", vert_stride, _vert_stride, NULL);
-  string (file, ",4,1>");
+    format(file, ".%d", 16 / reg_type_size[_reg_type]);
+  string(file, "<");
+
+  if (gen_version < 80) {
+    err |= control(file, "vert stride", vert_stride_gen7, _vert_stride, NULL);
+  } else {
+    err |= control(file, "vert stride", vert_stride_gen8, _vert_stride, NULL);
+  }
+  string(file, ",4,1>");
   /*
    * Three kinds of swizzle display:
    *  identity - nothing printed
@@ -731,46 +811,41 @@ static int src_da16 (FILE *file,
   if (swz_x == GEN_CHANNEL_X &&
       swz_y == GEN_CHANNEL_Y &&
       swz_z == GEN_CHANNEL_Z &&
-      swz_w == GEN_CHANNEL_W)
-  {
+      swz_w == GEN_CHANNEL_W) {
     ;
+  } else if (swz_x == swz_y && swz_x == swz_z && swz_x == swz_w) {
+    string(file, ".");
+    err |= control(file, "channel select", chan_sel, swz_x, NULL);
+  } else {
+    string(file, ".");
+    err |= control(file, "channel select", chan_sel, swz_x, NULL);
+    err |= control(file, "channel select", chan_sel, swz_y, NULL);
+    err |= control(file, "channel select", chan_sel, swz_z, NULL);
+    err |= control(file, "channel select", chan_sel, swz_w, NULL);
   }
-  else if (swz_x == swz_y && swz_x == swz_z && swz_x == swz_w)
-  {
-    string (file, ".");
-    err |= control (file, "channel select", chan_sel, swz_x, NULL);
-  }
-  else
-  {
-    string (file, ".");
-    err |= control (file, "channel select", chan_sel, swz_x, NULL);
-    err |= control (file, "channel select", chan_sel, swz_y, NULL);
-    err |= control (file, "channel select", chan_sel, swz_z, NULL);
-    err |= control (file, "channel select", chan_sel, swz_w, NULL);
-  }
-  err |= control (file, "src da16 reg type", reg_encoding, _reg_type, NULL);
+  err |= control(file, "src da16 reg type", reg_encoding, _reg_type, NULL);
   return err;
 }
 
-static int src0_3src (FILE *file, const union GenNativeInstruction *inst)
+static int src0_3src(FILE *file, const void* inst)
 {
   int err = 0;
-  uint32_t swz_x = (inst->bits2.da3src.src0_swizzle >> 0) & 0x3;
-  uint32_t swz_y = (inst->bits2.da3src.src0_swizzle >> 2) & 0x3;
-  uint32_t swz_z = (inst->bits2.da3src.src0_swizzle >> 4) & 0x3;
-  uint32_t swz_w = (inst->bits2.da3src.src0_swizzle >> 6) & 0x3;
+  uint32_t swz_x = (GEN_BITS_FIELD(inst, bits2.da3src.src0_swizzle) >> 0) & 0x3;
+  uint32_t swz_y = (GEN_BITS_FIELD(inst, bits2.da3src.src0_swizzle) >> 2) & 0x3;
+  uint32_t swz_z = (GEN_BITS_FIELD(inst, bits2.da3src.src0_swizzle) >> 4) & 0x3;
+  uint32_t swz_w = (GEN_BITS_FIELD(inst, bits2.da3src.src0_swizzle) >> 6) & 0x3;
 
-  err |= control (file, "negate", negate, inst->bits1.da3src.src0_negate, NULL);
-  err |= control (file, "abs", _abs, inst->bits1.da3src.src0_abs, NULL);
+  err |= control(file, "negate", negate, GEN_BITS_FIELD(inst, bits1.da3src.src0_negate), NULL);
+  err |= control(file, "abs", _abs, GEN_BITS_FIELD(inst, bits1.da3src.src0_abs), NULL);
 
-  err |= reg (file, GEN_GENERAL_REGISTER_FILE, inst->bits2.da3src.src0_reg_nr);
+  err |= reg(file, GEN_GENERAL_REGISTER_FILE, GEN_BITS_FIELD(inst, bits2.da3src.src0_reg_nr));
   if (err == -1)
     return 0;
-  if (inst->bits2.da3src.src0_subreg_nr)
-    format (file, ".%d", inst->bits2.da3src.src0_subreg_nr);
-  string (file, "<4,1,1>");
-  err |= control (file, "src da16 reg type", reg_encoding,
-      GEN_TYPE_F, NULL);
+  if (GEN_BITS_FIELD(inst, bits2.da3src.src0_subreg_nr))
+    format(file, ".%d", GEN_BITS_FIELD(inst, bits2.da3src.src0_subreg_nr));
+  string(file, "<4,1,1>");
+  err |= control(file, "src da16 reg type", reg_encoding,
+                 GEN_TYPE_F, NULL);
   /*
    * Three kinds of swizzle display:
    *  identity - nothing printed
@@ -780,49 +855,42 @@ static int src0_3src (FILE *file, const union GenNativeInstruction *inst)
   if (swz_x == GEN_CHANNEL_X &&
       swz_y == GEN_CHANNEL_Y &&
       swz_z == GEN_CHANNEL_Z &&
-      swz_w == GEN_CHANNEL_W)
-  {
+      swz_w == GEN_CHANNEL_W) {
     ;
-  }
-  else if (swz_x == swz_y && swz_x == swz_z && swz_x == swz_w)
-  {
-    string (file, ".");
-    err |= control (file, "channel select", chan_sel, swz_x, NULL);
-  }
-  else
-  {
-    string (file, ".");
-    err |= control (file, "channel select", chan_sel, swz_x, NULL);
-    err |= control (file, "channel select", chan_sel, swz_y, NULL);
-    err |= control (file, "channel select", chan_sel, swz_z, NULL);
-    err |= control (file, "channel select", chan_sel, swz_w, NULL);
+  } else if (swz_x == swz_y && swz_x == swz_z && swz_x == swz_w) {
+    string(file, ".");
+    err |= control(file, "channel select", chan_sel, swz_x, NULL);
+  } else {
+    string(file, ".");
+    err |= control(file, "channel select", chan_sel, swz_x, NULL);
+    err |= control(file, "channel select", chan_sel, swz_y, NULL);
+    err |= control(file, "channel select", chan_sel, swz_z, NULL);
+    err |= control(file, "channel select", chan_sel, swz_w, NULL);
   }
   return err;
 }
 
-static int src1_3src (FILE *file, const union GenNativeInstruction *inst)
+static int src1_3src(FILE *file, const void* inst)
 {
   int err = 0;
-  uint32_t swz_x = (inst->bits2.da3src.src1_swizzle >> 0) & 0x3;
-  uint32_t swz_y = (inst->bits2.da3src.src1_swizzle >> 2) & 0x3;
-  uint32_t swz_z = (inst->bits2.da3src.src1_swizzle >> 4) & 0x3;
-  uint32_t swz_w = (inst->bits2.da3src.src1_swizzle >> 6) & 0x3;
-  uint32_t src1_subreg_nr = (inst->bits2.da3src.src1_subreg_nr_low |
-      (inst->bits3.da3src.src1_subreg_nr_high << 2));
-
-  err |= control (file, "negate", negate, inst->bits1.da3src.src1_negate,
-      NULL);
-  err |= control (file, "abs", _abs, inst->bits1.da3src.src1_abs, NULL);
-
-  err |= reg (file, GEN_GENERAL_REGISTER_FILE,
-      inst->bits3.da3src.src1_reg_nr);
+  uint32_t swz_x = (GEN_BITS_FIELD(inst, bits2.da3src.src1_swizzle) >> 0) & 0x3;
+  uint32_t swz_y = (GEN_BITS_FIELD(inst, bits2.da3src.src1_swizzle) >> 2) & 0x3;
+  uint32_t swz_z = (GEN_BITS_FIELD(inst, bits2.da3src.src1_swizzle) >> 4) & 0x3;
+  uint32_t swz_w = (GEN_BITS_FIELD(inst, bits2.da3src.src1_swizzle) >> 6) & 0x3;
+  uint32_t src1_subreg_nr = (GEN_BITS_FIELD(inst, bits2.da3src.src1_subreg_nr_low) |
+                             (GEN_BITS_FIELD(inst, bits3.da3src.src1_subreg_nr_high) << 2));
+
+  err |= control(file, "negate", negate, GEN_BITS_FIELD(inst, bits1.da3src.src1_negate), NULL);
+  err |= control(file, "abs", _abs, GEN_BITS_FIELD(inst, bits1.da3src.src1_abs), NULL);
+
+  err |= reg(file, GEN_GENERAL_REGISTER_FILE, GEN_BITS_FIELD(inst, bits3.da3src.src1_reg_nr));
   if (err == -1)
     return 0;
   if (src1_subreg_nr)
-    format (file, ".%d", src1_subreg_nr);
-  string (file, "<4,1,1>");
-  err |= control (file, "src da16 reg type", reg_encoding,
-      GEN_TYPE_F, NULL);
+    format(file, ".%d", src1_subreg_nr);
+  string(file, "<4,1,1>");
+  err |= control(file, "src da16 reg type", reg_encoding,
+                 GEN_TYPE_F, NULL);
   /*
    * Three kinds of swizzle display:
    *  identity - nothing printed
@@ -832,48 +900,39 @@ static int src1_3src (FILE *file, const union GenNativeInstruction *inst)
   if (swz_x == GEN_CHANNEL_X &&
       swz_y == GEN_CHANNEL_Y &&
       swz_z == GEN_CHANNEL_Z &&
-      swz_w == GEN_CHANNEL_W)
-  {
+      swz_w == GEN_CHANNEL_W) {
     ;
-  }
-  else if (swz_x == swz_y && swz_x == swz_z && swz_x == swz_w)
-  {
-    string (file, ".");
-    err |= control (file, "channel select", chan_sel, swz_x, NULL);
-  }
-  else
-  {
-    string (file, ".");
-    err |= control (file, "channel select", chan_sel, swz_x, NULL);
-    err |= control (file, "channel select", chan_sel, swz_y, NULL);
-    err |= control (file, "channel select", chan_sel, swz_z, NULL);
-    err |= control (file, "channel select", chan_sel, swz_w, NULL);
+  } else if (swz_x == swz_y && swz_x == swz_z && swz_x == swz_w) {
+    string(file, ".");
+    err |= control(file, "channel select", chan_sel, swz_x, NULL);
+  } else {
+    string(file, ".");
+    err |= control(file, "channel select", chan_sel, swz_x, NULL);
+    err |= control(file, "channel select", chan_sel, swz_y, NULL);
+    err |= control(file, "channel select", chan_sel, swz_z, NULL);
+    err |= control(file, "channel select", chan_sel, swz_w, NULL);
   }
   return err;
 }
 
-
-static int src2_3src (FILE *file, const union GenNativeInstruction *inst)
+static int src2_3src(FILE *file, const void* inst)
 {
   int err = 0;
-  uint32_t swz_x = (inst->bits3.da3src.src2_swizzle >> 0) & 0x3;
-  uint32_t swz_y = (inst->bits3.da3src.src2_swizzle >> 2) & 0x3;
-  uint32_t swz_z = (inst->bits3.da3src.src2_swizzle >> 4) & 0x3;
-  uint32_t swz_w = (inst->bits3.da3src.src2_swizzle >> 6) & 0x3;
-
-  err |= control (file, "negate", negate, inst->bits1.da3src.src2_negate,
-      NULL);
-  err |= control (file, "abs", _abs, inst->bits1.da3src.src2_abs, NULL);
-
-  err |= reg (file, GEN_GENERAL_REGISTER_FILE,
-      inst->bits3.da3src.src2_reg_nr);
+  uint32_t swz_x = (GEN_BITS_FIELD(inst, bits3.da3src.src2_swizzle) >> 0) & 0x3;
+  uint32_t swz_y = (GEN_BITS_FIELD(inst, bits3.da3src.src2_swizzle) >> 2) & 0x3;
+  uint32_t swz_z = (GEN_BITS_FIELD(inst, bits3.da3src.src2_swizzle) >> 4) & 0x3;
+  uint32_t swz_w = (GEN_BITS_FIELD(inst, bits3.da3src.src2_swizzle) >> 6) & 0x3;
+
+  err |= control(file, "negate", negate, GEN_BITS_FIELD(inst, bits1.da3src.src2_negate), NULL);
+  err |= control(file, "abs", _abs, GEN_BITS_FIELD(inst, bits1.da3src.src2_abs), NULL);
+  err |= reg(file, GEN_GENERAL_REGISTER_FILE, GEN_BITS_FIELD(inst, bits3.da3src.src2_reg_nr));
   if (err == -1)
     return 0;
-  if (inst->bits3.da3src.src2_subreg_nr)
-    format (file, ".%d", inst->bits3.da3src.src2_subreg_nr);
-  string (file, "<4,1,1>");
-  err |= control (file, "src da16 reg type", reg_encoding,
-      GEN_TYPE_F, NULL);
+  if (GEN_BITS_FIELD(inst, bits3.da3src.src2_subreg_nr))
+    format(file, ".%d", GEN_BITS_FIELD(inst, bits3.da3src.src2_subreg_nr));
+  string(file, "<4,1,1>");
+  err |= control(file, "src da16 reg type", reg_encoding,
+                 GEN_TYPE_F, NULL);
   /*
    * Three kinds of swizzle display:
    *  identity - nothing printed
@@ -883,170 +942,147 @@ static int src2_3src (FILE *file, const union GenNativeInstruction *inst)
   if (swz_x == GEN_CHANNEL_X &&
       swz_y == GEN_CHANNEL_Y &&
       swz_z == GEN_CHANNEL_Z &&
-      swz_w == GEN_CHANNEL_W)
-  {
+      swz_w == GEN_CHANNEL_W) {
     ;
-  }
-  else if (swz_x == swz_y && swz_x == swz_z && swz_x == swz_w)
-  {
-    string (file, ".");
-    err |= control (file, "channel select", chan_sel, swz_x, NULL);
-  }
-  else
-  {
-    string (file, ".");
-    err |= control (file, "channel select", chan_sel, swz_x, NULL);
-    err |= control (file, "channel select", chan_sel, swz_y, NULL);
-    err |= control (file, "channel select", chan_sel, swz_z, NULL);
-    err |= control (file, "channel select", chan_sel, swz_w, NULL);
+  } else if (swz_x == swz_y && swz_x == swz_z && swz_x == swz_w) {
+    string(file, ".");
+    err |= control(file, "channel select", chan_sel, swz_x, NULL);
+  } else {
+    string(file, ".");
+    err |= control(file, "channel select", chan_sel, swz_x, NULL);
+    err |= control(file, "channel select", chan_sel, swz_y, NULL);
+    err |= control(file, "channel select", chan_sel, swz_z, NULL);
+    err |= control(file, "channel select", chan_sel, swz_w, NULL);
   }
   return err;
 }
 
-static int imm (FILE *file, uint32_t type, const union GenNativeInstruction *inst) {
+static int imm(FILE *file, uint32_t type, const void* inst)
+{
   switch (type) {
     case GEN_TYPE_UD:
-      format (file, "0x%xUD", inst->bits3.ud);
+      format(file, "0x%xUD", GEN_BITS_FIELD(inst, bits3.ud));
       break;
     case GEN_TYPE_D:
-      format (file, "%dD", inst->bits3.d);
+      format(file, "%dD", GEN_BITS_FIELD(inst, bits3.d));
       break;
     case GEN_TYPE_UW:
-      format (file, "0x%xUW", (uint16_t) inst->bits3.ud);
+      format(file, "0x%xUW", (uint16_t) GEN_BITS_FIELD(inst, bits3.ud));
       break;
     case GEN_TYPE_W:
-      format (file, "%dW", (int16_t) inst->bits3.d);
+      format(file, "%dW", (int16_t) GEN_BITS_FIELD(inst, bits3.d));
       break;
     case GEN_TYPE_UB:
-      format (file, "0x%xUB", (int8_t) inst->bits3.ud);
+      format(file, "0x%xUB", (int8_t) GEN_BITS_FIELD(inst, bits3.ud));
       break;
     case GEN_TYPE_VF:
-      format (file, "Vector Float");
+      format(file, "Vector Float");
       break;
     case GEN_TYPE_V:
-      format (file, "0x%xV", inst->bits3.ud);
+      format(file, "0x%xV", GEN_BITS_FIELD(inst, bits3.ud));
       break;
     case GEN_TYPE_F:
-      format (file, "%-gF", inst->bits3.f);
+      format(file, "%-gF", GEN_BITS_FIELD(inst, bits3.f));
   }
   return 0;
 }
 
-static int src0 (FILE *file, const union GenNativeInstruction *inst)
+static int src0(FILE *file, const void* inst)
 {
-  if (inst->bits1.da1.src0_reg_file == GEN_IMMEDIATE_VALUE)
-    return imm (file, inst->bits1.da1.src0_reg_type,
-        inst);
-  else if (inst->header.access_mode == GEN_ALIGN_1)
-  {
-    if (inst->bits2.da1.src0_address_mode == GEN_ADDRESS_DIRECT)
-    {
-      return src_da1 (file,
-          inst->bits1.da1.src0_reg_type,
-          inst->bits1.da1.src0_reg_file,
-          inst->bits2.da1.src0_vert_stride,
-          inst->bits2.da1.src0_width,
-          inst->bits2.da1.src0_horiz_stride,
-          inst->bits2.da1.src0_reg_nr,
-          inst->bits2.da1.src0_subreg_nr,
-          inst->bits2.da1.src0_abs,
-          inst->bits2.da1.src0_negate);
-    }
-    else
-    {
-      return src_ia1 (file,
-          inst->bits1.ia1.src0_reg_type,
-          inst->bits1.ia1.src0_reg_file,
-          inst->bits2.ia1.src0_indirect_offset,
-          inst->bits2.ia1.src0_subreg_nr,
-          inst->bits2.ia1.src0_negate,
-          inst->bits2.ia1.src0_abs,
-          inst->bits2.ia1.src0_address_mode,
-          inst->bits2.ia1.src0_horiz_stride,
-          inst->bits2.ia1.src0_width,
-          inst->bits2.ia1.src0_vert_stride);
-    }
-  }
-  else
-  {
-    if (inst->bits2.da16.src0_address_mode == GEN_ADDRESS_DIRECT)
-    {
-      return src_da16 (file,
-          inst->bits1.da16.src0_reg_type,
-          inst->bits1.da16.src0_reg_file,
-          inst->bits2.da16.src0_vert_stride,
-          inst->bits2.da16.src0_reg_nr,
-          inst->bits2.da16.src0_subreg_nr,
-          inst->bits2.da16.src0_abs,
-          inst->bits2.da16.src0_negate,
-          inst->bits2.da16.src0_swz_x,
-          inst->bits2.da16.src0_swz_y,
-          inst->bits2.da16.src0_swz_z,
-          inst->bits2.da16.src0_swz_w);
+  if (GEN_BITS_FIELD(inst, bits1.da1.src0_reg_file) == GEN_IMMEDIATE_VALUE)
+    return imm(file, GEN_BITS_FIELD(inst, bits1.da1.src0_reg_type), inst);
+  else if (ACCESS_MODE(inst) == GEN_ALIGN_1) {
+    if (GEN_BITS_FIELD(inst, bits2.da1.src0_address_mode) == GEN_ADDRESS_DIRECT) {
+      return src_da1(file,
+                     GEN_BITS_FIELD(inst, bits1.da1.src0_reg_type),
+                     GEN_BITS_FIELD(inst, bits1.da1.src0_reg_file),
+                     GEN_BITS_FIELD(inst, bits2.da1.src0_vert_stride),
+                     GEN_BITS_FIELD(inst, bits2.da1.src0_width),
+                     GEN_BITS_FIELD(inst, bits2.da1.src0_horiz_stride),
+                     GEN_BITS_FIELD(inst, bits2.da1.src0_reg_nr),
+                     GEN_BITS_FIELD(inst, bits2.da1.src0_subreg_nr),
+                     GEN_BITS_FIELD(inst, bits2.da1.src0_abs),
+                     GEN_BITS_FIELD(inst, bits2.da1.src0_negate));
+    } else {
+      return src_ia1(file,
+                     GEN_BITS_FIELD(inst, bits1.ia1.src0_reg_type),
+                     GEN_BITS_FIELD(inst, bits1.ia1.src0_reg_file),
+                     GEN_BITS_FIELD(inst, bits2.ia1.src0_indirect_offset),
+                     GEN_BITS_FIELD(inst, bits2.ia1.src0_subreg_nr),
+                     GEN_BITS_FIELD(inst, bits2.ia1.src0_negate),
+                     GEN_BITS_FIELD(inst, bits2.ia1.src0_abs),
+                     GEN_BITS_FIELD(inst, bits2.ia1.src0_address_mode),
+                     GEN_BITS_FIELD(inst, bits2.ia1.src0_horiz_stride),
+                     GEN_BITS_FIELD(inst, bits2.ia1.src0_width),
+                     GEN_BITS_FIELD(inst, bits2.ia1.src0_vert_stride));
     }
-    else
-    {
-      string (file, "Indirect align16 address mode not supported");
+  } else {
+    if (GEN_BITS_FIELD(inst, bits2.da16.src0_address_mode) == GEN_ADDRESS_DIRECT) {
+      return src_da16(file,
+                      GEN_BITS_FIELD(inst, bits1.da16.src0_reg_type),
+                      GEN_BITS_FIELD(inst, bits1.da16.src0_reg_file),
+                      GEN_BITS_FIELD(inst, bits2.da16.src0_vert_stride),
+                      GEN_BITS_FIELD(inst, bits2.da16.src0_reg_nr),
+                      GEN_BITS_FIELD(inst, bits2.da16.src0_subreg_nr),
+                      GEN_BITS_FIELD(inst, bits2.da16.src0_abs),
+                      GEN_BITS_FIELD(inst, bits2.da16.src0_negate),
+                      GEN_BITS_FIELD(inst, bits2.da16.src0_swz_x),
+                      GEN_BITS_FIELD(inst, bits2.da16.src0_swz_y),
+                      GEN_BITS_FIELD(inst, bits2.da16.src0_swz_z),
+                      GEN_BITS_FIELD(inst, bits2.da16.src0_swz_w));
+    } else {
+      string(file, "Indirect align16 address mode not supported");
       return 1;
     }
   }
 }
 
-static int src1 (FILE *file, const union GenNativeInstruction *inst)
+static int src1(FILE *file, const void* inst)
 {
-  if (inst->bits1.da1.src1_reg_file == GEN_IMMEDIATE_VALUE)
-    return imm (file, inst->bits1.da1.src1_reg_type,
-        inst);
-  else if (inst->header.access_mode == GEN_ALIGN_1)
-  {
-    if (inst->bits3.da1.src1_address_mode == GEN_ADDRESS_DIRECT)
-    {
-      return src_da1 (file,
-          inst->bits1.da1.src1_reg_type,
-          inst->bits1.da1.src1_reg_file,
-          inst->bits3.da1.src1_vert_stride,
-          inst->bits3.da1.src1_width,
-          inst->bits3.da1.src1_horiz_stride,
-          inst->bits3.da1.src1_reg_nr,
-          inst->bits3.da1.src1_subreg_nr,
-          inst->bits3.da1.src1_abs,
-          inst->bits3.da1.src1_negate);
-    }
-    else
-    {
-      return src_ia1 (file,
-          inst->bits1.ia1.src1_reg_type,
-          inst->bits1.ia1.src1_reg_file,
-          inst->bits3.ia1.src1_indirect_offset,
-          inst->bits3.ia1.src1_subreg_nr,
-          inst->bits3.ia1.src1_negate,
-          inst->bits3.ia1.src1_abs,
-          inst->bits3.ia1.src1_address_mode,
-          inst->bits3.ia1.src1_horiz_stride,
-          inst->bits3.ia1.src1_width,
-          inst->bits3.ia1.src1_vert_stride);
-    }
-  }
-  else
-  {
-    if (inst->bits3.da16.src1_address_mode == GEN_ADDRESS_DIRECT)
-    {
-      return src_da16 (file,
-          inst->bits1.da16.src1_reg_type,
-          inst->bits1.da16.src1_reg_file,
-          inst->bits3.da16.src1_vert_stride,
-          inst->bits3.da16.src1_reg_nr,
-          inst->bits3.da16.src1_subreg_nr,
-          inst->bits3.da16.src1_abs,
-          inst->bits3.da16.src1_negate,
-          inst->bits3.da16.src1_swz_x,
-          inst->bits3.da16.src1_swz_y,
-          inst->bits3.da16.src1_swz_z,
-          inst->bits3.da16.src1_swz_w);
+  if (GEN_BITS_FIELD2(inst, bits1.da1.src1_reg_file, bits2.da1.src1_reg_file) == GEN_IMMEDIATE_VALUE)
+    return imm(file, GEN_BITS_FIELD2(inst, bits1.da1.src1_reg_type, bits2.da1.src1_reg_type),
+               inst);
+  else if (ACCESS_MODE(inst) == GEN_ALIGN_1) {
+    if (GEN_BITS_FIELD(inst, bits3.da1.src1_address_mode) == GEN_ADDRESS_DIRECT) {
+      return src_da1(file,
+                     GEN_BITS_FIELD2(inst, bits1.da1.src1_reg_type, bits2.da1.src1_reg_type),
+                     GEN_BITS_FIELD2(inst, bits1.da1.src1_reg_file, bits2.da1.src1_reg_file),
+                     GEN_BITS_FIELD(inst, bits3.da1.src1_vert_stride),
+                     GEN_BITS_FIELD(inst, bits3.da1.src1_width),
+                     GEN_BITS_FIELD(inst, bits3.da1.src1_horiz_stride),
+                     GEN_BITS_FIELD(inst, bits3.da1.src1_reg_nr),
+                     GEN_BITS_FIELD(inst, bits3.da1.src1_subreg_nr),
+                     GEN_BITS_FIELD(inst, bits3.da1.src1_abs),
+                     GEN_BITS_FIELD(inst, bits3.da1.src1_negate));
+    } else {
+      return src_ia1(file,
+                     GEN_BITS_FIELD2(inst, bits1.ia1.src1_reg_type, bits2.ia1.src1_reg_type),
+                     GEN_BITS_FIELD2(inst, bits1.ia1.src1_reg_file, bits2.ia1.src1_reg_file),
+                     GEN_BITS_FIELD(inst, bits3.ia1.src1_indirect_offset),
+                     GEN_BITS_FIELD(inst, bits3.ia1.src1_subreg_nr),
+                     GEN_BITS_FIELD(inst, bits3.ia1.src1_negate),
+                     GEN_BITS_FIELD(inst, bits3.ia1.src1_abs),
+                     GEN_BITS_FIELD(inst, bits3.ia1.src1_address_mode),
+                     GEN_BITS_FIELD(inst, bits3.ia1.src1_horiz_stride),
+                     GEN_BITS_FIELD(inst, bits3.ia1.src1_width),
+                     GEN_BITS_FIELD(inst, bits3.ia1.src1_vert_stride));
     }
-    else
-    {
-      string (file, "Indirect align16 address mode not supported");
+  } else {
+    if (GEN_BITS_FIELD(inst, bits3.da16.src1_address_mode) == GEN_ADDRESS_DIRECT) {
+      return src_da16(file,
+                      GEN_BITS_FIELD2(inst, bits1.da16.src1_reg_type, bits2.da16.src1_reg_type),
+                      GEN_BITS_FIELD2(inst, bits1.da16.src1_reg_file, bits2.da16.src1_reg_file),
+                      GEN_BITS_FIELD(inst, bits3.da16.src1_vert_stride),
+                      GEN_BITS_FIELD(inst, bits3.da16.src1_reg_nr),
+                      GEN_BITS_FIELD(inst, bits3.da16.src1_subreg_nr),
+                      GEN_BITS_FIELD(inst, bits3.da16.src1_abs),
+                      GEN_BITS_FIELD(inst, bits3.da16.src1_negate),
+                      GEN_BITS_FIELD(inst, bits3.da16.src1_swz_x),
+                      GEN_BITS_FIELD(inst, bits3.da16.src1_swz_y),
+                      GEN_BITS_FIELD(inst, bits3.da16.src1_swz_z),
+                      GEN_BITS_FIELD(inst, bits3.da16.src1_swz_w));
+    } else {
+      string(file, "Indirect align16 address mode not supported");
       return 1;
     }
   }
@@ -1061,242 +1097,233 @@ static const int esize[6] = {
   [5] = 32,
 };
 
-static int qtr_ctrl(FILE *file, const union GenNativeInstruction *inst)
+static int qtr_ctrl(FILE *file, const void* inst)
 {
-  int qtr_ctl = inst->header.quarter_control;
-  int exec_size = esize[inst->header.execution_size];
+  int qtr_ctl = QUARTER_CONTROL(inst);
+  int exec_size = esize[EXECUTION_SIZE(inst)];
 
   if (exec_size == 8) {
     switch (qtr_ctl) {
       case 0:
-        string (file, " 1Q");
+        string(file, " 1Q");
         break;
       case 1:
-        string (file, " 2Q");
+        string(file, " 2Q");
         break;
       case 2:
-        string (file, " 3Q");
+        string(file, " 3Q");
         break;
       case 3:
-        string (file, " 4Q");
+        string(file, " 4Q");
         break;
     }
-  } else if (exec_size == 16){
+  } else if (exec_size == 16) {
     if (qtr_ctl < 2)
-      string (file, " 1H");
+      string(file, " 1H");
     else
-      string (file, " 2H");
+      string(file, " 2H");
   }
   return 0;
 }
 
-int gen_disasm (FILE *file, const void *opaque_insn, uint32_t deviceID, uint32_t compacted)
+int gen_disasm (FILE *file, const void *inst, uint32_t deviceID, uint32_t compacted)
 {
-  const union GenNativeInstruction *inst = (const union GenNativeInstruction *) opaque_insn;
-  int	err = 0;
+  int err = 0;
   int space = 0;
-  int gen = 70;
   if (IS_IVYBRIDGE(deviceID)) {
-    gen = 70;
+    gen_version = 70;
   } else if (IS_HASWELL(deviceID)) {
-    gen = 75;
+    gen_version = 75;
+  } else if (IS_BROADWELL(deviceID)) {
+    gen_version = 80;
   }
 
-  if (inst->header.predicate_control) {
-    string (file, "(");
-    err |= control (file, "predicate inverse", pred_inv, inst->header.predicate_inverse, NULL);
-    format (file, "f%d", inst->bits2.da1.flag_reg_nr);
-    if (inst->bits2.da1.flag_sub_reg_nr)
-      format (file, ".%d", inst->bits2.da1.flag_sub_reg_nr);
-    if (inst->header.access_mode == GEN_ALIGN_1)
-      err |= control (file, "predicate control align1", pred_ctrl_align1,
-          inst->header.predicate_control, NULL);
+  if (PRED_CTRL(inst)) {
+    string(file, "(");
+    err |= control(file, "predicate inverse", pred_inv, PRED_INV(inst), NULL);
+    format(file, "f%d", FLAG_REG_NR(inst));
+    if (FLAG_SUB_REG_NR(inst))
+      format(file, ".%d", FLAG_SUB_REG_NR(inst));
+    if (ACCESS_MODE(inst) == GEN_ALIGN_1)
+      err |= control(file, "predicate control align1", pred_ctrl_align1,
+                     PRED_CTRL(inst), NULL);
     else
-      err |= control (file, "predicate control align16", pred_ctrl_align16,
-          inst->header.predicate_control, NULL);
-    string (file, ") ");
+      err |= control(file, "predicate control align16", pred_ctrl_align16,
+                     PRED_CTRL(inst), NULL);
+    string(file, ") ");
   }
 
-  err |= print_opcode (file, inst->header.opcode);
-  err |= control (file, "saturate", saturate, inst->header.saturate, NULL);
-  err |= control (file, "debug control", debug_ctrl, inst->header.debug_control, NULL);
-
-  if (inst->header.opcode == GEN_OPCODE_MATH) {
-    string (file, " ");
-    err |= control (file, "function", math_function,
-        inst->header.destreg_or_condmod, NULL);
-  } else if (inst->header.opcode != GEN_OPCODE_SEND &&
-      inst->header.opcode != GEN_OPCODE_SENDC) {
-    err |= control (file, "conditional modifier", conditional_modifier,
-                    inst->header.destreg_or_condmod, NULL);
-    if (inst->header.destreg_or_condmod)
-      err |= flag_reg (file,
-                       inst->bits2.da1.flag_reg_nr,
-                       inst->bits2.da1.flag_sub_reg_nr);
-  }
+  err |= print_opcode(file, OPCODE(inst));
+  err |= control(file, "saturate", saturate, SATURATE(inst), NULL);
+  err |= control(file, "debug control", debug_ctrl, DEBUG_CONTROL(inst), NULL);
+
+  if (OPCODE(inst) == GEN_OPCODE_MATH) {
+    string(file, " ");
+    if (gen_version < 80) {
+      err |= control(file, "function", math_function_gen7,
+                     MATH_FUNCTION(inst), &space);
+    } else {
+      err |= control(file, "function", math_function_gen8,
+                     MATH_FUNCTION(inst), &space);
+    }
 
-  if (inst->header.opcode != GEN_OPCODE_NOP) {
-    string (file, "(");
-    err |= control (file, "execution size", exec_size, inst->header.execution_size, NULL);
-    string (file, ")");
+  } else if (OPCODE(inst) != GEN_OPCODE_SEND &&
+             OPCODE(inst) != GEN_OPCODE_SENDC) {
+    err |= control(file, "conditional modifier", conditional_modifier,
+                   COND_DST_OR_MODIFIER(inst), NULL);
+    if (COND_DST_OR_MODIFIER(inst))
+      err |= flag_reg(file,
+                      FLAG_REG_NR(inst),
+                      FLAG_SUB_REG_NR(inst));
   }
 
-  if (inst->header.opcode == GEN_OPCODE_SEND && gen < 60)
-    format (file, " %d", inst->header.destreg_or_condmod);
+  if (OPCODE(inst) != GEN_OPCODE_NOP) {
+    string(file, "(");
+    err |= control(file, "execution size", exec_size, EXECUTION_SIZE(inst), NULL);
+    string(file, ")");
+  }
 
-  if (opcode[inst->header.opcode].nsrc == 3) {
-    pad (file, 16);
-    err |= dest_3src (file, inst);
+  if (opcode[OPCODE(inst)].nsrc == 3) {
+    pad(file, 16);
+    err |= dest_3src(file, inst);
 
-    pad (file, 32);
-    err |= src0_3src (file, inst);
+    pad(file, 32);
+    err |= src0_3src(file, inst);
 
-    pad (file, 48);
-    err |= src1_3src (file, inst);
+    pad(file, 48);
+    err |= src1_3src(file, inst);
 
-    pad (file, 64);
-    err |= src2_3src (file, inst);
+    pad(file, 64);
+    err |= src2_3src(file, inst);
   } else {
-    if (opcode[inst->header.opcode].ndst > 0) {
-      pad (file, 16);
-      err |= dest (file, inst);
-    } else if (gen >= 60 && (inst->header.opcode == GEN_OPCODE_IF ||
-          inst->header.opcode == GEN_OPCODE_ELSE ||
-          inst->header.opcode == GEN_OPCODE_ENDIF ||
-          inst->header.opcode == GEN_OPCODE_WHILE ||
-          inst->header.opcode == GEN_OPCODE_BRD ||
-          inst->header.opcode == GEN_OPCODE_JMPI)) {
-      format(file, " %d", (int16_t)inst->bits3.gen7_branch.jip);
-    } else if (gen >= 60 && (inst->header.opcode == GEN_OPCODE_BREAK ||
-          inst->header.opcode == GEN_OPCODE_CONTINUE ||
-          inst->header.opcode == GEN_OPCODE_HALT ||
-          inst->header.opcode == GEN_OPCODE_BRC)) {
-      format (file, " %d %d", inst->bits3.gen7_branch.jip, inst->bits3.gen7_branch.uip);
+    if (opcode[OPCODE(inst)].ndst > 0) {
+      pad(file, 16);
+      err |= dest(file, inst);
+    } else if (OPCODE(inst) == GEN_OPCODE_IF ||
+               OPCODE(inst) == GEN_OPCODE_ELSE ||
+               OPCODE(inst) == GEN_OPCODE_ENDIF ||
+               OPCODE(inst) == GEN_OPCODE_WHILE ||
+               OPCODE(inst) == GEN_OPCODE_BRD ||
+               OPCODE(inst) == GEN_OPCODE_JMPI) {
+      format(file, " %d", (int16_t)BRANCH_JIP(inst));
+    } else if (OPCODE(inst) == GEN_OPCODE_BREAK ||
+               OPCODE(inst) == GEN_OPCODE_CONTINUE ||
+               OPCODE(inst) == GEN_OPCODE_HALT ||
+               OPCODE(inst) == GEN_OPCODE_BRC) {
+      format(file, " %d %d", BRANCH_JIP(inst), BRANCH_UIP(inst));
     }/* else if (inst->header.opcode == GEN_OPCODE_JMPI) {
-      format (file, " %d", inst->bits3.d);
+      format(file, " %d", inst->bits3.d);
     }*/
 
-    if (opcode[inst->header.opcode].nsrc > 0) {
-      pad (file, 32);
-      err |= src0 (file, inst);
+    if (opcode[OPCODE(inst)].nsrc > 0) {
+      pad(file, 32);
+      err |= src0(file, inst);
     }
-    if (opcode[inst->header.opcode].nsrc > 1) {
-      pad (file, 48);
-      err |= src1 (file, inst);
+    if (opcode[OPCODE(inst)].nsrc > 1) {
+      pad(file, 48);
+      err |= src1(file, inst);
     }
   }
 
-  if (inst->header.opcode == GEN_OPCODE_SEND ||
-      inst->header.opcode == GEN_OPCODE_SENDC) {
-    enum GenMessageTarget target = inst->header.destreg_or_condmod;
+  if (OPCODE(inst) == GEN_OPCODE_SEND ||
+      OPCODE(inst) == GEN_OPCODE_SENDC) {
+    enum GenMessageTarget target = COND_DST_OR_MODIFIER(inst);
 
-    newline (file);
-    pad (file, 16);
+    newline(file);
+    pad(file, 16);
     space = 0;
 
-    if(gen == 75) {
-      err |= control (file, "target function", target_function_gen75,
-             target, &space);
+    if(gen_version >= 75) {
+      err |= control(file, "target function", target_function_gen75,
+                     target, &space);
     } else {
-      err |= control (file, "target function", target_function_gen6,
-             target, &space);
+      err |= control(file, "target function", target_function_gen7,
+                     target, &space);
     }
 
     switch (target) {
-      case GEN_SFID_MATH:
-        err |= control (file, "math function", math_function,
-            inst->bits3.math_gen5.function, &space);
-        err |= control (file, "math saturate", math_saturate,
-            inst->bits3.math_gen5.saturate, &space);
-        err |= control (file, "math signed", math_signed,
-            inst->bits3.math_gen5.int_type, &space);
-        err |= control (file, "math scalar", math_scalar,
-            inst->bits3.math_gen5.data_type, &space);
-        err |= control (file, "math precision", math_precision,
-            inst->bits3.math_gen5.precision, &space);
-        break;
       case GEN_SFID_SAMPLER:
-        format (file, " (%d, %d, %d, %d)",
-                inst->bits3.sampler_gen7.bti,
-                inst->bits3.sampler_gen7.sampler,
-                inst->bits3.sampler_gen7.msg_type,
-                inst->bits3.sampler_gen7.simd_mode);
+        format(file, " (%d, %d, %d, %d)",
+               SAMPLE_BTI(inst),
+               SAMPLER(inst),
+               SAMPLER_MSG_TYPE(inst),
+               SAMPLER_SIMD_MODE(inst));
         break;
-      case GEN_SFID_DATAPORT_DATA_CACHE:
-        if(inst->bits3.gen7_untyped_rw.category == 0) {
-          format (file, " (bti: %d, rgba: %d, %s, %s, %s)",
-                  inst->bits3.gen7_untyped_rw.bti,
-                  inst->bits3.gen7_untyped_rw.rgba,
-                  data_port_data_cache_simd_mode[inst->bits3.gen7_untyped_rw.simd_mode],
-                  data_port_data_cache_category[inst->bits3.gen7_untyped_rw.category],
-                  data_port_data_cache_msg_type[inst->bits3.gen7_untyped_rw.msg_type]);
+      case GEN_SFID_DATAPORT_DATA:
+        if(UNTYPED_RW_CATEGORY(inst) == 0) {
+          format(file, " (bti: %d, rgba: %d, %s, %s, %s)",
+                 UNTYPED_RW_BTI(inst),
+                 UNTYPED_RW_RGBA(inst),
+                 data_port_data_cache_simd_mode[UNTYPED_RW_SIMD_MODE(inst)],
+                 data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
+                 data_port_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
         } else {
-          format (file, " (addr: %d, blocks: %s, %s, mode: %s, %s)",
-                  inst->bits3.gen7_scratch_rw.offset,
-                  data_port_scratch_block_size[inst->bits3.gen7_scratch_rw.block_size],
-                  data_port_scratch_invalidate[inst->bits3.gen7_scratch_rw.invalidate_after_read],
-                  data_port_scratch_channel_mode[inst->bits3.gen7_scratch_rw.channel_mode],
-                  data_port_scratch_msg_type[inst->bits3.gen7_scratch_rw.msg_type]);
+          format(file, " (addr: %d, blocks: %s, %s, mode: %s, %s)",
+                 SCRATCH_RW_OFFSET(inst),
+                 data_port_scratch_block_size[SCRATCH_RW_BLOCK_SIZE(inst)],
+                 data_port_scratch_invalidate[SCRATCH_RW_INVALIDATE_AFTER_READ(inst)],
+                 data_port_scratch_channel_mode[SCRATCH_RW_CHANNEL_MODE(inst)],
+                 data_port_scratch_msg_type[SCRATCH_RW_MSG_TYPE(inst)]);
         }
         break;
-      case GEN_SFID_DATAPORT1_DATA_CACHE:
-        format (file, " (bti: %d, rgba: %d, %s, %s, %s)",
-                inst->bits3.gen7_untyped_rw.bti,
-                inst->bits3.gen7_untyped_rw.rgba,
-                data_port_data_cache_simd_mode[inst->bits3.gen7_untyped_rw.simd_mode],
-                data_port_data_cache_category[inst->bits3.gen7_untyped_rw.category],
-                data_port1_data_cache_msg_type[inst->bits3.gen7_untyped_rw.msg_type]);
+      case GEN_SFID_DATAPORT1_DATA:
+        format(file, " (bti: %d, rgba: %d, %s, %s, %s)",
+               UNTYPED_RW_BTI(inst),
+               UNTYPED_RW_RGBA(inst),
+               data_port_data_cache_simd_mode[UNTYPED_RW_SIMD_MODE(inst)],
+               data_port_data_cache_category[UNTYPED_RW_CATEGORY(inst)],
+               data_port1_data_cache_msg_type[UNTYPED_RW_MSG_TYPE(inst)]);
         break;
-      case GEN6_SFID_DATAPORT_CONSTANT_CACHE:
-        format (file, " (bti: %d, %s)",
-                inst->bits3.gen7_dword_rw.bti,
-                data_port_data_cache_msg_type[inst->bits3.gen7_dword_rw.msg_type]);
+      case GEN_SFID_DATAPORT_CONSTANT:
+        format(file, " (bti: %d, %s)",
+               DWORD_RW_BTI(inst),
+               data_port_data_cache_msg_type[DWORD_RW_MSG_TYPE(inst)]);
         break;
       case GEN_SFID_MESSAGE_GATEWAY:
-        format (file, " (subfunc: %s, notify: %d, ackreq: %d)",
-            gateway_sub_function[inst->bits3.gen7_msg_gw.subfunc],
-            inst->bits3.gen7_msg_gw.notify,
-            inst->bits3.gen7_msg_gw.ackreq);
+        format(file, " (subfunc: %s, notify: %d, ackreq: %d)",
+               gateway_sub_function[MSG_GW_SUBFUNC(inst)],
+               MSG_GW_NOTIFY(inst),
+               MSG_GW_ACKREQ(inst));
         break;
 
       default:
-        format (file, "unsupported target %d", target);
+        format(file, "unsupported target %d", target);
         break;
     }
     if (space)
-      string (file, " ");
-    format (file, "mlen %d", inst->bits3.generic_gen5.msg_length);
-    format (file, " rlen %d", inst->bits3.generic_gen5.response_length);
+      string(file, " ");
+    format(file, "mlen %d", GENERIC_MSG_LENGTH(inst));
+    format(file, " rlen %d", GENERIC_RESPONSE_LENGTH(inst));
   }
-  pad (file, 64);
-  if (inst->header.opcode != GEN_OPCODE_NOP) {
-    string (file, "{");
+  pad(file, 64);
+  if (OPCODE(inst) != GEN_OPCODE_NOP) {
+    string(file, "{");
     space = 1;
-    err |= control(file, "access mode", access_mode, inst->header.access_mode, &space);
-    if (gen >= 60)
-      err |= control (file, "write enable control", wectrl, inst->header.mask_control, &space);
-    else
-      err |= control (file, "mask control", mask_ctrl, inst->header.mask_control, &space);
-    err |= control (file, "dependency control", dep_ctrl, inst->header.dependency_control, &space);
-
-    err |= qtr_ctrl (file, inst);
-    err |= control (file, "thread control", thread_ctrl, inst->header.thread_control, &space);
-    if (gen >= 60)
-      err |= control (file, "acc write control", accwr, inst->header.acc_wr_control, &space);
-    if (inst->header.opcode == GEN_OPCODE_SEND ||
-        inst->header.opcode == GEN_OPCODE_SENDC)
-      err |= control (file, "end of thread", end_of_thread,
-          inst->bits3.generic_gen5.end_of_thread, &space);
+    err |= control(file, "access mode", access_mode, ACCESS_MODE(inst), &space);
+    err |= control(file, "write enable control", wectrl, MASK_CONTROL(inst), &space);
+    err |= control(file, "dependency control", dep_ctrl, DEPENDENCY_CONTROL(inst), &space);
+
+    err |= qtr_ctrl(file, inst);
+    if (gen_version < 80) {
+      err |= control(file, "thread control", thread_ctrl_gen7, THREAD_CONTROL(inst), &space);
+    } else {
+      err |= control(file, "thread control", thread_ctrl_gen8, THREAD_CONTROL(inst), &space);
+    }
+    err |= control(file, "acc write control", accwr, ACC_WR_CONTROL(inst), &space);
+    if (OPCODE(inst) == GEN_OPCODE_SEND ||
+        OPCODE(inst) == GEN_OPCODE_SENDC)
+      err |= control(file, "end of thread", end_of_thread,
+                     END_OF_THREAD(inst), &space);
 
     if(compacted) {
       string(file, " Compacted");
     }
     if (space)
-      string (file, " ");
-    string (file, "}");
+      string(file, " ");
+    string(file, "}");
   }
-  string (file, ";");
-  newline (file);
+  string(file, ";");
+  newline(file);
   return err;
 }
 
diff --git a/backend/src/backend/gen/gen_mesa_disasm.h b/backend/src/backend/gen/gen_mesa_disasm.h
index ae007a4..c9e3d3c 100644
--- a/backend/src/backend/gen/gen_mesa_disasm.h
+++ b/backend/src/backend/gen/gen_mesa_disasm.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/backend/gen75_context.cpp b/backend/src/backend/gen75_context.cpp
index da0db85..1f9591e 100644
--- a/backend/src/backend/gen75_context.cpp
+++ b/backend/src/backend/gen75_context.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/backend/gen75_context.hpp b/backend/src/backend/gen75_context.hpp
index 6f62b02..19fadb2 100644
--- a/backend/src/backend/gen75_context.hpp
+++ b/backend/src/backend/gen75_context.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/backend/gen75_encoder.cpp b/backend/src/backend/gen75_encoder.cpp
index 69d2de0..c77ce4d 100644
--- a/backend/src/backend/gen75_encoder.cpp
+++ b/backend/src/backend/gen75_encoder.cpp
@@ -38,27 +38,28 @@ static const uint32_t untypedRWMask[] = {
 namespace gbe
 {
   void Gen75Encoder::setHeader(GenNativeInstruction *insn) {
+    Gen7NativeInstruction *gen7_insn = &insn->gen7_insn;
     if (this->curr.execWidth == 8)
-      insn->header.execution_size = GEN_WIDTH_8;
+      gen7_insn->header.execution_size = GEN_WIDTH_8;
     else if (this->curr.execWidth == 16)
-      insn->header.execution_size = GEN_WIDTH_16;
+      gen7_insn->header.execution_size = GEN_WIDTH_16;
     else if (this->curr.execWidth == 1)
-      insn->header.execution_size = GEN_WIDTH_1;
+      gen7_insn->header.execution_size = GEN_WIDTH_1;
     else if (this->curr.execWidth == 4)
-      insn->header.execution_size = GEN_WIDTH_4;
+      gen7_insn->header.execution_size = GEN_WIDTH_4;
     else
       NOT_IMPLEMENTED;
-    insn->header.acc_wr_control = this->curr.accWrEnable;
-    insn->header.quarter_control = this->curr.quarterControl;
-    insn->bits1.ia1.nib_ctrl = this->curr.nibControl;
-    insn->header.mask_control = this->curr.noMask;
-    insn->bits2.ia1.flag_reg_nr = this->curr.flag;
-    insn->bits2.ia1.flag_sub_reg_nr = this->curr.subFlag;
+    gen7_insn->header.acc_wr_control = this->curr.accWrEnable;
+    gen7_insn->header.quarter_control = this->curr.quarterControl;
+    gen7_insn->bits1.ia1.nib_ctrl = this->curr.nibControl;
+    gen7_insn->header.mask_control = this->curr.noMask;
+    gen7_insn->bits2.ia1.flag_reg_nr = this->curr.flag;
+    gen7_insn->bits2.ia1.flag_sub_reg_nr = this->curr.subFlag;
     if (this->curr.predicate != GEN_PREDICATE_NONE) {
-      insn->header.predicate_control = this->curr.predicate;
-      insn->header.predicate_inverse = this->curr.inversePredicate;
+      gen7_insn->header.predicate_control = this->curr.predicate;
+      gen7_insn->header.predicate_inverse = this->curr.inversePredicate;
     }
-    insn->header.saturate = this->curr.saturate;
+    gen7_insn->header.saturate = this->curr.saturate;
   }
 
   void Gen75Encoder::setDPUntypedRW(GenNativeInstruction *insn,
@@ -68,15 +69,16 @@ namespace gbe
                                     uint32_t msg_length,
                                     uint32_t response_length)
   {
-    const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA_CACHE;
+    Gen7NativeInstruction *gen7_insn = &insn->gen7_insn;
+    const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA;
     setMessageDescriptor(insn, sfid, msg_length, response_length);
-    insn->bits3.gen7_untyped_rw.msg_type = msg_type;
-    insn->bits3.gen7_untyped_rw.bti = bti;
-    insn->bits3.gen7_untyped_rw.rgba = rgba;
+    gen7_insn->bits3.gen7_untyped_rw.msg_type = msg_type;
+    gen7_insn->bits3.gen7_untyped_rw.bti = bti;
+    gen7_insn->bits3.gen7_untyped_rw.rgba = rgba;
     if (curr.execWidth == 8)
-      insn->bits3.gen7_untyped_rw.simd_mode = GEN_UNTYPED_SIMD8;
+      gen7_insn->bits3.gen7_untyped_rw.simd_mode = GEN_UNTYPED_SIMD8;
     else if (curr.execWidth == 16)
-      insn->bits3.gen7_untyped_rw.simd_mode = GEN_UNTYPED_SIMD16;
+      gen7_insn->bits3.gen7_untyped_rw.simd_mode = GEN_UNTYPED_SIMD16;
     else
       NOT_SUPPORTED;
   }
@@ -84,17 +86,19 @@ namespace gbe
   void Gen75Encoder::setTypedWriteMessage(GenNativeInstruction *insn, unsigned char bti,
                                           unsigned char msg_type, uint32_t msg_length, bool header_present)
   {
-    const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA_CACHE;
+    Gen7NativeInstruction *gen7_insn = &insn->gen7_insn;
+    const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA;
     setMessageDescriptor(insn, sfid, msg_length, 0, header_present);
-    insn->bits3.gen7_typed_rw.bti = bti;
-    insn->bits3.gen7_typed_rw.msg_type = msg_type;
+    gen7_insn->bits3.gen7_typed_rw.bti = bti;
+    gen7_insn->bits3.gen7_typed_rw.msg_type = msg_type;
 
     /* Always using the low 8 slots here. */
-    insn->bits3.gen7_typed_rw.slot = 1;
+    gen7_insn->bits3.gen7_typed_rw.slot = 1;
   }
 
   void Gen75Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum) {
     GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    Gen7NativeInstruction *gen7_insn = &insn->gen7_insn;
     uint32_t msg_length = 0;
     uint32_t response_length = 0;
 
@@ -112,17 +116,17 @@ namespace gbe
     this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
     this->setSrc1(insn, GenRegister::immud(0));
 
-    const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA_CACHE;
+    const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA;
     setMessageDescriptor(insn, sfid, msg_length, response_length);
-    insn->bits3.gen7_atomic_op.msg_type = GEN75_P1_UNTYPED_ATOMIC_OP;
-    insn->bits3.gen7_atomic_op.bti = bti;
-    insn->bits3.gen7_atomic_op.return_data = 1;
-    insn->bits3.gen7_atomic_op.aop_type = function;
+    gen7_insn->bits3.gen7_atomic_op.msg_type = GEN75_P1_UNTYPED_ATOMIC_OP;
+    gen7_insn->bits3.gen7_atomic_op.bti = bti;
+    gen7_insn->bits3.gen7_atomic_op.return_data = 1;
+    gen7_insn->bits3.gen7_atomic_op.aop_type = function;
 
     if (this->curr.execWidth == 8)
-      insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD8;
+      gen7_insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD8;
     else if (this->curr.execWidth == 16)
-      insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD16;
+      gen7_insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD16;
     else
       NOT_SUPPORTED;
   }
@@ -207,8 +211,9 @@ namespace gbe
     pop();
   }
 
-  void Gen75Encoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister r) {
+  void Gen75Encoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp) {
     GBE_ASSERT((src0.type == GEN_TYPE_F && dest.isdf()) || (src0.isdf() && dest.type == GEN_TYPE_F));
+    GenRegister r = GenRegister::retype(tmp, GEN_TYPE_F);
     int w = curr.execWidth;
     GenRegister r0;
     r0 = GenRegister::h2(r);
@@ -246,24 +251,33 @@ namespace gbe
     alu2(this, GEN_OPCODE_JMPI, GenRegister::ip(), GenRegister::ip(), src);
   }
 
-  void Gen75Encoder::patchJMPI(uint32_t insnID, int32_t jumpDistance) {
+  void Gen75Encoder::patchJMPI(uint32_t insnID, int32_t jip, int32_t uip) {
     GenNativeInstruction &insn = *(GenNativeInstruction *)&this->store[insnID];
     GBE_ASSERT(insnID < this->store.size());
     GBE_ASSERT(insn.header.opcode == GEN_OPCODE_JMPI ||
                insn.header.opcode == GEN_OPCODE_BRD  ||
                insn.header.opcode == GEN_OPCODE_ENDIF ||
                insn.header.opcode == GEN_OPCODE_IF ||
-               insn.header.opcode == GEN_OPCODE_BRC);
+               insn.header.opcode == GEN_OPCODE_BRC ||
+               insn.header.opcode == GEN_OPCODE_WHILE ||
+               insn.header.opcode == GEN_OPCODE_ELSE);
 
-    if (insn.header.opcode == GEN_OPCODE_IF) {
-      this->setSrc1(&insn, GenRegister::immd(jumpDistance));
-      return;
+    if( insn.header.opcode == GEN_OPCODE_WHILE ){
+      // if this WHILE instruction jump back to an ELSE instruction,
+      // need add distance to go to the next instruction.
+      GenNativeInstruction & insn_else = *(GenNativeInstruction *)&this->store[insnID+jip];
+      if(insn_else.header.opcode == GEN_OPCODE_ELSE){
+        jip += 2;
+      }
     }
+
+    if (insn.header.opcode != GEN_OPCODE_JMPI)
+      this->setSrc1(&insn, GenRegister::immd((jip & 0xffff) | (uip<<16)));
     else if (insn.header.opcode == GEN_OPCODE_JMPI) {
-      //jumpDistance'unit is Qword, and the HSW's offset of jmpi is in byte, so multi 8
-      jumpDistance = (jumpDistance - 2) * 8;
+      //jumpDistance'unit is Qword, and the HSW's JMPI offset of jmpi is in byte, so multi 8
+      jip = (jip - 2) * 8;
+      this->setSrc1(&insn, GenRegister::immd(jip));
     }
-
-    this->setSrc1(&insn, GenRegister::immd(jumpDistance));
+    return;
   }
 } /* End of the name space. */
diff --git a/backend/src/backend/gen75_encoder.hpp b/backend/src/backend/gen75_encoder.hpp
index c10dac9..9545157 100644
--- a/backend/src/backend/gen75_encoder.hpp
+++ b/backend/src/backend/gen75_encoder.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -23,12 +23,14 @@
 #define __GBE_GEN75_ENCODER_HPP__
 
 #include "backend/gen_encoder.hpp"
+#include "backend/gen7_encoder.hpp"
+
 
 namespace gbe
 {
   /* This class is used to implement the HSW
      specific logic for encoder. */
-  class Gen75Encoder : public GenEncoder
+  class Gen75Encoder : public Gen7Encoder
   {
   public:
     /*! exec width of the double data type */    
@@ -36,12 +38,12 @@ namespace gbe
     virtual ~Gen75Encoder(void) { }
 
     Gen75Encoder(uint32_t simdWidth, uint32_t gen, uint32_t deviceID)
-         : GenEncoder(simdWidth, gen, deviceID) { }
+         : Gen7Encoder(simdWidth, gen, deviceID) { }
 
     /*! Jump indexed instruction */
     virtual void JMPI(GenRegister src, bool longjmp = false);
     /*! Patch JMPI/BRC/BRD (located at index insnID) with the given jump distance */
-    virtual void patchJMPI(uint32_t insnID, int32_t jumpDistance);
+    virtual void patchJMPI(uint32_t insnID, int32_t jip, int32_t uip);
     /*! Get double/long exec width */
     virtual int getDoubleExecWidth(void) { return GEN75_DOUBLE_EXEC_WIDTH; }
     virtual void MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp = GenRegister::null());
diff --git a/backend/src/backend/gen7_encoder.cpp b/backend/src/backend/gen7_encoder.cpp
new file mode 100644
index 0000000..ecf5b39
--- /dev/null
+++ b/backend/src/backend/gen7_encoder.cpp
@@ -0,0 +1,244 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+
+#include "backend/gen7_encoder.hpp"
+
+
+namespace gbe
+{
+  void Gen7Encoder::setHeader(GenNativeInstruction *insn) {
+    Gen7NativeInstruction *gen7_insn = &insn->gen7_insn;
+    if (this->curr.execWidth == 8)
+      gen7_insn->header.execution_size = GEN_WIDTH_8;
+    else if (this->curr.execWidth == 16)
+      gen7_insn->header.execution_size = GEN_WIDTH_16;
+    else if (this->curr.execWidth == 4)
+      gen7_insn->header.execution_size = GEN_WIDTH_4;
+    else if (this->curr.execWidth == 1)
+      gen7_insn->header.execution_size = GEN_WIDTH_1;
+    else
+      NOT_IMPLEMENTED;
+    gen7_insn->header.acc_wr_control = this->curr.accWrEnable;
+    gen7_insn->header.quarter_control = this->curr.quarterControl;
+    gen7_insn->bits1.ia1.nib_ctrl = this->curr.nibControl;
+    gen7_insn->header.mask_control = this->curr.noMask;
+    gen7_insn->bits2.ia1.flag_reg_nr = this->curr.flag;
+    gen7_insn->bits2.ia1.flag_sub_reg_nr = this->curr.subFlag;
+    if (this->curr.predicate != GEN_PREDICATE_NONE) {
+      gen7_insn->header.predicate_control = this->curr.predicate;
+      gen7_insn->header.predicate_inverse = this->curr.inversePredicate;
+    }
+    gen7_insn->header.saturate = this->curr.saturate;
+  }
+
+  void Gen7Encoder::setDst(GenNativeInstruction *insn, GenRegister dest) {
+    Gen7NativeInstruction *gen7_insn = &insn->gen7_insn;
+    if (dest.file != GEN_ARCHITECTURE_REGISTER_FILE)
+      assert(dest.nr < 128);
+
+    gen7_insn->bits1.da1.dest_reg_file = dest.file;
+    gen7_insn->bits1.da1.dest_reg_type = dest.type;
+    gen7_insn->bits1.da1.dest_address_mode = dest.address_mode;
+    gen7_insn->bits1.da1.dest_reg_nr = dest.nr;
+    gen7_insn->bits1.da1.dest_subreg_nr = dest.subnr;
+    if (dest.hstride == GEN_HORIZONTAL_STRIDE_0) {
+      if (dest.type == GEN_TYPE_UB || dest.type == GEN_TYPE_B)
+        dest.hstride = GEN_HORIZONTAL_STRIDE_4;
+      else if (dest.type == GEN_TYPE_UW || dest.type == GEN_TYPE_W)
+        dest.hstride = GEN_HORIZONTAL_STRIDE_2;
+      else
+        dest.hstride = GEN_HORIZONTAL_STRIDE_1;
+    }
+    gen7_insn->bits1.da1.dest_horiz_stride = dest.hstride;
+  }
+
+  void Gen7Encoder::setSrc0(GenNativeInstruction *insn, GenRegister reg) {
+    Gen7NativeInstruction *gen7_insn = &insn->gen7_insn;
+    if (reg.file != GEN_ARCHITECTURE_REGISTER_FILE)
+      assert(reg.nr < 128);
+
+    if (reg.address_mode == GEN_ADDRESS_DIRECT) {
+      gen7_insn->bits1.da1.src0_reg_file = reg.file;
+      gen7_insn->bits1.da1.src0_reg_type = reg.type;
+      gen7_insn->bits2.da1.src0_abs = reg.absolute;
+      gen7_insn->bits2.da1.src0_negate = reg.negation;
+      gen7_insn->bits2.da1.src0_address_mode = reg.address_mode;
+      if (reg.file == GEN_IMMEDIATE_VALUE) {
+        gen7_insn->bits3.ud = reg.value.ud;
+
+        /* Required to set some fields in src1 as well: */
+        gen7_insn->bits1.da1.src1_reg_file = 0; /* arf */
+        gen7_insn->bits1.da1.src1_reg_type = reg.type;
+      }
+      else {
+        if (gen7_insn->header.access_mode == GEN_ALIGN_1) {
+          gen7_insn->bits2.da1.src0_subreg_nr = reg.subnr;
+          gen7_insn->bits2.da1.src0_reg_nr = reg.nr;
+        } else {
+          gen7_insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
+          gen7_insn->bits2.da16.src0_reg_nr = reg.nr;
+        }
+
+        if (reg.width == GEN_WIDTH_1 &&
+            gen7_insn->header.execution_size == GEN_WIDTH_1) {
+          gen7_insn->bits2.da1.src0_horiz_stride = GEN_HORIZONTAL_STRIDE_0;
+          gen7_insn->bits2.da1.src0_width = GEN_WIDTH_1;
+          gen7_insn->bits2.da1.src0_vert_stride = GEN_VERTICAL_STRIDE_0;
+        }
+        else {
+          gen7_insn->bits2.da1.src0_horiz_stride = reg.hstride;
+          gen7_insn->bits2.da1.src0_width = reg.width;
+          gen7_insn->bits2.da1.src0_vert_stride = reg.vstride;
+        }
+      }
+    } else {
+      gen7_insn->bits1.ia1.src0_reg_file = GEN_GENERAL_REGISTER_FILE;
+      gen7_insn->bits1.ia1.src0_reg_type = reg.type;
+      gen7_insn->bits2.ia1.src0_subreg_nr = 0;
+      gen7_insn->bits2.ia1.src0_indirect_offset = 0;
+      gen7_insn->bits2.ia1.src0_abs = 0;
+      gen7_insn->bits2.ia1.src0_negate = 0;
+      gen7_insn->bits2.ia1.src0_address_mode = reg.address_mode;
+      gen7_insn->bits2.ia1.src0_horiz_stride = GEN_HORIZONTAL_STRIDE_0;
+      gen7_insn->bits2.ia1.src0_width = GEN_WIDTH_1;
+      gen7_insn->bits2.ia1.src0_vert_stride = GEN_VERTICAL_STRIDE_ONE_DIMENSIONAL;
+    }
+  }
+
+  void Gen7Encoder::setSrc1(GenNativeInstruction *insn, GenRegister reg) {
+    Gen7NativeInstruction *gen7_insn = &insn->gen7_insn;
+    assert(reg.nr < 128);
+    assert(reg.file != GEN_ARCHITECTURE_REGISTER_FILE || reg.nr == 0);
+
+    gen7_insn->bits1.da1.src1_reg_file = reg.file;
+    gen7_insn->bits1.da1.src1_reg_type = reg.type;
+    gen7_insn->bits3.da1.src1_abs = reg.absolute;
+    gen7_insn->bits3.da1.src1_negate = reg.negation;
+
+    assert(gen7_insn->bits1.da1.src0_reg_file != GEN_IMMEDIATE_VALUE);
+
+    if (reg.file == GEN_IMMEDIATE_VALUE)
+      gen7_insn->bits3.ud = reg.value.ud;
+    else {
+      assert (reg.address_mode == GEN_ADDRESS_DIRECT);
+      if (gen7_insn->header.access_mode == GEN_ALIGN_1) {
+        gen7_insn->bits3.da1.src1_subreg_nr = reg.subnr;
+        gen7_insn->bits3.da1.src1_reg_nr = reg.nr;
+      } else {
+        gen7_insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
+        gen7_insn->bits3.da16.src1_reg_nr = reg.nr;
+      }
+
+      if (reg.width == GEN_WIDTH_1 &&
+          gen7_insn->header.execution_size == GEN_WIDTH_1) {
+        gen7_insn->bits3.da1.src1_horiz_stride = GEN_HORIZONTAL_STRIDE_0;
+        gen7_insn->bits3.da1.src1_width = GEN_WIDTH_1;
+        gen7_insn->bits3.da1.src1_vert_stride = GEN_VERTICAL_STRIDE_0;
+      } else {
+        gen7_insn->bits3.da1.src1_horiz_stride = reg.hstride;
+        gen7_insn->bits3.da1.src1_width = reg.width;
+        gen7_insn->bits3.da1.src1_vert_stride = reg.vstride;
+      }
+    }
+  }
+
+#define NO_SWIZZLE ((0<<0) | (1<<2) | (2<<4) | (3<<6))
+
+  void Gen7Encoder::alu3(uint32_t opcode,
+                              GenRegister dest,
+                              GenRegister src0,
+                              GenRegister src1,
+                              GenRegister src2)
+  {
+     GenNativeInstruction *insn = this->next(opcode);
+     Gen7NativeInstruction *gen7_insn = &insn->gen7_insn;
+
+     assert(dest.file == GEN_GENERAL_REGISTER_FILE);
+     assert(dest.nr < 128);
+     assert(dest.address_mode == GEN_ADDRESS_DIRECT);
+     assert(dest.type = GEN_TYPE_F);
+     gen7_insn->bits1.da3src.dest_reg_file = 0;
+     gen7_insn->bits1.da3src.dest_reg_nr = dest.nr;
+     gen7_insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
+     gen7_insn->bits1.da3src.dest_writemask = 0xf;
+     this->setHeader(insn);
+     gen7_insn->header.access_mode = GEN_ALIGN_16;
+     gen7_insn->header.execution_size = GEN_WIDTH_8;
+
+     assert(src0.file == GEN_GENERAL_REGISTER_FILE);
+     assert(src0.address_mode == GEN_ADDRESS_DIRECT);
+     assert(src0.nr < 128);
+     assert(src0.type == GEN_TYPE_F);
+     gen7_insn->bits2.da3src.src0_swizzle = NO_SWIZZLE;
+     gen7_insn->bits2.da3src.src0_subreg_nr = src0.subnr / 4 ;
+     gen7_insn->bits2.da3src.src0_reg_nr = src0.nr;
+     gen7_insn->bits1.da3src.src0_abs = src0.absolute;
+     gen7_insn->bits1.da3src.src0_negate = src0.negation;
+     gen7_insn->bits2.da3src.src0_rep_ctrl = src0.vstride == GEN_VERTICAL_STRIDE_0;
+
+     assert(src1.file == GEN_GENERAL_REGISTER_FILE);
+     assert(src1.address_mode == GEN_ADDRESS_DIRECT);
+     assert(src1.nr < 128);
+     assert(src1.type == GEN_TYPE_F);
+     gen7_insn->bits2.da3src.src1_swizzle = NO_SWIZZLE;
+     gen7_insn->bits2.da3src.src1_subreg_nr_low = (src1.subnr / 4) & 0x3;
+     gen7_insn->bits3.da3src.src1_subreg_nr_high = (src1.subnr / 4) >> 2;
+     gen7_insn->bits2.da3src.src1_rep_ctrl = src1.vstride == GEN_VERTICAL_STRIDE_0;
+     gen7_insn->bits3.da3src.src1_reg_nr = src1.nr;
+     gen7_insn->bits1.da3src.src1_abs = src1.absolute;
+     gen7_insn->bits1.da3src.src1_negate = src1.negation;
+
+     assert(src2.file == GEN_GENERAL_REGISTER_FILE);
+     assert(src2.address_mode == GEN_ADDRESS_DIRECT);
+     assert(src2.nr < 128);
+     assert(src2.type == GEN_TYPE_F);
+     gen7_insn->bits3.da3src.src2_swizzle = NO_SWIZZLE;
+     gen7_insn->bits3.da3src.src2_subreg_nr = src2.subnr / 4;
+     gen7_insn->bits3.da3src.src2_rep_ctrl = src2.vstride == GEN_VERTICAL_STRIDE_0;
+     gen7_insn->bits3.da3src.src2_reg_nr = src2.nr;
+     gen7_insn->bits1.da3src.src2_abs = src2.absolute;
+     gen7_insn->bits1.da3src.src2_negate = src2.negation;
+
+     // Emit second half of the instruction
+     if (this->curr.execWidth == 16) {
+      GenNativeInstruction q1Insn = *insn;
+      insn = this->next(opcode);
+      *insn = q1Insn;
+      gen7_insn = &insn->gen7_insn;
+      gen7_insn->header.quarter_control = GEN_COMPRESSION_Q2;
+      gen7_insn->bits1.da3src.dest_reg_nr++;
+      if (gen7_insn->bits2.da3src.src0_rep_ctrl == 0)
+        gen7_insn->bits2.da3src.src0_reg_nr++;
+      if (gen7_insn->bits2.da3src.src1_rep_ctrl == 0)
+        gen7_insn->bits3.da3src.src1_reg_nr++;
+      if (gen7_insn->bits3.da3src.src2_rep_ctrl == 0)
+        gen7_insn->bits3.da3src.src2_reg_nr++;
+     }
+  }
+
+#undef NO_SWIZZLE
+}
diff --git a/backend/src/backend/gen7_encoder.hpp b/backend/src/backend/gen7_encoder.hpp
new file mode 100644
index 0000000..f009263
--- /dev/null
+++ b/backend/src/backend/gen7_encoder.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file gen7_context.hpp
+ */
+#ifndef __GBE_GEN7_ENCODER_HPP__
+#define __GBE_GEN7_ENCODER_HPP__
+
+#include "backend/gen_encoder.hpp"
+
+namespace gbe
+{
+  /* This class is used to implement the HSW
+     specific logic for encoder. */
+  class Gen7Encoder : public GenEncoder
+  {
+  public:
+    /*! gen7 exec width of the double data type */
+    #define GEN7_DOUBLE_EXEC_WIDTH  8
+    virtual ~Gen7Encoder(void) { }
+
+    Gen7Encoder(uint32_t simdWidth, uint32_t gen, uint32_t deviceID)
+         : GenEncoder(simdWidth, gen, deviceID) { }
+
+    /*! Get double/long exec width */
+    virtual int getDoubleExecWidth(void) { return GEN7_DOUBLE_EXEC_WIDTH; }
+    virtual void setHeader(GenNativeInstruction *insn);
+    virtual void setDst(GenNativeInstruction *insn, GenRegister dest);
+    virtual void setSrc0(GenNativeInstruction *insn, GenRegister reg);
+    virtual void setSrc1(GenNativeInstruction *insn, GenRegister reg);
+    virtual void alu3(uint32_t opcode, GenRegister dst,
+                       GenRegister src0, GenRegister src1, GenRegister src2);
+  };
+}
+#endif /* __GBE_GEN7_ENCODER_HPP__ */
diff --git a/backend/src/backend/gen7_instruction.hpp b/backend/src/backend/gen7_instruction.hpp
new file mode 100644
index 0000000..51f342b
--- /dev/null
+++ b/backend/src/backend/gen7_instruction.hpp
@@ -0,0 +1,525 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Rong Yang <rong.r.yang at intel.com>
+ */
+
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith at tungstengraphics.com>
+  */
+
+#ifndef __GEN7_INSTRUCTION_HPP__
+#define __GEN7_INSTRUCTION_HPP__
+
+union Gen7NativeInstruction
+{
+  struct {
+    struct {
+      uint32_t opcode:7;
+      uint32_t pad:1;
+      uint32_t access_mode:1;
+      uint32_t mask_control:1;
+      uint32_t dependency_control:2;
+      uint32_t quarter_control:2;
+      uint32_t thread_control:2;
+      uint32_t predicate_control:4;
+      uint32_t predicate_inverse:1;
+      uint32_t execution_size:3;
+      uint32_t destreg_or_condmod:4;
+      uint32_t acc_wr_control:1;
+      uint32_t cmpt_control:1;
+      uint32_t debug_control:1;
+      uint32_t saturate:1;
+    } header;
+
+    union {
+      struct {
+        uint32_t dest_reg_file:2;
+        uint32_t dest_reg_type:3;
+        uint32_t src0_reg_file:2;
+        uint32_t src0_reg_type:3;
+        uint32_t src1_reg_file:2;
+        uint32_t src1_reg_type:3;
+        uint32_t nib_ctrl:1;
+        uint32_t dest_subreg_nr:5;
+        uint32_t dest_reg_nr:8;
+        uint32_t dest_horiz_stride:2;
+        uint32_t dest_address_mode:1;
+      } da1;
+
+      struct {
+        uint32_t dest_reg_file:2;
+        uint32_t dest_reg_type:3;
+        uint32_t src0_reg_file:2;
+        uint32_t src0_reg_type:3;
+        uint32_t src1_reg_file:2;        /* 0x00000c00 */
+        uint32_t src1_reg_type:3;        /* 0x00007000 */
+        uint32_t nib_ctrl:1;
+        int dest_indirect_offset:10;        /* offset against the deref'd address reg */
+        uint32_t dest_subreg_nr:3; /* subnr for the address reg a0.x */
+        uint32_t dest_horiz_stride:2;
+        uint32_t dest_address_mode:1;
+      } ia1;
+
+      struct {
+        uint32_t dest_reg_file:2;
+        uint32_t dest_reg_type:3;
+        uint32_t src0_reg_file:2;
+        uint32_t src0_reg_type:3;
+        uint32_t src1_reg_file:2;
+        uint32_t src1_reg_type:3;
+        uint32_t nib_ctrl:1;
+        uint32_t dest_writemask:4;
+        uint32_t dest_subreg_nr:1;
+        uint32_t dest_reg_nr:8;
+        uint32_t dest_horiz_stride:2;
+        uint32_t dest_address_mode:1;
+      } da16;
+
+      struct {
+        uint32_t dest_reg_file:2;
+        uint32_t dest_reg_type:3;
+        uint32_t src0_reg_file:2;
+        uint32_t src0_reg_type:3;
+        uint32_t nib_ctrl:1;
+        uint32_t dest_writemask:4;
+        int dest_indirect_offset:6;
+        uint32_t dest_subreg_nr:3;
+        uint32_t dest_horiz_stride:2;
+        uint32_t dest_address_mode:1;
+      } ia16;
+
+      struct {
+        uint32_t dest_reg_file:2;
+        uint32_t dest_reg_type:3;
+        uint32_t src0_reg_file:2;
+        uint32_t src0_reg_type:3;
+        uint32_t src1_reg_file:2;
+        uint32_t src1_reg_type:3;
+        uint32_t pad:1;
+        int jump_count:16;
+      } branch_gen6;
+
+      struct {
+        uint32_t dest_reg_file:1;
+        uint32_t flag_subreg_num:1;
+        uint32_t pad0:2;
+        uint32_t src0_abs:1;
+        uint32_t src0_negate:1;
+        uint32_t src1_abs:1;
+        uint32_t src1_negate:1;
+        uint32_t src2_abs:1;
+        uint32_t src2_negate:1;
+        uint32_t pad1:7;
+        uint32_t dest_writemask:4;
+        uint32_t dest_subreg_nr:3;
+        uint32_t dest_reg_nr:8;
+      } da3src;
+    } bits1;
+
+    union {
+      struct {
+        uint32_t src0_subreg_nr:5;
+        uint32_t src0_reg_nr:8;
+        uint32_t src0_abs:1;
+        uint32_t src0_negate:1;
+        uint32_t src0_address_mode:1;
+        uint32_t src0_horiz_stride:2;
+        uint32_t src0_width:3;
+        uint32_t src0_vert_stride:4;
+        uint32_t flag_sub_reg_nr:1;
+        uint32_t flag_reg_nr:1;
+        uint32_t pad:5;
+      } da1;
+
+      struct {
+        int src0_indirect_offset:10;
+        uint32_t src0_subreg_nr:3;
+        uint32_t src0_abs:1;
+        uint32_t src0_negate:1;
+        uint32_t src0_address_mode:1;
+        uint32_t src0_horiz_stride:2;
+        uint32_t src0_width:3;
+        uint32_t src0_vert_stride:4;
+        uint32_t flag_sub_reg_nr:1;
+        uint32_t flag_reg_nr:1;
+        uint32_t pad:5;
+      } ia1;
+
+      struct {
+        uint32_t src0_swz_x:2;
+        uint32_t src0_swz_y:2;
+        uint32_t src0_subreg_nr:1;
+        uint32_t src0_reg_nr:8;
+        uint32_t src0_abs:1;
+        uint32_t src0_negate:1;
+        uint32_t src0_address_mode:1;
+        uint32_t src0_swz_z:2;
+        uint32_t src0_swz_w:2;
+        uint32_t pad0:1;
+        uint32_t src0_vert_stride:4;
+        uint32_t flag_sub_reg_nr:1;
+        uint32_t flag_reg_nr:1;
+        uint32_t pad:5;
+      } da16;
+
+      struct {
+        uint32_t src0_swz_x:2;
+        uint32_t src0_swz_y:2;
+        int src0_indirect_offset:6;
+        uint32_t src0_subreg_nr:3;
+        uint32_t src0_abs:1;
+        uint32_t src0_negate:1;
+        uint32_t src0_address_mode:1;
+        uint32_t src0_swz_z:2;
+        uint32_t src0_swz_w:2;
+        uint32_t pad0:1;
+        uint32_t src0_vert_stride:4;
+        uint32_t flag_sub_reg_nr:1;
+        uint32_t flag_reg_nr:1;
+        uint32_t pad:5;
+      } ia16;
+
+      struct {
+        uint32_t src0_rep_ctrl:1;
+        uint32_t src0_swizzle:8;
+        uint32_t src0_subreg_nr:3;
+        uint32_t src0_reg_nr:8;
+        uint32_t pad0:1;
+        uint32_t src1_rep_ctrl:1;
+        uint32_t src1_swizzle:8;
+        uint32_t src1_subreg_nr_low:2;
+      } da3src;
+    } bits2;
+
+    union {
+      struct {
+        uint32_t src1_subreg_nr:5;
+        uint32_t src1_reg_nr:8;
+        uint32_t src1_abs:1;
+        uint32_t src1_negate:1;
+        uint32_t src1_address_mode:1;
+        uint32_t src1_horiz_stride:2;
+        uint32_t src1_width:3;
+        uint32_t src1_vert_stride:4;
+        uint32_t pad0:7;
+      } da1;
+
+      struct {
+        uint32_t src1_swz_x:2;
+        uint32_t src1_swz_y:2;
+        uint32_t src1_subreg_nr:1;
+        uint32_t src1_reg_nr:8;
+        uint32_t src1_abs:1;
+        uint32_t src1_negate:1;
+        uint32_t src1_address_mode:1;
+        uint32_t src1_swz_z:2;
+        uint32_t src1_swz_w:2;
+        uint32_t pad1:1;
+        uint32_t src1_vert_stride:4;
+        uint32_t pad2:7;
+      } da16;
+
+      struct {
+        int  src1_indirect_offset:10;
+        uint32_t src1_subreg_nr:3;
+        uint32_t src1_abs:1;
+        uint32_t src1_negate:1;
+        uint32_t src1_address_mode:1;
+        uint32_t src1_horiz_stride:2;
+        uint32_t src1_width:3;
+        uint32_t src1_vert_stride:4;
+        uint32_t pad1:7;
+      } ia1;
+
+      struct {
+        uint32_t src1_swz_x:2;
+        uint32_t src1_swz_y:2;
+        int  src1_indirect_offset:6;
+        uint32_t src1_subreg_nr:3;
+        uint32_t src1_abs:1;
+        uint32_t src1_negate:1;
+        uint32_t pad0:1;
+        uint32_t src1_swz_z:2;
+        uint32_t src1_swz_w:2;
+        uint32_t pad1:1;
+        uint32_t src1_vert_stride:4;
+        uint32_t pad2:7;
+      } ia16;
+
+      struct {
+        uint32_t function_control:19;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad1:2;
+        uint32_t end_of_thread:1;
+      } generic_gen5;
+
+      struct {
+        uint32_t sub_function_id:3;
+        uint32_t pad0:11;
+        uint32_t ack_req:1;
+        uint32_t notify:2;
+        uint32_t pad1:2;
+        uint32_t header:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } msg_gateway;
+
+      struct {
+        uint32_t opcode:1;
+        uint32_t request:1;
+        uint32_t pad0:2;
+        uint32_t resource:1;
+        uint32_t pad1:14;
+        uint32_t header:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } spawner_gen5;
+
+      /** Ironlake PRM, Volume 4 Part 1, Section 6.1.1.1 */
+      struct {
+        uint32_t function:4;
+        uint32_t int_type:1;
+        uint32_t precision:1;
+        uint32_t saturate:1;
+        uint32_t data_type:1;
+        uint32_t snapshot:1;
+        uint32_t pad0:10;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad1:2;
+        uint32_t end_of_thread:1;
+      } math_gen5;
+
+      struct {
+        uint32_t bti:8;
+        uint32_t sampler:4;
+        uint32_t msg_type:5;
+        uint32_t simd_mode:2;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad1:2;
+        uint32_t end_of_thread:1;
+      } sampler_gen7;
+
+      /**
+       * Message for the Sandybridge Sampler Cache or Constant Cache Data Port.
+       *
+       * See the Sandybridge PRM, Volume 4 Part 1, Section 3.9.2.1.1.
+       **/
+      struct {
+        uint32_t bti:8;
+        uint32_t msg_control:5;
+        uint32_t msg_type:3;
+        uint32_t pad0:3;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad1:2;
+        uint32_t end_of_thread:1;
+      } gen6_dp_sampler_const_cache;
+
+      /*! Data port untyped read / write messages */
+      struct {
+        uint32_t bti:8;
+        uint32_t rgba:4;
+        uint32_t simd_mode:2;
+        uint32_t msg_type:4;
+        uint32_t category:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } gen7_untyped_rw;
+
+      /*! Data port byte scatter / gather */
+      struct {
+        uint32_t bti:8;
+        uint32_t simd_mode:1;
+        uint32_t ignored0:1;
+        uint32_t data_size:2;
+        uint32_t ignored1:2;
+        uint32_t msg_type:4;
+        uint32_t category:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } gen7_byte_rw;
+
+      /*! Data port Scratch Read/ write */
+      struct {
+        uint32_t offset:12;
+        uint32_t block_size:2;
+        uint32_t ignored0:1;
+        uint32_t invalidate_after_read:1;
+        uint32_t channel_mode:1;
+        uint32_t msg_type:1;
+        uint32_t category:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } gen7_scratch_rw;
+
+      /*! Data port OBlock read / write */
+      struct {
+        uint32_t bti:8;
+        uint32_t block_size:3;
+        uint32_t ignored:2;
+        uint32_t invalidate_after_read:1;
+        uint32_t msg_type:4;
+        uint32_t category:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } gen7_oblock_rw;
+
+      /*! Data port dword scatter / gather */
+      struct {
+        uint32_t bti:8;
+        uint32_t block_size:2;
+        uint32_t ignored0:3;
+        uint32_t invalidate_after_read:1;
+        uint32_t msg_type:4;
+        uint32_t ignored1:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } gen7_dword_rw;
+
+      /*! Data port typed read / write messages */
+      struct {
+        uint32_t bti:8;
+        uint32_t chan_mask:4;
+        uint32_t slot:2;
+        uint32_t msg_type:4;
+        uint32_t pad2:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad3:2;
+        uint32_t end_of_thread:1;
+      } gen7_typed_rw;
+
+      /*! Memory fence */
+      struct {
+        uint32_t bti:8;
+        uint32_t pad:5;
+        uint32_t commit_enable:1;
+        uint32_t msg_type:4;
+        uint32_t pad2:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad3:2;
+        uint32_t end_of_thread:1;
+      } gen7_memory_fence;
+
+      /*! atomic messages */
+      struct {
+        uint32_t bti:8;
+        uint32_t aop_type:4;
+        uint32_t simd_mode:1;
+        uint32_t return_data:1;
+        uint32_t msg_type:4;
+        uint32_t category:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad3:2;
+        uint32_t end_of_thread:1;
+      } gen7_atomic_op;
+
+      struct {
+        uint32_t src1_subreg_nr_high:1;
+        uint32_t src1_reg_nr:8;
+        uint32_t pad0:1;
+        uint32_t src2_rep_ctrl:1;
+        uint32_t src2_swizzle:8;
+        uint32_t src2_subreg_nr:3;
+        uint32_t src2_reg_nr:8;
+        uint32_t pad1:2;
+      } da3src;
+
+      /*! Message gateway */
+      struct {
+        uint32_t subfunc:3;
+        uint32_t pad:11;
+        uint32_t ackreq:1;
+        uint32_t notify:2;
+        uint32_t pad2:2;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad3:2;
+        uint32_t end_of_thread:1;
+      } gen7_msg_gw;
+
+      struct {
+        uint32_t jip:16;
+        uint32_t uip:16;
+      } gen7_branch;
+
+      int d;
+      uint32_t ud;
+      float f;
+    } bits3;
+  };
+};
+#endif
diff --git a/backend/src/backend/gen8_context.cpp b/backend/src/backend/gen8_context.cpp
new file mode 100644
index 0000000..6e138e8
--- /dev/null
+++ b/backend/src/backend/gen8_context.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+/**
+ * \file gen8_context.cpp
+ */
+
+#include "backend/gen8_context.hpp"
+#include "backend/gen8_encoder.hpp"
+#include "backend/gen_program.hpp"
+#include "backend/gen_defs.hpp"
+#include "backend/gen_encoder.hpp"
+#include "backend/gen_insn_selection.hpp"
+#include "backend/gen_insn_scheduling.hpp"
+#include "backend/gen_reg_allocation.hpp"
+#include "sys/cvar.hpp"
+#include "ir/function.hpp"
+#include "ir/value.hpp"
+#include <cstring>
+
+namespace gbe
+{
+  void Gen8Context::emitSLMOffset(void) {
+    return;
+  }
+
+  void Gen8Context::allocSLMOffsetCurbe(void) {
+    return;
+  }
+
+  uint32_t Gen8Context::alignScratchSize(uint32_t size){
+    if(size == 0)
+      return 0;
+    uint32_t i = 1024;
+    while(i < size) i *= 2;
+    return i;
+  }
+
+  void Gen8Context::newSelection(void) {
+    this->sel = GBE_NEW(Selection8, *this);
+  }
+}
diff --git a/backend/src/backend/gen75_context.hpp b/backend/src/backend/gen8_context.hpp
similarity index 67%
copy from backend/src/backend/gen75_context.hpp
copy to backend/src/backend/gen8_context.hpp
index 6f62b02..925e080 100644
--- a/backend/src/backend/gen75_context.hpp
+++ b/backend/src/backend/gen8_context.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -17,40 +17,40 @@
  */
 
 /**
- * \file gen75_context.hpp
+ * \file gen8_context.hpp
  */
-#ifndef __GBE_GEN75_CONTEXT_HPP__
-#define __GBE_GEN75_CONTEXT_HPP__
+#ifndef __GBE_GEN8_CONTEXT_HPP__
+#define __GBE_GEN8_CONTEXT_HPP__
 
 #include "backend/gen_context.hpp"
-#include "backend/gen75_encoder.hpp"
+#include "backend/gen8_encoder.hpp"
 
 namespace gbe
 {
   /* This class is used to implement the HSW
      specific logic for context. */
-  class Gen75Context : public GenContext
+  class Gen8Context : public GenContext
   {
   public:
-    virtual ~Gen75Context(void) { }
-    Gen75Context(const ir::Unit &unit, const std::string &name, uint32_t deviceID, bool relaxMath = false)
+    virtual ~Gen8Context(void) { }
+    Gen8Context(const ir::Unit &unit, const std::string &name, uint32_t deviceID, bool relaxMath = false)
             : GenContext(unit, name, deviceID, relaxMath) {
     };
     /*! device's max srcatch buffer size */
-    #define GEN75_SCRATCH_SIZE  (2 * KB * KB)
-    /*! Emit the per-lane stack pointer computation */
-    virtual void emitStackPointer(void);
+    #define GEN8_SCRATCH_SIZE  (2 * KB * KB)
     /*! Align the scratch size to the device's scratch unit size */
     virtual uint32_t alignScratchSize(uint32_t size);
     /*! Get the device's max srcatch size */
     virtual uint32_t getScratchSize(void) {
       //Because the allocate is use uint16_t, so clamp it, need refine
-      return std::min(GEN75_SCRATCH_SIZE, 0x7fff);
+      return std::min(GEN8_SCRATCH_SIZE, 0x7fff);
     }
+    /*! Get the pointer argument size for curbe alloc */
+    virtual uint32_t getPointerSize(void) { return 8; }
 
   protected:
     virtual GenEncoder* generateEncoder(void) {
-      return GBE_NEW(Gen75Encoder, this->simdWidth, 75, deviceID);
+      return GBE_NEW(Gen8Encoder, this->simdWidth, 8, deviceID);
     }
 
   private:
@@ -59,4 +59,4 @@ namespace gbe
     virtual void newSelection(void);
   };
 }
-#endif /* __GBE_GEN75_CONTEXT_HPP__ */
+#endif /* __GBE_GEN8_CONTEXT_HPP__ */
diff --git a/backend/src/backend/gen8_encoder.cpp b/backend/src/backend/gen8_encoder.cpp
new file mode 100644
index 0000000..ae2d4eb
--- /dev/null
+++ b/backend/src/backend/gen8_encoder.cpp
@@ -0,0 +1,485 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+
+#include "backend/gen8_encoder.hpp"
+
+static const uint32_t untypedRWMask[] = {
+  GEN_UNTYPED_ALPHA|GEN_UNTYPED_BLUE|GEN_UNTYPED_GREEN|GEN_UNTYPED_RED,
+  GEN_UNTYPED_ALPHA|GEN_UNTYPED_BLUE|GEN_UNTYPED_GREEN,
+  GEN_UNTYPED_ALPHA|GEN_UNTYPED_BLUE,
+  GEN_UNTYPED_ALPHA,
+  0
+};
+
+namespace gbe
+{
+  void Gen8Encoder::setHeader(GenNativeInstruction *insn) {
+    Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
+    if (this->curr.execWidth == 8)
+      gen8_insn->header.execution_size = GEN_WIDTH_8;
+    else if (this->curr.execWidth == 16)
+      gen8_insn->header.execution_size = GEN_WIDTH_16;
+    else if (this->curr.execWidth == 1)
+      gen8_insn->header.execution_size = GEN_WIDTH_1;
+    else if (this->curr.execWidth == 4)
+      gen8_insn->header.execution_size = GEN_WIDTH_4;
+    else
+      NOT_IMPLEMENTED;
+    gen8_insn->header.acc_wr_control = this->curr.accWrEnable;
+    gen8_insn->header.quarter_control = this->curr.quarterControl;
+    gen8_insn->header.nib_ctrl = this->curr.nibControl;
+    gen8_insn->bits1.ia1.mask_control = this->curr.noMask;
+    gen8_insn->bits1.ia1.flag_reg_nr = this->curr.flag;
+    gen8_insn->bits1.ia1.flag_sub_reg_nr = this->curr.subFlag;
+    if (this->curr.predicate != GEN_PREDICATE_NONE) {
+      gen8_insn->header.predicate_control = this->curr.predicate;
+      gen8_insn->header.predicate_inverse = this->curr.inversePredicate;
+    }
+    gen8_insn->header.saturate = this->curr.saturate;
+  }
+
+  void Gen8Encoder::setDPUntypedRW(GenNativeInstruction *insn,
+                                    uint32_t bti,
+                                    uint32_t rgba,
+                                    uint32_t msg_type,
+                                    uint32_t msg_length,
+                                    uint32_t response_length)
+  {
+    Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
+    const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA;
+    setMessageDescriptor(insn, sfid, msg_length, response_length);
+    gen8_insn->bits3.gen7_untyped_rw.msg_type = msg_type;
+    gen8_insn->bits3.gen7_untyped_rw.bti = bti;
+    gen8_insn->bits3.gen7_untyped_rw.rgba = rgba;
+    if (curr.execWidth == 8)
+      gen8_insn->bits3.gen7_untyped_rw.simd_mode = GEN_UNTYPED_SIMD8;
+    else if (curr.execWidth == 16)
+      gen8_insn->bits3.gen7_untyped_rw.simd_mode = GEN_UNTYPED_SIMD16;
+    else
+      NOT_SUPPORTED;
+  }
+
+  void Gen8Encoder::setTypedWriteMessage(GenNativeInstruction *insn, unsigned char bti,
+                                          unsigned char msg_type, uint32_t msg_length, bool header_present)
+  {
+    Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
+    const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA;
+    setMessageDescriptor(insn, sfid, msg_length, 0, header_present);
+    gen8_insn->bits3.gen7_typed_rw.bti = bti;
+    gen8_insn->bits3.gen7_typed_rw.msg_type = msg_type;
+
+    /* Always using the low 8 slots here. */
+    gen8_insn->bits3.gen7_typed_rw.slot = 1;
+  }
+
+  void Gen8Encoder::F16TO32(GenRegister dest, GenRegister src0) {
+    MOV(GenRegister::retype(dest, GEN_TYPE_F), GenRegister::retype(src0, GEN_TYPE_HF));
+  }
+
+  void Gen8Encoder::F32TO16(GenRegister dest, GenRegister src0) {
+    MOV(GenRegister::retype(dest, GEN_TYPE_HF), GenRegister::retype(src0, GEN_TYPE_F));
+  }
+
+  void Gen8Encoder::ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
+    uint32_t msg_length = 0;
+    uint32_t response_length = 0;
+
+    if (this->curr.execWidth == 8) {
+      msg_length = srcNum;
+      response_length = 1;
+    } else if (this->curr.execWidth == 16) {
+      msg_length = 2 * srcNum;
+      response_length = 2;
+    } else
+      NOT_IMPLEMENTED;
+
+    this->setHeader(insn);
+    this->setDst(insn, GenRegister::uw16grf(dst.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+
+    const GenMessageTarget sfid = GEN_SFID_DATAPORT1_DATA;
+    setMessageDescriptor(insn, sfid, msg_length, response_length);
+    gen8_insn->bits3.gen7_atomic_op.msg_type = GEN75_P1_UNTYPED_ATOMIC_OP;
+    gen8_insn->bits3.gen7_atomic_op.bti = bti;
+    gen8_insn->bits3.gen7_atomic_op.return_data = 1;
+    gen8_insn->bits3.gen7_atomic_op.aop_type = function;
+
+    if (this->curr.execWidth == 8)
+      gen8_insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD8;
+    else if (this->curr.execWidth == 16)
+      gen8_insn->bits3.gen7_atomic_op.simd_mode = GEN_ATOMIC_SIMD16;
+    else
+      NOT_SUPPORTED;
+  }
+
+  void Gen8Encoder::UNTYPED_READ(GenRegister dst, GenRegister src, uint32_t bti, uint32_t elemNum) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    assert(elemNum >= 1 || elemNum <= 4);
+    uint32_t msg_length = 0;
+    uint32_t response_length = 0;
+    if (this->curr.execWidth == 8) {
+      msg_length = 1;
+      response_length = elemNum;
+    } else if (this->curr.execWidth == 16) {
+      msg_length = 2;
+      response_length = 2 * elemNum;
+    } else
+      NOT_IMPLEMENTED;
+
+    this->setHeader(insn);
+    this->setDst(insn,  GenRegister::uw16grf(dst.nr, 0));
+    this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    setDPUntypedRW(insn,
+                   bti,
+                   untypedRWMask[elemNum],
+                   GEN75_P1_UNTYPED_READ,
+                   msg_length,
+                   response_length);
+  }
+
+  void Gen8Encoder::UNTYPED_WRITE(GenRegister msg, uint32_t bti, uint32_t elemNum) {
+    GenNativeInstruction *insn = this->next(GEN_OPCODE_SEND);
+    assert(elemNum >= 1 || elemNum <= 4);
+    uint32_t msg_length = 0;
+    uint32_t response_length = 0;
+    this->setHeader(insn);
+    if (this->curr.execWidth == 8) {
+      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UD));
+      msg_length = 1 + elemNum;
+    } else if (this->curr.execWidth == 16) {
+      this->setDst(insn, GenRegister::retype(GenRegister::null(), GEN_TYPE_UW));
+      msg_length = 2 * (1 + elemNum);
+    }
+    else
+      NOT_IMPLEMENTED;
+    this->setSrc0(insn, GenRegister::ud8grf(msg.nr, 0));
+    this->setSrc1(insn, GenRegister::immud(0));
+    setDPUntypedRW(insn,
+                   bti,
+                   untypedRWMask[elemNum],
+                   GEN75_P1_UNTYPED_SURFACE_WRITE,
+                   msg_length,
+                   response_length);
+  }
+
+  void Gen8Encoder::LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value) {
+    union { double d; unsigned u[2]; } u;
+    u.d = value;
+    GenRegister r = GenRegister::retype(tmp, GEN_TYPE_UD);
+    push();
+    curr.predicate = GEN_PREDICATE_NONE;
+    curr.noMask = 1;
+    curr.execWidth = 1;
+    MOV(r, GenRegister::immud(u.u[0]));
+    MOV(GenRegister::suboffset(r, 1), GenRegister::immud(u.u[1]));
+    pop();
+    r.type = GEN_TYPE_DF;
+    r.vstride = GEN_VERTICAL_STRIDE_0;
+    r.width = GEN_WIDTH_1;
+    r.hstride = GEN_HORIZONTAL_STRIDE_0;
+    push();
+    uint32_t width = curr.execWidth;
+    curr.execWidth = 8;
+    curr.predicate = GEN_PREDICATE_NONE;
+    curr.noMask = 1;
+    curr.quarterControl = GEN_COMPRESSION_Q1;
+    MOV(dest, r);
+    if (width == 16) {
+      curr.quarterControl = GEN_COMPRESSION_Q2;
+      MOV(GenRegister::offset(dest, 2), r);
+    }
+    pop();
+  }
+
+  void Gen8Encoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp) {
+    GBE_ASSERT((src0.type == GEN_TYPE_F && dest.isdf()) || (src0.isdf() && dest.type == GEN_TYPE_F));
+    GenRegister r = GenRegister::retype(tmp, GEN_TYPE_F);
+    int w = curr.execWidth;
+    GenRegister r0;
+    r0 = GenRegister::h2(r);
+    push();
+    curr.execWidth = 4;
+    curr.predicate = GEN_PREDICATE_NONE;
+    curr.noMask = 1;
+    MOV(r0, src0);
+    MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0, 4));
+    curr.noMask = 0;
+    curr.quarterControl = 0;
+    curr.nibControl = 0;
+    MOV(dest, r0);
+    curr.nibControl = 1;
+    MOV(GenRegister::suboffset(dest, 4), GenRegister::suboffset(r0, 4));
+    pop();
+    if (w == 16) {
+      push();
+      curr.execWidth = 4;
+      curr.predicate = GEN_PREDICATE_NONE;
+      curr.noMask = 1;
+      MOV(r0, GenRegister::suboffset(src0, 8));
+      MOV(GenRegister::suboffset(r0, 4), GenRegister::suboffset(src0, 12));
+      curr.noMask = 0;
+      curr.quarterControl = 1;
+      curr.nibControl = 0;
+      MOV(GenRegister::suboffset(dest, 8), r0);
+      curr.nibControl = 1;
+      MOV(GenRegister::suboffset(dest, 12), GenRegister::suboffset(r0, 4));
+      pop();
+    }
+  }
+
+  void Gen8Encoder::JMPI(GenRegister src, bool longjmp) {
+    alu2(this, GEN_OPCODE_JMPI, GenRegister::ip(), GenRegister::ip(), src);
+  }
+
+  void Gen8Encoder::patchJMPI(uint32_t insnID, int32_t jip, int32_t uip) {
+    GenNativeInstruction &insn = *(GenNativeInstruction *)&this->store[insnID];
+    GBE_ASSERT(insnID < this->store.size());
+    GBE_ASSERT(insn.header.opcode == GEN_OPCODE_JMPI ||
+               insn.header.opcode == GEN_OPCODE_BRD  ||
+               insn.header.opcode == GEN_OPCODE_ENDIF ||
+               insn.header.opcode == GEN_OPCODE_IF ||
+               insn.header.opcode == GEN_OPCODE_BRC ||
+               insn.header.opcode == GEN_OPCODE_WHILE ||
+               insn.header.opcode == GEN_OPCODE_ELSE);
+
+    if( insn.header.opcode == GEN_OPCODE_WHILE ) {
+      // if this WHILE instruction jump back to an ELSE instruction,
+      // need add distance to go to the next instruction.
+      GenNativeInstruction & insn_else = *(GenNativeInstruction *)&this->store[insnID+jip];
+      if(insn_else.header.opcode == GEN_OPCODE_ELSE) {
+        jip += 2;
+      }
+    }
+
+    if(insn.header.opcode == GEN_OPCODE_ELSE)
+      uip = jip;
+
+    if (insn.header.opcode == GEN_OPCODE_IF ||
+        insn.header.opcode == GEN_OPCODE_ELSE) {
+      Gen8NativeInstruction *gen8_insn = &insn.gen8_insn;
+      this->setSrc0(&insn, GenRegister::immud(0));
+      gen8_insn->bits2.gen8_branch.uip = uip*8;
+      gen8_insn->bits3.gen8_branch.jip = jip*8;
+      return;
+    }
+    else if (insn.header.opcode == GEN_OPCODE_JMPI) {
+      //jumpDistance'unit is Qword, and the HSW's offset of jmpi is in byte, so multi 8
+      jip = (jip - 2);
+    }
+
+    this->setSrc1(&insn, GenRegister::immd(jip*8));
+  }
+
+  void Gen8Encoder::setDst(GenNativeInstruction *insn, GenRegister dest) {
+    Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
+    if (dest.file != GEN_ARCHITECTURE_REGISTER_FILE)
+      assert(dest.nr < 128);
+
+    gen8_insn->bits1.da1.dest_reg_file = dest.file;
+    gen8_insn->bits1.da1.dest_reg_type = dest.type;
+    gen8_insn->bits1.da1.dest_address_mode = dest.address_mode;
+    gen8_insn->bits1.da1.dest_reg_nr = dest.nr;
+    gen8_insn->bits1.da1.dest_subreg_nr = dest.subnr;
+    if (dest.hstride == GEN_HORIZONTAL_STRIDE_0) {
+      if (dest.type == GEN_TYPE_UB || dest.type == GEN_TYPE_B)
+        dest.hstride = GEN_HORIZONTAL_STRIDE_4;
+      else if (dest.type == GEN_TYPE_UW || dest.type == GEN_TYPE_W)
+        dest.hstride = GEN_HORIZONTAL_STRIDE_2;
+      else
+        dest.hstride = GEN_HORIZONTAL_STRIDE_1;
+    }
+    gen8_insn->bits1.da1.dest_horiz_stride = dest.hstride;
+  }
+
+  void Gen8Encoder::setSrc0(GenNativeInstruction *insn, GenRegister reg) {
+    Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
+    if (reg.file != GEN_ARCHITECTURE_REGISTER_FILE)
+      assert(reg.nr < 128);
+
+    if (reg.address_mode == GEN_ADDRESS_DIRECT) {
+      gen8_insn->bits1.da1.src0_reg_file = reg.file;
+      gen8_insn->bits1.da1.src0_reg_type = reg.type;
+      gen8_insn->bits2.da1.src0_abs = reg.absolute;
+      gen8_insn->bits2.da1.src0_negate = reg.negation;
+      gen8_insn->bits2.da1.src0_address_mode = reg.address_mode;
+      if (reg.file == GEN_IMMEDIATE_VALUE) {
+        gen8_insn->bits3.ud = reg.value.ud;
+
+        /* Required to set some fields in src1 as well: */
+        gen8_insn->bits2.da1.src1_reg_file = 0; /* arf */
+        gen8_insn->bits2.da1.src1_reg_type = reg.type;
+      }
+      else {
+        if (gen8_insn->header.access_mode == GEN_ALIGN_1) {
+          gen8_insn->bits2.da1.src0_subreg_nr = reg.subnr;
+          gen8_insn->bits2.da1.src0_reg_nr = reg.nr;
+        } else {
+          gen8_insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
+          gen8_insn->bits2.da16.src0_reg_nr = reg.nr;
+        }
+
+        if (reg.width == GEN_WIDTH_1 &&
+            gen8_insn->header.execution_size == GEN_WIDTH_1) {
+          gen8_insn->bits2.da1.src0_horiz_stride = GEN_HORIZONTAL_STRIDE_0;
+          gen8_insn->bits2.da1.src0_width = GEN_WIDTH_1;
+          gen8_insn->bits2.da1.src0_vert_stride = GEN_VERTICAL_STRIDE_0;
+        }
+        else {
+          gen8_insn->bits2.da1.src0_horiz_stride = reg.hstride;
+          gen8_insn->bits2.da1.src0_width = reg.width;
+          gen8_insn->bits2.da1.src0_vert_stride = reg.vstride;
+        }
+      }
+    } else {
+      gen8_insn->bits1.ia1.src0_reg_file = GEN_GENERAL_REGISTER_FILE;
+      gen8_insn->bits1.ia1.src0_reg_type = reg.type;
+      gen8_insn->bits2.ia1.src0_subreg_nr = 0;
+      gen8_insn->bits2.ia1.src0_indirect_offset = 0;
+      gen8_insn->bits2.ia1.src0_abs = 0;
+      gen8_insn->bits2.ia1.src0_negate = 0;
+      gen8_insn->bits2.ia1.src0_address_mode = reg.address_mode;
+      gen8_insn->bits2.ia1.src0_horiz_stride = GEN_HORIZONTAL_STRIDE_0;
+      gen8_insn->bits2.ia1.src0_width = GEN_WIDTH_1;
+      gen8_insn->bits2.ia1.src0_vert_stride = GEN_VERTICAL_STRIDE_ONE_DIMENSIONAL;
+    }
+  }
+
+  void Gen8Encoder::setSrc1(GenNativeInstruction *insn, GenRegister reg) {
+    Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
+    assert(reg.nr < 128);
+    assert(reg.file != GEN_ARCHITECTURE_REGISTER_FILE || reg.nr == 0);
+
+    gen8_insn->bits2.da1.src1_reg_file = reg.file;
+    gen8_insn->bits2.da1.src1_reg_type = reg.type;
+    gen8_insn->bits3.da1.src1_abs = reg.absolute;
+    gen8_insn->bits3.da1.src1_negate = reg.negation;
+
+    assert(gen8_insn->bits1.da1.src0_reg_file != GEN_IMMEDIATE_VALUE);
+
+    if (reg.file == GEN_IMMEDIATE_VALUE)
+      gen8_insn->bits3.ud = reg.value.ud;
+    else {
+      assert (reg.address_mode == GEN_ADDRESS_DIRECT);
+      if (gen8_insn->header.access_mode == GEN_ALIGN_1) {
+        gen8_insn->bits3.da1.src1_subreg_nr = reg.subnr;
+        gen8_insn->bits3.da1.src1_reg_nr = reg.nr;
+      } else {
+        gen8_insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
+        gen8_insn->bits3.da16.src1_reg_nr = reg.nr;
+      }
+
+      if (reg.width == GEN_WIDTH_1 &&
+          gen8_insn->header.execution_size == GEN_WIDTH_1) {
+        gen8_insn->bits3.da1.src1_horiz_stride = GEN_HORIZONTAL_STRIDE_0;
+        gen8_insn->bits3.da1.src1_width = GEN_WIDTH_1;
+        gen8_insn->bits3.da1.src1_vert_stride = GEN_VERTICAL_STRIDE_0;
+      } else {
+        gen8_insn->bits3.da1.src1_horiz_stride = reg.hstride;
+        gen8_insn->bits3.da1.src1_width = reg.width;
+        gen8_insn->bits3.da1.src1_vert_stride = reg.vstride;
+      }
+    }
+  }
+
+#define NO_SWIZZLE ((0<<0) | (1<<2) | (2<<4) | (3<<6))
+
+  void Gen8Encoder::alu3(uint32_t opcode,
+                              GenRegister dest,
+                              GenRegister src0,
+                              GenRegister src1,
+                              GenRegister src2)
+  {
+     GenNativeInstruction *insn = this->next(opcode);
+     Gen8NativeInstruction *gen8_insn = &insn->gen8_insn;
+
+     assert(dest.file == GEN_GENERAL_REGISTER_FILE);
+     assert(dest.nr < 128);
+     assert(dest.address_mode == GEN_ADDRESS_DIRECT);
+     assert(dest.type = GEN_TYPE_F);
+     //gen8_insn->bits1.da3src.dest_reg_file = 0;
+     gen8_insn->bits1.da3src.dest_reg_nr = dest.nr;
+     gen8_insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
+     gen8_insn->bits1.da3src.dest_writemask = 0xf;
+     this->setHeader(insn);
+     gen8_insn->header.access_mode = GEN_ALIGN_16;
+     gen8_insn->header.execution_size = GEN_WIDTH_8;
+
+     assert(src0.file == GEN_GENERAL_REGISTER_FILE);
+     assert(src0.address_mode == GEN_ADDRESS_DIRECT);
+     assert(src0.nr < 128);
+     assert(src0.type == GEN_TYPE_F);
+     gen8_insn->bits2.da3src.src0_swizzle = NO_SWIZZLE;
+     gen8_insn->bits2.da3src.src0_subreg_nr = src0.subnr / 4 ;
+     gen8_insn->bits2.da3src.src0_reg_nr = src0.nr;
+     gen8_insn->bits1.da3src.src0_abs = src0.absolute;
+     gen8_insn->bits1.da3src.src0_negate = src0.negation;
+     gen8_insn->bits2.da3src.src0_rep_ctrl = src0.vstride == GEN_VERTICAL_STRIDE_0;
+
+     assert(src1.file == GEN_GENERAL_REGISTER_FILE);
+     assert(src1.address_mode == GEN_ADDRESS_DIRECT);
+     assert(src1.nr < 128);
+     assert(src1.type == GEN_TYPE_F);
+     gen8_insn->bits2.da3src.src1_swizzle = NO_SWIZZLE;
+     gen8_insn->bits2.da3src.src1_subreg_nr_low = (src1.subnr / 4) & 0x3;
+     gen8_insn->bits3.da3src.src1_subreg_nr_high = (src1.subnr / 4) >> 2;
+     gen8_insn->bits2.da3src.src1_rep_ctrl = src1.vstride == GEN_VERTICAL_STRIDE_0;
+     gen8_insn->bits3.da3src.src1_reg_nr = src1.nr;
+     gen8_insn->bits1.da3src.src1_abs = src1.absolute;
+     gen8_insn->bits1.da3src.src1_negate = src1.negation;
+
+     assert(src2.file == GEN_GENERAL_REGISTER_FILE);
+     assert(src2.address_mode == GEN_ADDRESS_DIRECT);
+     assert(src2.nr < 128);
+     assert(src2.type == GEN_TYPE_F);
+     gen8_insn->bits3.da3src.src2_swizzle = NO_SWIZZLE;
+     gen8_insn->bits3.da3src.src2_subreg_nr = src2.subnr / 4;
+     gen8_insn->bits3.da3src.src2_rep_ctrl = src2.vstride == GEN_VERTICAL_STRIDE_0;
+     gen8_insn->bits3.da3src.src2_reg_nr = src2.nr;
+     gen8_insn->bits1.da3src.src2_abs = src2.absolute;
+     gen8_insn->bits1.da3src.src2_negate = src2.negation;
+
+     // Emit second half of the instruction
+     if (this->curr.execWidth == 16) {
+      GenNativeInstruction q1Insn = *insn;
+      insn = this->next(opcode);
+      *insn = q1Insn;
+      gen8_insn = &insn->gen8_insn;
+      gen8_insn->header.quarter_control = GEN_COMPRESSION_Q2;
+      gen8_insn->bits1.da3src.dest_reg_nr++;
+      if (gen8_insn->bits2.da3src.src0_rep_ctrl == 0)
+        gen8_insn->bits2.da3src.src0_reg_nr++;
+      if (gen8_insn->bits2.da3src.src1_rep_ctrl == 0)
+        gen8_insn->bits3.da3src.src1_reg_nr++;
+      if (gen8_insn->bits3.da3src.src2_rep_ctrl == 0)
+        gen8_insn->bits3.da3src.src2_reg_nr++;
+     }
+  }
+} /* End of the name space. */
diff --git a/backend/src/backend/gen75_encoder.hpp b/backend/src/backend/gen8_encoder.hpp
similarity index 64%
copy from backend/src/backend/gen75_encoder.hpp
copy to backend/src/backend/gen8_encoder.hpp
index c10dac9..e0d934f 100644
--- a/backend/src/backend/gen75_encoder.hpp
+++ b/backend/src/backend/gen8_encoder.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -17,10 +17,10 @@
  */
 
 /**
- * \file gen75_context.hpp
+ * \file gen8_context.hpp
  */
-#ifndef __GBE_GEN75_ENCODER_HPP__
-#define __GBE_GEN75_ENCODER_HPP__
+#ifndef __GBE_GEN8_ENCODER_HPP__
+#define __GBE_GEN8_ENCODER_HPP__
 
 #include "backend/gen_encoder.hpp"
 
@@ -28,22 +28,24 @@ namespace gbe
 {
   /* This class is used to implement the HSW
      specific logic for encoder. */
-  class Gen75Encoder : public GenEncoder
+  class Gen8Encoder : public GenEncoder
   {
   public:
-    /*! exec width of the double data type */    
-    #define GEN75_DOUBLE_EXEC_WIDTH  4
-    virtual ~Gen75Encoder(void) { }
+    /*! exec width of the double data type */
+    #define GEN8_DOUBLE_EXEC_WIDTH  4
+    virtual ~Gen8Encoder(void) { }
 
-    Gen75Encoder(uint32_t simdWidth, uint32_t gen, uint32_t deviceID)
+    Gen8Encoder(uint32_t simdWidth, uint32_t gen, uint32_t deviceID)
          : GenEncoder(simdWidth, gen, deviceID) { }
 
     /*! Jump indexed instruction */
     virtual void JMPI(GenRegister src, bool longjmp = false);
     /*! Patch JMPI/BRC/BRD (located at index insnID) with the given jump distance */
-    virtual void patchJMPI(uint32_t insnID, int32_t jumpDistance);
+    virtual void patchJMPI(uint32_t insnID, int32_t jip, int32_t uip);
     /*! Get double/long exec width */
-    virtual int getDoubleExecWidth(void) { return GEN75_DOUBLE_EXEC_WIDTH; }
+    virtual int getDoubleExecWidth(void) { return GEN8_DOUBLE_EXEC_WIDTH; }
+    virtual void F16TO32(GenRegister dest, GenRegister src0);
+    virtual void F32TO16(GenRegister dest, GenRegister src0);
     virtual void MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp = GenRegister::null());
     virtual void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value);
     virtual void ATOMIC(GenRegister dst, uint32_t function, GenRegister src, uint32_t bti, uint32_t srcNum);
@@ -55,6 +57,12 @@ namespace gbe
     virtual void setTypedWriteMessage(GenNativeInstruction *insn, unsigned char bti,
                                       unsigned char msg_type, uint32_t msg_length,
                                       bool header_present);
+    virtual void setDst(GenNativeInstruction *insn, GenRegister dest);
+    virtual void setSrc0(GenNativeInstruction *insn, GenRegister reg);
+    virtual void setSrc1(GenNativeInstruction *insn, GenRegister reg);
+    virtual bool disableCompact() { return true; }
+    virtual void alu3(uint32_t opcode, GenRegister dst,
+                       GenRegister src0, GenRegister src1, GenRegister src2);
   };
 }
-#endif /* __GBE_GEN75_ENCODER_HPP__ */
+#endif /* __GBE_GEN8_ENCODER_HPP__ */
diff --git a/backend/src/backend/gen8_instruction.hpp b/backend/src/backend/gen8_instruction.hpp
new file mode 100644
index 0000000..8981fe7
--- /dev/null
+++ b/backend/src/backend/gen8_instruction.hpp
@@ -0,0 +1,529 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Rong Yang <rong.r.yang at intel.com>
+ */
+
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keith at tungstengraphics.com>
+  */
+
+#ifndef __GEN8_INSTRUCTION_HPP__
+#define __GEN8_INSTRUCTION_HPP__
+
+union Gen8NativeInstruction
+{
+  struct {
+    struct {
+      uint32_t opcode:7;
+      uint32_t pad:1;
+      uint32_t access_mode:1;
+      uint32_t dependency_control:2;
+      uint32_t nib_ctrl:1;
+      uint32_t quarter_control:2;
+      uint32_t thread_control:2;
+      uint32_t predicate_control:4;
+      uint32_t predicate_inverse:1;
+      uint32_t execution_size:3;
+      uint32_t destreg_or_condmod:4;
+      uint32_t acc_wr_control:1;
+      uint32_t cmpt_control:1;
+      uint32_t debug_control:1;
+      uint32_t saturate:1;
+    } header;
+
+    union {
+      struct {
+        uint32_t flag_sub_reg_nr:1;
+        uint32_t flag_reg_nr:1;
+        uint32_t mask_control:1;
+        uint32_t dest_reg_file:2;
+        uint32_t dest_reg_type:4;
+        uint32_t src0_reg_file:2;
+        uint32_t src0_reg_type:4;
+        uint32_t pad:1;
+        uint32_t dest_subreg_nr:5;
+        uint32_t dest_reg_nr:8;
+        uint32_t dest_horiz_stride:2;
+        uint32_t dest_address_mode:1;
+      } da1;
+
+      struct {
+        uint32_t flag_sub_reg_nr:1;
+        uint32_t flag_reg_nr:1;
+        uint32_t mask_control:1;
+        uint32_t dest_reg_file:2;
+        uint32_t dest_reg_type:4;
+        uint32_t src0_reg_file:2;
+        uint32_t src0_reg_type:4;
+        int dest_indirect_offset_9:1;        /* offset against the deref'd address reg bit9 */
+        int dest_indirect_offset:9;        /* offset against the deref'd address reg bit0-8 */
+        uint32_t dest_subreg_nr:3; /* subnr for the address reg a0.x */
+        uint32_t dest_horiz_stride:2;
+        uint32_t dest_address_mode:1;
+      } ia1;
+
+      struct {
+        uint32_t flag_sub_reg_nr:1;
+        uint32_t flag_reg_nr:1;
+        uint32_t mask_control:1;
+        uint32_t dest_reg_file:2;
+        uint32_t dest_reg_type:4;
+        uint32_t src0_reg_file:2;
+        uint32_t src0_reg_type:4;
+        uint32_t pad:1;
+        uint32_t dest_writemask:4;
+        uint32_t dest_subreg_nr:1;
+        uint32_t dest_reg_nr:8;
+        uint32_t dest_horiz_stride:2;
+        uint32_t dest_address_mode:1;
+      } da16;
+
+      struct {
+        uint32_t flag_sub_reg_nr:1;
+        uint32_t flag_reg_nr:1;
+        uint32_t mask_control:1;
+        uint32_t dest_reg_file:2;
+        uint32_t dest_reg_type:4;
+        uint32_t src0_reg_file:2;
+        uint32_t src0_reg_type:4;
+        int dest_indirect_offset_9:1;        /* offset against the deref'd address reg bit9 */
+        uint32_t dest_writemask:4;
+        int dest_indirect_offset:5;
+        uint32_t dest_subreg_nr:3;
+        uint32_t dest_horiz_stride:2;
+        uint32_t dest_address_mode:1;
+      } ia16;
+
+      struct {
+        uint32_t flag_sub_reg_nr:1;
+        uint32_t flag_reg_nr:1;
+        uint32_t mask_control:1;
+        uint32_t src1_type:1;
+        uint32_t src2_type:1;
+        uint32_t src0_abs:1;
+        uint32_t src0_negate:1;
+        uint32_t src1_abs:1;
+        uint32_t src1_negate:1;
+        uint32_t src2_abs:1;
+        uint32_t src2_negate:1;
+        uint32_t src_type:3;
+        uint32_t dest_type:3;
+        uint32_t dest_writemask:4;
+        uint32_t dest_subreg_nr:3;
+        uint32_t dest_reg_nr:8;
+      } da3src;
+    }bits1;
+
+    union {
+      struct {
+        uint32_t src0_subreg_nr:5;
+        uint32_t src0_reg_nr:8;
+        uint32_t src0_abs:1;
+        uint32_t src0_negate:1;
+        uint32_t src0_address_mode:1;
+        uint32_t src0_horiz_stride:2;
+        uint32_t src0_width:3;
+        uint32_t src0_vert_stride:4;
+        uint32_t src1_reg_file:2;
+        uint32_t src1_reg_type:4;
+        uint32_t pad:1;
+      } da1;
+
+      struct {
+        int src0_indirect_offset:9;
+        uint32_t src0_subreg_nr:4;
+        uint32_t src0_abs:1;
+        uint32_t src0_negate:1;
+        uint32_t src0_address_mode:1;
+        uint32_t src0_horiz_stride:2;
+        uint32_t src0_width:3;
+        uint32_t src0_vert_stride:4;
+        uint32_t src1_reg_file:2;
+        uint32_t src1_reg_type:4;
+        uint32_t src0_indirect_offset_9:1;
+      } ia1;
+
+      struct {
+        uint32_t src0_swz_x:2;
+        uint32_t src0_swz_y:2;
+        uint32_t src0_subreg_nr:1;
+        uint32_t src0_reg_nr:8;
+        uint32_t src0_abs:1;
+        uint32_t src0_negate:1;
+        uint32_t src0_address_mode:1;
+        uint32_t src0_swz_z:2;
+        uint32_t src0_swz_w:2;
+        uint32_t pad0:1;
+        uint32_t src0_vert_stride:4;
+        uint32_t src1_reg_file:2;
+        uint32_t src1_reg_type:4;
+        uint32_t pad:1;
+      } da16;
+
+      struct {
+        uint32_t src0_swz_x:2;
+        uint32_t src0_swz_y:2;
+        int src0_indirect_offset:5;
+        uint32_t src0_subreg_nr:4;
+        uint32_t src0_abs:1;
+        uint32_t src0_negate:1;
+        uint32_t src0_address_mode:1;
+        uint32_t src0_swz_z:2;
+        uint32_t src0_swz_w:2;
+        uint32_t pad0:1;
+        uint32_t src0_vert_stride:4;
+        uint32_t src1_reg_file:2;
+        uint32_t src1_reg_type:4;
+        uint32_t src0_indirect_offset_9:1;
+      } ia16;
+
+      struct {
+        uint32_t src0_rep_ctrl:1;
+        uint32_t src0_swizzle:8;
+        uint32_t src0_subreg_nr:3;
+        uint32_t src0_reg_nr:8;
+        uint32_t src0_subreg_nr_w:1;
+        uint32_t src1_rep_ctrl:1;
+        uint32_t src1_swizzle:8;
+        uint32_t src1_subreg_nr_low:2;
+      } da3src;
+
+      struct {
+        uint32_t uip:32;
+      } gen8_branch;
+    } bits2;
+
+    union {
+      struct {
+        uint32_t src1_subreg_nr:5;
+        uint32_t src1_reg_nr:8;
+        uint32_t src1_abs:1;
+        uint32_t src1_negate:1;
+        uint32_t src1_address_mode:1;
+        uint32_t src1_horiz_stride:2;
+        uint32_t src1_width:3;
+        uint32_t src1_vert_stride:4;
+        uint32_t pad0:7;
+      } da1;
+
+      struct {
+        uint32_t src1_swz_x:2;
+        uint32_t src1_swz_y:2;
+        uint32_t src1_subreg_nr:1;
+        uint32_t src1_reg_nr:8;
+        uint32_t src1_abs:1;
+        uint32_t src1_negate:1;
+        uint32_t src1_address_mode:1;
+        uint32_t src1_swz_z:2;
+        uint32_t src1_swz_w:2;
+        uint32_t pad1:1;
+        uint32_t src1_vert_stride:4;
+        uint32_t pad2:7;
+      } da16;
+
+      struct {
+        int  src1_indirect_offset:9;
+        uint32_t src1_subreg_nr:4;
+        uint32_t src1_abs:1;
+        uint32_t src1_negate:1;
+        uint32_t src1_address_mode:1;
+        uint32_t src1_horiz_stride:2;
+        uint32_t src1_width:3;
+        uint32_t src1_vert_stride:4;
+        int  src1_indirect_offset_9:1;
+        uint32_t pad1:6;
+      } ia1;
+
+      struct {
+        uint32_t src1_swz_x:2;
+        uint32_t src1_swz_y:2;
+        int  src1_indirect_offset:5;
+        uint32_t src1_subreg_nr:4;
+        uint32_t src1_abs:1;
+        uint32_t src1_negate:1;
+        uint32_t src1_address_mode:1;
+        uint32_t src1_swz_z:2;
+        uint32_t src1_swz_w:2;
+        uint32_t pad1:1;
+        uint32_t src1_vert_stride:4;
+        int  src1_indirect_offset_9:1;
+        uint32_t pad2:6;
+      } ia16;
+
+      struct {
+        uint32_t function_control:19;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad1:2;
+        uint32_t end_of_thread:1;
+      } generic_gen5;
+
+      struct {
+        uint32_t sub_function_id:3;
+        uint32_t pad0:11;
+        uint32_t ack_req:1;
+        uint32_t notify:2;
+        uint32_t pad1:2;
+        uint32_t header:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } msg_gateway;
+
+      struct {
+        uint32_t opcode:1;
+        uint32_t request:1;
+        uint32_t pad0:2;
+        uint32_t resource:1;
+        uint32_t pad1:14;
+        uint32_t header:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } spawner_gen5;
+
+      /** Ironlake PRM, Volume 4 Part 1, Section 6.1.1.1 */
+      struct {
+        uint32_t function:4;
+        uint32_t int_type:1;
+        uint32_t precision:1;
+        uint32_t saturate:1;
+        uint32_t data_type:1;
+        uint32_t snapshot:1;
+        uint32_t pad0:10;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad1:2;
+        uint32_t end_of_thread:1;
+      } math_gen5;
+
+      struct {
+        uint32_t bti:8;
+        uint32_t sampler:4;
+        uint32_t msg_type:5;
+        uint32_t simd_mode:2;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad1:2;
+        uint32_t end_of_thread:1;
+      } sampler_gen7;
+
+      /**
+       * Message for the Sandybridge Sampler Cache or Constant Cache Data Port.
+       *
+       * See the Sandybridge PRM, Volume 4 Part 1, Section 3.9.2.1.1.
+       **/
+      struct {
+        uint32_t bti:8;
+        uint32_t msg_control:5;
+        uint32_t msg_type:3;
+        uint32_t pad0:3;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad1:2;
+        uint32_t end_of_thread:1;
+      } gen6_dp_sampler_const_cache;
+
+      /*! Data port untyped read / write messages */
+      struct {
+        uint32_t bti:8;
+        uint32_t rgba:4;
+        uint32_t simd_mode:2;
+        uint32_t msg_type:4;
+        uint32_t category:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } gen7_untyped_rw;
+
+      /*! Data port byte scatter / gather */
+      struct {
+        uint32_t bti:8;
+        uint32_t simd_mode:1;
+        uint32_t ignored0:1;
+        uint32_t data_size:2;
+        uint32_t ignored1:2;
+        uint32_t msg_type:4;
+        uint32_t category:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } gen7_byte_rw;
+
+      /*! Data port Scratch Read/ write */
+      struct {
+        uint32_t offset:12;
+        uint32_t block_size:2;
+        uint32_t ignored0:1;
+        uint32_t invalidate_after_read:1;
+        uint32_t channel_mode:1;
+        uint32_t msg_type:1;
+        uint32_t category:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } gen7_scratch_rw;
+
+      /*! Data port OBlock read / write */
+      struct {
+        uint32_t bti:8;
+        uint32_t block_size:3;
+        uint32_t ignored:2;
+        uint32_t invalidate_after_read:1;
+        uint32_t msg_type:4;
+        uint32_t category:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } gen7_oblock_rw;
+
+      /*! Data port dword scatter / gather */
+      struct {
+        uint32_t bti:8;
+        uint32_t block_size:2;
+        uint32_t ignored0:3;
+        uint32_t invalidate_after_read:1;
+        uint32_t msg_type:4;
+        uint32_t ignored1:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad2:2;
+        uint32_t end_of_thread:1;
+      } gen7_dword_rw;
+
+      /*! Data port typed read / write messages */
+      struct {
+        uint32_t bti:8;
+        uint32_t chan_mask:4;
+        uint32_t slot:2;
+        uint32_t msg_type:4;
+        uint32_t pad2:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad3:2;
+        uint32_t end_of_thread:1;
+      } gen7_typed_rw;
+
+      /*! Memory fence */
+      struct {
+        uint32_t bti:8;
+        uint32_t pad:5;
+        uint32_t commit_enable:1;
+        uint32_t msg_type:4;
+        uint32_t pad2:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad3:2;
+        uint32_t end_of_thread:1;
+      } gen7_memory_fence;
+
+      /*! atomic messages */
+      struct {
+        uint32_t bti:8;
+        uint32_t aop_type:4;
+        uint32_t simd_mode:1;
+        uint32_t return_data:1;
+        uint32_t msg_type:4;
+        uint32_t category:1;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad3:2;
+        uint32_t end_of_thread:1;
+      } gen7_atomic_op;
+
+      struct {
+        uint32_t src1_subreg_nr_high:1;
+        uint32_t src1_reg_nr:8;
+        uint32_t src1_subreg_nr_w:1;
+        uint32_t src2_rep_ctrl:1;
+        uint32_t src2_swizzle:8;
+        uint32_t src2_subreg_nr:3;
+        uint32_t src2_reg_nr:8;
+        uint32_t src2_subreg_nr_w:1;
+        uint32_t pad:1;
+      } da3src;
+
+      /*! Message gateway */
+      struct {
+        uint32_t subfunc:3;
+        uint32_t pad:11;
+        uint32_t ackreq:1;
+        uint32_t notify:2;
+        uint32_t pad2:2;
+        uint32_t header_present:1;
+        uint32_t response_length:5;
+        uint32_t msg_length:4;
+        uint32_t pad3:2;
+        uint32_t end_of_thread:1;
+      } gen7_msg_gw;
+
+      struct {
+        uint32_t jip:32;
+      } gen8_branch;
+
+      int d;
+      uint32_t ud;
+      float f;
+    } bits3;
+  };
+};
+#endif
diff --git a/backend/src/backend/gen_context.cpp b/backend/src/backend/gen_context.cpp
index 4f697ef..a473451 100644
--- a/backend/src/backend/gen_context.cpp
+++ b/backend/src/backend/gen_context.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -50,6 +50,7 @@ namespace gbe
     this->sel = NULL;
     this->ra = NULL;
     this->ifEndifFix = false;
+    this->regSpillTick = 0;
   }
 
   GenContext::~GenContext(void) {
@@ -72,6 +73,7 @@ namespace gbe
     this->branchPos3.clear();
     this->labelPos.clear();
     this->errCode = NO_ERROR;
+    this->regSpillTick = 0;
   }
 
   void GenContext::newSelection(void) {
@@ -113,7 +115,7 @@ namespace gbe
       const LabelIndex label = pair.first;
       const int32_t insnID = pair.second;
       const int32_t targetID = labelPos.find(label)->second;
-      p->patchJMPI(insnID, (targetID - insnID));
+      p->patchJMPI(insnID, (targetID - insnID), 0);
     }
     for (auto pair : branchPos3) {
       const LabelPair labelPair = pair.first;
@@ -126,7 +128,7 @@ namespace gbe
         errCode = OUT_OF_RANGE_IF_ENDIF; 
         return false;
       }
-      p->patchJMPI(insnID, (((uip - insnID)) << 16) | ((jip - insnID)));
+      p->patchJMPI(insnID, jip - insnID, uip - insnID);
     }
     return true;
   }
@@ -200,8 +202,10 @@ namespace gbe
     const GenRegister src = ra->genReg(insn.src(0));
     switch (insn.opcode) {
       case SEL_OP_MOV: p->MOV(dst, src, insn.extra.function); break;
+      case SEL_OP_READ_ARF: p->MOV(dst, src); break;
       case SEL_OP_FBH: p->FBH(dst, src); break;
       case SEL_OP_FBL: p->FBL(dst, src); break;
+      case SEL_OP_CBIT: p->CBIT(dst, src); break;
       case SEL_OP_NOT: p->NOT(dst, src); break;
       case SEL_OP_RNDD: p->RNDD(dst, src); break;
       case SEL_OP_RNDU: p->RNDU(dst, src); break;
@@ -241,6 +245,27 @@ namespace gbe
           p->IF(src);
         }
         break;
+      case SEL_OP_ELSE:
+        {
+          insertJumpPos(insn);
+          /*
+          const ir::LabelIndex label(insn.index), label1(insn.index);
+          const LabelPair labelPair(label, label1);
+          const GenRegister src = ra->genReg(insn.src(0));
+          this->branchPos3.push_back(std::make_pair(labelPair, p->store.size()));*/
+          p->ELSE(src);
+        }
+        break;
+      case SEL_OP_WHILE:
+        {
+          /*const ir::LabelIndex label0(insn.index), label1(insn.index1);
+          const LabelPair labelPair(label0, label1);
+          const GenRegister src = ra->genReg(insn.src(0));
+          this->branchPos3.push_back(std::make_pair(labelPair, p->store.size()));*/
+          insertJumpPos(insn);
+          p->WHILE(src);
+        }
+        break;
       default: NOT_IMPLEMENTED;
     }
   }
@@ -322,7 +347,8 @@ namespace gbe
           p->push();
           p->curr.predicate = GEN_PREDICATE_NONE;
           p->curr.noMask = 1;
-          p->MUL(GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD), src0, src1);
+          p->MUL(GenRegister::retype(GenRegister::acc(), GEN_TYPE_UD), src0,
+                     GenRegister::h2(GenRegister::retype(src1, GEN_TYPE_UW)));
           p->curr.accWrEnable = 1;
           p->MACH(tmp, src0, src1);
           p->pop();
@@ -416,8 +442,6 @@ namespace gbe
       case SEL_OP_ADD:  p->ADD(dst, src0, src1); break;
       case SEL_OP_MUL:  p->MUL(dst, src0, src1); break;
       case SEL_OP_MACH: p->MACH(dst, src0, src1); break;
-      case SEL_OP_UPSAMPLE_SHORT: p->UPSAMPLE_SHORT(dst, src0, src1); break;
-      case SEL_OP_UPSAMPLE_INT: p->UPSAMPLE_INT(dst, src0, src1); break;
       case SEL_OP_UPSAMPLE_LONG:
         {
           GenRegister xdst = GenRegister::retype(dst, GEN_TYPE_UL),
@@ -739,14 +763,14 @@ namespace gbe
         p->SHL(c, e, a);
         p->SHL(d, f, a);
         p->OR(e, d, b);
-        p->MOV(flagReg, GenRegister::immuw(0xFFFF));
+        setFlag(flagReg, GenRegister::immuw(0xFFFF));
         p->curr.predicate = GEN_PREDICATE_NORMAL;
         p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
         p->CMP(GEN_CONDITIONAL_Z, a, zero);
         p->SEL(d, d, e);
         p->curr.predicate = GEN_PREDICATE_NONE;
         p->AND(a, a, GenRegister::immud(32));
-        p->MOV(flagReg, GenRegister::immuw(0xFFFF));
+        setFlag(flagReg, GenRegister::immuw(0xFFFF));
         p->curr.predicate = GEN_PREDICATE_NORMAL;
         p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
         p->CMP(GEN_CONDITIONAL_Z, a, zero);
@@ -767,14 +791,14 @@ namespace gbe
         p->SHR(c, f, a);
         p->SHR(d, e, a);
         p->OR(e, d, b);
-        p->MOV(flagReg, GenRegister::immuw(0xFFFF));
+        setFlag(flagReg, GenRegister::immuw(0xFFFF));
         p->curr.predicate = GEN_PREDICATE_NORMAL;
         p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
         p->CMP(GEN_CONDITIONAL_Z, a, zero);
         p->SEL(d, d, e);
         p->curr.predicate = GEN_PREDICATE_NONE;
         p->AND(a, a, GenRegister::immud(32));
-        p->MOV(flagReg, GenRegister::immuw(0xFFFF));
+        setFlag(flagReg, GenRegister::immuw(0xFFFF));
         p->curr.predicate = GEN_PREDICATE_NORMAL;
         p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
         p->CMP(GEN_CONDITIONAL_Z, a, zero);
@@ -796,7 +820,7 @@ namespace gbe
         p->ASR(c, f, a);
         p->SHR(d, e, a);
         p->OR(e, d, b);
-        p->MOV(flagReg, GenRegister::immuw(0xFFFF));
+        setFlag(flagReg, GenRegister::immuw(0xFFFF));
         p->curr.predicate = GEN_PREDICATE_NORMAL;
         p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
         p->CMP(GEN_CONDITIONAL_Z, a, zero);
@@ -804,7 +828,7 @@ namespace gbe
         p->curr.predicate = GEN_PREDICATE_NONE;
         p->AND(a, a, GenRegister::immud(32));
         p->ASR(f, f, GenRegister::immd(31));
-        p->MOV(flagReg, GenRegister::immuw(0xFFFF));
+        setFlag(flagReg, GenRegister::immuw(0xFFFF));
         p->curr.predicate = GEN_PREDICATE_NORMAL;
         p->curr.useFlag(flagReg.flag_nr(), flagReg.flag_subnr());
         p->CMP(GEN_CONDITIONAL_Z, a, zero);
@@ -818,6 +842,14 @@ namespace gbe
         NOT_IMPLEMENTED;
     }
   }
+  void GenContext::setFlag(GenRegister flagReg, GenRegister src) {
+    p->push();
+    p->curr.noMask = 1;
+    p->curr.execWidth = 1;
+    p->curr.predicate = GEN_PREDICATE_NONE;
+    p->MOV(flagReg, src);
+    p->pop();
+  }
 
   void GenContext::saveFlag(GenRegister dest, int flag, int subFlag) {
     p->push();
@@ -904,7 +936,7 @@ namespace gbe
       p->SHL(high, low, tmp);
       p->MOV(low, GenRegister::immud(0));
 
-      p->patchJMPI(jip1, (p->n_instruction() - jip1) );
+      p->patchJMPI(jip1, (p->n_instruction() - jip1), 0);
       p->curr.predicate = GEN_PREDICATE_NONE;
       p->CMP(GEN_CONDITIONAL_LE, exp, GenRegister::immud(31));  //update dst where high != 0
       p->curr.predicate = GEN_PREDICATE_NORMAL;
@@ -918,7 +950,7 @@ namespace gbe
       p->CMP(GEN_CONDITIONAL_EQ, high, GenRegister::immud(0x80000000));
       p->CMP(GEN_CONDITIONAL_EQ, low, GenRegister::immud(0x0));
       p->AND(dst_ud, dst_ud, GenRegister::immud(0xfffffffe));
-      p->patchJMPI(jip0, (p->n_instruction() - jip0));
+      p->patchJMPI(jip0, (p->n_instruction() - jip0), 0);
 
     p->pop();
 
@@ -1238,7 +1270,7 @@ namespace gbe
     p->push();
     p->curr.execWidth = 8;
     for(int i = 0; i < execWidth; i += 8) {
-      p->MUL(acc, src0, src1);
+      p->MUL(acc, src0, GenRegister::h2(GenRegister::retype(src1, GEN_TYPE_UW)));
       p->curr.accWrEnable = 1;
       p->MACH(high, src0, src1);
       p->curr.accWrEnable = 0;
@@ -1403,7 +1435,7 @@ namespace gbe
       p->curr.noMask = 1;
       jip0 = p->n_instruction();
       p->JMPI(zero);
-      p->patchJMPI(jip0, distance);
+      p->patchJMPI(jip0, distance, 0);
       p->pop();
       // end of loop
     }
@@ -1680,7 +1712,7 @@ namespace gbe
   void GenContext::emitUnpackByteInstruction(const SelectionInstruction &insn) {
     const GenRegister src = ra->genReg(insn.src(0));
     for(uint32_t i = 0; i < insn.dstNum; i++) {
-      p->MOV(ra->genReg(insn.dst(i)), GenRegister::splitReg(src, insn.dstNum, i));
+      p->MOV(ra->genReg(insn.dst(i)), GenRegister::splitReg(src, insn.extra.elem, i));
     }
   }
 
@@ -1689,12 +1721,12 @@ namespace gbe
     p->push();
     if(simdWidth == 8) {
       for(uint32_t i = 0; i < insn.srcNum; i++)
-        p->MOV(GenRegister::splitReg(dst, insn.srcNum, i), ra->genReg(insn.src(i)));
+        p->MOV(GenRegister::splitReg(dst, insn.extra.elem, i), ra->genReg(insn.src(i)));
     } else {
       // when destination expands two registers, the source must span two registers.
       p->curr.execWidth = 8;
       for(uint32_t i = 0; i < insn.srcNum; i++) {
-        GenRegister dsti = GenRegister::splitReg(dst, insn.srcNum, i);
+        GenRegister dsti = GenRegister::splitReg(dst, insn.extra.elem, i);
         GenRegister src = ra->genReg(insn.src(i));
 
         p->curr.quarterControl = 0;
@@ -1770,7 +1802,7 @@ namespace gbe
   }
 
   void GenContext::buildPatchList(void) {
-    const uint32_t ptrSize = unit.getPointerSize() == ir::POINTER_32_BITS ? 4u : 8u;
+    const uint32_t ptrSize = this->getPointerSize();
     kernel->curbeSize = 0u;
     auto &stackUse = dag->getUse(ir::ocl::stackptr);
 
@@ -1792,12 +1824,13 @@ namespace gbe
       // For pointers and values, we have nothing to do. We just push the values
       if (arg.type == ir::FunctionArgument::GLOBAL_POINTER ||
           arg.type == ir::FunctionArgument::LOCAL_POINTER ||
-          arg.type == ir::FunctionArgument::CONSTANT_POINTER ||
-          arg.type == ir::FunctionArgument::VALUE ||
+          arg.type == ir::FunctionArgument::CONSTANT_POINTER)
+        this->insertCurbeReg(arg.reg, this->newCurbeEntry(GBE_CURBE_KERNEL_ARGUMENT, argID, ptrSize, ptrSize));
+      if (arg.type == ir::FunctionArgument::VALUE ||
           arg.type == ir::FunctionArgument::STRUCTURE ||
           arg.type == ir::FunctionArgument::IMAGE ||
           arg.type == ir::FunctionArgument::SAMPLER)
-        this->insertCurbeReg(arg.reg, this->newCurbeEntry(GBE_CURBE_KERNEL_ARGUMENT, argID, arg.size, ptrSize));
+        this->insertCurbeReg(arg.reg, this->newCurbeEntry(GBE_CURBE_KERNEL_ARGUMENT, argID, arg.size, arg.size));
     }
 
     // Go over all the instructions and find the special register we need
@@ -1879,13 +1912,17 @@ namespace gbe
       std::cout << genKernel->getName() << "'s disassemble begin:" << std::endl;
       ir::LabelIndex curLabel = (ir::LabelIndex)0;
       GenCompactInstruction * pCom = NULL;
-      GenNativeInstruction insn;
+      GenInstruction insn[2];
       std::cout << "  L0:" << std::endl;
       for (uint32_t insnID = 0; insnID < genKernel->insnNum; ) {
         if (labelPos.find((ir::LabelIndex)(curLabel + 1))->second == insnID &&
             curLabel < this->getFunction().labelNum()) {
           std::cout << "  L" << curLabel + 1 << ":" << std::endl;
           curLabel = (ir::LabelIndex)(curLabel + 1);
+          while(labelPos.find((ir::LabelIndex)(curLabel + 1))->second == insnID) {
+            std::cout << "  L" << curLabel + 1 << ":" << std::endl;
+            curLabel = (ir::LabelIndex)(curLabel + 1);
+          }
         }
         std::cout << "    (" << std::setw(8) << insnID << ")  ";
         pCom = (GenCompactInstruction*)&p->store[insnID];
diff --git a/backend/src/backend/gen_context.hpp b/backend/src/backend/gen_context.hpp
index 02c83d0..41489a0 100644
--- a/backend/src/backend/gen_context.hpp
+++ b/backend/src/backend/gen_context.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -26,7 +26,7 @@
 #define __GBE_GEN_CONTEXT_HPP__
 
 #include "backend/context.hpp"
-#include "backend/gen_encoder.hpp"
+#include "backend/gen7_encoder.hpp"
 #include "backend/program.h"
 #include "backend/gen_register.hpp"
 #include "ir/function.hpp"
@@ -76,6 +76,8 @@ namespace gbe
     virtual uint32_t alignScratchSize(uint32_t size);
     /*! Get the device's max srcatch size */
     virtual uint32_t getScratchSize(void) { return GEN7_SCRATCH_SIZE; }
+    /*! Get the pointer argument size for curbe alloc */
+    virtual uint32_t getPointerSize(void) { return 4; }
     /*! Function we emit code for */
     INLINE const ir::Function &getFunction(void) const { return fn; }
     /*! Simd width chosen for the current function */
@@ -116,6 +118,7 @@ namespace gbe
     void I64FullAdd(GenRegister high1, GenRegister low1, GenRegister high2, GenRegister low2);
     void I32FullMult(GenRegister high, GenRegister low, GenRegister src0, GenRegister src1);
     void I64FullMult(GenRegister dst1, GenRegister dst2, GenRegister dst3, GenRegister dst4, GenRegister x_high, GenRegister x_low, GenRegister y_high, GenRegister y_low);
+    void setFlag(GenRegister flag, GenRegister src);
     void saveFlag(GenRegister dest, int flag, int subFlag);
     void UnsignedI64ToFloat(GenRegister dst, GenRegister high, GenRegister low, GenRegister exp, GenRegister mantissa, GenRegister tmp, GenRegister flag);
 
@@ -192,13 +195,13 @@ namespace gbe
     uint32_t reservedSpillRegs;
     bool limitRegisterPressure;
     bool relaxMath;
-    const bool getIFENDIFFix(void) const { return ifEndifFix; }
+    bool getIFENDIFFix(void) const { return ifEndifFix; }
     void setIFENDIFFix(bool fix) { ifEndifFix = fix; }
-    const CompileErrorCode getErrCode() { return errCode; }
+    CompileErrorCode getErrCode() { return errCode; }
 
   protected:
     virtual GenEncoder* generateEncoder(void) {
-      return GBE_NEW(GenEncoder, this->simdWidth, 7, deviceID);
+      return GBE_NEW(Gen7Encoder, this->simdWidth, 7, deviceID);
     }
     /*! allocate a new curbe register and insert to curbe pool. */
     void allocCurbeReg(ir::Register reg, gbe_curbe_type value, uint32_t subValue = 0);
@@ -206,6 +209,7 @@ namespace gbe
   private:
     CompileErrorCode errCode;
     bool ifEndifFix;
+    uint32_t regSpillTick;
     /*! Build the curbe patch list for the given kernel */
     void buildPatchList(void);
     /*! Calc the group's slm offset from R0.0, to work around HSW SLM bug*/
diff --git a/backend/src/backend/gen_defs.hpp b/backend/src/backend/gen_defs.hpp
index f0da50a..cd6b7c8 100644
--- a/backend/src/backend/gen_defs.hpp
+++ b/backend/src/backend/gen_defs.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -52,6 +52,8 @@
 #define __GEN_DEFS_HPP__
 
 #include <stdint.h>
+#include "backend/gen7_instruction.hpp"
+#include "backend/gen8_instruction.hpp"
 
 /////////////////////////////////////////////////////////////////////////////
 // Gen EU defines
@@ -159,6 +161,7 @@ enum opcode {
   GEN_OPCODE_LZD = 74,
   GEN_OPCODE_FBH = 75,
   GEN_OPCODE_FBL = 76,
+  GEN_OPCODE_CBIT = 77,
   GEN_OPCODE_ADDC = 78,
   GEN_OPCODE_SUBB = 79,
   GEN_OPCODE_SAD2 = 80,
@@ -199,18 +202,18 @@ enum GenAtomicOpCode {
 /*! Gen SFID */
 enum GenMessageTarget {
   GEN_SFID_NULL                     = 0,
-  GEN_SFID_MATH                     = 1,
+  GEN_SFID_RESERVED                 = 1,
   GEN_SFID_SAMPLER                  = 2,
   GEN_SFID_MESSAGE_GATEWAY          = 3,
-  GEN_SFID_DATAPORT_READ            = 4,
-  GEN_SFID_DATAPORT_WRITE           = 5,
+  GEN_SFID_DATAPORT_SAMPLER         = 4,
+  GEN_SFID_DATAPORT_RENDER          = 5,
   GEN_SFID_URB                      = 6,
   GEN_SFID_THREAD_SPAWNER           = 7,
-  GEN6_SFID_DATAPORT_SAMPLER_CACHE  = 4,
-  GEN6_SFID_DATAPORT_RENDER_CACHE   = 5,
-  GEN6_SFID_DATAPORT_CONSTANT_CACHE = 9,
-  GEN_SFID_DATAPORT_DATA_CACHE      = 10,
-  GEN_SFID_DATAPORT1_DATA_CACHE     = 12,
+  GEN_SFID_VIDEO_MOTION_EST         = 8,
+  GEN_SFID_DATAPORT_CONSTANT        = 9,
+  GEN_SFID_DATAPORT_DATA            = 10,
+  GEN_SFID_PIXEL_INTERPOLATOR       = 11,
+  GEN_SFID_DATAPORT1_DATA           = 12, /* New for HSW and BDW. */
 };
 
 #define GEN_PREDICATE_NONE                    0
@@ -243,12 +246,12 @@ enum GenMessageTarget {
 #define GEN_TYPE_UB  4
 #define GEN_TYPE_B   5
 #define GEN_TYPE_VF  5 /* packed float vector, immediates only? */
-#define GEN_TYPE_HF  6
 #define GEN_TYPE_V   6 /* packed int vector, immediates only, uword dest only */
 #define GEN_TYPE_DF  6
 #define GEN_TYPE_F   7
 #define GEN_TYPE_UL  8
 #define GEN_TYPE_L   9
+#define GEN_TYPE_HF  10
 
 #define GEN_ARF_NULL                  0x00
 #define GEN_ARF_ADDRESS               0x10
@@ -261,6 +264,7 @@ enum GenMessageTarget {
 #define GEN_ARF_CONTROL               0x80
 #define GEN_ARF_NOTIFICATION_COUNT    0x90
 #define GEN_ARF_IP                    0xA0
+#define GEN_ARF_TM                    0xC0
 
 #define GEN_MRF_COMPR4   (1 << 7)
 
@@ -428,6 +432,8 @@ enum GenMessageTarget {
 #define GEN_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER   11
 #define GEN_MATH_FUNCTION_INT_DIV_QUOTIENT                 12
 #define GEN_MATH_FUNCTION_INT_DIV_REMAINDER                13
+#define GEN8_MATH_FUNCTION_INVM                            14
+#define GEN8_MATH_FUNCTION_RSQRTM                          15
 
 #define GEN_MATH_INTEGER_UNSIGNED     0
 #define GEN_MATH_INTEGER_SIGNED       1
@@ -500,13 +506,16 @@ union GenNativeInstruction
     struct GenInstruction low;
     struct GenInstruction high;
   };
+  union Gen7NativeInstruction gen7_insn;
+  union Gen8NativeInstruction gen8_insn;
+
+  //Gen7 & Gen8 common field
   struct {
     struct {
       uint32_t opcode:7;
       uint32_t pad:1;
       uint32_t access_mode:1;
-      uint32_t mask_control:1;
-      uint32_t dependency_control:2;
+      uint32_t pad1:3;
       uint32_t quarter_control:2;
       uint32_t thread_control:2;
       uint32_t predicate_control:4;
@@ -519,222 +528,16 @@ union GenNativeInstruction
       uint32_t saturate:1;
     } header;
 
-    union {
-      struct {
-        uint32_t dest_reg_file:2;
-        uint32_t dest_reg_type:3;
-        uint32_t src0_reg_file:2;
-        uint32_t src0_reg_type:3;
-        uint32_t src1_reg_file:2;
-        uint32_t src1_reg_type:3;
-        uint32_t nib_ctrl:1;
-        uint32_t dest_subreg_nr:5;
-        uint32_t dest_reg_nr:8;
-        uint32_t dest_horiz_stride:2;
-        uint32_t dest_address_mode:1;
-      } da1;
-
-      struct {
-        uint32_t dest_reg_file:2;
-        uint32_t dest_reg_type:3;
-        uint32_t src0_reg_file:2;
-        uint32_t src0_reg_type:3;
-        uint32_t src1_reg_file:2;        /* 0x00000c00 */
-        uint32_t src1_reg_type:3;        /* 0x00007000 */
-        uint32_t nib_ctrl:1;
-        int dest_indirect_offset:10;        /* offset against the deref'd address reg */
-        uint32_t dest_subreg_nr:3; /* subnr for the address reg a0.x */
-        uint32_t dest_horiz_stride:2;
-        uint32_t dest_address_mode:1;
-      } ia1;
-
-      struct {
-        uint32_t dest_reg_file:2;
-        uint32_t dest_reg_type:3;
-        uint32_t src0_reg_file:2;
-        uint32_t src0_reg_type:3;
-        uint32_t src1_reg_file:2;
-        uint32_t src1_reg_type:3;
-        uint32_t nib_ctrl:1;
-        uint32_t dest_writemask:4;
-        uint32_t dest_subreg_nr:1;
-        uint32_t dest_reg_nr:8;
-        uint32_t dest_horiz_stride:2;
-        uint32_t dest_address_mode:1;
-      } da16;
-
-      struct {
-        uint32_t dest_reg_file:2;
-        uint32_t dest_reg_type:3;
-        uint32_t src0_reg_file:2;
-        uint32_t src0_reg_type:3;
-        uint32_t nib_ctrl:1;
-        uint32_t dest_writemask:4;
-        int dest_indirect_offset:6;
-        uint32_t dest_subreg_nr:3;
-        uint32_t dest_horiz_stride:2;
-        uint32_t dest_address_mode:1;
-      } ia16;
-
-      struct {
-        uint32_t dest_reg_file:2;
-        uint32_t dest_reg_type:3;
-        uint32_t src0_reg_file:2;
-        uint32_t src0_reg_type:3;
-        uint32_t src1_reg_file:2;
-        uint32_t src1_reg_type:3;
-        uint32_t pad:1;
-        int jump_count:16;
-      } branch_gen6;
-
-      struct {
-        uint32_t dest_reg_file:1;
-        uint32_t flag_subreg_num:1;
-        uint32_t pad0:2;
-        uint32_t src0_abs:1;
-        uint32_t src0_negate:1;
-        uint32_t src1_abs:1;
-        uint32_t src1_negate:1;
-        uint32_t src2_abs:1;
-        uint32_t src2_negate:1;
-        uint32_t pad1:7;
-        uint32_t dest_writemask:4;
-        uint32_t dest_subreg_nr:3;
-        uint32_t dest_reg_nr:8;
-      } da3src;
+    struct {
+      uint32_t pad1:32;
     } bits1;
 
-    union {
-      struct {
-        uint32_t src0_subreg_nr:5;
-        uint32_t src0_reg_nr:8;
-        uint32_t src0_abs:1;
-        uint32_t src0_negate:1;
-        uint32_t src0_address_mode:1;
-        uint32_t src0_horiz_stride:2;
-        uint32_t src0_width:3;
-        uint32_t src0_vert_stride:4;
-        uint32_t flag_sub_reg_nr:1;
-        uint32_t flag_reg_nr:1;
-        uint32_t pad:5;
-      } da1;
-
-      struct {
-        int src0_indirect_offset:10;
-        uint32_t src0_subreg_nr:3;
-        uint32_t src0_abs:1;
-        uint32_t src0_negate:1;
-        uint32_t src0_address_mode:1;
-        uint32_t src0_horiz_stride:2;
-        uint32_t src0_width:3;
-        uint32_t src0_vert_stride:4;
-        uint32_t flag_sub_reg_nr:1;
-        uint32_t flag_reg_nr:1;
-        uint32_t pad:5;
-      } ia1;
-
-      struct {
-        uint32_t src0_swz_x:2;
-        uint32_t src0_swz_y:2;
-        uint32_t src0_subreg_nr:1;
-        uint32_t src0_reg_nr:8;
-        uint32_t src0_abs:1;
-        uint32_t src0_negate:1;
-        uint32_t src0_address_mode:1;
-        uint32_t src0_swz_z:2;
-        uint32_t src0_swz_w:2;
-        uint32_t pad0:1;
-        uint32_t src0_vert_stride:4;
-        uint32_t flag_sub_reg_nr:1;
-        uint32_t flag_reg_nr:1;
-        uint32_t pad:5;
-      } da16;
-
-      struct {
-        uint32_t src0_swz_x:2;
-        uint32_t src0_swz_y:2;
-        int src0_indirect_offset:6;
-        uint32_t src0_subreg_nr:3;
-        uint32_t src0_abs:1;
-        uint32_t src0_negate:1;
-        uint32_t src0_address_mode:1;
-        uint32_t src0_swz_z:2;
-        uint32_t src0_swz_w:2;
-        uint32_t pad0:1;
-        uint32_t src0_vert_stride:4;
-        uint32_t flag_sub_reg_nr:1;
-        uint32_t flag_reg_nr:1;
-        uint32_t pad:5;
-      } ia16;
-
-      struct {
-        uint32_t src0_rep_ctrl:1;
-        uint32_t src0_swizzle:8;
-        uint32_t src0_subreg_nr:3;
-        uint32_t src0_reg_nr:8;
-        uint32_t pad0:1;
-        uint32_t src1_rep_ctrl:1;
-        uint32_t src1_swizzle:8;
-        uint32_t src1_subreg_nr_low:2;
-      } da3src;
+    struct {
+      uint32_t pad2:32;
     } bits2;
 
     union {
       struct {
-        uint32_t src1_subreg_nr:5;
-        uint32_t src1_reg_nr:8;
-        uint32_t src1_abs:1;
-        uint32_t src1_negate:1;
-        uint32_t src1_address_mode:1;
-        uint32_t src1_horiz_stride:2;
-        uint32_t src1_width:3;
-        uint32_t src1_vert_stride:4;
-        uint32_t pad0:7;
-      } da1;
-
-      struct {
-        uint32_t src1_swz_x:2;
-        uint32_t src1_swz_y:2;
-        uint32_t src1_subreg_nr:1;
-        uint32_t src1_reg_nr:8;
-        uint32_t src1_abs:1;
-        uint32_t src1_negate:1;
-        uint32_t src1_address_mode:1;
-        uint32_t src1_swz_z:2;
-        uint32_t src1_swz_w:2;
-        uint32_t pad1:1;
-        uint32_t src1_vert_stride:4;
-        uint32_t pad2:7;
-      } da16;
-
-      struct {
-        int  src1_indirect_offset:10;
-        uint32_t src1_subreg_nr:3;
-        uint32_t src1_abs:1;
-        uint32_t src1_negate:1;
-        uint32_t src1_address_mode:1;
-        uint32_t src1_horiz_stride:2;
-        uint32_t src1_width:3;
-        uint32_t src1_vert_stride:4;
-        uint32_t pad1:7;
-      } ia1;
-
-      struct {
-        uint32_t src1_swz_x:2;
-        uint32_t src1_swz_y:2;
-        int  src1_indirect_offset:6;
-        uint32_t src1_subreg_nr:3;
-        uint32_t src1_abs:1;
-        uint32_t src1_negate:1;
-        uint32_t pad0:1;
-        uint32_t src1_swz_z:2;
-        uint32_t src1_swz_w:2;
-        uint32_t pad1:1;
-        uint32_t src1_vert_stride:4;
-        uint32_t pad2:7;
-      } ia16;
-
-      struct {
         uint32_t function_control:19;
         uint32_t header_present:1;
         uint32_t response_length:5;
@@ -933,17 +736,6 @@ union GenNativeInstruction
         uint32_t end_of_thread:1;
       } gen7_atomic_op;
 
-      struct {
-        uint32_t src1_subreg_nr_high:1;
-        uint32_t src1_reg_nr:8;
-        uint32_t pad0:1;
-        uint32_t src2_rep_ctrl:1;
-        uint32_t src2_swizzle:8;
-        uint32_t src2_subreg_nr:3;
-        uint32_t src2_reg_nr:8;
-        uint32_t pad1:2;
-      } da3src;
-
       /*! Message gateway */
       struct {
         uint32_t subfunc:3;
@@ -959,9 +751,8 @@ union GenNativeInstruction
       } gen7_msg_gw;
 
       struct {
-        uint32_t jip:16;
-        uint32_t uip:16;
-      } gen7_branch;
+        uint32_t jip:32;
+      } gen8_branch;
 
       int d;
       uint32_t ud;
diff --git a/backend/src/backend/gen_encoder.cpp b/backend/src/backend/gen_encoder.cpp
index 182752a..e4df109 100644
--- a/backend/src/backend/gen_encoder.cpp
+++ b/backend/src/backend/gen_encoder.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -115,7 +115,7 @@ namespace gbe
                                         unsigned char msg_type, uint32_t msg_length,
                                         bool header_present)
   {
-    const GenMessageTarget sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
+    const GenMessageTarget sfid = GEN_SFID_DATAPORT_RENDER;
     setMessageDescriptor(insn, sfid, msg_length, 0, header_present);
     insn->bits3.gen7_typed_rw.bti = bti;
     insn->bits3.gen7_typed_rw.msg_type = msg_type;
@@ -125,7 +125,7 @@ namespace gbe
                                   uint32_t rgba, uint32_t msg_type,
                                   uint32_t msg_length, uint32_t response_length)
   {
-    const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA_CACHE;
+    const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA;
     setMessageDescriptor(insn, sfid, msg_length, response_length);
     insn->bits3.gen7_untyped_rw.msg_type = msg_type;
     insn->bits3.gen7_untyped_rw.bti = bti;
@@ -146,7 +146,7 @@ namespace gbe
                                      uint32_t msg_length,
                                      uint32_t response_length)
   {
-    const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA_CACHE;
+    const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA;
     p->setMessageDescriptor(insn, sfid, msg_length, response_length);
     insn->bits3.gen7_byte_rw.msg_type = msg_type;
     insn->bits3.gen7_byte_rw.bti = bti;
@@ -167,7 +167,7 @@ namespace gbe
                           uint32_t msg_length,
                           uint32_t response_length)
   {
-    const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA_CACHE;
+    const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA;
     p->setMessageDescriptor(insn, sfid, msg_length, response_length);
     assert(size == 2 || size == 4);
     insn->bits3.gen7_oblock_rw.msg_type = msg_type;
@@ -208,7 +208,7 @@ namespace gbe
     // message causes a hang at unit test case compiler_global_constant.
     // We workaround it to use DATA CACHE instead.
     const GenMessageTarget sfid = (p->deviceID == PCI_CHIP_BAYTRAIL_T) ?
-                                 GEN_SFID_DATAPORT_DATA_CACHE : GEN6_SFID_DATAPORT_CONSTANT_CACHE;
+                                 GEN_SFID_DATAPORT_DATA : GEN_SFID_DATAPORT_CONSTANT;
     p->setMessageDescriptor(insn, sfid, msg_length, response_length);
     insn->bits3.gen7_dword_rw.msg_type = msg_type;
     insn->bits3.gen7_dword_rw.bti = bti;
@@ -241,139 +241,6 @@ namespace gbe
     curr = stack[--stateNum];
   }
 
-  void GenEncoder::setHeader(GenNativeInstruction *insn) {
-    if (this->curr.execWidth == 8)
-      insn->header.execution_size = GEN_WIDTH_8;
-    else if (this->curr.execWidth == 16)
-      insn->header.execution_size = GEN_WIDTH_16;
-    else if (this->curr.execWidth == 4)
-      insn->header.execution_size = GEN_WIDTH_4;
-    else if (this->curr.execWidth == 1)
-      insn->header.execution_size = GEN_WIDTH_1;
-    else
-      NOT_IMPLEMENTED;
-    insn->header.acc_wr_control = this->curr.accWrEnable;
-    insn->header.quarter_control = this->curr.quarterControl;
-    insn->bits1.ia1.nib_ctrl = this->curr.nibControl;
-    insn->header.mask_control = this->curr.noMask;
-    insn->bits2.ia1.flag_reg_nr = this->curr.flag;
-    insn->bits2.ia1.flag_sub_reg_nr = this->curr.subFlag;
-    if (this->curr.predicate != GEN_PREDICATE_NONE) {
-      insn->header.predicate_control = this->curr.predicate;
-      insn->header.predicate_inverse = this->curr.inversePredicate;
-    }
-    insn->header.saturate = this->curr.saturate;
-  }
-
-  void GenEncoder::setDst(GenNativeInstruction *insn, GenRegister dest) {
-     if (dest.file != GEN_ARCHITECTURE_REGISTER_FILE)
-        assert(dest.nr < 128);
-
-     insn->bits1.da1.dest_reg_file = dest.file;
-     insn->bits1.da1.dest_reg_type = dest.type;
-     insn->bits1.da1.dest_address_mode = dest.address_mode;
-     insn->bits1.da1.dest_reg_nr = dest.nr;
-     insn->bits1.da1.dest_subreg_nr = dest.subnr;
-     if (dest.hstride == GEN_HORIZONTAL_STRIDE_0) {
-       if (dest.type == GEN_TYPE_UB || dest.type == GEN_TYPE_B)
-         dest.hstride = GEN_HORIZONTAL_STRIDE_4;
-       else if (dest.type == GEN_TYPE_UW || dest.type == GEN_TYPE_W)
-         dest.hstride = GEN_HORIZONTAL_STRIDE_2;
-       else
-         dest.hstride = GEN_HORIZONTAL_STRIDE_1;
-     }
-     insn->bits1.da1.dest_horiz_stride = dest.hstride;
-  }
-
-  void GenEncoder::setSrc0(GenNativeInstruction *insn, GenRegister reg) {
-     if (reg.file != GEN_ARCHITECTURE_REGISTER_FILE)
-        assert(reg.nr < 128);
-
-     if (reg.address_mode == GEN_ADDRESS_DIRECT) {
-       insn->bits1.da1.src0_reg_file = reg.file;
-       insn->bits1.da1.src0_reg_type = reg.type;
-       insn->bits2.da1.src0_abs = reg.absolute;
-       insn->bits2.da1.src0_negate = reg.negation;
-       insn->bits2.da1.src0_address_mode = reg.address_mode;
-
-       if (reg.file == GEN_IMMEDIATE_VALUE) {
-          insn->bits3.ud = reg.value.ud;
-
-          /* Required to set some fields in src1 as well: */
-          insn->bits1.da1.src1_reg_file = 0; /* arf */
-          insn->bits1.da1.src1_reg_type = reg.type;
-       }
-       else {
-         if (insn->header.access_mode == GEN_ALIGN_1) {
-           insn->bits2.da1.src0_subreg_nr = reg.subnr;
-           insn->bits2.da1.src0_reg_nr = reg.nr;
-         } else {
-           insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
-           insn->bits2.da16.src0_reg_nr = reg.nr;
-         }
-
-         if (reg.width == GEN_WIDTH_1 &&
-             insn->header.execution_size == GEN_WIDTH_1) {
-           insn->bits2.da1.src0_horiz_stride = GEN_HORIZONTAL_STRIDE_0;
-           insn->bits2.da1.src0_width = GEN_WIDTH_1;
-           insn->bits2.da1.src0_vert_stride = GEN_VERTICAL_STRIDE_0;
-         }
-         else {
-           insn->bits2.da1.src0_horiz_stride = reg.hstride;
-           insn->bits2.da1.src0_width = reg.width;
-           insn->bits2.da1.src0_vert_stride = reg.vstride;
-         }
-       }
-    } else {
-       insn->bits1.ia1.src0_reg_file = GEN_GENERAL_REGISTER_FILE;
-       insn->bits1.ia1.src0_reg_type = reg.type;
-       insn->bits2.ia1.src0_subreg_nr = 0;
-       insn->bits2.ia1.src0_indirect_offset = 0;
-       insn->bits2.ia1.src0_abs = 0;
-       insn->bits2.ia1.src0_negate = 0;
-       insn->bits2.ia1.src0_address_mode = reg.address_mode;
-       insn->bits2.ia1.src0_horiz_stride = GEN_HORIZONTAL_STRIDE_0;
-       insn->bits2.ia1.src0_width = GEN_WIDTH_1;
-       insn->bits2.ia1.src0_vert_stride = GEN_VERTICAL_STRIDE_ONE_DIMENSIONAL;
-    }
-  }
-
-  void GenEncoder::setSrc1(GenNativeInstruction *insn, GenRegister reg) {
-     assert(reg.nr < 128);
-     assert(reg.file != GEN_ARCHITECTURE_REGISTER_FILE || reg.nr == 0);
-
-     insn->bits1.da1.src1_reg_file = reg.file;
-     insn->bits1.da1.src1_reg_type = reg.type;
-     insn->bits3.da1.src1_abs = reg.absolute;
-     insn->bits3.da1.src1_negate = reg.negation;
-
-     assert(insn->bits1.da1.src0_reg_file != GEN_IMMEDIATE_VALUE);
-
-     if (reg.file == GEN_IMMEDIATE_VALUE)
-       insn->bits3.ud = reg.value.ud;
-     else {
-       assert (reg.address_mode == GEN_ADDRESS_DIRECT);
-       if (insn->header.access_mode == GEN_ALIGN_1) {
-         insn->bits3.da1.src1_subreg_nr = reg.subnr;
-         insn->bits3.da1.src1_reg_nr = reg.nr;
-       } else {
-         insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
-         insn->bits3.da16.src1_reg_nr = reg.nr;
-       }
-
-       if (reg.width == GEN_WIDTH_1 &&
-           insn->header.execution_size == GEN_WIDTH_1) {
-         insn->bits3.da1.src1_horiz_stride = GEN_HORIZONTAL_STRIDE_0;
-         insn->bits3.da1.src1_width = GEN_WIDTH_1;
-         insn->bits3.da1.src1_vert_stride = GEN_VERTICAL_STRIDE_0;
-       } else {
-         insn->bits3.da1.src1_horiz_stride = reg.hstride;
-         insn->bits3.da1.src1_width = reg.width;
-         insn->bits3.da1.src1_vert_stride = reg.vstride;
-       }
-     }
-  }
-
   static const uint32_t untypedRWMask[] = {
     GEN_UNTYPED_ALPHA|GEN_UNTYPED_BLUE|GEN_UNTYPED_GREEN|GEN_UNTYPED_RED,
     GEN_UNTYPED_ALPHA|GEN_UNTYPED_BLUE|GEN_UNTYPED_GREEN,
@@ -532,7 +399,7 @@ namespace gbe
     this->setSrc0(insn, GenRegister::ud8grf(src.nr, 0));
     this->setSrc1(insn, GenRegister::immud(0));
 
-    const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA_CACHE;
+    const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA;
     setMessageDescriptor(insn, sfid, msg_length, response_length);
     insn->bits3.gen7_atomic_op.msg_type = GEN7_UNTYPED_ATOMIC_READ;
     insn->bits3.gen7_atomic_op.bti = bti;
@@ -695,83 +562,6 @@ namespace gbe
     }
   }
 
-#define NO_SWIZZLE ((0<<0) | (1<<2) | (2<<4) | (3<<6))
-
-  static GenNativeInstruction *alu3(GenEncoder *p,
-                              uint32_t opcode,
-                              GenRegister dest,
-                              GenRegister src0,
-                              GenRegister src1,
-                              GenRegister src2)
-  {
-     GenNativeInstruction *insn = p->next(opcode);
-
-     assert(dest.file == GEN_GENERAL_REGISTER_FILE);
-     assert(dest.nr < 128);
-     assert(dest.address_mode == GEN_ADDRESS_DIRECT);
-     assert(dest.type = GEN_TYPE_F);
-     insn->bits1.da3src.dest_reg_file = 0;
-     insn->bits1.da3src.dest_reg_nr = dest.nr;
-     insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
-     insn->bits1.da3src.dest_writemask = 0xf;
-     p->setHeader(insn);
-     insn->header.access_mode = GEN_ALIGN_16;
-     insn->header.execution_size = GEN_WIDTH_8;
-
-     assert(src0.file == GEN_GENERAL_REGISTER_FILE);
-     assert(src0.address_mode == GEN_ADDRESS_DIRECT);
-     assert(src0.nr < 128);
-     assert(src0.type == GEN_TYPE_F);
-     insn->bits2.da3src.src0_swizzle = NO_SWIZZLE;
-     insn->bits2.da3src.src0_subreg_nr = src0.subnr / 4 ;
-     insn->bits2.da3src.src0_reg_nr = src0.nr;
-     insn->bits1.da3src.src0_abs = src0.absolute;
-     insn->bits1.da3src.src0_negate = src0.negation;
-     insn->bits2.da3src.src0_rep_ctrl = src0.vstride == GEN_VERTICAL_STRIDE_0;
-
-     assert(src1.file == GEN_GENERAL_REGISTER_FILE);
-     assert(src1.address_mode == GEN_ADDRESS_DIRECT);
-     assert(src1.nr < 128);
-     assert(src1.type == GEN_TYPE_F);
-     insn->bits2.da3src.src1_swizzle = NO_SWIZZLE;
-     insn->bits2.da3src.src1_subreg_nr_low = (src1.subnr / 4) & 0x3;
-     insn->bits3.da3src.src1_subreg_nr_high = (src1.subnr / 4) >> 2;
-     insn->bits2.da3src.src1_rep_ctrl = src1.vstride == GEN_VERTICAL_STRIDE_0;
-     insn->bits3.da3src.src1_reg_nr = src1.nr;
-     insn->bits1.da3src.src1_abs = src1.absolute;
-     insn->bits1.da3src.src1_negate = src1.negation;
-
-     assert(src2.file == GEN_GENERAL_REGISTER_FILE);
-     assert(src2.address_mode == GEN_ADDRESS_DIRECT);
-     assert(src2.nr < 128);
-     assert(src2.type == GEN_TYPE_F);
-     insn->bits3.da3src.src2_swizzle = NO_SWIZZLE;
-     insn->bits3.da3src.src2_subreg_nr = src2.subnr / 4;
-     insn->bits3.da3src.src2_rep_ctrl = src2.vstride == GEN_VERTICAL_STRIDE_0;
-     insn->bits3.da3src.src2_reg_nr = src2.nr;
-     insn->bits1.da3src.src2_abs = src2.absolute;
-     insn->bits1.da3src.src2_negate = src2.negation;
-
-     // Emit second half of the instruction
-     if (p->curr.execWidth == 16) {
-      GenNativeInstruction q1Insn = *insn;
-      insn = p->next(opcode);
-      *insn = q1Insn;
-      insn->header.quarter_control = GEN_COMPRESSION_Q2;
-      insn->bits1.da3src.dest_reg_nr++;
-      if (insn->bits2.da3src.src0_rep_ctrl == 0)
-        insn->bits2.da3src.src0_reg_nr++;
-      if (insn->bits2.da3src.src1_rep_ctrl == 0)
-        insn->bits3.da3src.src1_reg_nr++;
-      if (insn->bits3.da3src.src2_rep_ctrl == 0)
-        insn->bits3.da3src.src2_reg_nr++;
-     }
-
-     return insn;
-  }
-
-#undef NO_SWIZZLE
-
 #define ALU1(OP) \
   void GenEncoder::OP(GenRegister dest, GenRegister src0, uint32_t condition) { \
     alu1(this, GEN_OPCODE_##OP, dest, src0, condition); \
@@ -790,7 +580,7 @@ namespace gbe
 
 #define ALU3(OP) \
   void GenEncoder::OP(GenRegister dest, GenRegister src0, GenRegister src1, GenRegister src2) { \
-    alu3(this, GEN_OPCODE_##OP, dest, src0, src1, src2); \
+    this->alu3(GEN_OPCODE_##OP, dest, src0, src1, src2); \
   }
 
   void GenEncoder::LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value) {
@@ -822,38 +612,15 @@ namespace gbe
     pop();
   }
 
-  void GenEncoder::UPSAMPLE_SHORT(GenRegister dest, GenRegister src0, GenRegister src1) {
-    dest.type = GEN_TYPE_B;
-    dest.hstride = GEN_HORIZONTAL_STRIDE_2;
-    src0.type = GEN_TYPE_B;
-    src0.hstride = GEN_HORIZONTAL_STRIDE_2;
-    src1.type = GEN_TYPE_B;
-    src1.hstride = GEN_HORIZONTAL_STRIDE_2;
-    MOV(dest, src1);
-    dest.subnr ++;
-    MOV(dest, src0);
-  }
-
-  void GenEncoder::UPSAMPLE_INT(GenRegister dest, GenRegister src0, GenRegister src1) {
-    dest.type = GEN_TYPE_W;
-    dest.hstride = GEN_HORIZONTAL_STRIDE_2;
-    src0.type = GEN_TYPE_W;
-    src0.hstride = GEN_HORIZONTAL_STRIDE_2;
-    src1.type = GEN_TYPE_W;
-    src1.hstride = GEN_HORIZONTAL_STRIDE_2;
-    MOV(dest, src1);
-    dest.subnr += 2;
-    MOV(dest, src0);
-  }
-
   void GenEncoder::LOAD_INT64_IMM(GenRegister dest, int64_t value) {
     GenRegister u0 = GenRegister::immd((int)value), u1 = GenRegister::immd(value >> 32);
     MOV(dest.bottom_half(), u0);
     MOV(dest.top_half(this->simdWidth), u1);
   }
 
-  void GenEncoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister r) {
+  void GenEncoder::MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp) {
     GBE_ASSERT((src0.type == GEN_TYPE_F && dest.isdf()) || (src0.isdf() && dest.type == GEN_TYPE_F));
+    GenRegister r = GenRegister::retype(tmp, GEN_TYPE_F);
     int w = curr.execWidth;
     GenRegister r0;
     int factor = 1;
@@ -894,6 +661,14 @@ namespace gbe
     }
   }
 
+  void GenEncoder::F16TO32(GenRegister dest, GenRegister src0) {
+    alu1(this, GEN_OPCODE_F16TO32, dest, src0);
+  }
+
+  void GenEncoder::F32TO16(GenRegister dest, GenRegister src0) {
+    alu1(this, GEN_OPCODE_F32TO16, dest, src0);
+  }
+
   ALU1(MOV)
   ALU1(RNDZ)
   ALU1(RNDE)
@@ -901,8 +676,7 @@ namespace gbe
   ALU1(RNDU)
   ALU1(FBH)
   ALU1(FBL)
-  ALU1(F16TO32)
-  ALU1(F32TO16)
+  ALU1(CBIT)
   ALU2(SEL)
   ALU1(NOT)
   ALU2_MOD(AND)
@@ -1007,7 +781,7 @@ namespace gbe
     this->setHeader(insn);
     this->setDst(insn, dst);
     this->setSrc0(insn, dst);
-    setMessageDescriptor(insn, GEN_SFID_DATAPORT_DATA_CACHE, 1, 1, 1);
+    setMessageDescriptor(insn, GEN_SFID_DATAPORT_DATA, 1, 1, 1);
     insn->bits3.gen7_memory_fence.msg_type = GEN_MEM_FENCE;
     insn->bits3.gen7_memory_fence.commit_enable = 0x1;
   }
@@ -1024,29 +798,41 @@ namespace gbe
   }
 
   ALU2_BRA(IF)
+  ALU2_BRA(ELSE)
   ALU2_BRA(ENDIF)
+  ALU2_BRA(WHILE)
   ALU2_BRA(BRD)
   ALU2_BRA(BRC)
 
-  void GenEncoder::patchJMPI(uint32_t insnID, int32_t jumpDistance) {
+  void GenEncoder::patchJMPI(uint32_t insnID, int32_t jip, int32_t uip) {
     GenNativeInstruction &insn = *(GenNativeInstruction *)&this->store[insnID];
     GBE_ASSERT(insnID < this->store.size());
     GBE_ASSERT(insn.header.opcode == GEN_OPCODE_JMPI ||
                insn.header.opcode == GEN_OPCODE_BRD  ||
                insn.header.opcode == GEN_OPCODE_ENDIF ||
                insn.header.opcode == GEN_OPCODE_IF ||
-               insn.header.opcode == GEN_OPCODE_BRC);
-
-    if (insn.header.opcode != GEN_OPCODE_JMPI || (jumpDistance > -32769 && jumpDistance < 32768))  {
-           if (insn.header.opcode == GEN_OPCODE_IF) {
-             this->setSrc1(&insn, GenRegister::immd(jumpDistance));
-             return;
-           }
-           else if (insn.header.opcode == GEN_OPCODE_JMPI) {
-             jumpDistance = jumpDistance - 2;
-           }
-
-           this->setSrc1(&insn, GenRegister::immd(jumpDistance));
+               insn.header.opcode == GEN_OPCODE_BRC ||
+               insn.header.opcode == GEN_OPCODE_WHILE ||
+               insn.header.opcode == GEN_OPCODE_ELSE);
+
+    if( insn.header.opcode == GEN_OPCODE_WHILE ){
+      // if this WHILE instruction jump back to an ELSE instruction,
+      // need add distance to go to the next instruction.
+      GenNativeInstruction & insn_else = *(GenNativeInstruction *)&this->store[insnID+jip];
+      if(insn_else.header.opcode == GEN_OPCODE_ELSE){
+        jip += 2;
+      }
+    }
+
+    if (insn.header.opcode != GEN_OPCODE_JMPI || (jip > -32769 && jip < 32768))  {
+      if (insn.header.opcode == GEN_OPCODE_IF) {
+        this->setSrc1(&insn, GenRegister::immd((jip & 0xffff) | uip<<16));
+        return;
+      } else if (insn.header.opcode == GEN_OPCODE_JMPI) {
+        jip = jip - 2;
+      } else if(insn.header.opcode == GEN_OPCODE_ENDIF)
+        jip += 2;
+       this->setSrc1(&insn, GenRegister::immd((jip & 0xffff) | uip<<16));
     } else if ( insn.header.predicate_control == GEN_PREDICATE_NONE ) {
       // For the conditional jump distance out of S15 range, we need to use an
       // inverted jmp followed by a add ip, ip, distance to implement.
@@ -1058,13 +844,11 @@ namespace gbe
       // for all the branching instruction. And need to adjust the distance
       // for those branch instruction's start point and end point contains
       // this instruction.
-      GenNativeInstruction *insn2 = (GenNativeInstruction *)&this->store[insnID+2];
-      GBE_ASSERT(insn2->header.opcode == GEN_OPCODE_NOP);
-      insn2 = insn2;
+      GBE_ASSERT(((GenNativeInstruction *)&this->store[insnID+2])->header.opcode == GEN_OPCODE_NOP);
       insn.header.opcode = GEN_OPCODE_ADD;
       this->setDst(&insn, GenRegister::ip());
       this->setSrc0(&insn, GenRegister::ip());
-      this->setSrc1(&insn, GenRegister::immd(jumpDistance * 8));
+      this->setSrc1(&insn, GenRegister::immd(jip * 8));
     } else {
       GenNativeInstruction &insn2 = *(GenNativeInstruction *)&this->store[insnID+2];
       insn.header.predicate_inverse ^= 1;
@@ -1075,7 +859,7 @@ namespace gbe
       insn2.header.opcode = GEN_OPCODE_ADD;
       this->setDst(&insn2, GenRegister::ip());
       this->setSrc0(&insn2, GenRegister::ip());
-      this->setSrc1(&insn2, GenRegister::immd((jumpDistance - 2) * 8));
+      this->setSrc1(&insn2, GenRegister::immd((jip - 2) * 8));
     }
   }
 
@@ -1261,7 +1045,7 @@ namespace gbe
                                    uint32_t msg_length,
                                    uint32_t response_length)
   {
-     const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA_CACHE;
+     const GenMessageTarget sfid = GEN_SFID_DATAPORT_DATA;
      p->setMessageDescriptor(insn, sfid, msg_length, response_length, true);
      insn->bits3.gen7_scratch_rw.block_size = block_size;
      insn->bits3.gen7_scratch_rw.msg_type = msg_type;
diff --git a/backend/src/backend/gen_encoder.hpp b/backend/src/backend/gen_encoder.hpp
index d6e2b97..9343581 100644
--- a/backend/src/backend/gen_encoder.hpp
+++ b/backend/src/backend/gen_encoder.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -70,8 +70,6 @@ namespace gbe
     virtual ~GenEncoder(void) { }
     /*! Size of the stack (should be large enough) */
     enum { MAX_STATE_NUM = 16 };
-    /*! gen7 exec width of the double data type */
-    #define GEN7_DOUBLE_EXEC_WIDTH  8
     /*! Push the current instruction state */
     void push(void);
     /*! Pop the latest pushed state */
@@ -101,15 +99,12 @@ namespace gbe
     ALU1(MOV)
     ALU1(FBH)
     ALU1(FBL)
+    ALU1(CBIT)
     ALU2(SUBB)
-    ALU2(UPSAMPLE_SHORT)
-    ALU2(UPSAMPLE_INT)
     ALU1(RNDZ)
     ALU1(RNDE)
     ALU1(RNDD)
     ALU1(RNDU)
-    ALU1(F16TO32)
-    ALU1(F32TO16)
     ALU2(SEL)
     ALU1(NOT)
     ALU2_MOD(AND)
@@ -137,8 +132,11 @@ namespace gbe
 #undef ALU2
 #undef ALU2_MOD
 #undef ALU3
+
+    virtual void F16TO32(GenRegister dest, GenRegister src0);
+    virtual void F32TO16(GenRegister dest, GenRegister src0);
     /*! Get double/long exec width */
-    virtual int getDoubleExecWidth(void) { return GEN7_DOUBLE_EXEC_WIDTH; }
+    virtual int getDoubleExecWidth(void) = 0;
     virtual void MOV_DF(GenRegister dest, GenRegister src0, GenRegister tmp = GenRegister::null());
     virtual void LOAD_DF_IMM(GenRegister dest, GenRegister tmp, double value);
     void LOAD_INT64_IMM(GenRegister dest, int64_t value);
@@ -150,8 +148,12 @@ namespace gbe
     virtual void JMPI(GenRegister src, bool longjmp = false);
     /*! IF indexed instruction */
     void IF(GenRegister src);
+    /*! ELSE indexed instruction */
+    void ELSE(GenRegister src);
     /*! ENDIF indexed instruction */
     void ENDIF(GenRegister src);
+    /*! WHILE indexed instruction */
+    void WHILE(GenRegister src);
     /*! BRC indexed instruction */
     void BRC(GenRegister src);
     /*! BRD indexed instruction */
@@ -205,12 +207,11 @@ namespace gbe
     void MATH(GenRegister dst, uint32_t function, GenRegister src);
 
     /*! Patch JMPI/BRC/BRD (located at index insnID) with the given jump distance */
-    virtual void patchJMPI(uint32_t insnID, int32_t jumpDistance);
+    virtual void patchJMPI(uint32_t insnID, int32_t jip, int32_t uip);
 
     ////////////////////////////////////////////////////////////////////////
     // Helper functions to encode
     ////////////////////////////////////////////////////////////////////////
-    virtual void setHeader(GenNativeInstruction *insn);
     virtual void setDPUntypedRW(GenNativeInstruction *insn, uint32_t bti, uint32_t rgba,
                                 uint32_t msg_type, uint32_t msg_length,
                                 uint32_t response_length);
@@ -220,13 +221,18 @@ namespace gbe
     void setMessageDescriptor(GenNativeInstruction *inst, enum GenMessageTarget sfid,
                               unsigned msg_length, unsigned response_length,
                               bool header_present = false, bool end_of_thread = false);
-    void setDst(GenNativeInstruction *insn, GenRegister dest);
-    void setSrc0(GenNativeInstruction *insn, GenRegister reg);
-    void setSrc1(GenNativeInstruction *insn, GenRegister reg);
+    virtual void setHeader(GenNativeInstruction *insn) = 0;
+    virtual void setDst(GenNativeInstruction *insn, GenRegister dest) = 0;
+    virtual void setSrc0(GenNativeInstruction *insn, GenRegister reg) = 0;
+    virtual void setSrc1(GenNativeInstruction *insn, GenRegister reg) = 0;
     GenCompactInstruction *nextCompact(uint32_t opcode);
+    virtual bool disableCompact() { return false; }
     GenNativeInstruction *next(uint32_t opcode);
     uint32_t n_instruction(void) const { return store.size(); }
     GBE_CLASS(GenEncoder); //!< Use custom allocators
+
+    virtual void alu3(uint32_t opcode, GenRegister dst,
+                       GenRegister src0, GenRegister src1, GenRegister src2) = 0;
   };
 
   void alu1(GenEncoder *p, uint32_t opcode, GenRegister dst,
diff --git a/backend/src/backend/gen_insn_compact.cpp b/backend/src/backend/gen_insn_compact.cpp
index f19c364..d692fff 100644
--- a/backend/src/backend/gen_insn_compact.cpp
+++ b/backend/src/backend/gen_insn_compact.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -260,12 +260,14 @@ namespace gbe {
     uint32_t data;
   };
 
-  void decompactInstruction(GenCompactInstruction * p, GenNativeInstruction *pOut) {
+  void decompactInstruction(GenCompactInstruction * p, void *insn) {
+    Gen7NativeInstruction *pOut = (union Gen7NativeInstruction *) insn;
+    GenNativeInstruction *pNative = (union GenNativeInstruction *) insn;
 
-    memset(pOut, 0, sizeof(GenNativeInstruction));
+    memset(pOut, 0, sizeof(Gen7NativeInstruction));
     union ControlBits control_bits;
     control_bits.data = control_table[(uint32_t)p->bits1.control_index].bit_pattern;
-    pOut->low.low = (uint32_t)p->bits1.opcode | ((control_bits.data & 0xffff) << 8);
+    pNative->low.low = (uint32_t)p->bits1.opcode | ((control_bits.data & 0xffff) << 8);
     pOut->header.destreg_or_condmod = p->bits1.destreg_or_condmod;
     pOut->header.saturate = control_bits.saturate;
     pOut->header.acc_wr_control = p->bits1.acc_wr_control;
@@ -279,7 +281,7 @@ namespace gbe {
     subreg_bits.data = subreg_table[(uint32_t)p->bits1.sub_reg_index].bit_pattern;
     src0_bits.data = srcreg_table[p->bits1.src0_index_lo | p->bits2.src0_index_hi << 2].bit_pattern;
 
-    pOut->low.high |= data_type_bits.data & 0x7fff;
+    pNative->low.high |= data_type_bits.data & 0x7fff;
     pOut->bits1.da1.dest_horiz_stride = data_type_bits.dest_horiz_stride;
     pOut->bits1.da1.dest_address_mode = data_type_bits.dest_address_mode;
     pOut->bits1.da1.dest_reg_nr = p->bits2.dest_reg_nr;
@@ -287,7 +289,7 @@ namespace gbe {
 
     pOut->bits2.da1.src0_subreg_nr = subreg_bits.src0_subreg_nr;
     pOut->bits2.da1.src0_reg_nr = p->bits2.src0_reg_nr;
-    pOut->high.low |= (src0_bits.data << 13);
+    pNative->high.low |= (src0_bits.data << 13);
     pOut->bits2.da1.flag_sub_reg_nr = control_bits.flag_sub_reg_nr;
     pOut->bits2.da1.flag_reg_nr = control_bits.flag_reg_nr;
 
@@ -299,7 +301,7 @@ namespace gbe {
       src1_bits.data = srcreg_table[p->bits2.src1_index].bit_pattern;
       pOut->bits3.da1.src1_subreg_nr = subreg_bits.src1_subreg_nr;
       pOut->bits3.da1.src1_reg_nr = p->bits2.src1_reg_nr;
-      pOut->high.high |= (src1_bits.data << 13);
+      pNative->high.high |= (src1_bits.data << 13);
     }
   }
 
@@ -438,6 +440,9 @@ namespace gbe {
   }
 
   bool compactAlu1(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src, uint32_t condition, bool split) {
+    if(p->disableCompact())
+      return false;
+
     if(split) {
       // TODO support it
       return false;
@@ -473,6 +478,9 @@ namespace gbe {
   }
 
   bool compactAlu2(GenEncoder *p, uint32_t opcode, GenRegister dst, GenRegister src0, GenRegister src1, uint32_t condition, bool split) {
+    if(p->disableCompact())
+      return false;
+
     if(split) {
       // TODO support it
       return false;
diff --git a/backend/src/backend/gen_insn_scheduling.cpp b/backend/src/backend/gen_insn_scheduling.cpp
index 106d608..5538a74 100644
--- a/backend/src/backend/gen_insn_scheduling.cpp
+++ b/backend/src/backend/gen_insn_scheduling.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -190,6 +190,10 @@ namespace gbe
     static const uint32_t MAX_FLAG_REGISTER = 8u;
     /*! Maximum number of *physical* accumulators registers */
     static const uint32_t MAX_ACC_REGISTER = 1u;
+    /*! Maximum number of *physical* tm registers */
+    static const uint32_t MAX_TM_REGISTER = 1u;
+    /*! Maximum number of *physical* arf registers */
+    static const uint32_t MAX_ARF_REGISTER = MAX_FLAG_REGISTER + MAX_ACC_REGISTER + MAX_TM_REGISTER;
     /*! Stores the last node that wrote to a register / memory ... */
     vector<ScheduleDAGNode*> nodes;
     /*! store nodes each node depends on */
@@ -237,12 +241,12 @@ namespace gbe
   {
     if (scheduler.policy == PRE_ALLOC) {
       this->grfNum = selection.getRegNum();
-      nodes.resize(grfNum + MAX_FLAG_REGISTER + MAX_ACC_REGISTER + MAX_MEM_SYSTEM);
+      nodes.resize(grfNum + MAX_ARF_REGISTER + MAX_MEM_SYSTEM);
     } else {
       const uint32_t simdWidth = scheduler.ctx.getSimdWidth();
       GBE_ASSERT(simdWidth == 8 || simdWidth == 16);
       this->grfNum = simdWidth == 8 ? 128 : 64;
-      nodes.resize(grfNum + MAX_FLAG_REGISTER + MAX_ACC_REGISTER + MAX_MEM_SYSTEM);
+      nodes.resize(grfNum + MAX_ARF_REGISTER + MAX_MEM_SYSTEM);
     }
     insnNodes.resize(selection.getLargestBlockSize());
   }
@@ -327,6 +331,8 @@ namespace gbe
         } else if (file == GEN_ARF_ACCUMULATOR) {
           GBE_ASSERT(nr < MAX_ACC_REGISTER);
           return grfNum + MAX_FLAG_REGISTER + nr;
+        } else if (file == GEN_ARF_TM) {
+          return grfNum + MAX_FLAG_REGISTER + MAX_ACC_REGISTER;
         } else {
           NOT_SUPPORTED;
           return 0;
@@ -348,7 +354,7 @@ namespace gbe
   }
 
   uint32_t DependencyTracker::getIndex(uint32_t bti) const {
-    const uint32_t memDelta = grfNum + MAX_FLAG_REGISTER + MAX_ACC_REGISTER;
+    const uint32_t memDelta = grfNum + MAX_ARF_REGISTER;
     return bti == 0xfe ? memDelta + LOCAL_MEMORY : (bti == 0xff ? memDelta + SCRATCH_MEMORY : memDelta + GLOBAL_MEMORY);
   }
 
@@ -582,7 +588,8 @@ namespace gbe
     for (int32_t insnID = 0; insnID < insnNum; ++insnID) {
       ScheduleDAGNode *node = tracker.insnNodes[insnID];
       if (node->insn.isBranch() || node->insn.isLabel()
-          || node->insn.opcode == SEL_OP_EOT || node->insn.opcode == SEL_OP_IF
+          || node->insn.opcode == SEL_OP_EOT || node->insn.opcode == SEL_OP_IF || node->insn.opcode == SEL_OP_WHILE
+          || node->insn.opcode == SEL_OP_READ_ARF
           || node->insn.opcode == SEL_OP_BARRIER)
         tracker.makeBarrier(insnID, insnNum);
     }
diff --git a/backend/src/backend/gen_insn_scheduling.hpp b/backend/src/backend/gen_insn_scheduling.hpp
index 534557d..b8b60a6 100644
--- a/backend/src/backend/gen_insn_scheduling.hpp
+++ b/backend/src/backend/gen_insn_scheduling.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/backend/gen_insn_selection.cpp b/backend/src/backend/gen_insn_selection.cpp
index 96d3965..cd968c0 100644
--- a/backend/src/backend/gen_insn_selection.cpp
+++ b/backend/src/backend/gen_insn_selection.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -217,7 +217,7 @@ namespace gbe
   // SelectionBlock
   ///////////////////////////////////////////////////////////////////////////
 
-  SelectionBlock::SelectionBlock(const ir::BasicBlock *bb) : bb(bb), isLargeBlock(false), endifLabel( (ir::LabelIndex) 0){}
+  SelectionBlock::SelectionBlock(const ir::BasicBlock *bb) : bb(bb), isLargeBlock(false), endifLabel( (ir::LabelIndex) 0), removeSimpleIfEndif(false){}
 
   void SelectionBlock::append(ir::Register reg) { tmp.push_back(reg); }
 
@@ -245,7 +245,7 @@ namespace gbe
   public:
     INLINE SelectionDAG(const ir::Instruction &insn) :
       insn(insn), mergeable(0), childNum(insn.getSrcNum()), isRoot(0) {
-      GBE_ASSERT(insn.getSrcNum() < 127);
+      GBE_ASSERT(insn.getSrcNum() <= ir::Instruction::MAX_SRC_NUM);
       for (uint32_t childID = 0; childID < childNum; ++childID)
         this->child[childID] = NULL;
       computeBool = false;
@@ -343,6 +343,8 @@ namespace gbe
     /*! should add per thread offset to the local memory address when load/store/atomic */
     bool needPatchSLMAddr() const { return patchSLMAddr; }
     void setPatchSLMAddr(bool b) { patchSLMAddr = b; }
+    bool has32X32Mul() const { return bHas32X32Mul; }
+    void setHas32X32Mul(bool b) { bHas32X32Mul = b; }
     /*! indicate whether a register is a scalar/uniform register. */
     INLINE bool isScalarReg(const ir::Register &reg) const {
       const ir::RegisterData &regData = getRegisterData(reg);
@@ -403,6 +405,10 @@ namespace gbe
     uint32_t buildBasicBlockDAG(const ir::BasicBlock &bb);
     /*! Perform the selection on the basic block */
     void matchBasicBlock(const ir::BasicBlock &bb, uint32_t insnNum);
+    /*! a simple block can use predication instead of if/endif*/
+    bool isSimpleBlock(const ir::BasicBlock &bb, uint32_t insnNum);
+    /*! an instruction has a QWORD family src or dst operand. */
+    bool hasQWord(const ir::Instruction &insn);
     /*! A root instruction needs to be generated */
     bool isRoot(const ir::Instruction &insn) const;
 
@@ -458,6 +464,7 @@ namespace gbe
 #define I64Shift(OP) \
   INLINE void OP(Reg dst, Reg src0, Reg src1, GenRegister tmp[6]) { I64Shift(SEL_OP_##OP, dst, src0, src1, tmp); }
     ALU1(MOV)
+    ALU1(READ_ARF)
     ALU1WithTemp(MOV_DF)
     ALU1WithTemp(LOAD_DF_IMM)
     ALU1(LOAD_INT64_IMM)
@@ -492,10 +499,9 @@ namespace gbe
     ALU2WithTemp(MUL_HI)
     ALU1(FBH)
     ALU1(FBL)
+    ALU1(CBIT)
     ALU2WithTemp(HADD)
     ALU2WithTemp(RHADD)
-    ALU2(UPSAMPLE_SHORT)
-    ALU2(UPSAMPLE_INT)
     ALU2(UPSAMPLE_LONG)
     ALU1WithTemp(CONVI_TO_I64)
     ALU1WithTemp(CONVF_TO_I64)
@@ -539,8 +545,12 @@ namespace gbe
     int JMPI(Reg src, ir::LabelIndex target, ir::LabelIndex origin);
     /*! IF indexed instruction */
     void IF(Reg src, ir::LabelIndex jip, ir::LabelIndex uip);
+    /*! ELSE indexed instruction */
+    void ELSE(Reg src, ir::LabelIndex jip, ir::LabelIndex elseLabel);
     /*! ENDIF indexed instruction */
-    void ENDIF(Reg src, ir::LabelIndex jip);
+    void ENDIF(Reg src, ir::LabelIndex jip, ir::LabelIndex endifLabel = ir::LabelIndex(0));
+    /*! WHILE indexed instruction */
+    void WHILE(Reg src, ir::LabelIndex jip);
     /*! BRD indexed instruction */
     void BRD(Reg src, ir::LabelIndex jip);
     /*! BRC indexed instruction */
@@ -573,10 +583,10 @@ namespace gbe
     void BYTE_SCATTER(Reg addr, Reg src, uint32_t elemSize, uint32_t bti);
     /*! DWord scatter (for constant cache read) */
     void DWORD_GATHER(Reg dst, Reg addr, uint32_t bti);
-    /*! Unpack the uint to char4 */
-    void UNPACK_BYTE(const GenRegister *dst, const GenRegister src, uint32_t elemNum);
-    /*! pack the char4 to uint */
-    void PACK_BYTE(const GenRegister dst, const GenRegister *src, uint32_t elemNum);
+    /*! Unpack the uint to charN */
+    void UNPACK_BYTE(const GenRegister *dst, const GenRegister src, uint32_t elemSize, uint32_t elemNum);
+    /*! pack the charN to uint */
+    void PACK_BYTE(const GenRegister dst, const GenRegister *src, uint32_t elemSize, uint32_t elemNum);
     /*! Extended math function (2 arguments) */
     void MATH(Reg dst, uint32_t function, Reg src0, Reg src1);
     /*! Extended math function (1 argument) */
@@ -620,6 +630,7 @@ namespace gbe
     /*! Auxiliary label for if/endif. */ 
     uint16_t currAuxLabel;
     bool patchSLMAddr;
+    bool bHas32X32Mul;
     INLINE ir::LabelIndex newAuxLabel()
     {
       currAuxLabel++;
@@ -658,7 +669,8 @@ namespace gbe
     ctx(ctx), block(NULL),
     curr(ctx.getSimdWidth()), file(ctx.getFunction().getRegisterFile()),
     maxInsnNum(ctx.getFunction().getLargestBlockSize()), dagPool(maxInsnNum),
-    stateNum(0), vectorNum(0), bwdCodeGeneration(false), currAuxLabel(ctx.getFunction().labelNum()), patchSLMAddr(false)
+    stateNum(0), vectorNum(0), bwdCodeGeneration(false), currAuxLabel(ctx.getFunction().labelNum()),
+    patchSLMAddr(false), bHas32X32Mul(false)
   {
     const ir::Function &fn = ctx.getFunction();
     this->regNum = fn.regNum();
@@ -790,13 +802,8 @@ namespace gbe
           }
         }
 
-        if (poolOffset > ctx.reservedSpillRegs) {
-          if (GBE_DEBUG)
-            std::cerr << "Instruction (#" << (uint32_t)insn.opcode
-                      << ") src too large pooloffset "
-                      << (uint32_t)poolOffset << std::endl;
+        if (poolOffset > ctx.reservedSpillRegs)
           return false;
-        }
         // FIXME, to support post register allocation scheduling,
         // put all the reserved register to the spill/unspill's destination registers.
         // This is not the best way. We need to refine the spill/unspill instruction to
@@ -860,13 +867,8 @@ namespace gbe
           }
         }
 
-        if (poolOffset > ctx.reservedSpillRegs){
-          if (GBE_DEBUG)
-           std::cerr << "Instruction (#" << (uint32_t)insn.opcode
-                     << ") dst too large pooloffset "
-                     << (uint32_t)poolOffset << std::endl;
+        if (poolOffset > ctx.reservedSpillRegs)
           return false;
-        }
         while(!regSet.empty()) {
           struct RegSlot regSlot = regSet.back();
           regSet.pop_back();
@@ -917,6 +919,11 @@ namespace gbe
       SelectionInstruction *mov = this->create(SEL_OP_MOV, 1, 1);
       mov->src(0) = GenRegister::retype(insn->src(regID), gr.type);
       mov->state = GenInstructionState(simdWidth);
+      if(this->block->removeSimpleIfEndif){
+        mov->state.predicate = GEN_PREDICATE_NORMAL;
+        mov->state.flag = 0;
+        mov->state.subFlag = 0;
+      }
       if (this->isScalarReg(insn->src(regID).reg()))
         mov->state.noMask = 1;
       mov->dst(0) = gr;
@@ -946,6 +953,11 @@ namespace gbe
       SelectionInstruction *mov = this->create(SEL_OP_MOV, 1, 1);
       mov->dst(0) = GenRegister::retype(insn->dst(regID), gr.type);
       mov->state = GenInstructionState(simdWidth);
+      if(this->block->removeSimpleIfEndif){
+        mov->state.predicate = GEN_PREDICATE_NORMAL;
+        mov->state.flag = 0;
+        mov->state.subFlag = 0;
+      }
       if (simdWidth == 1) {
         mov->state.noMask = 1;
         mov->src(0) = GenRegister::retype(GenRegister::vec1(GEN_GENERAL_REGISTER_FILE, gr.reg()), gr.type);
@@ -1041,14 +1053,31 @@ namespace gbe
     insn->index1 = uint16_t(uip);
   }
 
-  void Selection::Opaque::ENDIF(Reg src, ir::LabelIndex jip) {
-    this->block->endifLabel = this->newAuxLabel();
+  void Selection::Opaque::ELSE(Reg src, ir::LabelIndex jip, ir::LabelIndex elseLabel) {
+
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_ELSE, 0, 1);
+    insn->src(0) = src;
+    insn->index = uint16_t(jip);
+    this->LABEL(elseLabel);
+  }
+
+  void Selection::Opaque::ENDIF(Reg src, ir::LabelIndex jip, ir::LabelIndex endifLabel) {
+    if(endifLabel == 0)
+      this->block->endifLabel = this->newAuxLabel();
+    else
+      this->block->endifLabel = endifLabel;
     this->LABEL(this->block->endifLabel);
     SelectionInstruction *insn = this->appendInsn(SEL_OP_ENDIF, 0, 1);
     insn->src(0) = src;
     insn->index = uint16_t(this->block->endifLabel);
   }
 
+  void Selection::Opaque::WHILE(Reg src, ir::LabelIndex jip) {
+    SelectionInstruction *insn = this->appendInsn(SEL_OP_WHILE, 0, 1);
+    insn->src(0) = src;
+    insn->index = uint16_t(jip);
+  }
+
   void Selection::Opaque::CMP(uint32_t conditional, Reg src0, Reg src1, Reg dst) {
     SelectionInstruction *insn = this->appendInsn(SEL_OP_CMP, 1, 2);
     insn->src(0) = src0;
@@ -1242,16 +1271,18 @@ namespace gbe
     srcVector->reg = &insn->src(0);
   }
 
-  void Selection::Opaque::UNPACK_BYTE(const GenRegister *dst, const GenRegister src, uint32_t elemNum) {
+  void Selection::Opaque::UNPACK_BYTE(const GenRegister *dst, const GenRegister src, uint32_t elemSize, uint32_t elemNum) {
     SelectionInstruction *insn = this->appendInsn(SEL_OP_UNPACK_BYTE, elemNum, 1);
     insn->src(0) = src;
+    insn->extra.elem = 4 / elemSize;
     for(uint32_t i = 0; i < elemNum; i++)
       insn->dst(i) = dst[i];
   }
-  void Selection::Opaque::PACK_BYTE(const GenRegister dst, const GenRegister *src, uint32_t elemNum) {
+  void Selection::Opaque::PACK_BYTE(const GenRegister dst, const GenRegister *src, uint32_t elemSize, uint32_t elemNum) {
     SelectionInstruction *insn = this->appendInsn(SEL_OP_PACK_BYTE, 1, elemNum);
     for(uint32_t i = 0; i < elemNum; i++)
       insn->src(i) = src[i];
+    insn->extra.elem = 4 / elemSize;
     insn->dst(0) = dst;
   }
 
@@ -1455,6 +1486,66 @@ namespace gbe
     return false;
   }
 
+  bool Selection::Opaque::hasQWord(const ir::Instruction &insn) {
+    for (uint32_t i = 0; i < insn.getSrcNum(); i++) {
+      const ir::Register reg = insn.getSrc(i);
+      if (getRegisterFamily(reg) == ir::FAMILY_QWORD)
+        return true;
+    }
+    for (uint32_t i = 0; i < insn.getDstNum(); i++) {
+      const ir::Register reg = insn.getDst(i);
+      if (getRegisterFamily(reg) == ir::FAMILY_QWORD)
+        return true;
+    }
+    return false;
+  } 
+
+  bool Selection::Opaque::isSimpleBlock(const ir::BasicBlock &bb, uint32_t insnNum) {
+
+    // FIXME should include structured innermost if/else/endif
+    if(bb.belongToStructure)
+      return false;
+
+    // FIXME scalar reg should not be excluded and just need some special handling.
+    for (int32_t insnID = insnNum-1; insnID >= 0; --insnID) {
+      SelectionDAG &dag = *insnDAG[insnID];
+      const ir::Instruction& insn = dag.insn;
+      if ( (insn.getDstNum() && this->isScalarReg(insn.getDst(0)) == true) ||
+         insn.isMemberOf<ir::CompareInstruction>() ||
+         insn.isMemberOf<ir::SelectInstruction>() ||
+         insn.getOpcode() == ir::OP_SIMD_ANY ||
+         insn.getOpcode() == ir::OP_SIMD_ALL ||
+         insn.getOpcode() == ir::OP_ELSE)
+        return false;
+
+      // Most of the QWord(long) related instruction introduce some CMP or
+      // more than 10 actual instructions at latter stage.
+      if (hasQWord(insn))
+        return false;
+
+      // Unaligned load may introduce CMP instruction.
+      if ( insn.isMemberOf<ir::LoadInstruction>()) {
+        const ir::LoadInstruction &ld = ir::cast<ir::LoadInstruction>(insn);
+        if (!ld.isAligned())
+          return false;
+      }
+    }
+
+    // there would generate a extra CMP instruction for predicated BRA with extern flag,
+    // should retrun false to keep the if/endif.
+    if((insnDAG[insnNum-1]->insn.isMemberOf<ir::BranchInstruction>())){
+      if (insnDAG[insnNum-1]->insn.getOpcode() == ir::OP_BRA) {
+        const ir::BranchInstruction &insn = ir::cast<ir::BranchInstruction>(insnDAG[insnNum-1]->insn);
+        if(insn.isPredicated() && insnDAG[insnNum-1]->child[0] == NULL){
+          return false;
+        }
+      }
+    }
+
+    return true;
+  }
+
+
   uint32_t Selection::Opaque::buildBasicBlockDAG(const ir::BasicBlock &bb)
   {
     using namespace ir;
@@ -1503,9 +1594,10 @@ namespace gbe
           // Check whether this bool is used as a normal source
           // oprand other than BRA/SEL.
           if (getRegisterFamily(reg) == FAMILY_BOOL) {
-            if (insn.getOpcode() != OP_BRA &&
+            if ((insn.getOpcode() != OP_BRA &&
                  (insn.getOpcode() != OP_SEL ||
-                   (insn.getOpcode() == OP_SEL && srcID != 0)))
+                 (insn.getOpcode() == OP_SEL && srcID != 0))) ||
+               (isScalarReg(reg)))
               child->computeBool = true;
           }
           child->isUsed = true;
@@ -1534,10 +1626,16 @@ namespace gbe
   {
     // Bottom up code generation
     bool needEndif = this->block->hasBranch == false && !this->block->hasBarrier;
-
-    if(needEndif) {
-      const ir::BasicBlock *next = bb.getNextBlock();
-      this->ENDIF(GenRegister::immd(0), next->getLabelIndex());
+    needEndif = needEndif && bb.needEndif;
+    this->block->removeSimpleIfEndif = insnNum < 10 && isSimpleBlock(bb, insnNum);
+    if (needEndif && !this->block->removeSimpleIfEndif) {
+      if(!bb.needIf) // this basic block is the exit of a structure
+        this->ENDIF(GenRegister::immd(0), bb.endifLabel, bb.endifLabel);
+      else {
+        const ir::BasicBlock *next = bb.getNextBlock();
+        this->ENDIF(GenRegister::immd(0), next->getLabelIndex());
+        needEndif = false;
+      }
     }
 
     for (int32_t insnID = insnNum-1; insnID >= 0; --insnID) {
@@ -1551,6 +1649,13 @@ namespace gbe
 
         // Start a new code fragment
         this->startBackwardGeneration();
+
+        if(this->block->removeSimpleIfEndif){
+          this->push();
+            this->curr.predicate = GEN_PREDICATE_NORMAL;
+            this->curr.flag = 0;
+            this->curr.subFlag = 0;
+        }
         // If there is no branch at the end of this block.
 
         // Try all the patterns from best to worst
@@ -1560,6 +1665,13 @@ namespace gbe
           ++it;
         } while (it != end);
         GBE_ASSERT(it != end);
+
+        if(this->block->removeSimpleIfEndif){
+            this->curr.predicate = GEN_PREDICATE_NONE;
+            this->curr.flag = 0;
+            this->curr.subFlag = 0;
+          this->pop();
+        }
         // If we are in if/endif fix mode, and this block is
         // large enough, we need to insert endif/if pair to eliminate
         // the too long if/endif block.
@@ -1575,7 +1687,6 @@ namespace gbe
           this->pop();
           this->block->isLargeBlock = true;
         }
-
         // Output the code in the current basic block
         this->endBackwardGeneration();
       }
@@ -1639,6 +1750,10 @@ namespace gbe
     this->opaque->setPatchSLMAddr(true);
   }
 
+  Selection8::Selection8(GenContext &ctx) : Selection(ctx) {
+    this->opaque->setHas32X32Mul(true);
+  }
+
   void Selection::Opaque::TYPED_WRITE(GenRegister *msgs, uint32_t msgNum,
                                       uint32_t bti, bool is3D) {
     uint32_t elemID = 0;
@@ -1833,7 +1948,7 @@ namespace gbe
     static ir::Type getType(const ir::Opcode opcode, const ir::Type insnType) {
       if (insnType == ir::TYPE_S64 || insnType == ir::TYPE_U64 || insnType == ir::TYPE_S8 || insnType == ir::TYPE_U8)
         return insnType;
-      if (opcode == ir::OP_FBH || opcode == ir::OP_FBL)
+      if (opcode == ir::OP_FBH || opcode == ir::OP_FBL || opcode == ir::OP_CBIT)
         return ir::TYPE_U32;
       if (insnType == ir::TYPE_S16 || insnType == ir::TYPE_U16)
         return insnType;
@@ -1867,7 +1982,7 @@ namespace gbe
           case ir::OP_MOV:
             if (dst.isdf()) {
               ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
-              sel.MOV_DF(dst, src, sel.selReg(r));
+              sel.MOV_DF(dst, src, sel.selReg(r, ir::TYPE_U64));
             } else {
               sel.push();
                 auto dag = sel.regDAG[insn.getDst(0)];
@@ -1887,6 +2002,7 @@ namespace gbe
           case ir::OP_RNDZ: sel.RNDZ(dst, src); break;
           case ir::OP_FBH: sel.FBH(dst, src); break;
           case ir::OP_FBL: sel.FBL(dst, src); break;
+          case ir::OP_CBIT: sel.CBIT(dst, src); break;
           case ir::OP_COS: sel.MATH(dst, GEN_MATH_FUNCTION_COS, src); break;
           case ir::OP_SIN: sel.MATH(dst, GEN_MATH_FUNCTION_SIN, src); break;
           case ir::OP_LOG: sel.MATH(dst, GEN_MATH_FUNCTION_LOG, src); break;
@@ -2253,11 +2369,27 @@ namespace gbe
           break;
          }
         case OP_UPSAMPLE_SHORT:
-          sel.UPSAMPLE_SHORT(dst, src0, src1);
+        {
+          dst = GenRegister::retype(sel.unpacked_uw(dst.reg()), GEN_TYPE_B);
+          src0 = GenRegister::retype(sel.unpacked_uw(src0.reg()), GEN_TYPE_B);
+          src1 = GenRegister::retype(sel.unpacked_uw(src1.reg()), GEN_TYPE_B);
+          sel.MOV(dst, src1);
+          dst.subphysical = 1;
+          dst = dst.offset(dst, 0, typeSize(GEN_TYPE_B));
+          sel.MOV(dst, src0);
           break;
+        }
         case OP_UPSAMPLE_INT:
-          sel.UPSAMPLE_INT(dst, src0, src1);
+        {
+          dst = sel.unpacked_uw(dst.reg());
+          src0 = sel.unpacked_uw(src0.reg());
+          src1 = sel.unpacked_uw(src1.reg());
+          sel.MOV(dst, src1);
+          dst.subphysical = 1;
+          dst = dst.offset(dst, 0, typeSize(GEN_TYPE_W));
+          sel.MOV(dst, src0);
           break;
+        }
         case OP_UPSAMPLE_LONG:
           sel.UPSAMPLE_LONG(dst, src0, src1);
           break;
@@ -2416,18 +2548,29 @@ namespace gbe
       using namespace ir;
       const ir::BinaryInstruction &insn = cast<ir::BinaryInstruction>(dag.insn);
       const Type type = insn.getType();
-      if (type == TYPE_U32 || type == TYPE_S32) {
-        sel.push();
-          if (sel.isScalarReg(insn.getDst(0)) == true) {
-            sel.curr.execWidth = 1;
-            sel.curr.predicate = GEN_PREDICATE_NONE;
-            sel.curr.noMask = 1;
-          }
-        const uint32_t simdWidth = sel.curr.execWidth;
+      if (type != TYPE_U32 && type != TYPE_S32)
+        return false;
 
-        GenRegister dst  = sel.selReg(insn.getDst(0), type);
-        GenRegister src0 = sel.selReg(insn.getSrc(0), type);
-        GenRegister src1 = sel.selReg(insn.getSrc(1), type);
+      GenRegister dst  = sel.selReg(insn.getDst(0), type);
+      GenRegister src0 = sel.selReg(insn.getSrc(0), type);
+      GenRegister src1 = sel.selReg(insn.getSrc(1), type);
+
+      sel.push();
+      if (sel.has32X32Mul()) {
+        if (sel.isScalarReg(insn.getDst(0)) == true) {
+          sel.curr.execWidth = 1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+        }
+        sel.MUL(dst, src0, src1);
+      } else {
+        if (sel.isScalarReg(insn.getDst(0)) == true) {
+          sel.curr.execWidth = 1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.noMask = 1;
+        }
+
+        const int simdWidth = sel.curr.execWidth;
 
         // Either left part of the 16-wide register or just a simd 8 register
         dst  = GenRegister::retype(dst,  GEN_TYPE_D);
@@ -2470,13 +2613,12 @@ namespace gbe
           } else
             sel.MOV(GenRegister::retype(GenRegister::next(dst), GEN_TYPE_F), GenRegister::acc());
         }
+      }
+      sel.pop();
 
-        sel.pop();
-        // All children are marked as root
-        markAllChildren(dag);
-        return true;
-      } else
-        return false;
+      // All children are marked as root
+      markAllChildren(dag);
+      return true;
     }
   };
 
@@ -2642,7 +2784,7 @@ namespace gbe
         case TYPE_S16: sel.MOV(dst, GenRegister::immw(imm.getIntegerValue())); break;
         case TYPE_U8:  sel.MOV(dst, GenRegister::immuw(imm.getIntegerValue())); break;
         case TYPE_S8:  sel.MOV(dst, GenRegister::immw(imm.getIntegerValue())); break;
-        case TYPE_DOUBLE: sel.LOAD_DF_IMM(dst, GenRegister::immdf(imm.getDoubleValue()), sel.selReg(sel.reg(FAMILY_QWORD))); break;
+        case TYPE_DOUBLE: sel.LOAD_DF_IMM(dst, GenRegister::immdf(imm.getDoubleValue()), sel.selReg(sel.reg(FAMILY_QWORD), TYPE_U64)); break;
         case TYPE_S64: sel.LOAD_INT64_IMM(dst, GenRegister::immint64(imm.getIntegerValue())); break;
         case TYPE_U64: sel.LOAD_INT64_IMM(dst, GenRegister::immint64(imm.getIntegerValue())); break;
         default: NOT_SUPPORTED;
@@ -2782,11 +2924,11 @@ namespace gbe
       /* XXX support scalar only right now. */
       GBE_ASSERT(valueNum == 1);
       GBE_ASSERT(bti.count == 1);
-      GenRegister dst[valueNum];
+      vector<GenRegister> dst(valueNum);
       GenRegister tmpAddr = getRelativeAddress(sel, addr, insn.getAddressSpace(), bti.bti[0]);
       for ( uint32_t dstID = 0; dstID < valueNum; ++dstID)
         dst[dstID] = sel.selReg(insn.getValue(dstID), ir::TYPE_U64);
-      sel.READ64(tmpAddr, dst, valueNum, bti.bti[0]);
+      sel.READ64(tmpAddr, dst.data(), valueNum, bti.bti[0]);
     }
 
     void readByteAsDWord(Selection::Opaque &sel,
@@ -2798,12 +2940,11 @@ namespace gbe
     {
       using namespace ir;
         Register tmpReg = sel.reg(FAMILY_DWORD, simdWidth == 1);
-        GenRegister tmpAddr = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD, simdWidth == 1));
+        GenRegister tmpAddr = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
         GenRegister tmpData = GenRegister::udxgrf(simdWidth, tmpReg);
         // Get dword aligned addr
         sel.push();
           if (simdWidth == 1) {
-            sel.curr.execWidth = 1;
             sel.curr.noMask = 1;
           }
           sel.AND(tmpAddr, GenRegister::retype(address,GEN_TYPE_UD), GenRegister::immud(0xfffffffc));
@@ -2827,11 +2968,96 @@ namespace gbe
         sel.pop();
     }
 
-    void emitByteGather(Selection::Opaque &sel,
-                        const ir::LoadInstruction &insn,
-                        const uint32_t elemSize,
-                        GenRegister address,
-                        ir::BTI bti) const
+    // The address is dw aligned.
+    void emitAlignedByteGather(Selection::Opaque &sel,
+                               const ir::LoadInstruction &insn,
+                               const uint32_t elemSize,
+                               GenRegister address,
+                               ir::BTI bti) const
+    {
+      using namespace ir;
+      const uint32_t valueNum = insn.getValueNum();
+      const uint32_t simdWidth = sel.isScalarReg(insn.getValue(0)) ?
+                                 1 : sel.ctx.getSimdWidth();
+      RegisterFamily family = getFamily(insn.getValueType());
+
+      vector<GenRegister> dst(valueNum);
+      const uint32_t typeSize = getFamilySize(family);
+
+      for(uint32_t i = 0; i < valueNum; i++)
+        dst[i] = sel.selReg(insn.getValue(i), getType(family));
+
+      uint32_t tmpRegNum = (typeSize*valueNum + 3) / 4;
+      vector<GenRegister> tmp(tmpRegNum);
+      vector<GenRegister> tmp2(tmpRegNum);
+      vector<Register> tmpReg(tmpRegNum);
+      for(uint32_t i = 0; i < tmpRegNum; i++) {
+        tmpReg[i] = sel.reg(FAMILY_DWORD);
+        tmp2[i] = tmp[i] = GenRegister::udxgrf(simdWidth, tmpReg[i]);
+      }
+
+      readDWord(sel, tmp, tmp2, address, tmpRegNum, insn.getAddressSpace(), bti);
+
+      for(uint32_t i = 0; i < tmpRegNum; i++) {
+        unsigned int elemNum = (valueNum - i * (4 / typeSize)) > 4/typeSize ?
+                               4/typeSize : (valueNum - i * (4 / typeSize));
+        sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, tmp[i], typeSize, elemNum);
+      }
+    }
+
+    // Gather effect data to the effectData vector from the tmp vector.
+    //  x x d0 d1 | d2 d3 d4 d5 | ... ==> d0 d1 d2 d3 | d4 d5 ...
+    void getEffectByteData(Selection::Opaque &sel,
+                           vector<GenRegister> &effectData,
+                           vector<GenRegister> &tmp,
+                           uint32_t effectDataNum,
+                           const GenRegister &address,
+                           uint32_t simdWidth) const
+    {
+      using namespace ir;
+      GBE_ASSERT(effectData.size() == effectDataNum);
+      GBE_ASSERT(tmp.size() == effectDataNum + 1);
+      sel.push();
+        Register alignedFlag = sel.reg(FAMILY_BOOL);
+        GenRegister shiftL = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+        Register shiftHReg = sel.reg(FAMILY_DWORD);
+        GenRegister shiftH = GenRegister::udxgrf(simdWidth, shiftHReg);
+        sel.push();
+          if (simdWidth == 1)
+            sel.curr.noMask = 1;
+          sel.AND(shiftL, GenRegister::retype(address, GEN_TYPE_UD), GenRegister::immud(0x3));
+          sel.SHL(shiftL, shiftL, GenRegister::immud(0x3));
+          sel.ADD(shiftH, GenRegister::negate(shiftL), GenRegister::immud(32));
+          sel.curr.physicalFlag = 0;
+          sel.curr.modFlag = 1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.curr.flagIndex = (uint16_t)alignedFlag;
+          sel.CMP(GEN_CONDITIONAL_NEQ, GenRegister::unpacked_uw(shiftHReg), GenRegister::immuw(32));
+        sel.pop();
+
+        sel.curr.noMask = 1;
+        for(uint32_t i = 0; i < effectDataNum; i++) {
+          GenRegister tmpH = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+          GenRegister tmpL = effectData[i];
+          sel.SHR(tmpL, tmp[i], shiftL);
+          sel.push();
+            // Only need to consider the tmpH when the addr is not aligned.
+            sel.curr.modFlag = 0;
+            sel.curr.physicalFlag = 0;
+            sel.curr.flagIndex = (uint16_t)alignedFlag;
+            sel.curr.predicate = GEN_PREDICATE_NORMAL;
+            sel.SHL(tmpH, tmp[i + 1], shiftH);
+            sel.OR(effectData[i], tmpL, tmpH);
+          sel.pop();
+        }
+      sel.pop();
+    }
+
+    void emitUnalignedByteGather(Selection::Opaque &sel,
+                                 const ir::LoadInstruction &insn,
+                                 const uint32_t elemSize,
+                                 GenRegister address,
+                                 ir::BTI bti) const
     {
       using namespace ir;
       const uint32_t valueNum = insn.getValueNum();
@@ -2846,17 +3072,47 @@ namespace gbe
         for(uint32_t i = 0; i < valueNum; i++)
           dst[i] = sel.selReg(insn.getValue(i), getType(family));
 
-        uint32_t tmpRegNum = typeSize*valueNum / 4;
-        vector<GenRegister> tmp(tmpRegNum);
-        vector<GenRegister> tmp2(tmpRegNum);
-        for(uint32_t i = 0; i < tmpRegNum; i++) {
+        uint32_t effectDataNum = (typeSize*valueNum + 3) / 4;
+        vector<GenRegister> tmp(effectDataNum + 1);
+        vector<GenRegister> tmp2(effectDataNum + 1);
+        vector<GenRegister> effectData(effectDataNum);
+        for(uint32_t i = 0; i < effectDataNum + 1; i++)
           tmp2[i] = tmp[i] = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
-        }
 
-        readDWord(sel, tmp, tmp2, address, tmpRegNum, insn.getAddressSpace(), bti);
+        GenRegister alignedAddr = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+        sel.push();
+          if (simdWidth == 1)
+            sel.curr.noMask = 1;
+          sel.AND(alignedAddr, GenRegister::retype(address, GEN_TYPE_UD), GenRegister::immud(~0x3));
+        sel.pop();
 
-        for(uint32_t i = 0; i < tmpRegNum; i++) {
-          sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, tmp[i], 4/typeSize);
+        uint32_t remainedReg = effectDataNum + 1;
+        uint32_t pos = 0;
+        do {
+          uint32_t width = remainedReg > 4 ? 4 : remainedReg;
+          vector<GenRegister> t1(tmp.begin() + pos, tmp.begin() + pos + width);
+          vector<GenRegister> t2(tmp2.begin() + pos, tmp2.begin() + pos + width);
+          if (pos != 0) {
+            sel.push();
+              if (simdWidth == 1)
+                sel.curr.noMask = 1;
+              sel.ADD(alignedAddr, alignedAddr, GenRegister::immud(pos * 4));
+            sel.pop();
+          }
+          readDWord(sel, t1, t2, alignedAddr, width, insn.getAddressSpace(), bti);
+          remainedReg -= width;
+          pos += width;
+        } while(remainedReg);
+
+        for(uint32_t i = 0; i < effectDataNum; i++)
+          effectData[i] = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
+
+        getEffectByteData(sel, effectData, tmp, effectDataNum, address, simdWidth);
+
+        for(uint32_t i = 0; i < effectDataNum; i++) {
+          unsigned int elemNum = (valueNum - i * (4 / typeSize)) > 4/typeSize ?
+                                 4/typeSize : (valueNum - i * (4 / typeSize));
+          sel.UNPACK_BYTE(dst.data() + i * 4/typeSize, effectData[i], typeSize, elemNum);
         }
       } else {
         GBE_ASSERT(insn.getValueNum() == 1);
@@ -2938,17 +3194,19 @@ namespace gbe
           this->emitRead64(sel, insn, address, bti);
         else if(insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
           this->emitDWordGather(sel, insn, address, bti);
-        else {
-          this->emitByteGather(sel, insn, elemSize, address, bti);
-        }
+        else if (insn.isAligned() == true)
+          this->emitAlignedByteGather(sel, insn, elemSize, address, bti);
+        else
+          this->emitUnalignedByteGather(sel, insn, elemSize, address, bti);
       } else {
         if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_QWORD)
           this->emitRead64(sel, insn, address, bti);
         else if (insn.isAligned() == true && elemSize == GEN_BYTE_SCATTER_DWORD)
           this->emitUntypedRead(sel, insn, address, bti);
-        else {
-          this->emitByteGather(sel, insn, elemSize, address, bti);
-        }
+        else if (insn.isAligned())
+          this->emitAlignedByteGather(sel, insn, elemSize, address, bti);
+        else
+          this->emitUnalignedByteGather(sel, insn, elemSize, address, bti);
       }
       return true;
     }
@@ -2983,11 +3241,11 @@ namespace gbe
       /* XXX support scalar only right now. */
       GBE_ASSERT(valueNum == 1);
       addr = GenRegister::retype(addr, GEN_TYPE_UD);
-      GenRegister src[valueNum];
+      vector<GenRegister> src(valueNum);
 
       for (uint32_t valueID = 0; valueID < valueNum; ++valueID)
         src[valueID] = sel.selReg(insn.getValue(valueID), ir::TYPE_U64);
-      sel.WRITE64(addr, src, valueNum, bti);
+      sel.WRITE64(addr, src.data(), valueNum, bti);
     }
 
     void emitByteScatter(Selection::Opaque &sel,
@@ -3016,7 +3274,7 @@ namespace gbe
         vector<GenRegister> tmp(tmpRegNum);
         for(uint32_t i = 0; i < tmpRegNum; i++) {
           tmp[i] = GenRegister::udxgrf(simdWidth, sel.reg(FAMILY_DWORD));
-          sel.PACK_BYTE(tmp[i], value.data() + i * 4/typeSize, 4/typeSize);
+          sel.PACK_BYTE(tmp[i], value.data() + i * 4/typeSize, typeSize, 4/typeSize);
         }
 
         sel.UNTYPED_WRITE(addr, tmp.data(), tmpRegNum, bti);
@@ -3429,7 +3687,7 @@ namespace gbe
       } else if ((dst.isdf() && srcType == ir::TYPE_FLOAT) ||
                  (src.isdf() && dstType == ir::TYPE_FLOAT)) {
         ir::Register r = sel.reg(ir::RegisterFamily::FAMILY_QWORD);
-        sel.MOV_DF(dst, src, sel.selReg(r));
+        sel.MOV_DF(dst, src, sel.selReg(r, TYPE_U64));
       } else if (dst.isint64()) {
         switch(src.type) {
           case GEN_TYPE_F:
@@ -3537,7 +3795,16 @@ namespace gbe
         sel.curr.physicalFlag = 0;
         sel.curr.flagIndex = (uint16_t) pred;
         sel.curr.predicate = GEN_PREDICATE_NORMAL;
-        if (!dag0)
+        // FIXME in general, if the flag is a uniform flag.
+        // we should treat that flag as extern flag, as we
+        // never genrate a uniform physical flag. As we can
+        // never predicate which channel is active when this
+        // flag is used.
+        // We need to concentrate this logic to the modFlag bit.
+        // If an instruction has that bit, it will generate physical
+        // flag, otherwise it will not. But current modFlag is
+        // just a hint. We need to fix it in the future.
+        if (!dag0 || (sel.isScalarReg(dag0->insn.getDst(0))))
           sel.curr.externFlag = 1;
         if(type == ir::TYPE_S64 || type == ir::TYPE_U64)
           sel.SEL_INT64(dst, src0, src1);
@@ -3601,6 +3868,9 @@ namespace gbe
       GBE_ASSERTM(label < GEN_MAX_LABEL, "We reached the maximum label number which is reserved for barrier handling");
       sel.LABEL(label);
 
+      if(!insn.getParent()->needIf)
+        return true;
+
       // Do not emit any code for the "returning" block. There is no need for it
       if (insn.getParent() == &sel.ctx.getFunction().getBottomBlock())
         return true;
@@ -3669,10 +3939,17 @@ namespace gbe
             sel.JMPI(GenRegister::immd(0), jip, label);
           sel.pop();
         }
-        sel.push();
-          sel.curr.predicate = GEN_PREDICATE_NORMAL;
-          sel.IF(GenRegister::immd(0), sel.block->endifLabel, sel.block->endifLabel);
-        sel.pop();
+        if(!sel.block->removeSimpleIfEndif){
+          sel.push();
+            sel.curr.predicate = GEN_PREDICATE_NORMAL;
+            if(!insn.getParent()->needEndif && insn.getParent()->needIf) {
+              ir::LabelIndex label = insn.getParent()->endifLabel;
+              sel.IF(GenRegister::immd(0), label, label);
+            }
+            else
+              sel.IF(GenRegister::immd(0), sel.block->endifLabel, sel.block->endifLabel);
+          sel.pop();
+        }
       }
 
       return true;
@@ -3686,11 +3963,10 @@ namespace gbe
     {
       using namespace ir;
       GenRegister msgPayloads[4];
-      GenRegister dst[insn.getDstNum()];
+      vector<GenRegister> dst(insn.getDstNum());
       uint32_t srcNum = insn.getSrcNum();
       uint32_t valueID = 0;
       uint32_t msgLen = 0;
-
       for (valueID = 0; valueID < insn.getDstNum(); ++valueID)
         dst[valueID] = sel.selReg(insn.getDst(valueID), insn.getDstType());
 
@@ -3727,7 +4003,7 @@ namespace gbe
       }
       uint32_t sampler = insn.getSamplerIndex();
 
-      sel.SAMPLE(dst, insn.getDstNum(), msgPayloads, msgLen, bti, sampler, insn.getSamplerOffset() != 0, false);
+      sel.SAMPLE(dst.data(), insn.getDstNum(), msgPayloads, msgLen, bti, sampler, insn.getSamplerOffset() != 0, false);
       return true;
     }
     DECL_CTOR(SampleInstruction, 1, 1);
@@ -3837,6 +4113,70 @@ namespace gbe
     DECL_CTOR(GetImageInfoInstruction, 1, 1);
   };
 
+  class ReadARFInstructionPattern : public SelectionPattern
+  {
+  public:
+    ReadARFInstructionPattern(void) : SelectionPattern(1,1) {
+      this->opcodes.push_back(ir::OP_READ_ARF);
+    }
+
+    INLINE uint32_t getRegNum(ir::ARFRegister arf) const {
+      if (arf == ir::ARF_TM) {
+        return 0xc0;
+      } else {
+        GBE_ASSERT(0);
+        return 0;
+      }
+    }
+
+    INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const {
+      using namespace ir;
+      const ir::ReadARFInstruction &insn = cast<ir::ReadARFInstruction>(dag.insn);
+      GenRegister dst;
+      dst = sel.selReg(insn.getDst(0), insn.getType());
+
+      sel.push();
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.curr.noMask = 1;
+        sel.curr.execWidth = 8;
+        sel.READ_ARF(dst, GenRegister(GEN_ARCHITECTURE_REGISTER_FILE,
+                      getRegNum(insn.getARFRegister()),
+                      0,
+                      getGenType(insn.getType()),
+                      GEN_VERTICAL_STRIDE_8,
+                      GEN_WIDTH_8,
+                      GEN_HORIZONTAL_STRIDE_1));
+      sel.pop();
+      return true;
+    }
+  };
+
+  /*! Get a region of a register */
+  class RegionInstructionPattern : public SelectionPattern
+  {
+  public:
+    RegionInstructionPattern(void) : SelectionPattern(1,1) {
+      this->opcodes.push_back(ir::OP_REGION);
+    }
+    INLINE bool emit(Selection::Opaque &sel, SelectionDAG &dag) const {
+      using namespace ir;
+      const ir::RegionInstruction &insn = cast<ir::RegionInstruction>(dag.insn);
+      GenRegister dst, src;
+      dst = sel.selReg(insn.getDst(0), ir::TYPE_U32);
+      src = GenRegister::ud1grf(insn.getSrc(0));
+      src.subphysical = 1;
+      src = GenRegister::offset(src, 0, insn.getOffset()*4);
+
+      sel.push();
+        sel.curr.noMask = 1;
+        sel.curr.predicate = GEN_PREDICATE_NONE;
+        sel.MOV(dst, src);
+      sel.pop();
+      markAllChildren(dag);
+      return true;
+    }
+  };
+
   /*! Branch instruction pattern */
   class BranchInstructionPattern : public SelectionPattern
   {
@@ -3870,16 +4210,22 @@ namespace gbe
           sel.curr.predicate = GEN_PREDICATE_NORMAL;
           sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
           sel.curr.predicate = GEN_PREDICATE_NONE;
-          if (!sel.block->hasBarrier)
+          if (!sel.block->hasBarrier && !sel.block->removeSimpleIfEndif)
             sel.ENDIF(GenRegister::immd(0), nextLabel);
           sel.block->endifOffset = -1;
         sel.pop();
       } else {
         // Update the PcIPs
         const LabelIndex jip = sel.ctx.getLabelIndex(&insn);
-        sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
-        if (!sel.block->hasBarrier)
-          sel.ENDIF(GenRegister::immd(0), nextLabel);
+        if(insn.getParent()->needEndif)
+          sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
+
+        if (!sel.block->hasBarrier && !sel.block->removeSimpleIfEndif) {
+          if(insn.getParent()->needEndif && !insn.getParent()->needIf)
+            sel.ENDIF(GenRegister::immd(0), insn.getParent()->endifLabel, insn.getParent()->endifLabel);
+          else if(insn.getParent()->needEndif)
+            sel.ENDIF(GenRegister::immd(0), nextLabel);
+        }
         sel.block->endifOffset = -1;
         if (nextLabel == jip) return;
         // Branch to the jump target
@@ -3922,7 +4268,7 @@ namespace gbe
           sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
           sel.block->endifOffset = -1;
           sel.curr.predicate = GEN_PREDICATE_NONE;
-          if (!sel.block->hasBarrier)
+          if (!sel.block->hasBarrier && !sel.block->removeSimpleIfEndif)
             sel.ENDIF(GenRegister::immd(0), next);
           sel.curr.execWidth = 1;
           if (simdWidth == 16)
@@ -3935,10 +4281,15 @@ namespace gbe
       } else {
         const LabelIndex next = bb.getNextBlock()->getLabelIndex();
         // Update the PcIPs
-        sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
+        if(insn.getParent()->needEndif)
+          sel.MOV(ip, GenRegister::immuw(uint16_t(dst)));
         sel.block->endifOffset = -1;
-        if (!sel.block->hasBarrier)
-          sel.ENDIF(GenRegister::immd(0), next);
+        if (!sel.block->hasBarrier && !sel.block->removeSimpleIfEndif) {
+          if(insn.getParent()->needEndif && !insn.getParent()->needIf)
+            sel.ENDIF(GenRegister::immd(0), insn.getParent()->endifLabel, insn.getParent()->endifLabel);
+          else if(insn.getParent()->needEndif)
+            sel.ENDIF(GenRegister::immd(0), next);
+        }
         // Branch to the jump target
         sel.push();
           sel.curr.execWidth = 1;
@@ -3971,6 +4322,46 @@ namespace gbe
         else
           this->emitForwardBranch(sel, insn, dst, src);
         sel.pop();
+      }
+      else if(opcode == OP_IF) {
+        const Register pred = insn.getPredicateIndex();
+        const LabelIndex jip = insn.getLabelIndex();
+        LabelIndex uip;
+        if(insn.getParent()->matchingEndifLabel != 0)
+          uip = insn.getParent()->matchingEndifLabel;
+        else
+          uip = jip;
+        sel.push();
+          sel.curr.physicalFlag = 0;
+          sel.curr.flagIndex = (uint64_t)pred;
+          sel.curr.externFlag = 1;
+          sel.curr.inversePredicate = insn.getInversePredicated();
+          sel.curr.predicate = GEN_PREDICATE_NORMAL;
+          sel.IF(GenRegister::immd(0), jip, uip);
+          sel.curr.inversePredicate = 0;
+        sel.pop();
+      } else if(opcode == OP_ENDIF) {
+        const LabelIndex label = insn.getLabelIndex();
+        sel.push();
+          sel.curr.noMask = 1;
+          sel.curr.predicate = GEN_PREDICATE_NONE;
+          sel.ENDIF(GenRegister::immd(0), label, label);
+        sel.pop();
+      } else if(opcode == OP_ELSE) {
+        const LabelIndex label = insn.getLabelIndex();
+        sel.ELSE(GenRegister::immd(0), label, insn.getParent()->thisElseLabel);
+      } else if(opcode == OP_WHILE) {
+        const Register pred = insn.getPredicateIndex();
+        const LabelIndex jip = insn.getLabelIndex();
+        sel.push();
+          sel.curr.physicalFlag = 0;
+          sel.curr.flagIndex = (uint64_t)pred;
+          sel.curr.externFlag = 1;
+          sel.curr.inversePredicate = insn.getInversePredicated();
+          sel.curr.predicate = GEN_PREDICATE_NORMAL;
+          sel.WHILE(GenRegister::immd(0), jip);
+          sel.curr.inversePredicate = 0;
+        sel.pop();
       } else
         NOT_IMPLEMENTED;
 
@@ -4009,6 +4400,8 @@ namespace gbe
     this->insert<SelectModifierInstructionPattern>();
     this->insert<SampleInstructionPattern>();
     this->insert<GetImageInfoInstructionPattern>();
+    this->insert<ReadARFInstructionPattern>();
+    this->insert<RegionInstructionPattern>();
 
     // Sort all the patterns with the number of instructions they output
     for (uint32_t op = 0; op < ir::OP_INVALID; ++op)
diff --git a/backend/src/backend/gen_insn_selection.hpp b/backend/src/backend/gen_insn_selection.hpp
index 9bcce6f..7fef11f 100644
--- a/backend/src/backend/gen_insn_selection.hpp
+++ b/backend/src/backend/gen_insn_selection.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -233,6 +233,7 @@ namespace gbe
     int endifOffset;
     bool hasBarrier;
     bool hasBranch;
+    bool removeSimpleIfEndif;
   };
 
   /*! Owns the selection engine */
@@ -284,6 +285,13 @@ namespace gbe
       Selection75(GenContext &ctx);
   };
 
+  class Selection8: public Selection
+  {
+    public:
+      /*! Initialize internal structures used for the selection */
+      Selection8(GenContext &ctx);
+  };
+
 } /* namespace gbe */
 
 #endif /*  __GEN_INSN_SELECTION_HPP__ */
diff --git a/backend/src/backend/gen_insn_selection.hxx b/backend/src/backend/gen_insn_selection.hxx
index ddc9d5e..da8086e 100644
--- a/backend/src/backend/gen_insn_selection.hxx
+++ b/backend/src/backend/gen_insn_selection.hxx
@@ -68,12 +68,11 @@ DECL_SELECTION_IR(MUL_HI, BinaryWithTempInstruction)
 DECL_SELECTION_IR(I64_MUL_HI, I64MULHIInstruction)
 DECL_SELECTION_IR(FBH, UnaryInstruction)
 DECL_SELECTION_IR(FBL, UnaryInstruction)
+DECL_SELECTION_IR(CBIT, UnaryInstruction)
 DECL_SELECTION_IR(HADD, BinaryWithTempInstruction)
 DECL_SELECTION_IR(RHADD, BinaryWithTempInstruction)
 DECL_SELECTION_IR(I64HADD, I64HADDInstruction)
 DECL_SELECTION_IR(I64RHADD, I64RHADDInstruction)
-DECL_SELECTION_IR(UPSAMPLE_SHORT, BinaryInstruction)
-DECL_SELECTION_IR(UPSAMPLE_INT, BinaryInstruction)
 DECL_SELECTION_IR(UPSAMPLE_LONG, BinaryInstruction)
 DECL_SELECTION_IR(CONVI_TO_I64, UnaryWithTempInstruction)
 DECL_SELECTION_IR(CONVI64_TO_I, UnaryInstruction)
@@ -84,3 +83,6 @@ DECL_SELECTION_IR(BRC, UnaryInstruction)
 DECL_SELECTION_IR(BRD, UnaryInstruction)
 DECL_SELECTION_IR(IF, UnaryInstruction)
 DECL_SELECTION_IR(ENDIF, UnaryInstruction)
+DECL_SELECTION_IR(ELSE, UnaryInstruction)
+DECL_SELECTION_IR(READ_ARF, UnaryInstruction)
+DECL_SELECTION_IR(WHILE, UnaryInstruction)
diff --git a/backend/src/backend/gen_program.cpp b/backend/src/backend/gen_program.cpp
index 5324587..05d830a 100644
--- a/backend/src/backend/gen_program.cpp
+++ b/backend/src/backend/gen_program.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -51,6 +51,7 @@
 #include "backend/gen_program.hpp"
 #include "backend/gen_context.hpp"
 #include "backend/gen75_context.hpp"
+#include "backend/gen8_context.hpp"
 #include "backend/gen_defs.hpp"
 #include "backend/gen/gen_mesa_disasm.h"
 #include "backend/gen_reg_allocation.hpp"
@@ -75,7 +76,7 @@ namespace gbe {
   {}
   GenKernel::~GenKernel(void) { GBE_SAFE_DELETE_ARRAY(insns); }
   const char *GenKernel::getCode(void) const { return (const char*) insns; }
-  const void GenKernel::setCode(const char * ins, size_t size) {
+  void GenKernel::setCode(const char * ins, size_t size) {
     insns = (GenInstruction *)ins;
     insnNum = size / sizeof(GenInstruction);
   }
@@ -89,13 +90,13 @@ namespace gbe {
     char *buf = new char[4096];
     setbuffer(f, buf, 4096);
     GenCompactInstruction * pCom = NULL;
-    GenNativeInstruction nativeInsn;
+    GenInstruction insn[2];
 
     for (uint32_t i = 0; i < insnNum;) {
       pCom = (GenCompactInstruction*)(insns+i);
       if(pCom->bits1.cmpt_control == 1) {
-        decompactInstruction(pCom, &nativeInsn);
-        gen_disasm(f, &nativeInsn, deviceID, 1);
+        decompactInstruction(pCom, &insn);
+        gen_disasm(f, &insn, deviceID, 1);
         i++;
       } else {
         gen_disasm(f, insns+i, deviceID, 0);
@@ -164,6 +165,8 @@ namespace gbe {
       ctx = GBE_NEW(GenContext, unit, name, deviceID, relaxMath);
     } else if (IS_HASWELL(deviceID)) {
       ctx = GBE_NEW(Gen75Context, unit, name, deviceID, relaxMath);
+    } else if (IS_BROADWELL(deviceID)) {
+      ctx = GBE_NEW(Gen8Context, unit, name, deviceID, relaxMath);
     }
     GBE_ASSERTM(ctx != NULL, "Fail to create the gen context\n");
 
@@ -204,7 +207,8 @@ namespace gbe {
 #define DEVICE_MATCH(typeA, src_hw_info) ((IS_IVYBRIDGE(typeA) && !strcmp(src_hw_info, "IVB")) ||  \
                                       (IS_IVYBRIDGE(typeA) && !strcmp(src_hw_info, "BYT")) ||  \
                                       (IS_BAYTRAIL_T(typeA) && !strcmp(src_hw_info, "BYT")) ||  \
-                                      (IS_HASWELL(typeA) && !strcmp(src_hw_info, "HSW")) )
+                                      (IS_HASWELL(typeA) && !strcmp(src_hw_info, "HSW")) ||  \
+                                      (IS_BROADWELL(typeA) && !strcmp(src_hw_info, "BDW")) )
 
   static gbe_program genProgramNewFromBinary(uint32_t deviceID, const char *binary, size_t size) {
     using namespace gbe;
@@ -295,6 +299,10 @@ namespace gbe {
         src_hw_info[0]='H';
         src_hw_info[1]='S';
         src_hw_info[2]='W';
+      }else if(IS_BROADWELL(prog->deviceID)){
+        src_hw_info[0]='B';
+        src_hw_info[1]='D';
+        src_hw_info[2]='W';
       }
       FILL_DEVICE_ID(*binary, src_hw_info);
       memcpy(*binary+BINARY_HEADER_LENGTH, oss.str().c_str(), sz*sizeof(char));
@@ -365,17 +373,7 @@ namespace gbe {
       ((GenProgram*)dst_program)->module = llvm::CloneModule((llvm::Module*)((GenProgram*)src_program)->module);
       errSize = 0;
     }else{
-      //set the global variables and functions to link once to fix redefine.
       llvm::Module* src = (llvm::Module*)((GenProgram*)src_program)->module;
-      for (llvm::Module::global_iterator I = src->global_begin(), E = src->global_end(); I != E; ++I) {
-        I->setLinkage(llvm::GlobalValue::LinkOnceAnyLinkage);
-      }
-
-      for (llvm::Module::iterator I = src->begin(), E = src->end(); I != E; ++I) {
-        llvm::Function *F = llvm::dyn_cast<llvm::Function>(I);
-        if (F && isKernelFunction(*F)) continue;
-        I->setLinkage(llvm::GlobalValue::LinkOnceAnyLinkage);
-      }
       llvm::Module* dst = (llvm::Module*)((GenProgram*)dst_program)->module;
       llvm::Linker::LinkModules( dst,
                                  src,
diff --git a/backend/src/backend/gen_program.h b/backend/src/backend/gen_program.h
index 8d37a70..31433f9 100644
--- a/backend/src/backend/gen_program.h
+++ b/backend/src/backend/gen_program.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/backend/gen_program.hpp b/backend/src/backend/gen_program.hpp
index 1b5136e..af1a9fa 100644
--- a/backend/src/backend/gen_program.hpp
+++ b/backend/src/backend/gen_program.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -44,7 +44,7 @@ namespace gbe
     /*! Implements base class */
     virtual const char *getCode(void) const;
     /*! Set the instruction stream (to be implemented) */
-    virtual const void setCode(const char *, size_t size);
+    virtual void setCode(const char *, size_t size);
     /*! Implements get the code size */
     virtual size_t getCodeSize(void) const;
     /*! Implements printStatus*/
@@ -79,7 +79,7 @@ namespace gbe
     GBE_CLASS(GenProgram);
   };
   /*! decompact GEN ASM if it is in compacted format */
-  extern void decompactInstruction(union GenCompactInstruction *p, union GenNativeInstruction *pOut);
+  extern void decompactInstruction(union GenCompactInstruction *p, void *insn);
 } /* namespace gbe */
 
 #endif /* __GBE_GEN_PROGRAM_HPP__ */
diff --git a/backend/src/backend/gen_reg_allocation.cpp b/backend/src/backend/gen_reg_allocation.cpp
index b7fbc93..26078e0 100644
--- a/backend/src/backend/gen_reg_allocation.cpp
+++ b/backend/src/backend/gen_reg_allocation.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -60,7 +60,7 @@ namespace gbe
     const ir::Register getReg() const {
       return (ir::Register)(key & 0xFFFF);
     }
-    const int32_t getMaxID() const {
+    int32_t getMaxID() const {
       return key >> 16;
     }
     uint64_t key;
@@ -194,13 +194,17 @@ namespace gbe
                                    uint32_t regID, bool isSrc,
                                    ir::Type type = ir::TYPE_FLOAT, bool needMov = true) {
       ir::Register reg;
-      if (isSrc)
+      if (isSrc) {
         reg = sel.replaceSrc(insn, regID, type, needMov);
-      else
+        intervals.push_back(reg);
+        intervals[reg].minID = insn->ID - 1;
+        intervals[reg].maxID = insn->ID;
+      } else {
         reg = sel.replaceDst(insn, regID, type, needMov);
-      intervals.push_back(reg);
-      intervals[reg].minID = insn->ID;
-      intervals[reg].maxID = insn->ID;
+        intervals.push_back(reg);
+        intervals[reg].minID = insn->ID;
+        intervals[reg].maxID = insn->ID + 1;
+      }
       return reg;
     }
     /*! Use custom allocator */
@@ -353,7 +357,15 @@ namespace gbe
 
   template <bool sortStartingPoint>
   inline bool cmp(const GenRegInterval *i0, const GenRegInterval *i1) {
-    return sortStartingPoint ? i0->minID < i1->minID : i0->maxID < i1->maxID;
+    if (sortStartingPoint) {
+      if (i0->minID == i1->minID)
+        return (i0->maxID < i1->maxID);
+      return i0->minID < i1->minID;
+    } else {
+      if (i0->maxID == i1->maxID)
+        return (i0->minID < i1->minID);
+      return i0->maxID < i1->maxID;
+    }
   }
 
   bool GenRegAllocator::Opaque::expireGRF(const GenRegInterval &limit) {
@@ -573,6 +585,8 @@ namespace gbe
               // If this is a modFlag on a scalar bool, we need to remove it
               // from the allocated flags map. Then latter, the user could
               // validate the flag from the scalar value correctly.
+              // The reason is we can not predicate the active channel when we
+              // need to use this flag.
               if (IS_SCALAR_FLAG(insn)) {
                 allocatedFlags.erase(ir::Register(insn.state.flagIndex));
                 continue;
@@ -838,7 +852,7 @@ namespace gbe
       // from the RA map.
       bool success = expireReg(interval.reg);
       GBE_ASSERT(success);
-      success = success;
+      if(!success) return success;
       RA.erase(interval.reg);
     }
     spilledRegs.insert(std::make_pair(interval.reg, spillTag));
@@ -943,15 +957,14 @@ namespace gbe
                                                        uint32_t size,
                                                        uint32_t alignment) {
     uint32_t grfOffset;
-    static uint32_t tick = 0;
     // Doing expireGRF too freqently will cause the post register allocation
     // scheduling very hard. As it will cause a very high register conflict rate.
     // The tradeoff here is to reduce the freqency here. And if we are under spilling
     // then no need to reduce that freqency as the register pressure is the most
     // important factor.
-    if (tick % 12 == 0 || ctx.reservedSpillRegs != 0)
+    if (ctx.regSpillTick % 12 == 0 || ctx.reservedSpillRegs != 0)
       this->expireGRF(interval);
-    tick++;
+    ctx.regSpillTick++;
     // For some scalar byte register, it may be used as a destination register
     // and the source is a scalar Dword. If that is the case, the byte register
     // must get 4byte alignment register offset.
@@ -1065,7 +1078,8 @@ namespace gbe
                        insn.opcode == SEL_OP_JMPI ||
                        insn.state.predicate == GEN_PREDICATE_NONE ||
                        (block.hasBarrier && insn.opcode == SEL_OP_MOV) ||
-                       (insn.state.flag == 0 && insn.state.subFlag == 1)));
+                       (insn.state.flag == 0 && insn.state.subFlag == 1) ||
+                       (block.removeSimpleIfEndif && insn.state.flag == 0 && insn.state.subFlag == 0) ));
         }
         lastID = insnID;
         insnID++;
diff --git a/backend/src/backend/gen_reg_allocation.hpp b/backend/src/backend/gen_reg_allocation.hpp
index e41f503..89dba64 100644
--- a/backend/src/backend/gen_reg_allocation.hpp
+++ b/backend/src/backend/gen_reg_allocation.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/backend/program.cpp b/backend/src/backend/program.cpp
index 787d111..971071e 100644
--- a/backend/src/backend/program.cpp
+++ b/backend/src/backend/program.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -105,6 +105,7 @@ namespace gbe {
 
 #ifdef GBE_COMPILER_AVAILABLE
   BVAR(OCL_OUTPUT_GEN_IR, false);
+  BVAR(OCL_STRICT_CONFORMANCE, false);
 
   bool Program::buildFromLLVMFile(const char *fileName, const void* module, std::string &error, int optLevel) {
     ir::Unit *unit = new ir::Unit();
@@ -112,7 +113,7 @@ namespace gbe {
     if(module){
       cloned_module = llvm::CloneModule((llvm::Module*)module);
     }
-    if (llvmToGen(*unit, fileName, module, optLevel) == false) {
+    if (llvmToGen(*unit, fileName, module, optLevel, OCL_STRICT_CONFORMANCE) == false) {
       if (fileName)
         error = std::string(fileName) + " not found";
       delete unit;
@@ -124,9 +125,11 @@ namespace gbe {
       delete unit;   //clear unit
       unit = new ir::Unit();
       if(cloned_module){
-        llvmToGen(*unit, fileName, cloned_module, 0);  //suppose file exists and llvmToGen will not return false.
+        //suppose file exists and llvmToGen will not return false.
+        llvmToGen(*unit, fileName, cloned_module, 0, OCL_STRICT_CONFORMANCE);
       }else{
-        llvmToGen(*unit, fileName, module, 0);  //suppose file exists and llvmToGen will not return false.
+        //suppose file exists and llvmToGen will not return false.
+        llvmToGen(*unit, fileName, module, 0, OCL_STRICT_CONFORMANCE);
       }
     }
     assert(unit->getValid());
@@ -138,8 +141,6 @@ namespace gbe {
     return true;
   }
 
-  BVAR(OCL_STRICT_CONFORMANCE, false);
-
   bool Program::buildFromUnit(const ir::Unit &unit, std::string &error) {
     constantSet = new ir::ConstantSet(unit.getConstantSet());
     const auto &set = unit.getFunctionSet();
@@ -494,38 +495,6 @@ namespace gbe {
 
   /*********************** End of Program class member function *************************/
 
-#define REDEF_MATH_FUNC(x) "#ifdef "#x"\n#undef "#x"\n#endif\n#define "#x" __gen_ocl_internal_fastpath_"#x"\n"
-  std::string ocl_mathfunc_fastpath_str =
-    REDEF_MATH_FUNC(acosh)
-    REDEF_MATH_FUNC(asinh)
-    REDEF_MATH_FUNC(atanh)
-    REDEF_MATH_FUNC(cbrt)
-    REDEF_MATH_FUNC(cos)
-    REDEF_MATH_FUNC(cosh)
-    REDEF_MATH_FUNC(cospi)
-    REDEF_MATH_FUNC(exp)
-    REDEF_MATH_FUNC(exp10)
-    REDEF_MATH_FUNC(expm1)
-    REDEF_MATH_FUNC(fmod)
-    REDEF_MATH_FUNC(hypot)
-    REDEF_MATH_FUNC(ilogb)
-    REDEF_MATH_FUNC(ldexp)
-    REDEF_MATH_FUNC(log)
-    REDEF_MATH_FUNC(log2)
-    REDEF_MATH_FUNC(log10)
-    REDEF_MATH_FUNC(log1p)
-    REDEF_MATH_FUNC(logb)
-    REDEF_MATH_FUNC(remainder)
-    REDEF_MATH_FUNC(rootn)
-    REDEF_MATH_FUNC(sin)
-    REDEF_MATH_FUNC(sincos)
-    REDEF_MATH_FUNC(sinh)
-    REDEF_MATH_FUNC(sinpi)
-    REDEF_MATH_FUNC(tan)
-    REDEF_MATH_FUNC(tanh)
-    "\n"
-  ;
-
   static void programDelete(gbe_program gbeProgram) {
     gbe::Program *program = (gbe::Program*)(gbeProgram);
     GBE_SAFE_DELETE(program);
@@ -538,55 +507,19 @@ namespace gbe {
 
 #ifdef GBE_COMPILER_AVAILABLE
   BVAR(OCL_OUTPUT_BUILD_LOG, false);
-  SVAR(OCL_PCH_PATH, PCH_OBJECT_DIR);
-  SVAR(OCL_PCM_PATH, PCM_OBJECT_DIR);
 
-  static bool buildModuleFromSource(const char* input, llvm::Module** out_module, llvm::LLVMContext* llvm_ctx, std::string options,
-                                    size_t stringSize, char *err, size_t *errSize) {
+  static bool buildModuleFromSource(const char* input, llvm::Module** out_module, llvm::LLVMContext* llvm_ctx,
+                                    std::vector<std::string>& options, size_t stringSize, char *err,
+                                    size_t *errSize) {
     // Arguments to pass to the clang frontend
     vector<const char *> args;
     bool bFastMath = false;
 
-    vector<std::string> useless; //hold substrings to avoid c_str free
-    size_t start = 0, end = 0;
-    /* FIXME
-       clang unsupport options:
-       -cl-denorms-are-zero, -cl-strict-aliasing
-       -cl-no-signed-zeros, -cl-fp32-correctly-rounded-divide-sqrt
-       all support options, refer to clang/include/clang/Driver/Options.inc
-    */
-    //Handle -cl-opt-disable in llvmToGen, skip here
-    const std::string unsupportedOptions("-cl-denorms-are-zero, -cl-strict-aliasing, -cl-opt-disable,"
-                                         "-cl-no-signed-zeros, -cl-fp32-correctly-rounded-divide-sqrt");
-    bool useDefaultCLCVersion = true;
-    while (end != std::string::npos) {
-      end = options.find(' ', start);
-      std::string str = options.substr(start, end - start);
-      start = end + 1;
-      if(str.size() == 0)
-        continue;
-      if(str == "-cl-fast-relaxed-math") bFastMath = true;
-      if(unsupportedOptions.find(str) != std::string::npos)
-        continue;
-      if(str.find("-cl-std=") != std::string::npos) {
-        useDefaultCLCVersion = false;
-        if (str == "-cl-std=CL1.1")
-          args.push_back("-D__OPENCL_C_VERSION__=110");
-        else if (str == "-cl-std=CL1.2")
-          args.push_back("-D__OPENCL_C_VERSION__=120");
-        else {
-          if (err && stringSize > 0 && errSize)
-            *errSize = snprintf(err, stringSize, "Invalid build option: %s\n", str.c_str());
-          return false;
-        }
-      }
-      useless.push_back(str);
-      args.push_back(str.c_str());
-    }
-    if (useDefaultCLCVersion) {
-      args.push_back("-D__OPENCL_C_VERSION__=120");
-      args.push_back("-cl-std=CL1.2");
+    for (auto &s : options) {
+      args.push_back(s.c_str());
     }
+
+    args.push_back("-cl-kernel-arg-info");
     args.push_back("-mllvm");
     args.push_back("-inline-threshold=200000");
 #ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
@@ -610,28 +543,21 @@ namespace gbe {
 #endif /* LLVM_VERSION_MINOR <= 2 */
     args.push_back(input);
 
+    args.push_back("-ffp-contract=off");
+
     // The compiler invocation needs a DiagnosticsEngine so it can report problems
     std::string ErrorString;
     llvm::raw_string_ostream ErrorInfo(ErrorString);
     llvm::IntrusiveRefCntPtr<clang::DiagnosticOptions> DiagOpts = new clang::DiagnosticOptions();
     DiagOpts->ShowCarets = false;
     DiagOpts->ShowPresumedLoc = true;
-#if LLVM_VERSION_MINOR <= 1
-    args.push_back("-triple");
-    args.push_back("ptx32");
-
-    clang::TextDiagnosticPrinter *DiagClient =
-                             new clang::TextDiagnosticPrinter(ErrorInfo, *DiagOpts)
-    llvm::IntrusiveRefCntPtr<clang::DiagnosticIDs> DiagID(new clang::DiagnosticIDs());
-    clang::DiagnosticsEngine Diags(DiagID, DiagClient);
-#else
-    args.push_back("-ffp-contract=off");
 
+  
     clang::TextDiagnosticPrinter *DiagClient =
                              new clang::TextDiagnosticPrinter(ErrorInfo, &*DiagOpts);
     llvm::IntrusiveRefCntPtr<clang::DiagnosticIDs> DiagID(new clang::DiagnosticIDs());
     clang::DiagnosticsEngine Diags(DiagID, &*DiagOpts, DiagClient);
-#endif /* LLVM_VERSION_MINOR <= 1 */
+
     // Create the compiler invocation
     std::unique_ptr<clang::CompilerInvocation> CI(new clang::CompilerInvocation);
     clang::CompilerInvocation::CreateFromArgs(*CI,
@@ -643,11 +569,7 @@ namespace gbe {
     clang::CompilerInstance Clang;
     Clang.setInvocation(CI.release());
     // Get ready to report problems
-#if LLVM_VERSION_MINOR <= 2
-    Clang.createDiagnostics(args.size(), &args[0]);
-#else
     Clang.createDiagnostics(DiagClient, false);
-#endif /* LLVM_VERSION_MINOR <= 2 */
 
     Clang.getDiagnosticOpts().ShowCarets = false;
     if (!Clang.hasDiagnostics())
@@ -656,10 +578,7 @@ namespace gbe {
     // Set Language
     clang::LangOptions & lang_opts = Clang.getLangOpts();
     lang_opts.OpenCL = 1;
-
-    clang::PreprocessorOptions& prep_opt = Clang.getPreprocessorOpts();
-    prep_opt.DisablePCHValidation = 1;
-
+    
     //llvm flags need command line parsing to take effect
     if (!Clang.getFrontendOpts().LLVMArgs.empty()) {
       unsigned NumArgs = Clang.getFrontendOpts().LLVMArgs.size();
@@ -672,32 +591,17 @@ namespace gbe {
       llvm::cl::ParseCommandLineOptions(NumArgs + 1, Args);
       delete [] Args;
     }
-
+  
     // Create an action and make the compiler instance carry it out
     std::unique_ptr<clang::CodeGenAction> Act(new clang::EmitLLVMOnlyAction(llvm_ctx));
-
-    std::string dirs = OCL_PCM_PATH;
-    std::string pcmFileName;
-    std::istringstream idirs(dirs);
-    bool findPcm = false;
-
-    while (getline(idirs, pcmFileName, ':')) {
-      if(access(pcmFileName.c_str(), R_OK) == 0) {
-        findPcm |= true;
-        break;
-      }
-    }
-
-    GBE_ASSERT(findPcm && "Could not find pre compiled module library.\n");
-
-    Clang.getCodeGenOpts().LinkBitcodeFile = pcmFileName;
+    
     auto retVal = Clang.ExecuteAction(*Act);
 
     if (err != NULL) {
       GBE_ASSERT(errSize != NULL);
       *errSize = ErrorString.copy(err, stringSize - 1, 0);
     }
-
+  
     if (err == NULL || OCL_OUTPUT_BUILD_LOG) {
       // flush the error messages to the errs() if there is no
       // error string buffer.
@@ -713,116 +617,99 @@ namespace gbe {
     return true;
   }
 
-  extern std::string ocl_stdlib_str;
 
-  BVAR(OCL_USE_PCH, true);
-  static void processSourceAndOption(const char *source,
+  SVAR(OCL_PCH_PATH, OCL_PCH_OBJECT);
+  SVAR(OCL_HEADER_FILE_DIR, OCL_HEADER_DIR);
+
+  static bool processSourceAndOption(const char *source,
                                      const char *options,
                                      const char *temp_header_path,
-                                     std::string& clOpt,
+                                     std::vector<std::string>& clOpt,
                                      std::string& clName,
-                                     int& optLevel)
+                                     int& optLevel,
+                                     size_t stringSize,
+                                     char *err,
+                                     size_t *errSize)
   {
-    char clStr[] = "/tmp/XXXXXX.cl";
-    int clFd = mkstemps(clStr, 3);
-    clName = std::string(clStr);
-
-    FILE *clFile = fdopen(clFd, "w");
-    FATAL_IF(clFile == NULL, "Failed to open temporary file");
-
-    bool usePCH = OCL_USE_PCH;
+    std::string dirs = OCL_PCH_PATH;
+    std::istringstream idirs(dirs);
+    std::string pchFileName;
     bool findPCH = false;
+    bool invalidPCH = false;
+    size_t start = 0, end = 0;
 
-    /* Because our header file is so big, we want to avoid recompile the header from
-       scratch. We use the PCH support of Clang to save the huge compiling time.
-       We just use the most general build opt to build the PCH header file, so if
-       user pass new build options here, the PCH can not pass the Clang's compitable
-       validating. Clang will do three kinds of compatible check: Language Option,
-       Target Option and Preprocessing Option. Other kinds of options such as the
-       CodeGen options will not affect the AST result, so no need to check.
-
-       According to OpenCL 1.1's spec, the CL build options:
-       -D name=definition
-       If the definition is not used in our header, it is compitable
-
-       -cl-single-precision-constant
-       -cl-denorms-are-zero
-       -cl-std=
-       Language options, really affect.
-
-       -cl-opt-disable
-       -cl-mad-enable
-       -cl-no-signed-zeros
-       -cl-unsafe-math-optimizations
-       -cl-finite-math-only
-       -cl-fast-relaxed-math
-       CodeGen options, not affect
-
-       -Werror
-       -w
-       Our header should not block the compiling because of warning.
-
-       So we just disable the PCH validation of Clang and do the judgement by ourself. */
-
-    /* We always add -cl-kernel-arg-info to the options. This option just generate the arg
-       information for the backend, no other side effect and does not have performance issue. */
-    if (!options || !strstr(const_cast<char *>(options), "-cl-kernel-arg-info"))
-      clOpt += "-cl-kernel-arg-info ";
+    std::string hdirs = OCL_HEADER_FILE_DIR;
+    std::istringstream hidirs(hdirs);
+    std::string headerFilePath;
+    bool findOcl = false;
+
+    while (getline(hidirs, headerFilePath, ':')) {
+      std::string oclDotHName = headerFilePath + "/ocl.h";
+      if(access(oclDotHName.c_str(), R_OK) == 0) {
+        findOcl = true;
+        break;
+      }
+    }
+    assert(findOcl);
+    std::string includePath  = "-I" + headerFilePath;
+    clOpt.push_back(includePath);
+    bool useDefaultCLCVersion = true;
 
     if (options) {
-      char *p;
-      /* FIXME: Though we can disable the pch valid check, and load pch successfully,
-         but these language opts and pre-defined macro will still generate the diag msg
-         to the diag engine of the Clang and cause the Clang to report error.
-         We filter them all here to avoid these. */
-      const char * incompatible_opts[] = {
-          "-cl-single-precision-constant",
-//        "-cl-denorms-are-zero",
-          "-cl-fast-relaxed-math",
-          "-cl-std=CL1.1"
-      };
-      const char * incompatible_defs[] = {
-          "GET_FLOAT_WORD",
-          "__NV_CL_C_VERSION",
-          "GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND"
-      };
-
-      for (unsigned int i = 0; i < sizeof(incompatible_opts)/sizeof(char *); i++ ) {
-        p = strstr(const_cast<char *>(options), incompatible_opts[i]);
-        if (p) {
-          usePCH = false;
-          break;
+      char *str = (char *)malloc(sizeof(char) * (strlen(options) + 1));
+      memcpy(str, options, strlen(options) + 1);
+      std::string optionStr(str);
+      const std::string unsupportedOptions("-cl-denorms-are-zero, -cl-strict-aliasing, -cl-opt-disable,"
+                       "-cl-no-signed-zeros, -cl-fp32-correctly-rounded-divide-sqrt");
+
+      const std::string uncompatiblePCHOptions = ("-cl-single-precision-constant, -cl-fast-relaxed-math, -cl-std=CL1.1");
+      const std::string fastMathOption = ("-cl-fast-relaxed-math");
+      while (end != std::string::npos) {
+        end = optionStr.find(' ', start);
+        std::string str = optionStr.substr(start, end - start);
+        start = end + 1;
+        if(str.size() == 0)
+          continue;
+
+        if(unsupportedOptions.find(str) != std::string::npos) {
+          continue;
         }
-      }
 
-      if (usePCH) {
-        for (unsigned int i = 0; i < sizeof(incompatible_defs)/sizeof(char *); i++ ) {
-          p = strstr(const_cast<char *>(options), incompatible_defs[i]);
-          if (p) {
-            usePCH = false;
-            break;
+        if(str.find("-cl-std=") != std::string::npos) {
+          useDefaultCLCVersion = false;
+          if (str == "-cl-std=CL1.1")
+            clOpt.push_back("-D__OPENCL_C_VERSION__=110");
+          else if (str == "-cl-std=CL1.2")
+            clOpt.push_back("-D__OPENCL_C_VERSION__=120");
+          else {
+            if (err && stringSize > 0 && errSize)
+              *errSize = snprintf(err, stringSize, "Invalid build option: %s\n", str.c_str());
+            return false;
           }
         }
-      }
 
-      p = strstr(const_cast<char *>(options), "-cl-opt-disable");
-      if (p)
-        optLevel = 0;
-      // XXX enable cl_khr_fp64 may cause some potential bugs.
-      // we may need to revisit here latter when we want to support fp64 completely.
-      // For now, as we don't support fp64 actually, just disable it by default.
-#if 0
-      #define ENABLE_CL_KHR_FP64_STR "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
-      if (!strstr(const_cast<char *>(options), "-cl-std=CL1.1"))
-        fwrite(ENABLE_CL_KHR_FP64_STR, strlen(ENABLE_CL_KHR_FP64_STR), 1, clFile);
-#endif
+        if (uncompatiblePCHOptions.find(str) != std::string::npos)
+          invalidPCH = true;
+
+        if (fastMathOption.find(str) != std::string::npos) {
+          clOpt.push_back("-D");
+          clOpt.push_back("__FAST_RELAXED_MATH__=1");
+        }
 
-      clOpt += options;
+        clOpt.push_back(str);
+      }
+      free(str);
     }
 
-    std::string dirs = OCL_PCH_PATH;
-    std::istringstream idirs(dirs);
-    std::string pchFileName;
+    if (useDefaultCLCVersion) {
+      clOpt.push_back("-D__OPENCL_C_VERSION__=120");
+      clOpt.push_back("-cl-std=CL1.2");
+    }
+    //for clCompilerProgram usage.
+    if(temp_header_path){
+      clOpt.push_back("-I");
+      clOpt.push_back(temp_header_path);
+    }
 
     while (getline(idirs, pchFileName, ':')) {
       if(access(pchFileName.c_str(), R_OK) == 0) {
@@ -831,31 +718,34 @@ namespace gbe {
       }
     }
 
-    if (usePCH && findPCH) {
-      clOpt += " -include-pch ";
-      clOpt += pchFileName;
-      clOpt += " ";
-    } else
-      fwrite(ocl_stdlib_str.c_str(), strlen(ocl_stdlib_str.c_str()), 1, clFile);
+    char clStr[] = "/tmp/XXXXXX.cl";
+    int clFd = mkstemps(clStr, 3);
+    clName = std::string(clStr);
 
-    //for clCompilerProgram usage.
-    if(temp_header_path){
-      clOpt += " -I ";
-      clOpt += temp_header_path;
-      clOpt += " ";
-    }
+    FILE *clFile = fdopen(clFd, "w");
+    FATAL_IF(clFile == NULL, "Failed to open temporary file");
+    // XXX enable cl_khr_fp64 may cause some potential bugs.
+    // we may need to revisit here latter when we want to support fp64 completely.
+    // For now, as we don't support fp64 actually, just disable it by default.
+#if 0
+    #define ENABLE_CL_KHR_FP64_STR "#pragma OPENCL EXTENSION cl_khr_fp64 : enable\n"
+    if (options && !strstr(const_cast<char *>(options), "-cl-std=CL1.1"))
+      fwrite(ENABLE_CL_KHR_FP64_STR, strlen(ENABLE_CL_KHR_FP64_STR), 1, clFile);
+#endif
 
-    if (!OCL_STRICT_CONFORMANCE) {
-        fwrite(ocl_mathfunc_fastpath_str.c_str(), strlen(ocl_mathfunc_fastpath_str.c_str()), 1, clFile);
+    if (!findPCH || invalidPCH) {
+      clOpt.push_back("-include");
+      clOpt.push_back("ocl.h");
+    } else {
+      clOpt.push_back("-fno-validate-pch");
+      clOpt.push_back("-include-pch");
+      clOpt.push_back(pchFileName);
     }
 
-    // reset the file number in case we have inserted something into the kernel
-    std::string resetFileNum = "#line 1\n";
-    fwrite(resetFileNum.c_str(), strlen(resetFileNum.c_str()), 1, clFile);
-
     // Write the source to the cl file
     fwrite(source, strlen(source), 1, clFile);
     fclose(clFile);
+    return true;
   }
 
   static gbe_program programNewFromSource(uint32_t deviceID,
@@ -866,20 +756,21 @@ namespace gbe {
                                           size_t *errSize)
   {
     int optLevel = 1;
-    std::string clOpt;
+    std::vector<std::string> clOpt;
     std::string clName;
-    processSourceAndOption(source, options, NULL, clOpt, clName, optLevel);
+    if (!processSourceAndOption(source, options, NULL, clOpt, clName,
+                                optLevel, stringSize, err, errSize))
+      return NULL;
 
     gbe_program p;
     // will delete the module and act in GenProgram::CleanLlvmResource().
     llvm::Module * out_module;
     llvm::LLVMContext* llvm_ctx = new llvm::LLVMContext;
-
     static std::mutex llvm_mutex;
     if (!llvm::llvm_is_multithreaded())
       llvm_mutex.lock();
 
-    if (buildModuleFromSource(clName.c_str(), &out_module, llvm_ctx, clOpt.c_str(),
+    if (buildModuleFromSource(clName.c_str(), &out_module, llvm_ctx, clOpt,
                               stringSize, err, errSize)) {
     // Now build the program from llvm
       size_t clangErrSize = 0;
@@ -918,9 +809,11 @@ namespace gbe {
                                           size_t *errSize)
   {
     int optLevel = 1;
-    std::string clOpt;
+    std::vector<std::string> clOpt;
     std::string clName;
-    processSourceAndOption(source, options, temp_header_path, clOpt, clName, optLevel);
+    if (!processSourceAndOption(source, options, temp_header_path, clOpt, clName,
+                                optLevel, stringSize, err, errSize))
+      return NULL;
 
     gbe_program p;
     acquireLLVMContextLock();
@@ -928,7 +821,7 @@ namespace gbe {
     //for some functions, so we use global context now, need switch to new context later.
     llvm::Module * out_module;
     llvm::LLVMContext* llvm_ctx = &llvm::getGlobalContext();
-    if (buildModuleFromSource(clName.c_str(), &out_module, llvm_ctx, clOpt.c_str(),
+    if (buildModuleFromSource(clName.c_str(), &out_module, llvm_ctx, clOpt,
                               stringSize, err, errSize)) {
     // Now build the program from llvm
       if (err != NULL) {
diff --git a/backend/src/backend/program.h b/backend/src/backend/program.h
index 1421993..a457d52 100644
--- a/backend/src/backend/program.h
+++ b/backend/src/backend/program.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -212,12 +212,6 @@ typedef gbe_program (gbe_program_new_from_llvm_cb)(uint32_t deviceID,
                                                    int optLevel);
 extern gbe_program_new_from_llvm_cb *gbe_program_new_from_llvm;
 
-/*! create s new genprogram for link. */
-typedef gbe_program (gbe_program_new_gen_program_cb)(uint32_t deviceID,
-                                                   const void *module,
-                                                   const void *act);
-extern gbe_program_new_gen_program_cb *gbe_program_new_gen_program;
-
 /*! link the programs from llvm level. */
 typedef void (gbe_program_link_from_llvm_cb)(gbe_program dst_program,
                                              gbe_program src_program,
diff --git a/backend/src/backend/program.hpp b/backend/src/backend/program.hpp
index 56f60af..4e6b275 100644
--- a/backend/src/backend/program.hpp
+++ b/backend/src/backend/program.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -80,7 +80,7 @@ namespace gbe {
     /*! Return the instruction stream (to be implemented) */
     virtual const char *getCode(void) const = 0;
     /*! Set the instruction stream.*/
-    virtual const void setCode(const char *, size_t size) = 0;
+    virtual void setCode(const char *, size_t size) = 0;
     /*! Return the instruction stream size (to be implemented) */
     virtual size_t getCodeSize(void) const = 0;
     /*! Get the kernel name */
diff --git a/backend/src/builtin_vector_proto.def b/backend/src/builtin_vector_proto.def
deleted file mode 100644
index 18d23ca..0000000
--- a/backend/src/builtin_vector_proto.def
+++ /dev/null
@@ -1,295 +0,0 @@
-##math
-gentype acos (gentype)
-gentype acosh (gentype)
-gentype acospi (gentype x)
-gentype asin (gentype)
-gentype asinh (gentype)
-gentype asinpi (gentype x)
-gentype atan (gentype y_over_x)
-gentype atan2 (gentype y, gentype x)
-gentype atanh (gentype)
-gentype atanpi (gentype x)
-gentype atan2pi (gentype y, gentype x)
-gentype cbrt (gentype)
-gentype ceil (gentype)
-gentype copysign (gentype x, gentype y)
-gentype cos (gentype)
-gentype cosh (gentype)
-gentype cospi (gentype x)
-gentype erfc (gentype)
-gentype erf (gentype)
-gentype exp (gentype x)
-gentype exp2 (gentype)
-gentype exp10 (gentype)
-gentype expm1 (gentype x)
-gentype fabs (gentype)
-gentype fdim (gentype x, gentype y)
-gentype floor (gentype)
-# XXX we use madd for fma
-#gentype fma (gentype a, gentype b, gentype c)
-gentype fmax (gentype x, gentype y)
-gentypef fmax (gentypef x, float y)
-gentyped fmax (gentyped x, double y)
-gentype fmin (gentype x, gentype y)
-gentypef fmin (gentypef x, float y)
-gentyped fmin (gentyped x, double y)
-gentype fmod (gentype x, gentype y)
-gentype fract (gentype x, __global gentype *iptr)
-gentype fract (gentype x, __local gentype *iptr)
-gentype fract (gentype x, __private gentype *iptr)
-floatn frexp (floatn x, __global intn *exp)
-floatn frexp (floatn x, __local intn *exp)
-floatn frexp (floatn x, __private intn *exp)
-float frexp (float x, __global int *exp)
-float frexp (float x, __local int *exp)
-float frexp (float x, __private int *exp)
-doublen frexp (doublen x, __global intn *exp)
-doublen frexp (doublen x, __local intn *exp)
-doublen frexp (doublen x, __private intn *exp)
-double frexp (double x, __global int *exp)
-double frexp (double x, __local int *exp)
-double frexp (double x, __private int *exp)
-gentype hypot (gentype x, gentype y)
-intn ilogb (floatn x)
-int ilogb (float x)
-intn ilogb (doublen x)
-int ilogb (double x)
-floatn ldexp (floatn x, intn k)
-floatn ldexp (floatn x, int k)
-float ldexp (float x, int k)
-doublen ldexp (doublen x, intn k)
-doublen ldexp (doublen x, int k)
-double ldexp (double x, int k)
-gentype lgamma (gentype x)
-floatn lgamma_r (floatn x, __global intn *signp)
-floatn lgamma_r (floatn x, __local intn *signp)
-floatn lgamma_r (floatn x, __private intn *signp)
-float lgamma_r (float x, __global int *signp)
-float lgamma_r (float x, __local int *signp)
-float lgamma_r (float x,   __private int *signp)
-#doublen lgamma_r (doublen x, __global intn *signp)
-#doublen lgamma_r (doublen x, __local intn *signp)
-#doublen lgamma_r (doublen x, __private intn *signp)
-#double lgamma_r (double x, __global int *signp)
-#double lgamma_r (double x, __local int *signp)
-#double lgamma_r (double x, __private int *signp)
-gentype log (gentype)
-gentype log2 (gentype)
-gentype log10 (gentype)
-gentype log1p (gentype x)
-gentype logb (gentype x)
-gentype mad (gentype a, gentype b, gentype c)
-gentype maxmag (gentype x, gentype y)
-gentype minmag (gentype x, gentype y)
-gentype modf (gentype x, __global gentype *iptr)
-gentype modf (gentype x, __local gentype *iptr)
-gentype modf (gentype x, __private gentype *iptr)
-floatn nan (uintn nancode)
-float nan (uint nancode)
-doublen nan (ulongn nancode)
-double nan (ulong nancode)
-gentype nextafter (gentype x, gentype y)
-gentype pow (gentype x, gentype y)
-floatn pown (floatn x, intn y)
-float pown (float x, int y)
-doublen pown (doublen x, intn y)
-double pown (double x, int y)
-#XXX we define powr as pow
-#gentype powr (gentype x, gentype y)
-gentype remainder (gentype x, gentype y)
-floatn remquo (floatn x, floatn y, __global intn *quo)
-floatn remquo (floatn x, floatn y, __local intn *quo)
-floatn remquo (floatn x, floatn y, __private intn *quo)
-float remquo (float x, float y, __global int *quo)
-float remquo (float x, float y, __local int *quo)
-float remquo (float x, float y, __private int *quo)
-doublen remquo (doublen x, doublen y, __global intn *quo)
-doublen remquo (doublen x, doublen y, __local intn *quo)
-doublen remquo (doublen x, doublen y, __private intn *quo)
-double remquo (double x, double y, __global int *quo)
-double remquo (double x, double y, __local int *quo)
-double remquo (double x, double y, __private int *quo)
-gentype rint (gentype)
-floatn rootn (floatn x, intn y)
-
-doublen rootn (doublen x, intn y)
-doublen rootn (double x, int y)
-gentype round (gentype x)
-gentype rsqrt (gentype)
-gentype sin (gentype)
-gentype sincos (gentype x, __global gentype *cosval)
-gentype sincos (gentype x, __local gentype *cosval)
-gentype sincos (gentype x, __private gentype *cosval)
-gentype sinh (gentype)
-gentype sinpi (gentype x)
-gentype sqrt (gentype)
-gentype tan (gentype)
-gentype tanh (gentype)
-gentype tanpi (gentype x)
-gentype tgamma (gentype)
-gentype trunc (gentype)
-
-##math function fast path
-gentype __gen_ocl_internal_fastpath_acosh (gentype x)
-gentype __gen_ocl_internal_fastpath_asinh (gentype x)
-gentype __gen_ocl_internal_fastpath_atanh (gentype x)
-gentype __gen_ocl_internal_fastpath_cbrt (gentype x)
-gentype __gen_ocl_internal_fastpath_cos (gentype x)
-gentype __gen_ocl_internal_fastpath_cosh (gentype x)
-gentype __gen_ocl_internal_fastpath_cospi (gentype x)
-gentype __gen_ocl_internal_fastpath_exp (gentype x)
-gentype __gen_ocl_internal_fastpath_exp10 (gentype x)
-gentype __gen_ocl_internal_fastpath_expm1 (gentype x)
-gentype __gen_ocl_internal_fastpath_fmod (gentype x, gentype y)
-gentype __gen_ocl_internal_fastpath_hypot (gentype x, gentype y)
-intn __gen_ocl_internal_fastpath_ilogb (floatn x)
-int __gen_ocl_internal_fastpath_ilogb (float x)
-intn __gen_ocl_internal_fastpath_ilogb (doublen x)
-int __gen_ocl_internal_fastpath_ilogb (double x)
-floatn __gen_ocl_internal_fastpath_ldexp (floatn x, intn k)
-floatn __gen_ocl_internal_fastpath_ldexp (floatn x, int k)
-float __gen_ocl_internal_fastpath_ldexp (float x, int k)
-doublen __gen_ocl_internal_fastpath_ldexp (doublen x, intn k)
-doublen __gen_ocl_internal_fastpath_ldexp (doublen x, int k)
-double __gen_ocl_internal_fastpath_ldexp (double x, int k)
-gentype __gen_ocl_internal_fastpath_log (gentype x)
-gentype __gen_ocl_internal_fastpath_log2 (gentype x)
-gentype __gen_ocl_internal_fastpath_log10 (gentype x)
-gentype __gen_ocl_internal_fastpath_log1p (gentype x)
-gentype __gen_ocl_internal_fastpath_logb (gentype x)
-gentype __gen_ocl_internal_fastpath_remainder (gentype x, gentype y)
-floatn __gen_ocl_internal_fastpath_rootn (floatn x, intn k)
-gentype __gen_ocl_internal_fastpath_sin (gentype x)
-gentype __gen_ocl_internal_fastpath_sincos (gentype x, __global gentype *cosval)
-gentype __gen_ocl_internal_fastpath_sincos (gentype x, __local gentype *cosval)
-gentype __gen_ocl_internal_fastpath_sincos (gentype x, __private gentype *cosval)
-gentype __gen_ocl_internal_fastpath_sinh (gentype x)
-gentype __gen_ocl_internal_fastpath_sinpi (gentype x)
-gentype __gen_ocl_internal_fastpath_tan (gentype x)
-gentype __gen_ocl_internal_fastpath_tanh (gentype x)
-
-##half_native_math
-#gentype half_cos (gentype x)
-#gentype half_divide (gentype x, gentype y)
-#gentype half_exp (gentype x)
-#gentype half_exp2 (gentype x)
-#gentype half_exp10 (gentype x)
-#gentype half_log (gentype x)
-#gentype half_log2 (gentype x)
-#gentype half_log10 (gentype x)
-#gentype half_powr (gentype x, gentype y)
-#gentype half_recip (gentype x)
-#gentype half_rsqrt (gentype x)
-#gentype half_sin (gentype x)
-#gentype half_sqrt (gentype x)
-#gentype half_tan (gentype x)
-
-# XXX we already defined all native and non-native
-# functions to the same one.
-gentype native_cos (gentype x)
-gentype native_divide (gentype x, gentype y)
-gentype native_exp (gentype x)
-#gentype native_exp2 (gentype x)
-gentype native_exp10 (gentype x)
-gentype native_log (gentype x)
-gentype native_log2 (gentype x)
-gentype native_log10 (gentype x)
-gentype native_powr (gentype x, gentype y)
-gentype native_recip (gentype x)
-gentype native_rsqrt (gentype x)
-gentype native_sin (gentype x)
-#gentype native_sqrt (gentype x)
-gentype native_tan (gentype x)
-
-##integer
-ugentype abs (gentype x)
-ugentype abs_diff (gentype x, gentype y)
-gentype add_sat (gentype x,  gentype y)
-gentype hadd (gentype x,  gentype y)
-gentype rhadd (gentype x, gentype y)
-gentype clamp (gentype x, gentype minval, gentype maxval)
-gentype clamp (gentype x, sgentype minval, sgentype maxval)
-gentype clz (gentype x)
-gentype mad_hi (gentype a, gentype b, gentype c)
-gentype mad_sat (gentype a, gentype b, gentype c)
-gentype max (gentype x,  gentype y)
-gentype max (gentype x,  sgentype y)
-gentype min (gentype x,  gentype y)
-gentype min (gentype x,  sgentype y)
-gentype mul_hi (gentype x,  gentype y)
-gentype rotate (gentype v,  gentype i)
-gentype sub_sat (gentype x,  gentype y)
-shortn upsample (charn hi, ucharn lo)
-ushortn upsample (ucharn hi, ucharn lo)
-intn upsample (shortn hi, ushortn lo)
-uintn upsample (ushortn hi, ushortn lo)
-longn upsample (intn hi, uintn lo)
-ulongn upsample (uintn hi, uintn lo)
-# XXX not implemented
-#gentype popcount (gentype x)
-
-##fast_integer
-gentype mad24 (gentype x, gentype y, gentype z)
-gentype mul24 (gentype x, gentype y)
-
-##common
-gentype clamp (gentype x, gentype minval, gentype maxval)
-gentypef clamp (gentypef x, float minval, float maxval)
-gentyped clamp (gentyped x, double minval, double maxval)
-gentype degrees (gentype radians)
-gentype max (gentype x,  gentype y)
-gentypef max (gentypef x, float y)
-gentyped max (gentyped x, double y)
-gentype min (gentype x,  gentype y)
-gentypef min (gentypef x,  float y)
-gentyped min (gentyped x,  double y)
-gentype mix (gentype x, gentype y, gentype a)
-gentypef mix (gentypef x, gentypef y, float a)
-gentyped mix (gentyped x, gentyped y, double a)
-gentype radians (gentype degrees)
-gentype step (gentype edge, gentype x)
-gentypef step (float edge, gentypef x)
-gentyped step (double edge, gentyped x)
-gentype smoothstep (gentype edge0, gentype edge1, gentype x)
-gentypef smoothstep (float edge0, float edge1, gentypef x)
-gentyped smoothstep (double edge0, double edge1, gentyped x)
-gentype sign (gentype x)
-
-##relational
-intn isequal (floatn x, floatn y)
-longn isequal (doublen x, doublen y)
-intn isnotequal (floatn x, floatn y)
-longn isnotequal (doublen x, doublen y)
-intn isgreater (floatn x, floatn y)
-longn isgreater (doublen x, doublen y)
-intn isgreaterequal (floatn x, floatn y)
-longn isgreaterequal (doublen x, doublen y)
-intn isless (floatn x, floatn y)
-longn isless (doublen x, doublen y)
-intn islessequal (floatn x, floatn y)
-longn islessequal (doublen x, doublen y)
-intn islessgreater (floatn x, floatn y)
-longn islessgreater (doublen x, doublen y)
-intn isfinite (floatn
-longn isfinite (doublen)
-intn isinf (floatn)
-longn isinf (doublen)
-intn isnan (floatn)
-longn isnan (doublen)
-intn isnormal (floatn)
-longn isnormal (doublen)
-intn isordered (floatn x, floatn y)
-longn isordered (doublen x, doublen y)
-intn isunordered (floatn x, floatn y)
-longn isunordered (doublen x, doublen y)
-intn signbit (floatn)
-longn signbit (doublen)
-int any (igentype x)
-int all (igentype x)
-gentype bitselect (gentype a, gentype b, gentype c)
-gentype select (gentype a, gentype b, igentype c)
-gentype select (gentype a, gentype b, ugentype c)
-
-##misc
-#gentypen shuffle (gentypem x, ugentypen mask)
-#gentypen shuffle2 (gentypem x, gentypem y, ugentypen mask)
diff --git a/backend/src/gbe_bin_generater.cpp b/backend/src/gbe_bin_generater.cpp
index 79e3935..f8e45fb 100644
--- a/backend/src/gbe_bin_generater.cpp
+++ b/backend/src/gbe_bin_generater.cpp
@@ -3,7 +3,7 @@
  *
  * This library is free software; you can redistribute it and/or modify it
  * under the terms of the GNU Lesser General Public License as published by the
- * Free Software Foundation; either version 2 of the License, or (at your
+ * Free Software Foundation; either version 2.1 of the License, or (at your
  * option) any later version.
  *
  * This library is distributed in the hope that it will be useful, but WITHOUT
@@ -174,6 +174,10 @@ void program_build_instance::serialize_program(void) throw(int)
         src_hw_info[0]='H';
         src_hw_info[1]='S';
         src_hw_info[2]='W';
+    }else if(IS_BROADWELL(gen_pci_id)){
+        src_hw_info[0]='B';
+        src_hw_info[1]='D';
+        src_hw_info[2]='W';
     }
 
     if (str_fmt_out) {
diff --git a/backend/src/gbe_bin_interpreter.cpp b/backend/src/gbe_bin_interpreter.cpp
index 1c67a4b..4594a0a 100644
--- a/backend/src/gbe_bin_interpreter.cpp
+++ b/backend/src/gbe_bin_interpreter.cpp
@@ -3,7 +3,7 @@
  *
  * This library is free software; you can redistribute it and/or modify it
  * under the terms of the GNU Lesser General Public License as published by the
- * Free Software Foundation; either version 2 of the License, or (at your
+ * Free Software Foundation; either version 2.1 of the License, or (at your
  * option) any later version.
  *
  * This library is distributed in the hope that it will be useful, but WITHOUT
diff --git a/backend/src/gen_as.sh b/backend/src/gen_as.sh
deleted file mode 100755
index 7dea15d..0000000
--- a/backend/src/gen_as.sh
+++ /dev/null
@@ -1,101 +0,0 @@
-#! /bin/sh -e
-
-. ./genconfig.sh
-
-# Generate list of union sizes
-for type in $TYPES; do
-        size=`IFS=:; set -- dummy $type; echo $3`
-        for vector_length in $VECTOR_LENGTHS; do
-                if test $vector_length -eq 3; then
-                      continue;
-                fi
-                union_sizes="$union_sizes `expr $vector_length \* $size`"
-        done
-done
-union_sizes="`echo $union_sizes | tr ' ' '\n' | sort -n | uniq`"
-
-# For each union size
-for union_size in $union_sizes; do
-
-        # Define an union that contains all vector types that have the same size as the union
-        unionname="union _type_cast_${union_size}_b"
-        echo "$unionname {"
-        for type in $TYPES; do
-                basetype=`IFS=:; set -- dummy $type; echo $2`
-                basesize=`IFS=:; set -- dummy $type; echo $3`
-                for vector_length in $VECTOR_LENGTHS; do
-                        if test $vector_length -eq 3; then
-                                vector_size_length="4"
-                        else
-                                vector_size_length=$vector_length;
-                        fi
-                        vector_size_in_union="`expr $vector_size_length \* $basesize`"
-                        if test $union_size -ne $vector_size_in_union; then
-                                continue
-                        fi
-                        if test $vector_length -eq 1; then
-                                vectortype=$basetype
-                        else
-                                vectortype=$basetype$vector_length
-                        fi
-                        echo "  $vectortype _$vectortype;"
-                done
-
-        done
-        echo "};"
-        echo
-
-        # For each tuple of vector types that has the same size as the current union size,
-        # define an as_* function that converts types without changing binary representation.
-        for ftype in $TYPES; do
-                fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
-                fbasesize=`IFS=:; set -- dummy $ftype; echo $3`
-                for fvector_length in $VECTOR_LENGTHS; do
-                        if test $fvector_length -eq 3; then
-                                fvector_size_length="4"
-                        else
-                                fvector_size_length=$fvector_length;
-                        fi
-                        fvector_size_in_union="`expr $fvector_size_length \* $fbasesize`"
-                        if test $union_size -ne $fvector_size_in_union; then
-                                continue
-                        fi
-                        if test $fvector_length -eq 1; then
-                                fvectortype=$fbasetype
-                        else
-                                fvectortype=$fbasetype$fvector_length
-                        fi
-                        for ttype in $TYPES; do
-                                tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
-                                tbasesize=`IFS=:; set -- dummy $ttype; echo $3`
-                                if test $fbasetype = $tbasetype; then
-                                        continue
-                                fi
-                                for tvector_length in $VECTOR_LENGTHS; do
-                                        if test $tvector_length -eq 3; then
-                                               tvector_size_length="4"
-                                        else
-                                               tvector_size_length=$tvector_length;
-                                        fi
-                                        tvector_size_in_union="`expr $tvector_size_length \* $tbasesize`"
-                                        if test $union_size -ne $tvector_size_in_union; then
-                                                continue
-                                        fi
-                                        if test $tvector_length -eq 1; then
-                                                tvectortype=$tbasetype
-                                        else
-                                                tvectortype=$tbasetype$tvector_length
-                                        fi
-                                        echo "INLINE OVERLOADABLE $tvectortype as_$tvectortype($fvectortype v) {"
-                                        echo "  $unionname u;"
-                                        echo "  u._$fvectortype = v;"
-                                        echo "  return u._$tvectortype;"
-                                        echo "}"
-                                        echo
-                                done
-                        done
-                done
-
-        done
-
-done
diff --git a/backend/src/gen_convert.sh b/backend/src/gen_convert.sh
deleted file mode 100755
index b940222..0000000
--- a/backend/src/gen_convert.sh
+++ /dev/null
@@ -1,553 +0,0 @@
-#! /bin/sh -e
-
-. ./genconfig.sh
-
-# For all vector lengths and types, generate conversion functions
-for vector_length in $VECTOR_LENGTHS; do
-        if test $vector_length -eq 1; then
-          for ftype in $TYPES; do
-            fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
-            for ttype in $TYPES; do
-              tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
-              echo "INLINE OVERLOADABLE $tbasetype convert_$tbasetype($fbasetype v) {"
-              echo "  return ($tbasetype)v;"
-              echo "}"
-              echo
-            done
-          done
-        else
-          for ftype in $TYPES; do
-                fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
-                for ttype in $TYPES; do
-                        tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
-                        if test $fbasetype = $tbasetype; then
-                          if test $vector_length -gt 1; then
-                            fvectortype=$fbasetype$vector_length
-                            tvectortype=$tbasetype$vector_length
-                            echo "INLINE OVERLOADABLE $tvectortype convert_$tvectortype($fvectortype v) { return v; }"
-                          else
-                            echo "INLINE OVERLOADABLE $tbasetype convert_$tbasetype($fbasetype v) { return v; }"
-                          fi
-                          continue
-                        fi
-                        fvectortype=$fbasetype$vector_length
-                        tvectortype=$tbasetype$vector_length
-                        construct="($tbasetype)(v.s0)"
-                        if test $vector_length -gt 1; then
-                                construct="$construct, ($tbasetype)(v.s1)"
-                        fi
-                        if test $vector_length -gt 2; then
-                                construct="$construct, ($tbasetype)(v.s2)"
-                        fi
-                        if test $vector_length -gt 3; then
-                                construct="$construct, ($tbasetype)(v.s3)"
-                        fi
-                        if test $vector_length -gt 4; then
-                                construct="$construct, ($tbasetype)(v.s4)"
-                                construct="$construct, ($tbasetype)(v.s5)"
-                                construct="$construct, ($tbasetype)(v.s6)"
-                                construct="$construct, ($tbasetype)(v.s7)"
-                        fi
-                        if test $vector_length -gt 8; then
-                                construct="$construct, ($tbasetype)(v.s8)"
-                                construct="$construct, ($tbasetype)(v.s9)"
-                                construct="$construct, ($tbasetype)(v.sA)"
-                                construct="$construct, ($tbasetype)(v.sB)"
-                                construct="$construct, ($tbasetype)(v.sC)"
-                                construct="$construct, ($tbasetype)(v.sD)"
-                                construct="$construct, ($tbasetype)(v.sE)"
-                                construct="$construct, ($tbasetype)(v.sF)"
-                        fi
-
-                        echo "INLINE OVERLOADABLE $tvectortype convert_$tvectortype($fvectortype v) {"
-                        echo "  return ($tvectortype)($construct);"
-                        echo "}"
-                        echo
-                done
-          done
-        fi
-done
-
-echo '
-#define DEF(DSTTYPE, SRCTYPE) \
-  OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x);
-DEF(char, uchar);
-DEF(char, short);
-DEF(char, ushort);
-DEF(char, int);
-DEF(char, uint);
-DEF(char, float);
-DEF(uchar, char);
-DEF(uchar, short);
-DEF(uchar, ushort);
-DEF(uchar, int);
-DEF(uchar, uint);
-DEF(uchar, float);
-DEF(short, ushort);
-DEF(short, int);
-DEF(short, uint);
-DEF(short, float);
-DEF(ushort, short);
-DEF(ushort, int);
-DEF(ushort, uint);
-DEF(ushort, float);
-DEF(int, uint);
-DEF(int, float);
-DEF(uint, int);
-DEF(uint, float);
-#undef DEF
-
-#define DEF(DSTTYPE, SRCTYPE, MIN, MAX) \
-  INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
-    return x >= MAX ? (DSTTYPE)MAX : x <= MIN ? (DSTTYPE)MIN : x; \
-  }
-DEF(char, long, -128, 127);
-DEF(uchar, long, 0, 255);
-DEF(short, long, -32768, 32767);
-DEF(ushort, long, 0, 65535);
-DEF(int, long, -0x7fffffff-1, 0x7fffffff);
-DEF(uint, long, 0, 0xffffffffu);
-DEF(long, float, -9.223372036854776e+18f, 9.223372036854776e+18f);
-DEF(ulong, float, 0, 1.8446744073709552e+19f);
-#undef DEF
-
-#define DEF(DSTTYPE, SRCTYPE, MAX) \
-  INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
-    return x >= MAX ? (DSTTYPE)MAX : x; \
-  }
-DEF(char, ulong, 127);
-DEF(uchar, ulong, 255);
-DEF(short, ulong, 32767);
-DEF(ushort, ulong, 65535);
-DEF(int, ulong, 0x7fffffff);
-DEF(uint, ulong, 0xffffffffu);
-#undef DEF
-
-INLINE_OVERLOADABLE long convert_long_sat(ulong x) {
-  ulong MAX = 0x7ffffffffffffffful;
-  return x >= MAX ? MAX : x;
-}
-
-#define DEF(DSTTYPE, SRCTYPE) \
-  INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
-    return x <= 0 ? 0 : x; \
-  }
-DEF(ushort, char);
-DEF(uint, char);
-DEF(uint, short);
-DEF(ulong, char);
-DEF(ulong, short);
-DEF(ulong, int);
-DEF(ulong, long);
-#undef DEF
-
-#define DEF(DSTTYPE, SRCTYPE) \
-  INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
-    return x; \
-  }
-DEF(char, char);
-DEF(uchar, uchar);
-DEF(short, char);
-DEF(short, uchar);
-DEF(short, short);
-DEF(ushort, uchar);
-DEF(ushort, ushort);
-DEF(int, char);
-DEF(int, uchar);
-DEF(int, short);
-DEF(int, ushort);
-DEF(int, int);
-DEF(uint, uchar);
-DEF(uint, ushort);
-DEF(uint, uint);
-DEF(long, char);
-DEF(long, uchar);
-DEF(long, short);
-DEF(long, ushort);
-DEF(long, int);
-DEF(long, uint);
-DEF(long, long);
-DEF(ulong, uchar);
-DEF(ulong, ushort);
-DEF(ulong, uint);
-DEF(ulong, ulong);
-#undef DEF
-'
-
-# vector convert_DSTTYPE_sat function
-for vector_length in $VECTOR_LENGTHS; do
-  if test $vector_length -eq 1; then continue; fi
-
-  for ftype in $TYPES; do
-    fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
-    if test $fbasetype = "double"; then continue; fi
-
-    for ttype in $TYPES; do
-      tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
-      if test $tbasetype = "double" -o $tbasetype = "float"; then continue; fi
-
-      fvectortype=$fbasetype$vector_length
-      tvectortype=$tbasetype$vector_length
-      conv="convert_${tbasetype}_sat"
-
-      construct="$conv(v.s0)"
-      if test $vector_length -gt 1; then
-        construct="$construct, $conv(v.s1)"
-      fi
-      if test $vector_length -gt 2; then
-        construct="$construct, $conv(v.s2)"
-      fi
-      if test $vector_length -gt 3; then
-        construct="$construct, $conv(v.s3)"
-      fi
-      if test $vector_length -gt 4; then
-        construct="$construct, $conv(v.s4)"
-        construct="$construct, $conv(v.s5)"
-        construct="$construct, $conv(v.s6)"
-        construct="$construct, $conv(v.s7)"
-      fi
-      if test $vector_length -gt 8; then
-        construct="$construct, $conv(v.s8)"
-        construct="$construct, $conv(v.s9)"
-        construct="$construct, $conv(v.sA)"
-        construct="$construct, $conv(v.sB)"
-        construct="$construct, $conv(v.sC)"
-        construct="$construct, $conv(v.sD)"
-        construct="$construct, $conv(v.sE)"
-        construct="$construct, $conv(v.sF)"
-      fi
-
-      echo "INLINE OVERLOADABLE $tvectortype convert_${tvectortype}_sat($fvectortype v) {"
-      echo "  return ($tvectortype)($construct);"
-      echo "}"
-      echo
-    done
-  done
-done
-
-echo '
-float __gen_ocl_rndz(float x);
-float __gen_ocl_rnde(float x);
-float __gen_ocl_rndu(float x);
-float __gen_ocl_rndd(float x);
-INLINE_OVERLOADABLE float __convert_float_rtz(long x)
-{
-  union {
-    uint u;
-    float f;
-  } u;
-  u.f = x;
-  long l = u.f;
-  if((l > x && x > 0) || x >= 0x7fffffc000000000 ||
-     (l < x && x < 0)) {
-      u.u -= 1;
-  }
-  return u.f;
-}
-INLINE_OVERLOADABLE float __convert_float_rtp(long x)
-{
-  union {
-    uint u;
-    float f;
-  } u;
-  u.f = x;
-  long l = u.f;  //can not use u.f < x
-  if(l < x && x < 0x7fffffc000000000) {
-    if(x > 0)
-      u.u = u.u + 1;
-    else
-      u.u = u.u - 1;
-  }
-  return u.f;
-}
-INLINE_OVERLOADABLE float __convert_float_rtn(long x)
-{
-  union {
-    uint u;
-    float f;
-  } u;
-  u.f = x;
-  long l = u.f;  //avoid overflow
-  if(l > x || x >= 0x7fffffc000000000) {
-    if(x > 0)
-      u.u = u.u - 1;
-    else
-      u.u = u.u + 1;
-  }
-  return u.f;
-}
-INLINE_OVERLOADABLE float __convert_float_rtz(ulong x)
-{
-  union {
-    uint u;
-    float f;
-  } u;
-  u.f = x;
-  ulong l = u.f;
-  if(l > x  || x >= 0xffffff8000000000)
-      u.u -= 1;
-  return u.f;
-}
-INLINE_OVERLOADABLE float __convert_float_rtp(ulong x)
-{
-  union {
-    uint u;
-    float f;
-  } u;
-  u.f = x;
-  ulong l = u.f;  //can not use u.f < x
-  if(l < x && x < 0xffffff8000000000)
-    u.u = u.u + 1;
-  return u.f;
-}
-INLINE_OVERLOADABLE float __convert_float_rtn(ulong x)
-{
-  return __convert_float_rtz(x);
-}
-INLINE_OVERLOADABLE float __convert_float_rtz(int x)
-{
-  union {
-    uint u;
-    float f;
-  } u;
-  u.f = x;
-  long i = u.f;
-  if((i > x && x > 0) ||
-     (i < x && x < 0)) {
-      u.u -= 1;
-  }
-  return u.f;
-}
-INLINE_OVERLOADABLE float __convert_float_rtp(int x)
-{
-  union {
-    uint u;
-    float f;
-  } u;
-  u.f = x;
-  int i = u.f;
-  if(i < x) {
-    if(x > 0)
-      u.u += 1;
-    else
-      u.u -= 1;
-  }
-  return u.f;
-}
-INLINE_OVERLOADABLE float __convert_float_rtn(int x)
-{
-  union {
-    uint u;
-    float f;
-  } u;
-  u.f = x;
-  long i = u.f;  //avoid overflow
-  if(i > x) {
-    if(x > 0)
-      u.u = u.u - 1;
-    else
-      u.u = u.u + 1;
-  }
-  return u.f;
-}
-INLINE_OVERLOADABLE float __convert_float_rtz(uint x)
-{
-  union {
-    uint u;
-    float f;
-  } u;
-  u.f = x;
-  ulong i = u.f;
-  if(i > x)
-    u.u -= 1;
-  return u.f;
-}
-INLINE_OVERLOADABLE float __convert_float_rtp(uint x)
-{
-  union {
-    uint u;
-    float f;
-  } u;
-  u.f = x;
-  uint i = u.f;
-  if(i < x)
-    u.u += 1;
-  return u.f;
-}
-INLINE_OVERLOADABLE float __convert_float_rtn(uint x)
-{
-  return __convert_float_rtz(x);
-}
-'
-
-# convert_DSTTYPE_ROUNDING function
-for vector_length in $VECTOR_LENGTHS; do
-  for ftype in $TYPES; do
-    fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
-    if test $fbasetype = "double"; then continue; fi
-
-    for ttype in $TYPES; do
-      tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
-      if test $tbasetype = "double"; then continue; fi
-
-      if test $vector_length -eq 1; then
-        echo "INLINE_OVERLOADABLE $tbasetype convert_${tbasetype}_rte($fbasetype x)"
-        if test $fbasetype = "float" -a $tbasetype != "float"; then
-          echo "{ return __gen_ocl_rnde(x); }"
-        else
-          echo "{ return x; }"
-        fi
-
-        echo "INLINE_OVERLOADABLE $tbasetype convert_${tbasetype}_rtz($fbasetype x)"
-        if test $fbasetype = "float" -a $tbasetype != "float"; then
-          echo "{ return __gen_ocl_rndz(x); }"
-        elif [ "$fbasetype" = "int" -o "$fbasetype" = "uint" -o "$fbasetype" = "long" -o "$fbasetype" = "ulong" ] && [ "$tbasetype" = "float" ]; then
-          echo "{ return __convert_${tbasetype}_rtz(x); }"
-        else
-          echo "{ return x; }"
-        fi
-
-        echo "INLINE_OVERLOADABLE $tbasetype convert_${tbasetype}_rtp($fbasetype x)"
-        if test $fbasetype = "float" -a $tbasetype != "float"; then
-          echo "{ return __gen_ocl_rndu(x); }"
-        elif [ "$fbasetype" = "int" -o "$fbasetype" = "uint" -o "$fbasetype" = "long" -o "$fbasetype" = "ulong" ] && [ "$tbasetype" = "float" ]; then
-          echo "{ return __convert_${tbasetype}_rtp(x); }"
-        else
-          echo "{ return x; }"
-        fi
-
-        echo "INLINE_OVERLOADABLE $tbasetype convert_${tbasetype}_rtn($fbasetype x)"
-        if test $fbasetype = "float" -a $tbasetype != "float"; then
-          echo "{ return __gen_ocl_rndd(x); }"
-        elif [ "$fbasetype" = "int" -o "$fbasetype" = "uint" -o "$fbasetype" = "long" -o "$fbasetype" = "ulong" ] && [ "$tbasetype" = "float" ]; then
-          echo "{ return __convert_${tbasetype}_rtn(x); }"
-        else
-          echo "{ return x; }"
-        fi
-
-        continue
-      fi
-
-      for rounding in $ROUNDING_MODES; do
-        fvectortype=$fbasetype$vector_length
-        tvectortype=$tbasetype$vector_length
-        conv="convert_${tbasetype}_${rounding}"
-
-        construct="$conv(v.s0)"
-        if test $vector_length -gt 1; then
-          construct="$construct, $conv(v.s1)"
-        fi
-        if test $vector_length -gt 2; then
-          construct="$construct, $conv(v.s2)"
-        fi
-        if test $vector_length -gt 3; then
-          construct="$construct, $conv(v.s3)"
-        fi
-        if test $vector_length -gt 4; then
-          construct="$construct, $conv(v.s4)"
-          construct="$construct, $conv(v.s5)"
-          construct="$construct, $conv(v.s6)"
-          construct="$construct, $conv(v.s7)"
-        fi
-        if test $vector_length -gt 8; then
-          construct="$construct, $conv(v.s8)"
-          construct="$construct, $conv(v.s9)"
-          construct="$construct, $conv(v.sA)"
-          construct="$construct, $conv(v.sB)"
-          construct="$construct, $conv(v.sC)"
-          construct="$construct, $conv(v.sD)"
-          construct="$construct, $conv(v.sE)"
-          construct="$construct, $conv(v.sF)"
-        fi
-
-        echo "INLINE OVERLOADABLE $tvectortype convert_${tvectortype}_${rounding}($fvectortype v) {"
-        echo "  return ($tvectortype)($construct);"
-        echo "}"
-        echo
-      done
-    done
-  done
-done
-
-# convert_DSTTYPE_sat_ROUNDING function
-for vector_length in $VECTOR_LENGTHS; do
-  for ftype in $TYPES; do
-    fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
-    if test $fbasetype = "double"; then continue; fi
-
-    for ttype in $TYPES; do
-      tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
-      if test $tbasetype = "double" -o $tbasetype = "float"; then continue; fi
-
-      if test $vector_length -eq 1; then
-        echo "INLINE_OVERLOADABLE $tbasetype convert_${tbasetype}_sat_rte($fbasetype x)"
-        if test $fbasetype = "float"; then
-          echo "{ return convert_${tbasetype}_sat(__gen_ocl_rnde(x)); }"
-        else
-          echo "{ return convert_${tbasetype}_sat(x); }"
-        fi
-
-        echo "INLINE_OVERLOADABLE $tbasetype convert_${tbasetype}_sat_rtz($fbasetype x)"
-        if test $fbasetype = "float"; then
-          echo "{ return convert_${tbasetype}_sat(__gen_ocl_rndz(x)); }"
-        else
-          echo "{ return convert_${tbasetype}_sat(x); }"
-        fi
-
-        echo "INLINE_OVERLOADABLE $tbasetype convert_${tbasetype}_sat_rtp($fbasetype x)"
-        if test $fbasetype = "float"; then
-          echo "{ return convert_${tbasetype}_sat(__gen_ocl_rndu(x)); }"
-        else
-          echo "{ return convert_${tbasetype}_sat(x); }"
-        fi
-
-        echo "INLINE_OVERLOADABLE $tbasetype convert_${tbasetype}_sat_rtn($fbasetype x)"
-        if test $fbasetype = "float"; then
-          echo "{ return convert_${tbasetype}_sat(__gen_ocl_rndd(x)); }"
-        else
-          echo "{ return convert_${tbasetype}_sat(x); }"
-        fi
-
-        continue
-      fi
-
-      for rounding in $ROUNDING_MODES; do
-        fvectortype=$fbasetype$vector_length
-        tvectortype=$tbasetype$vector_length
-        conv="convert_${tbasetype}_sat_${rounding}"
-
-        construct="$conv(v.s0)"
-        if test $vector_length -gt 1; then
-          construct="$construct, $conv(v.s1)"
-        fi
-        if test $vector_length -gt 2; then
-          construct="$construct, $conv(v.s2)"
-        fi
-        if test $vector_length -gt 3; then
-          construct="$construct, $conv(v.s3)"
-        fi
-        if test $vector_length -gt 4; then
-          construct="$construct, $conv(v.s4)"
-          construct="$construct, $conv(v.s5)"
-          construct="$construct, $conv(v.s6)"
-          construct="$construct, $conv(v.s7)"
-        fi
-        if test $vector_length -gt 8; then
-          construct="$construct, $conv(v.s8)"
-          construct="$construct, $conv(v.s9)"
-          construct="$construct, $conv(v.sA)"
-          construct="$construct, $conv(v.sB)"
-          construct="$construct, $conv(v.sC)"
-          construct="$construct, $conv(v.sD)"
-          construct="$construct, $conv(v.sE)"
-          construct="$construct, $conv(v.sF)"
-        fi
-
-        echo "INLINE OVERLOADABLE $tvectortype convert_${tvectortype}_sat_${rounding}($fvectortype v) {"
-        echo "  return ($tvectortype)($construct);"
-        echo "}"
-        echo
-      done
-    done
-  done
-done
diff --git a/backend/src/genconfig.sh b/backend/src/genconfig.sh
deleted file mode 100644
index 689499e..0000000
--- a/backend/src/genconfig.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#! /bin/false
-# This is to be sourced by the generation scripts
-
-# Supported base types and their lengths
-TYPES="long:8 ulong:8 int:4 uint:4 short:2 ushort:2 char:1 uchar:1 double:8 float:4"
-
-# Supported vector lengths
-VECTOR_LENGTHS="1 2 3 4 8 16"
-
-ROUNDING_MODES="rte rtz rtp rtn"
-## No user serviceable parts below here
diff --git a/backend/src/ir/constant.cpp b/backend/src/ir/constant.cpp
index a38d392..6ef8ea6 100644
--- a/backend/src/ir/constant.cpp
+++ b/backend/src/ir/constant.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/ir/constant.hpp b/backend/src/ir/constant.hpp
index 70d09aa..f5f172d 100644
--- a/backend/src/ir/constant.hpp
+++ b/backend/src/ir/constant.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/ir/context.cpp b/backend/src/ir/context.cpp
index 1528a8d..875a529 100644
--- a/backend/src/ir/context.cpp
+++ b/backend/src/ir/context.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/ir/context.hpp b/backend/src/ir/context.hpp
index cd09413..485d558 100644
--- a/backend/src/ir/context.hpp
+++ b/backend/src/ir/context.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -117,11 +117,6 @@ namespace ir {
       return fn->newImmediate(dstImm);
     }
 
-    /*! Set an immediate value */
-    template <typename T> INLINE void setImmediate(ImmediateIndex index, T value) {
-      const Immediate imm(value);
-      fn->immediates[index] = imm;
-    }
     /*! Create a new register holding the given value. A LOADI is pushed */
     template <typename T> INLINE Register immReg(T value) {
       GBE_ASSERTM(fn != NULL, "No function currently defined");
diff --git a/backend/src/ir/function.cpp b/backend/src/ir/function.cpp
index 85e7934..7983778 100644
--- a/backend/src/ir/function.cpp
+++ b/backend/src/ir/function.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -126,7 +126,7 @@ namespace ir {
     }
 
     // Reset the label to block mapping
-    this->labels.resize(last);
+    //this->labels.resize(last);
     foreachBlock([&](BasicBlock &bb) {
       const Instruction *first = bb.getFirstInstruction();
       const LabelInstruction *label = cast<LabelInstruction>(first);
@@ -187,7 +187,7 @@ namespace ir {
       return &bb == this->blocks[0];
   }
 
-  const BasicBlock &Function::getTopBlock(void) const {
+  BasicBlock &Function::getTopBlock(void) const {
     GBE_ASSERT(blockNum() > 0 && blocks[0] != NULL);
     return *blocks[0];
   }
@@ -204,7 +204,7 @@ namespace ir {
     return *blocks[n-1];
   }
 
-  const BasicBlock &Function::getBlock(LabelIndex label) const {
+  BasicBlock &Function::getBlock(LabelIndex label) const {
     GBE_ASSERT(label < labelNum() && labels[label] != NULL);
     return *labels[label];
   }
@@ -256,18 +256,27 @@ namespace ir {
       }
       if (bb.size() == 0) return;
       Instruction *last = bb.getLastInstruction();
-      if (last->isMemberOf<BranchInstruction>() == false) {
+      if (last->isMemberOf<BranchInstruction>() == false || last->getOpcode() == OP_ENDIF || last->getOpcode() == OP_ELSE) {
         jumpToNext = &bb;
         return;
       }
-      const BranchInstruction &insn = cast<BranchInstruction>(*last);
-      if (insn.getOpcode() == OP_BRA) {
+      ir::BasicBlock::iterator it = --bb.end();
+      uint32_t handledInsns = 0;
+      while ((handledInsns < 2 && it != bb.end()) &&
+             static_cast<ir::BranchInstruction *>(&*it)->getOpcode() == OP_BRA) {
+        const BranchInstruction &insn = cast<BranchInstruction>(*it);
+        if (insn.getOpcode() != OP_BRA)
+          break;
         const LabelIndex label = insn.getLabelIndex();
         BasicBlock *target = this->blocks[label];
         GBE_ASSERT(target != NULL);
         target->predecessors.insert(&bb);
         bb.successors.insert(target);
-        if ( insn.isPredicated() == true) jumpToNext = &bb;
+        if (insn.isPredicated() == true) jumpToNext = &bb;
+        // If we are going to handle the second bra, this bra must be a predicated bra
+        GBE_ASSERT(handledInsns == 0 || insn.isPredicated() == true);
+        --it;
+        ++handledInsns;
       }
     });
   }
@@ -321,7 +330,13 @@ namespace ir {
   // Basic Block
   ///////////////////////////////////////////////////////////////////////////
 
-  BasicBlock::BasicBlock(Function &fn) : fn(fn) {
+  BasicBlock::BasicBlock(Function &fn) : needEndif(true), needIf(true), endifLabel(0),
+                                         matchingEndifLabel(0), matchingElseLabel(0),
+                                         thisElseLabel(0), belongToStructure(false),
+                                         isStructureExit(false), isLoopExit(false),
+                                         hasExtraBra(false),
+                                         matchingStructureEntry(NULL),
+                                         fn(fn) {
     this->nextBlock = this->prevBlock = NULL;
   }
 
@@ -336,6 +351,11 @@ namespace ir {
     this->push_back(&insn);
   }
 
+  void BasicBlock::insertAt(iterator pos, Instruction &insn) {
+    insn.setParent(this);
+    this->insert(pos, &insn);
+  }
+
   Instruction *BasicBlock::getFirstInstruction(void) const {
     GBE_ASSERT(this->begin() != this->end());
     const Instruction &insn = *this->begin();
diff --git a/backend/src/ir/function.hpp b/backend/src/ir/function.hpp
index 9aa1e8d..0381095 100644
--- a/backend/src/ir/function.hpp
+++ b/backend/src/ir/function.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -40,7 +40,6 @@
 
 namespace gbe {
 namespace ir {
-
   /*! Commonly used in the CFG */
   typedef set<BasicBlock*> BlockSet;
   class Unit; // Function belongs to a unit
@@ -59,6 +58,7 @@ namespace ir {
     ~BasicBlock(void);
     /*! Append a new instruction at the end of the stream */
     void append(Instruction &insn);
+    void insertAt(iterator pos, Instruction &insn);
     /*! Get the parent function */
     Function &getParent(void) { return fn; }
     const Function &getParent(void) const { return fn; }
@@ -84,6 +84,71 @@ namespace ir {
     }
     set <Register> undefPhiRegs;
     set <Register> definedPhiRegs;
+  /* these three are used by structure transforming */
+  public:
+    /* if needEndif is true, it means that this bb is the exit of an
+     * outermost structure, so this block needs another endif to match
+     * the if inserted at the entry of this structure, otherwise this
+     * block is in the middle of a structure, there's no need to insert
+     * extra endif. */
+    bool needEndif;
+    /* if needIf is true, it means that this bb is the entry of an
+     * outermost structure, so this block needs an if instruction just
+     * like other unstructured bbs. otherwise this block is in the
+     * middle of a structure, there's no need to insert an if. */
+    bool needIf;
+    /* since we need to insert an if and endif at the entry and exit
+     * bb of an outermost structure respectively, so the endif is not
+     * in the same bb with if, in order to get the endif's position,
+     * we need to store the endif label in the entry bb. */
+    LabelIndex endifLabel;
+    /* the identified if-then and if-else structure contains more than
+     * one bbs, in order to insert if, else and endif properly, we give
+     * all the IF ELSE and ENDIF a label for convenience. matchingEndifLabel
+     * is used when inserts instruction if and else, and matchingElseLabel
+     * is used when inserts instruction if. */
+    LabelIndex matchingEndifLabel;
+    LabelIndex matchingElseLabel;
+    /* IR ELSE's target is the matching ENDIF's LabelIndex, thisElseLabel
+     * is used to store the virtual label of the instruction just below
+     * ELSE. */
+    LabelIndex thisElseLabel;
+    /* betongToStructure is used as a mark of wether this bb belongs to an
+     * identified structure. */
+    bool belongToStructure;
+    /* isStructureExit and matchingStructureEntry is used for buildJIPs at
+     * backend, isStructureExit is true means the bb is an identified structure's
+     * exit bb, while matchingStructureEntry means the entry bb of the same
+     * identified structure. so if isStructureExit is false then matchingStructureEntry
+     * is meaningless. */
+    bool isStructureExit;
+    /* This block is an exit point of a loop block. It may not be exit point of
+       the large structure block. */
+    bool isLoopExit;
+    /* This block has an extra branch in the end of the block. */
+    bool hasExtraBra;
+    BasicBlock *matchingStructureEntry;
+    /* variable liveout is for if-else structure liveness analysis. eg. we have an sequence of
+     * bbs of 0, 1, 2, 3, 4 and the CFG is as below:
+     *  0
+     *  |\
+     *  1 \
+     *  |  2
+     *  4  |
+     *   \ /
+     *    3
+     * we would identify 1 and 4 an sequence structure and 0 1 4 2 an if-else structure.
+     * since we will insert an else instruction at the top of bb 2, we have to add an
+     * unconditional jump at the bottom of bb 4 to bb 2 for executing the inserted else. this
+     * would cause a change of CFG. at origin, bb 2 always executes before bb 4, but after
+     * this insertion, bb 2 may executes after bb 4 which leads to bb 2's livein(i.e. part of
+     * bb 0's liveout) may be destroyed by bb 4. so we inserted the livein of the entry of
+     * else node into all the basic blocks belong to 'then' part while the liveout is
+     * calculated in structural_analysis.cpp:calculateNecessaryLiveout(); */
+    std::set<Register> liveout;
+    /* selfLoop's label.
+     * */
+    LabelIndex whileLabel;
   private:
     friend class Function; //!< Owns the basic blocks
     BlockSet predecessors; //!< Incoming blocks
@@ -278,13 +343,13 @@ namespace ir {
     /*! Says if this is the top basic block (entry point) */
     bool isEntryBlock(const BasicBlock &bb) const;
     /*! Get function the entry point block */
-    const BasicBlock &getTopBlock(void) const;
+    BasicBlock &getTopBlock(void) const;
     /*! Get the last block */
     const BasicBlock &getBottomBlock(void) const;
     /*! Get the last block */
     BasicBlock &getBottomBlock(void);
     /*! Get block from its label */
-    const BasicBlock &getBlock(LabelIndex label) const;
+    BasicBlock &getBlock(LabelIndex label) const;
     /*! Get the label instruction from its label index */
     const LabelInstruction *getLabelInstruction(LabelIndex index) const;
     /*! Return the number of instructions of the largest basic block */
@@ -353,12 +418,13 @@ namespace ir {
     /*! Get function attributes string. */
     const std::string& getFunctionAttributes(void) const {return this->functionAttributes;}
     /*! Get stack size. */
-    INLINE const uint32_t getStackSize(void) const { return this->stackSize; }
+    INLINE uint32_t getStackSize(void) const { return this->stackSize; }
     /*! Push stack size. */
     INLINE void pushStackSize(uint32_t step) { this->stackSize += step; }
     /*! add the loop info for later liveness analysis */
     void addLoop(const vector<LabelIndex> &bbs, const vector<std::pair<LabelIndex, LabelIndex>> &exits);
     INLINE const vector<Loop * > &getLoops() { return loops; }
+    vector<BasicBlock *> &getBlocks() { return blocks; }
     /*! Get surface starting address register from bti */
     Register getSurfaceBaseReg(uint8_t bti) const;
     void appendSurface(uint8_t bti, Register reg);
diff --git a/backend/src/ir/image.cpp b/backend/src/ir/image.cpp
index a9b1563..d28a72a 100644
--- a/backend/src/ir/image.cpp
+++ b/backend/src/ir/image.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -76,8 +76,7 @@ namespace ir {
       imageInfo->channelOrderSlot = -1;
     }
   }
-
-  const int32_t ImageSet::getInfoOffset(ImageInfoKey key) const
+  int32_t ImageSet::getInfoOffset(ImageInfoKey key) const
   {
     auto it = indexMap.find(key.index);
     if (it == indexMap.end())
@@ -86,7 +85,7 @@ namespace ir {
     return getInfoOffset4Type(imageInfo, key.type);
   }
 
-  const uint32_t ImageSet::getIdx(const Register imageReg) const
+  uint32_t ImageSet::getIdx(const Register imageReg) const
   {
     auto it = regMap.find(imageReg);
     GBE_ASSERT(it != regMap.end());
diff --git a/backend/src/ir/image.hpp b/backend/src/ir/image.hpp
index b31c7da..a93a4b6 100644
--- a/backend/src/ir/image.hpp
+++ b/backend/src/ir/image.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -52,11 +52,11 @@ namespace ir {
     /*! clear image info. */
     void clearInfo();
     /*! Get the image's index(actual location). */
-    const uint32_t getIdx(const Register imageReg) const;
+    uint32_t getIdx(const Register imageReg) const;
     size_t getDataSize(void) { return regMap.size(); }
     size_t getDataSize(void) const { return regMap.size(); }
 
-    const int32_t getInfoOffset(ImageInfoKey key) const;
+    int32_t getInfoOffset(ImageInfoKey key) const;
     void getData(struct ImageInfo *imageInfos) const;
     void operator = (const ImageSet& other) {
       regMap.insert(other.regMap.begin(), other.regMap.end());
diff --git a/backend/src/ir/immediate.cpp b/backend/src/ir/immediate.cpp
index 3a6b9a2..7d26925 100644
--- a/backend/src/ir/immediate.cpp
+++ b/backend/src/ir/immediate.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -23,15 +23,15 @@ using namespace ir;
 #define SCALAR_SAME_TYPE_ASSERT()                           \
       GBE_ASSERT(this->getType() == right.getType()       && \
                  this->getElemNum() == right.getElemNum() && \
-                 this->getElemNum() == 1                  && \
-                 this->getType() != TYPE_BOOL);
+                 this->getElemNum() == 1)
 
 #define DECLAR_BINARY_ALL_TYPE_OP(OP) \
     Immediate Immediate::operator OP (const Immediate &right) const { \
-      SCALAR_SAME_TYPE_ASSERT(); \
+      /*SCALAR_SAME_TYPE_ASSERT();*/ \
       switch (this->getType()) { \
         default: \
           GBE_ASSERT(0); \
+        case TYPE_BOOL:     return Immediate(*this->data.b OP *right.data.b);   \
         case TYPE_S8:     return Immediate(*this->data.s8 OP *right.data.s8);   \
         case TYPE_U8:     return Immediate(*this->data.u8 OP *right.data.u8);   \
         case TYPE_S16:    return Immediate(*this->data.s16 OP *right.data.s16); \
@@ -50,15 +50,24 @@ using namespace ir;
     DECLAR_BINARY_ALL_TYPE_OP(-)
     DECLAR_BINARY_ALL_TYPE_OP(*)
     DECLAR_BINARY_ALL_TYPE_OP(/)
+    DECLAR_BINARY_ALL_TYPE_OP(>)
+    //DECLAR_BINARY_ALL_TYPE_OP(<)
+    DECLAR_BINARY_ALL_TYPE_OP(==)
+    DECLAR_BINARY_ALL_TYPE_OP(!=)
+    DECLAR_BINARY_ALL_TYPE_OP(>=)
+    DECLAR_BINARY_ALL_TYPE_OP(<=)
+    DECLAR_BINARY_ALL_TYPE_OP(&&)
 
 #undef DECLAR_BINARY_ALL_TYPE_OP
 
+
 #define DECLAR_BINARY_INT_TYPE_OP(OP) \
     Immediate Immediate::operator OP (const Immediate &right) const { \
-      SCALAR_SAME_TYPE_ASSERT(); \
+      /*SCALAR_SAME_TYPE_ASSERT();*/ \
       switch (this->getType()) { \
         default: \
           GBE_ASSERT(0); \
+        case TYPE_BOOL:   return Immediate(*this->data.b OP *right.data.b);   \
         case TYPE_S8:     return Immediate(*this->data.s8 OP *right.data.s8);   \
         case TYPE_U8:     return Immediate(*this->data.u8 OP *right.data.u8);   \
         case TYPE_S16:    return Immediate(*this->data.s16 OP *right.data.s16); \
@@ -122,6 +131,25 @@ using namespace ir;
         }
     }
 
+    Immediate Immediate::less (const Immediate &left, const Immediate &right) {
+      GBE_ASSERT(left.getType() > TYPE_BOOL && left.getType() <= TYPE_U64);
+      switch (left.getType()) {
+        default:
+          GBE_ASSERT(0);
+        case TYPE_S8:     return Immediate(*left.data.s8 < *right.data.s8);
+        case TYPE_U8:     return Immediate(*left.data.u8 < *right.data.u8);
+        case TYPE_S16:    return Immediate(*left.data.s16 < *right.data.s16);
+        case TYPE_U16:    return Immediate(*left.data.u16 < *right.data.u16);
+        case TYPE_S32:    return Immediate(*left.data.s32 < *right.data.s32);
+        case TYPE_U32:    return Immediate(*left.data.u32 < *right.data.u32);
+        case TYPE_S64:    return Immediate(*left.data.s64 < *right.data.s64);
+        case TYPE_U64:    return Immediate(*left.data.u64 < *right.data.u64);
+        case TYPE_FLOAT:  return Immediate(*left.data.f32 < *right.data.f32);
+        case TYPE_DOUBLE: return Immediate(*left.data.f64 < *right.data.f64);
+      }
+    }
+
+
     Immediate::Immediate(ImmOpCode op, const Immediate &left, const Immediate &right, Type dstType) {
       switch (op) {
         default:
@@ -181,9 +209,16 @@ using namespace ir;
           }
           break;
         }
+        case IMM_OEQ: *this = left == right; break;
+        case IMM_ONE: *this = left != right; break;
+        case IMM_OLE: *this = left <= right; break;
+        case IMM_OGE: *this = left >= right; break;
+        case IMM_OLT: *this = less(left, right); break;
+        case IMM_OGT: *this = left > right; break;
+        case IMM_ORD: *this = (left == left) && (right == right); break;
       }
       // If the dst type is large int, we will not change the imm type to large int.
-      GBE_ASSERT(type == (ImmType)dstType || dstType == TYPE_LARGE_INT);
+      GBE_ASSERT(type == (ImmType)dstType || dstType == TYPE_LARGE_INT || dstType == TYPE_BOOL);
     }
 
     Immediate::Immediate(const vector<const Immediate*> immVec) {
diff --git a/backend/src/ir/immediate.hpp b/backend/src/ir/immediate.hpp
index 6a5c819..1f18a4c 100644
--- a/backend/src/ir/immediate.hpp
+++ b/backend/src/ir/immediate.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -45,7 +45,18 @@ namespace ir {
     IMM_LSHR,
     IMM_AND,
     IMM_OR,
-    IMM_XOR
+    IMM_XOR,
+    IMM_OEQ,
+    IMM_ONE,
+    IMM_OLE,
+    IMM_OGE,
+    IMM_OLT,
+    IMM_OGT,
+    IMM_ORD,
+    IMM_FPTOUI,
+    IMM_FPTOSI,
+    IMM_SITOFP,
+    IMM_UITOFP
   } ImmOpCode;
 
   typedef enum {
@@ -190,11 +201,20 @@ namespace ir {
     }
 
     Immediate(ImmOpCode op, const Immediate &other, Type dstType) {
-      if (op == IMM_TRUNC) {
-        copy(other, 0, 1);
-      } else if (op == IMM_BITCAST) {
-        *this = other;
-        type = (ImmType)dstType;
+      switch (op) {
+        default:
+          GBE_ASSERT(0);
+        case IMM_TRUNC:
+          copy(other, 0, 1);
+          break;
+        case IMM_BITCAST:
+          *this = other;
+          type = (ImmType)dstType;
+          break;
+        case IMM_FPTOUI: *this = Immediate((uint32_t)*other.data.f32); break;
+        case IMM_FPTOSI: *this = Immediate((int32_t)*other.data.f32); break;
+        case IMM_UITOFP: *this = Immediate((float)*other.data.u32); break;
+        case IMM_SITOFP: *this = Immediate((float)*other.data.s32); break;
       }
     }
 
@@ -208,6 +228,9 @@ namespace ir {
     }
 
   private:
+    ImmType type;  //!< Type of the value
+    uint32_t elemNum; //!< vector imm data type
+    uint64_t defaultData;
     union {
       bool *b;
       int8_t *s8;
@@ -223,22 +246,25 @@ namespace ir {
       const Immediate *immVec[];
       void *p;
     } data;     //!< Value to store
-    ImmType type;  //!< Type of the value
-    uint32_t elemNum; //!< vector imm data type
-    uint64_t defaultData;
     Immediate & operator= (const Immediate &);
-    Immediate operator+ (const Immediate &) const; 
-    Immediate operator- (const Immediate &) const; 
-    Immediate operator* (const Immediate &) const; 
-    Immediate operator/ (const Immediate &) const; 
-    Immediate operator% (const Immediate &) const; 
-    Immediate operator& (const Immediate &) const; 
-    Immediate operator| (const Immediate &) const; 
-    Immediate operator^ (const Immediate &) const; 
-    Immediate operator<< (const Immediate &) const; 
-    Immediate operator>> (const Immediate &) const; 
+    Immediate operator+ (const Immediate &) const;
+    Immediate operator- (const Immediate &) const;
+    Immediate operator* (const Immediate &) const;
+    Immediate operator/ (const Immediate &) const;
+    Immediate operator> (const Immediate &) const;
+    Immediate operator== (const Immediate &) const;
+    Immediate operator!= (const Immediate &) const;
+    Immediate operator>= (const Immediate &) const;
+    Immediate operator<= (const Immediate &) const;
+    Immediate operator&& (const Immediate &) const;
+    Immediate operator% (const Immediate &) const;
+    Immediate operator& (const Immediate &) const;
+    Immediate operator| (const Immediate &) const;
+    Immediate operator^ (const Immediate &) const;
+    Immediate operator<< (const Immediate &) const;
+    Immediate operator>> (const Immediate &) const;
     static Immediate lshr (const Immediate &left, const Immediate &right);
-
+    static Immediate less (const Immediate &left, const Immediate &right);
 
     void copy(const Immediate &other, int32_t offset, uint32_t num);
     GBE_CLASS(Immediate);
diff --git a/backend/src/ir/instruction.cpp b/backend/src/ir/instruction.cpp
index 5fc1535..2bd0061 100644
--- a/backend/src/ir/instruction.cpp
+++ b/backend/src/ir/instruction.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -261,7 +261,7 @@ namespace ir {
         this->src = src;
         this->dstFamily = getFamily(dstType);
         this->srcFamily = getFamily(srcType);
-        GBE_ASSERT(srcNum <= 16 && dstNum <= 16);
+        GBE_ASSERT(srcNum <= Instruction::MAX_SRC_NUM && dstNum <= Instruction::MAX_DST_NUM);
         this->dstNum = dstNum;
         this->srcNum = srcNum;
       }
@@ -348,24 +348,25 @@ namespace ir {
       public NDstPolicy<BranchInstruction, 0>
     {
     public:
-      INLINE BranchInstruction(Opcode op, LabelIndex labelIndex, Register predicate) {
-        GBE_ASSERT(op == OP_BRA);
+      INLINE BranchInstruction(Opcode op, LabelIndex labelIndex, Register predicate, bool inv_pred=false) {
+        GBE_ASSERT(op == OP_BRA || op == OP_IF || op == OP_WHILE);
         this->opcode = op;
         this->predicate = predicate;
         this->labelIndex = labelIndex;
         this->hasPredicate = true;
         this->hasLabel = true;
+        this->inversePredicate = inv_pred;
       }
       INLINE BranchInstruction(Opcode op, LabelIndex labelIndex) {
-        GBE_ASSERT(op == OP_BRA);
-        this->opcode = OP_BRA;
+        GBE_ASSERT(op == OP_BRA || op == OP_ELSE || op == OP_ENDIF);
+        this->opcode = op;
         this->labelIndex = labelIndex;
         this->hasPredicate = false;
         this->hasLabel = true;
       }
       INLINE BranchInstruction(Opcode op) {
         GBE_ASSERT(op == OP_RET);
-        this->opcode = OP_RET;
+        this->opcode = op;
         this->hasPredicate = false;
         this->hasLabel = false;
       }
@@ -385,11 +386,13 @@ namespace ir {
         predicate = reg;
       }
       INLINE bool isPredicated(void) const { return hasPredicate; }
+      INLINE bool getInversePredicated(void) const { return inversePredicate; }
       INLINE bool wellFormed(const Function &fn, std::string &why) const;
       INLINE void out(std::ostream &out, const Function &fn) const;
       Register predicate;    //!< Predication means conditional branch
       LabelIndex labelIndex; //!< Index of the label the branch targets
       bool hasPredicate:1;   //!< Is it predicated?
+      bool inversePredicate:1;   //!< Is it inverse predicated?
       bool hasLabel:1;       //!< Is there any target label?
       Register dst[0];       //!< No destination
     };
@@ -531,11 +534,11 @@ namespace ir {
       Tuple src;
       Tuple dst;
 
-      INLINE const uint8_t getImageIndex(void) const { return this->imageIdx; }
+      INLINE uint8_t getImageIndex(void) const { return this->imageIdx; }
       INLINE Type getSrcType(void) const { return this->srcIsFloat ? TYPE_FLOAT : TYPE_S32; }
       INLINE Type getDstType(void) const { return this->dstIsFloat ? TYPE_FLOAT : TYPE_U32; }
-      INLINE const uint8_t getSamplerIndex(void) const { return this->samplerIdx; }
-      INLINE const uint8_t getSamplerOffset(void) const { return this->samplerOffset; }
+      INLINE uint8_t getSamplerIndex(void) const { return this->samplerIdx; }
+      INLINE uint8_t getSamplerOffset(void) const { return this->samplerOffset; }
       uint8_t srcIsFloat:1;
       uint8_t dstIsFloat:1;
       uint8_t samplerIdx:4;
@@ -578,7 +581,7 @@ namespace ir {
       uint8_t coordType;
       uint8_t imageIdx;
 
-      INLINE const uint8_t getImageIndex(void) const { return this->imageIdx; }
+      INLINE uint8_t getImageIndex(void) const { return this->imageIdx; }
       INLINE Type getSrcType(void) const { return (Type)this->srcType; }
       INLINE Type getCoordType(void) const { return (Type)this->coordType; }
       // bti, u, v, w, 4 data elements
@@ -614,7 +617,7 @@ namespace ir {
             << " info reg %" << this->getSrc(fn, 0);
       }
 
-      INLINE const uint8_t getImageIndex(void) const { return imageIdx; }
+      INLINE uint8_t getImageIndex(void) const { return imageIdx; }
 
       uint8_t infoType;                 //!< Type of the requested information.
       uint8_t imageIdx;                //!< surface index.
@@ -665,6 +668,48 @@ namespace ir {
       Register dst[0], src[0];
     };
 
+    class ALIGNED_INSTRUCTION ReadARFInstruction :
+      public BasePolicy,
+      public NSrcPolicy<ReadARFInstruction, 0>,
+      public NDstPolicy<ReadARFInstruction, 1>
+    {
+    public:
+      INLINE ReadARFInstruction(Type type, Register dst, ARFRegister arf) {
+        this->type = type;
+        this->dst[0] = dst;
+        this->opcode = OP_READ_ARF;
+        this->arf = arf;
+      }
+      INLINE ir::ARFRegister getARFRegister(void) const { return this->arf; }
+      INLINE Type getType(void) const { return this->type; }
+      INLINE bool wellFormed(const Function &fn, std::string &why) const;
+      INLINE void out(std::ostream &out, const Function &fn) const;
+      Type type;
+      ARFRegister arf;
+      Register dst[1];
+      Register src[0];
+    };
+
+    class ALIGNED_INSTRUCTION RegionInstruction :
+      public BasePolicy,
+      public NSrcPolicy<RegionInstruction, 1>,
+      public NDstPolicy<RegionInstruction, 1>
+    {
+    public:
+      INLINE RegionInstruction(Register dst, Register src, uint32_t offset) {
+        this->offset = offset;
+        this->dst[0] = dst;
+        this->src[0] = src;
+        this->opcode = OP_REGION;
+      }
+      INLINE uint32_t getOffset(void) const { return this->offset; }
+      INLINE bool wellFormed(const Function &fn, std::string &why) const;
+      INLINE void out(std::ostream &out, const Function &fn) const;
+      uint32_t offset;
+      Register dst[1];
+      Register src[1];
+    };
+
     class ALIGNED_INSTRUCTION LabelInstruction :
       public BasePolicy,
       public NSrcPolicy<LabelInstruction, 0>,
@@ -989,7 +1034,7 @@ namespace ir {
       }
       const ir::Type immType = fn.getImmediate(immediateIndex).getType();
       if (UNLIKELY(type != immType)) {
-        whyNot = "Inconsistant type for the immediate value to load";
+        whyNot = "Inconsistent type for the immediate value to load";
         return false;
       }
       const RegisterFamily family = getFamily(type);
@@ -1019,6 +1064,30 @@ namespace ir {
       return true;
     }
 
+    INLINE bool ReadARFInstruction::wellFormed(const Function &fn, std::string &whyNot) const
+    {
+      if (UNLIKELY( this->type != TYPE_U32 && this->type != TYPE_S32)) {
+        whyNot = "Only support S32/U32 type";
+        return false;
+      }
+
+      const RegisterFamily family = getFamily(this->type);
+      if (UNLIKELY(checkRegisterData(family, dst[0], fn, whyNot) == false))
+        return false;
+
+      return true;
+    }
+
+    INLINE bool RegionInstruction::wellFormed(const Function &fn, std::string &whyNot) const
+    {
+      if (UNLIKELY(checkRegisterData(FAMILY_DWORD, src[0], fn, whyNot) == false))
+        return false;
+      if (UNLIKELY(checkRegisterData(FAMILY_DWORD, dst[0], fn, whyNot) == false))
+        return false;
+
+      return true;
+    }
+
     // Only a label index is required
     INLINE bool LabelInstruction::wellFormed(const Function &fn, std::string &whyNot) const
     {
@@ -1135,6 +1204,16 @@ namespace ir {
         out << ": " << (int)bti.bti[i];
     }
 
+    INLINE void ReadARFInstruction::out(std::ostream &out, const Function &fn) const {
+      this->outOpcode(out);
+      out << " %" << this->getDst(fn, 0) << " arf:" << arf;
+    }
+
+    INLINE void RegionInstruction::out(std::ostream &out, const Function &fn) const {
+      this->outOpcode(out);
+      out << " %" << this->getDst(fn, 0) << " %" << this->getSrc(fn, 0) << " offset: " << this->offset;
+    }
+
     INLINE void LabelInstruction::out(std::ostream &out, const Function &fn) const {
       this->outOpcode(out);
       out << " $" << labelIndex;
@@ -1142,6 +1221,8 @@ namespace ir {
 
     INLINE void BranchInstruction::out(std::ostream &out, const Function &fn) const {
       this->outOpcode(out);
+      if(opcode == OP_IF && inversePredicate)
+        out << " !";
       if (hasPredicate)
         out << "<%" << this->getSrc(fn, 0) << ">";
       if (hasLabel) out << " -> label$" << labelIndex;
@@ -1282,6 +1363,14 @@ START_INTROSPECTION(SyncInstruction)
 #include "ir/instruction.hxx"
 END_INTROSPECTION(SyncInstruction)
 
+START_INTROSPECTION(ReadARFInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(ReadARFInstruction)
+
+START_INTROSPECTION(RegionInstruction)
+#include "ir/instruction.hxx"
+END_INTROSPECTION(RegionInstruction)
+
 START_INTROSPECTION(LabelInstruction)
 #include "ir/instruction.hxx"
 END_INTROSPECTION(LabelInstruction)
@@ -1463,18 +1552,22 @@ DECL_MEM_FN(LoadInstruction, bool, isAligned(void), isAligned())
 DECL_MEM_FN(LoadImmInstruction, Type, getType(void), getType())
 DECL_MEM_FN(LabelInstruction, LabelIndex, getLabelIndex(void), getLabelIndex())
 DECL_MEM_FN(BranchInstruction, bool, isPredicated(void), isPredicated())
+DECL_MEM_FN(BranchInstruction, bool, getInversePredicated(void), getInversePredicated())
 DECL_MEM_FN(BranchInstruction, LabelIndex, getLabelIndex(void), getLabelIndex())
 DECL_MEM_FN(SyncInstruction, uint32_t, getParameters(void), getParameters())
+DECL_MEM_FN(ReadARFInstruction, Type, getType(void), getType())
+DECL_MEM_FN(ReadARFInstruction, ARFRegister, getARFRegister(void), getARFRegister())
+DECL_MEM_FN(RegionInstruction, uint32_t, getOffset(void), getOffset())
 DECL_MEM_FN(SampleInstruction, Type, getSrcType(void), getSrcType())
 DECL_MEM_FN(SampleInstruction, Type, getDstType(void), getDstType())
-DECL_MEM_FN(SampleInstruction, const uint8_t, getSamplerIndex(void), getSamplerIndex())
-DECL_MEM_FN(SampleInstruction, const uint8_t, getSamplerOffset(void), getSamplerOffset())
-DECL_MEM_FN(SampleInstruction, const uint8_t, getImageIndex(void), getImageIndex())
+DECL_MEM_FN(SampleInstruction, uint8_t, getSamplerIndex(void), getSamplerIndex())
+DECL_MEM_FN(SampleInstruction, uint8_t, getSamplerOffset(void), getSamplerOffset())
+DECL_MEM_FN(SampleInstruction, uint8_t, getImageIndex(void), getImageIndex())
 DECL_MEM_FN(TypedWriteInstruction, Type, getSrcType(void), getSrcType())
 DECL_MEM_FN(TypedWriteInstruction, Type, getCoordType(void), getCoordType())
-DECL_MEM_FN(TypedWriteInstruction, const uint8_t, getImageIndex(void), getImageIndex())
+DECL_MEM_FN(TypedWriteInstruction, uint8_t, getImageIndex(void), getImageIndex())
 DECL_MEM_FN(GetImageInfoInstruction, uint32_t, getInfoType(void), getInfoType())
-DECL_MEM_FN(GetImageInfoInstruction, const uint8_t, getImageIndex(void), getImageIndex())
+DECL_MEM_FN(GetImageInfoInstruction, uint8_t, getImageIndex(void), getImageIndex())
 
 #undef DECL_MEM_FN
 
@@ -1501,6 +1594,7 @@ DECL_MEM_FN(GetImageInfoInstruction, const uint8_t, getImageIndex(void), getImag
   DECL_EMIT_FUNCTION(MOV)
   DECL_EMIT_FUNCTION(FBH)
   DECL_EMIT_FUNCTION(FBL)
+  DECL_EMIT_FUNCTION(CBIT)
   DECL_EMIT_FUNCTION(COS)
   DECL_EMIT_FUNCTION(SIN)
   DECL_EMIT_FUNCTION(LOG)
@@ -1614,6 +1708,25 @@ DECL_MEM_FN(GetImageInfoInstruction, const uint8_t, getImageIndex(void), getImag
     return internal::BranchInstruction(OP_BRA, labelIndex, pred).convert();
   }
 
+  // IF
+  Instruction IF(LabelIndex labelIndex, Register pred, bool inv_pred) {
+    return internal::BranchInstruction(OP_IF, labelIndex, pred, inv_pred).convert();
+  }
+
+  // ELSE
+  Instruction ELSE(LabelIndex labelIndex) {
+    return internal::BranchInstruction(OP_ELSE, labelIndex).convert();
+  }
+  // ENDIF
+  Instruction ENDIF(LabelIndex labelIndex) {
+    return internal::BranchInstruction(OP_ENDIF, labelIndex).convert();
+  }
+
+  // WHILE
+  Instruction WHILE(LabelIndex labelIndex, Register pred) {
+    return internal::BranchInstruction(OP_WHILE, labelIndex, pred).convert();
+  }
+
   // RET
   Instruction RET(void) {
     return internal::BranchInstruction(OP_RET).convert();
@@ -1647,6 +1760,13 @@ DECL_MEM_FN(GetImageInfoInstruction, const uint8_t, getImageIndex(void), getImag
     return internal::SyncInstruction(parameters).convert();
   }
 
+  Instruction READ_ARF(Type type, Register dst, ARFRegister arf) {
+    return internal::ReadARFInstruction(type, dst, arf).convert();
+  }
+  Instruction REGION(Register dst, Register src, uint32_t offset) {
+    return internal::RegionInstruction(dst, src, offset).convert();
+  }
+
   // LABEL
   Instruction LABEL(LabelIndex labelIndex) {
     return internal::LabelInstruction(labelIndex).convert();
@@ -1667,10 +1787,17 @@ DECL_MEM_FN(GetImageInfoInstruction, const uint8_t, getImageIndex(void), getImag
 
   std::ostream &operator<< (std::ostream &out, const Instruction &insn) {
     const Function &fn = insn.getFunction();
+    const BasicBlock *bb = insn.getParent();
     switch (insn.getOpcode()) {
 #define DECL_INSN(OPCODE, CLASS) \
       case OP_##OPCODE: \
-        reinterpret_cast<const internal::CLASS&>(insn).out(out, fn); \
+          if(OP_##OPCODE == OP_ELSE) \
+          { \
+            reinterpret_cast<const internal::CLASS&>(insn).out(out, fn); \
+            out << "  <**>label: " << bb->thisElseLabel; \
+            break; \
+          } \
+          reinterpret_cast<const internal::CLASS&>(insn).out(out, fn); \
         break;
 #include "instruction.hxx"
 #undef DECL_INSN
diff --git a/backend/src/ir/instruction.hpp b/backend/src/ir/instruction.hpp
index a75a441..11e9509 100644
--- a/backend/src/ir/instruction.hpp
+++ b/backend/src/ir/instruction.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -137,6 +137,7 @@ namespace ir {
       InstructionBase(reinterpret_cast<const char*>(&other.opcode)) {
       parent = other.parent;
     }
+
   private:
     /*! To be consistant with copy constructor */
     INLINE Instruction &operator= (const Instruction &other) { return *this; }
@@ -188,8 +189,8 @@ namespace ir {
       return T::isClassOf(*this);
     }
     /*! max_src for store instruction (vec16 + addr) */
-    static const uint32_t MAX_SRC_NUM = 17;
-    static const uint32_t MAX_DST_NUM = 16;
+    static const uint32_t MAX_SRC_NUM = 32;
+    static const uint32_t MAX_DST_NUM = 32;
   protected:
     BasicBlock *parent;      //!< The basic block containing the instruction
     GBE_CLASS(Instruction);  //!< Use internal allocators
@@ -364,7 +365,7 @@ namespace ir {
   public:
     /*! Return true if the given instruction is an instance of this class */
     static bool isClassOf(const Instruction &insn);
-    const uint8_t getImageIndex() const;
+    uint8_t getImageIndex() const;
     Type getSrcType(void) const;
     Type getCoordType(void) const;
   };
@@ -372,9 +373,9 @@ namespace ir {
   /*! Load texels from a texture */
   class SampleInstruction : public Instruction {
   public:
-    const uint8_t getImageIndex() const;
-    const uint8_t getSamplerIndex(void) const;
-    const uint8_t getSamplerOffset(void) const;
+    uint8_t getImageIndex() const;
+    uint8_t getSamplerIndex(void) const;
+    uint8_t getSamplerOffset(void) const;
     Type getSrcType(void) const;
     Type getDstType(void) const;
     /*! Return true if the given instruction is an instance of this class */
@@ -416,7 +417,7 @@ namespace ir {
      return 0;
    }
 
-    const uint8_t getImageIndex() const;
+    uint8_t getImageIndex() const;
     uint32_t getInfoType() const;
     /*! Return true if the given instruction is an instance of this class */
     static bool isClassOf(const Instruction &insn);
@@ -429,6 +430,8 @@ namespace ir {
   public:
     /*! Indicate if the branch is predicated */
     bool isPredicated(void) const;
+    /*! Indicate if the branch is inverse predicated */
+    bool getInversePredicated(void) const;
     /*! Return the predicate register (if predicated) */
     RegisterData getPredicate(void) const {
       GBE_ASSERTM(this->isPredicated() == true, "Branch is not predicated");
@@ -493,6 +496,23 @@ namespace ir {
     static bool isClassOf(const Instruction &insn);
   };
 
+  /*! Read one register (8 DWORD) in arf */
+  class ReadARFInstruction : public Instruction {
+  public:
+    Type getType() const;
+    ir::ARFRegister getARFRegister() const;
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+  };
+
+  /*! return a region of a register, make sure the offset does not exceed the register size */
+  class RegionInstruction : public Instruction {
+  public:
+    uint32_t getOffset(void) const;
+    /*! Return true if the given instruction is an instance of this class */
+    static bool isClassOf(const Instruction &insn);
+  };
+
   /*! Specialize the instruction. Also performs typechecking first based on the
    *  opcode. Crashes if it fails
    */
@@ -565,6 +585,8 @@ namespace ir {
   Instruction FBH(Type type, Register dst, Register src);
   /*! fbl.type dst src */
   Instruction FBL(Type type, Register dst, Register src);
+  /*! cbit.type dst src */
+  Instruction CBIT(Type type, Register dst, Register src);
   /*! hadd.type dst src */
   Instruction HADD(Type type, Register dst, Register src0, Register src1);
   /*! rhadd.type dst src */
@@ -661,6 +683,14 @@ namespace ir {
   Instruction BRA(LabelIndex labelIndex);
   /*! (pred) bra labelIndex */
   Instruction BRA(LabelIndex labelIndex, Register pred);
+  /*! (pred) if labelIndex */
+  Instruction IF(LabelIndex labelIndex, Register pred, bool inv_pred=true);
+  /*! else labelIndex */
+  Instruction ELSE(LabelIndex labelIndex);
+  /*! endif */
+  Instruction ENDIF(LabelIndex labelIndex);
+  /*! (pred) while labelIndex */
+  Instruction WHILE(LabelIndex labelIndex, Register pred);
   /*! ret */
   Instruction RET(void);
   /*! load.type.space {dst1,...,dst_valueNum} offset value */
@@ -671,6 +701,9 @@ namespace ir {
   Instruction LOADI(Type type, Register dst, ImmediateIndex value);
   /*! sync.params... (see Sync instruction) */
   Instruction SYNC(uint32_t parameters);
+
+  Instruction READ_ARF(Type type, Register dst, ARFRegister arf);
+  Instruction REGION(Register dst, Register src, uint32_t offset);
   /*! typed write */
   Instruction TYPED_WRITE(uint8_t imageIndex, Tuple src, Type srcType, Type coordType);
   /*! sample textures */
diff --git a/backend/src/ir/instruction.hxx b/backend/src/ir/instruction.hxx
index 587517b..9a89069 100644
--- a/backend/src/ir/instruction.hxx
+++ b/backend/src/ir/instruction.hxx
@@ -79,11 +79,14 @@ DECL_INSN(TYPED_WRITE, TypedWriteInstruction)
 DECL_INSN(SAMPLE, SampleInstruction)
 DECL_INSN(SYNC, SyncInstruction)
 DECL_INSN(LABEL, LabelInstruction)
+DECL_INSN(READ_ARF, ReadARFInstruction)
+DECL_INSN(REGION, RegionInstruction)
 DECL_INSN(GET_IMAGE_INFO, GetImageInfoInstruction)
 DECL_INSN(MUL_HI, BinaryInstruction)
 DECL_INSN(I64_MUL_HI, BinaryInstruction)
 DECL_INSN(FBH, UnaryInstruction)
 DECL_INSN(FBL, UnaryInstruction)
+DECL_INSN(CBIT, UnaryInstruction)
 DECL_INSN(HADD, BinaryInstruction)
 DECL_INSN(RHADD, BinaryInstruction)
 DECL_INSN(I64HADD, BinaryInstruction)
@@ -93,3 +96,7 @@ DECL_INSN(UPSAMPLE_INT, BinaryInstruction)
 DECL_INSN(UPSAMPLE_LONG, BinaryInstruction)
 DECL_INSN(I64MADSAT, TernaryInstruction)
 DECL_INSN(MAD, TernaryInstruction)
+DECL_INSN(IF, BranchInstruction)
+DECL_INSN(ENDIF, BranchInstruction)
+DECL_INSN(ELSE, BranchInstruction)
+DECL_INSN(WHILE, BranchInstruction)
diff --git a/backend/src/ir/liveness.cpp b/backend/src/ir/liveness.cpp
index afed476..2b1ffdb 100644
--- a/backend/src/ir/liveness.cpp
+++ b/backend/src/ir/liveness.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -41,7 +41,11 @@ namespace ir {
       }
     });
     // Now with iterative analysis, we compute liveout and livein sets
-    this->computeLiveInOut();
+    while (unvisitBlocks.size()) {
+      if (workSet.size() == 0)
+        workSet.insert(--unvisitBlocks.end(), unvisitBlocks.end());
+      this->computeLiveInOut();
+    }
     // extend register (def in loop, use out-of-loop) liveness to the whole loop
     set<Register> extentRegs;
     this->computeExtraLiveInOut(extentRegs);
@@ -79,6 +83,7 @@ namespace ir {
               opCode != ir::OP_MUL_HI &&
               opCode != ir::OP_HADD &&
               opCode != ir::OP_RHADD &&
+              opCode != ir::OP_READ_ARF &&
               opCode != ir::OP_ADDSAT &&
               (dstNum == 1 || insn.getOpcode() != ir::OP_LOAD) &&
               !extentRegs->contains(reg)
@@ -97,6 +102,9 @@ namespace ir {
       this->initInstruction(*info, insn);
     });
     liveness[&bb] = info;
+    unvisitBlocks.insert(info);
+    if(!bb.liveout.empty())
+      info->liveOut.insert(bb.liveout.begin(), bb.liveout.end());
   }
 
   void Liveness::initInstruction(BlockInfo &info, const Instruction &insn) {
@@ -121,12 +129,16 @@ namespace ir {
     while(!workSet.empty()) {
       auto currInfo = *workSet.begin();
       workSet.erase(currInfo);
+      if (unvisitBlocks.find(currInfo) != unvisitBlocks.end())
+        unvisitBlocks.erase(currInfo);
       for (auto currOutVar : currInfo->liveOut)
         if (!currInfo->varKill.contains(currOutVar))
           currInfo->upwardUsed.insert(currOutVar);
       bool isChanged = false;
       for (auto prev : currInfo->bb.getPredecessorSet()) {
         BlockInfo *prevInfo = liveness[prev];
+        if (unvisitBlocks.find(currInfo) != unvisitBlocks.end())
+          unvisitBlocks.erase(currInfo);
         for (auto currInVar : currInfo->upwardUsed) {
           if (!prevInfo->bb.undefPhiRegs.contains(currInVar)) {
             auto changed = prevInfo->liveOut.insert(currInVar);
diff --git a/backend/src/ir/liveness.hpp b/backend/src/ir/liveness.hpp
index d55e00d..1bc66fe 100644
--- a/backend/src/ir/liveness.hpp
+++ b/backend/src/ir/liveness.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -132,6 +132,7 @@ namespace ir {
     /*! Set of work list block which has exit(return) instruction */
     typedef set <struct BlockInfo*> WorkSet;
     WorkSet workSet;
+    WorkSet unvisitBlocks;
 
     /*! Use custom allocators */
     GBE_CLASS(Liveness);
diff --git a/backend/src/ir/lowering.cpp b/backend/src/ir/lowering.cpp
index f71fd72..73b1dd2 100644
--- a/backend/src/ir/lowering.cpp
+++ b/backend/src/ir/lowering.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/ir/lowering.hpp b/backend/src/ir/lowering.hpp
index ba0c87b..ecc6b0f 100644
--- a/backend/src/ir/lowering.hpp
+++ b/backend/src/ir/lowering.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/ir/printf.cpp b/backend/src/ir/printf.cpp
index 9d60402..7b670f7 100644
--- a/backend/src/ir/printf.cpp
+++ b/backend/src/ir/printf.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -106,8 +106,9 @@ namespace gbe
       if (!vec_i)                                                       \
         pf_str = pf_str + std::string(#conv);                           \
       printf(pf_str.c_str(),                                            \
-             ((target_ty *)((char *)buf_addr + slot.state->out_buf_sizeof_offset * \
-                            global_wk_sz0 * global_wk_sz1 * global_wk_sz2)) \
+             ((target_ty *)((char *)buf_addr + sizeOfSize * global_wk_sz0 * global_wk_sz1 * global_wk_sz2 * n \
+                                              + slot.state->out_buf_sizeof_offset * \
+                                                         global_wk_sz0 * global_wk_sz1 * global_wk_sz2)) \
              [(k*global_wk_sz0*global_wk_sz1 + j*global_wk_sz0 + i) * vec_num + vec_i]);\
     } while (0)
 
@@ -124,10 +125,9 @@ namespace gbe
         for (i = 0; i < global_wk_sz0; i++) {
           for (j = 0; j < global_wk_sz1; j++) {
             for (k = 0; k < global_wk_sz2; k++) {
-
-              int flag = ((int *)index_addr)[stmt*global_wk_sz0*global_wk_sz1*global_wk_sz2
-                                             + k*global_wk_sz0*global_wk_sz1 + j*global_wk_sz0 + i];
-              if (flag) {
+              int loop_num = ((int *)index_addr)[stmt*global_wk_sz0*global_wk_sz1*global_wk_sz2
+                                                 + k*global_wk_sz0*global_wk_sz1 + j*global_wk_sz0 + i];
+              for (int n = 0; n < loop_num; n++) {
                 for (auto &slot : pf) {
                   pf_str = "";
                   int vec_num;
@@ -149,20 +149,35 @@ namespace gbe
                     switch (slot.state->conversion_specifier) {
                       case PRINTF_CONVERSION_D:
                       case PRINTF_CONVERSION_I:
-                        PRINT_SOMETHING(int, d);
+                        if (slot.state->length_modifier == PRINTF_LM_L)
+                          PRINT_SOMETHING(uint64_t, d);
+                        else
+                          PRINT_SOMETHING(int, d);
                         break;
 
                       case PRINTF_CONVERSION_O:
-                        PRINT_SOMETHING(int, o);
+                        if (slot.state->length_modifier == PRINTF_LM_L)
+                          PRINT_SOMETHING(uint64_t, o);
+                        else
+                          PRINT_SOMETHING(int, o);
                         break;
                       case PRINTF_CONVERSION_U:
-                        PRINT_SOMETHING(int, u);
+                        if (slot.state->length_modifier == PRINTF_LM_L)
+                          PRINT_SOMETHING(uint64_t, u);
+                        else
+                          PRINT_SOMETHING(int, u);
                         break;
                       case PRINTF_CONVERSION_X:
-                        PRINT_SOMETHING(int, X);
+                        if (slot.state->length_modifier == PRINTF_LM_L)
+                          PRINT_SOMETHING(uint64_t, X);
+                        else
+                          PRINT_SOMETHING(int, X);
                         break;
                       case PRINTF_CONVERSION_x:
-                        PRINT_SOMETHING(int, x);
+                        if (slot.state->length_modifier == PRINTF_LM_L)
+                          PRINT_SOMETHING(uint64_t, x);
+                        else
+                          PRINT_SOMETHING(int, x);
                         break;
 
                       case PRINTF_CONVERSION_C:
@@ -210,6 +225,7 @@ namespace gbe
 
                   pf_str = "";
                 }
+
               }
             }
           }
diff --git a/backend/src/ir/printf.hpp b/backend/src/ir/printf.hpp
index 4db7245..b9f7619 100644
--- a/backend/src/ir/printf.hpp
+++ b/backend/src/ir/printf.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/ir/profile.cpp b/backend/src/ir/profile.cpp
index fc69367..4c272bd 100644
--- a/backend/src/ir/profile.cpp
+++ b/backend/src/ir/profile.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -48,11 +48,11 @@ namespace ir {
 
 #if GBE_DEBUG
 #define DECL_NEW_REG(FAMILY, REG, UNIFORM) \
-   r = fn.newRegister(FAMILY_DWORD, UNIFORM); \
+   r = fn.newRegister(FAMILY, UNIFORM); \
    GBE_ASSERT(r == REG);
 #else
 #define DECL_NEW_REG(FAMILY, REG, UNIFORM) \
-   fn.newRegister(FAMILY_DWORD, UNIFORM);
+   fn.newRegister(FAMILY, UNIFORM);
 #endif /* GBE_DEBUG */
     static void init(Function &fn) {
       IF_DEBUG(Register r);
@@ -75,7 +75,7 @@ namespace ir {
       DECL_NEW_REG(FAMILY_DWORD, goffset1, 1);
       DECL_NEW_REG(FAMILY_DWORD, goffset2, 1);
       DECL_NEW_REG(FAMILY_DWORD, stackptr, 0);
-      DECL_NEW_REG(FAMILY_DWORD, stackbuffer, 1);
+      DECL_NEW_REG(FAMILY_QWORD, stackbuffer, 1);
       DECL_NEW_REG(FAMILY_WORD,  blockip, 0);
       DECL_NEW_REG(FAMILY_DWORD, barrierid, 1);
       DECL_NEW_REG(FAMILY_DWORD, threadn, 1);
@@ -83,7 +83,7 @@ namespace ir {
       DECL_NEW_REG(FAMILY_DWORD, zero, 1);
       DECL_NEW_REG(FAMILY_DWORD, one, 1);
       DECL_NEW_REG(FAMILY_WORD, retVal, 1);
-      DECL_NEW_REG(FAMILY_WORD, slmoffset, 1);
+      DECL_NEW_REG(FAMILY_DWORD, slmoffset, 1);
       DECL_NEW_REG(FAMILY_DWORD, printfbptr, 1);
       DECL_NEW_REG(FAMILY_DWORD, printfiptr, 1);
       DECL_NEW_REG(FAMILY_DWORD, invalid, 1);
diff --git a/backend/src/ir/profile.hpp b/backend/src/ir/profile.hpp
index 4e89bdd..7259d9f 100644
--- a/backend/src/ir/profile.hpp
+++ b/backend/src/ir/profile.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/ir/register.cpp b/backend/src/ir/register.cpp
index 471bfbd..48d6875 100644
--- a/backend/src/ir/register.cpp
+++ b/backend/src/ir/register.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/ir/register.hpp b/backend/src/ir/register.hpp
index 7bd4f6e..ce8bd60 100644
--- a/backend/src/ir/register.hpp
+++ b/backend/src/ir/register.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -63,6 +63,21 @@ namespace ir {
     return 0;
   }
 
+  enum ARFRegister {
+    ARF_NULL = 0,
+    ARF_ADDRESS,
+    ARF_ACCUMULATOR,
+    ARF_FLAG,
+    ARF_MASK,
+    ARF_MASK_STACK,
+    ARF_MASK_STACK_DEPTH,
+    ARF_STATE,
+    ARF_CONTROL,
+    ARF_NOTIFICATION_COUNT,
+    ARF_IP,
+    ARF_TM
+  };
+
   /*! A register can be either a byte, a word, a dword or a qword. We store this
    *  value into a register data (which makes the register file) 
    */
@@ -83,7 +98,7 @@ namespace ir {
     /*! Nothing really happens here */
     INLINE ~RegisterData(void) {}
     RegisterFamily family;            //!< Register size or if it is a flag
-    INLINE const bool isUniform() const { return uniform; }
+    INLINE bool isUniform() const { return uniform; }
     INLINE void setUniform(bool uni) { uniform = uni; }
   private:
     bool uniform;
diff --git a/backend/src/ir/sampler.cpp b/backend/src/ir/sampler.cpp
index 7e8355f..ba42acb 100644
--- a/backend/src/ir/sampler.cpp
+++ b/backend/src/ir/sampler.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/ir/sampler.hpp b/backend/src/ir/sampler.hpp
index 2b51ce3..a23f871 100644
--- a/backend/src/ir/sampler.hpp
+++ b/backend/src/ir/sampler.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/ir/structural_analysis.cpp b/backend/src/ir/structural_analysis.cpp
new file mode 100644
index 0000000..1e98629
--- /dev/null
+++ b/backend/src/ir/structural_analysis.cpp
@@ -0,0 +1,1083 @@
+/*
+ * structural_analysis.hpp
+ * This code is derived from the ControlTree.h and ControlTree.cpp of
+ * project gpuocelot by Yongjia Zhang.
+ * The original copyright of gpuocelot appears below in its entirety.
+ */
+
+/*
+ * Copyright 2011
+ * GEORGIA TECH RESEARCH CORPORATION
+ * ALL RIGHTS RESERVED
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *     * Redistributions of source code must retain the above copyright
+ * notice,   this list of conditions and the following disclaimers.
+ *     * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimers in the
+ *       documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of GEORGIA TECH RESEARCH CORPORATION nor the
+ * names of  its contributors may be used to endorse or promote
+ * products derived  from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY GEORGIA TECH RESEARCH CORPORATION ''AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GEORGIA TECH RESEARCH
+ * CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You agree that the Software will not be shipped, transferred, exported,
+ * or re-exported directly into any country prohibited by the United States
+ * Export Administration Act and the regulations thereunder nor will be
+ * used for any purpose prohibited by the Act.
+ */
+
+
+#include "structural_analysis.hpp"
+
+namespace analysis
+{
+  ControlTree::~ControlTree()
+  {
+    NodeVector::iterator iter = nodes.begin();
+    NodeVector::iterator iter_end = nodes.end();
+    while(iter != iter_end)
+    {
+      delete *iter;
+      iter++;
+    }
+  }
+  void ControlTree::handleSelfLoopNode(Node *loopnode, ir::LabelIndex& whileLabel)
+  {
+              //NodeList::iterator child_iter = (*it)->children.begin();
+    ir::BasicBlock *pbb = loopnode->getExit();
+    GBE_ASSERT(pbb->isLoopExit);
+    ir::BasicBlock::iterator it = pbb->end();
+    it--;
+    if (pbb->hasExtraBra)
+      it--;
+    ir::BranchInstruction* pinsn = static_cast<ir::BranchInstruction *>(&*it);
+    ir::Register reg = pinsn->getPredicateIndex();
+    /* since this node is an while node, so we remove the BRA instruction at the bottom of the exit BB of 'node',
+     * and insert WHILE instead
+     */
+    whileLabel = pinsn->getLabelIndex();
+    ir::Instruction insn = ir::WHILE(whileLabel, reg);
+    ir::Instruction* p_new_insn = pbb->getParent().newInstruction(insn);
+    pbb->insertAt(it, *p_new_insn);
+    pbb->whileLabel = whileLabel;
+    pbb->erase(it);
+  }
+
+  /* recursive mark the bbs' variable needEndif, the bbs all belong to node.*/
+  void ControlTree::markNeedIf(Node *node, bool status)
+  {
+    if(node->type() == BasicBlock)
+    {
+      ir::BasicBlock* bb = ((BasicBlockNode*)node)->getBasicBlock();
+      bb->needIf = status;
+      return;
+    }
+    NodeList::iterator it = node->children.begin();
+    while(it != node->children.end())
+    {
+      markNeedIf(*it,status);
+      it++;
+    }
+  }
+
+  /* recursive mark the bbs' variable needIf, the bbs all belong to node.*/
+  void ControlTree::markNeedEndif(Node *node, bool status)
+  {
+    if(node->type() == BasicBlock)
+    {
+      ir::BasicBlock* bb = ((BasicBlockNode*)node)->getBasicBlock();
+      bb->needEndif = status;
+      return;
+    }
+
+    NodeList::iterator it = node->children.begin();
+    while(it != node->children.end())
+    {
+      markNeedEndif(*it, status);
+      it++;
+    }
+  }
+
+  /* recursive mark the bbs' variable mark, the bbs all belong to node. */
+  void ControlTree::markStructuredNodes(Node *node, bool status)
+  {
+    if(node->type() == BasicBlock)
+    {
+      BasicBlockNode* pbb = static_cast<BasicBlockNode *>(node);
+      pbb->getBasicBlock()->belongToStructure = true;
+    }
+    node->mark = status;
+    NodeList::iterator it = node->children.begin();
+    while(it != node->children.end())
+    {
+      markStructuredNodes(*it, status);
+      it++;
+    }
+  }
+
+  void ControlTree::handleIfNode(Node *node, ir::LabelIndex& matchingEndifLabel, ir::LabelIndex& matchingElseLabel)
+  {
+    ir::BasicBlock *pbb = node->getExit();
+    ir::BranchInstruction* pinsn = static_cast<ir::BranchInstruction *>(pbb->getLastInstruction());
+    ir::Register reg = pinsn->getPredicateIndex();
+    ir::BasicBlock::iterator it = pbb->end();
+    it--;
+    /* since this node is an if node, so we remove the BRA instruction at the bottom of the exit BB of 'node',
+     * and insert IF instead
+     */
+    pbb->erase(it);
+    ir::Instruction insn = ir::IF(matchingElseLabel, reg, node->inversePredicate);
+    ir::Instruction* p_new_insn = pbb->getParent().newInstruction(insn);
+    pbb->append(*p_new_insn);
+    pbb->matchingEndifLabel = matchingEndifLabel;
+    pbb->matchingElseLabel = matchingElseLabel;
+  }
+
+  void ControlTree::handleThenNode(Node *node, ir::LabelIndex& endiflabel)
+  {
+    ir::BasicBlock *pbb = node->getExit();
+    ir::BasicBlock::iterator it = pbb->end();
+    it--;
+    ir::Instruction *p_last_insn = pbb->getLastInstruction();
+
+    endiflabel = fn->newLabel();
+    //pbb->thisEndifLabel = endiflabel;
+
+    ir::Instruction insn = ir::ENDIF(endiflabel);
+    ir::Instruction* p_new_insn = pbb->getParent().newInstruction(insn);
+    // we need to insert ENDIF before the BRA(if exists).
+    bool append_bra = false;
+    if((*it).getOpcode() == ir::OP_BRA)
+    {
+      pbb->erase(it);
+      append_bra = true;
+    }
+    pbb->append(*p_new_insn);
+    if(append_bra)
+      pbb->append(*p_last_insn);
+  }
+
+
+  void ControlTree::handleThenNode2(Node *node, Node *elsenode, ir::LabelIndex elseBBLabel)
+  {
+    ir::BasicBlock *pbb = node->getExit();
+    ir::BasicBlock::iterator it = pbb->end();
+    it--;
+    if((*it).getOpcode() == ir::OP_BRA)
+      pbb->erase(it);
+
+    if(node->getExit()->getNextBlock() == elsenode->getEntry())
+      return;
+
+    // Add an unconditional jump to 'else' block
+    ir::Instruction insn = ir::BRA(elseBBLabel);
+    ir::Instruction* p_new_insn = pbb->getParent().newInstruction(insn);
+    pbb->append(*p_new_insn);
+  }
+
+
+  void ControlTree::handleElseNode(Node* node, ir::LabelIndex& elselabel, ir::LabelIndex& endiflabel)
+  {
+    // to insert ENDIF properly
+    handleThenNode(node, endiflabel);
+
+    ir::BasicBlock *pbb = node->getEntry();
+    ir::BasicBlock::iterator it = pbb->begin();
+    it++;
+
+    elselabel = fn->newLabel();
+    pbb->thisElseLabel = elselabel;
+
+    // insert ELSE properly
+    ir::Instruction insn = ir::ELSE(endiflabel);
+    ir::Instruction* p_new_insn = pbb->getParent().newInstruction(insn);
+
+    pbb->insertAt(it, *p_new_insn);
+  }
+
+
+  void ControlTree::handleStructuredNodes()
+  {
+    NodeVector::iterator it;
+    NodeVector::iterator end = nodes.end();
+    NodeVector::iterator begin = nodes.begin();
+    it = end;
+    it--;
+    NodeVector::reverse_iterator rit = nodes.rbegin();
+    /* structured bbs only need if and endif insn to handle the execution
+     * in structure entry and exit BasicBlock, so we process the nodes backward, since
+     * the node at the back of nodes is always a 'not smaller' structure then
+     * the ones before it. we mark the nodes which are sub-nodes of the node
+     * we are dealing with, in order to ensure we are always handling the 'biggest'
+     * structures */
+    while(rit != nodes.rend())
+    {
+      if((*rit)->type() == IfThen || (*rit)->type() == IfElse|| (*rit)->type() == SelfLoop)
+      {
+        if(false == (*rit)->mark && (*rit)->canBeHandled)
+        {
+          markStructuredNodes(*rit, true);
+          /* only the entry bb of this structure needs 'if' at backend and
+           * only the exit bb of this structure needs 'endif' at backend
+           * see comment about needEndif and needIf at function.hpp for detail. */
+          markNeedEndif(*rit, false);
+          markNeedIf(*rit, false);
+          ir::BasicBlock* entry = (*rit)->getEntry();
+          ir::BasicBlock* eexit = (*rit)->getExit();
+          entry->needIf = true;
+          eexit->needEndif = true;
+          entry->endifLabel = fn->newLabel();
+          eexit->endifLabel = entry->endifLabel;
+          eexit->isStructureExit = true;
+          eexit->matchingStructureEntry = entry;
+        }
+      }
+      rit++;
+    }
+
+    rit = nodes.rbegin();
+    gbe::vector<ir::BasicBlock *> &blocks = fn->getBlocks();
+    std::vector<ir::BasicBlock *> bbs;
+    bbs.resize(blocks.size());
+
+    /* here insert the bras to the BBs, which would
+     * simplify the reorder of basic blocks */
+    for(size_t i = 0; i < blocks.size(); ++i)
+    {
+      bbs[i] = blocks[i];
+      if(i != blocks.size() -1 &&
+         (bbs[i]->getLastInstruction()->getOpcode() != ir::OP_BRA ||
+         (bbs[i]->isStructureExit && bbs[i]->isLoopExit)))
+      {
+        ir::Instruction insn = ir::BRA(bbs[i]->getNextBlock()->getLabelIndex());
+        ir::Instruction* pNewInsn = bbs[i]->getParent().newInstruction(insn);
+        bbs[i]->append(*pNewInsn);
+        if (bbs[i]->isStructureExit && bbs[i]->isLoopExit)
+          bbs[i]->hasExtraBra = true;
+      }
+    }
+
+    /* now, reorder the basic blocks to reduce the unconditional jump we inserted whose
+     * targets are the 'else' nodes. the algorithm is quite simple, just put the unstructured
+     * BBs(maybe belong to another structure, but not this one) in front of the entry BB of
+     * this structure in front of all the others and put the other unstructured BBs at the
+     * back of the others. the sequence of structured is get through function getStructureSequence.
+     */
+    while(rit != nodes.rend())
+    {
+      if(((*rit)->type() == IfThen || (*rit)->type() == IfElse || (*rit)->type() == Block ||(*rit)->type() == SelfLoop) &&
+          (*rit)->canBeHandled && (*rit)->mark == true)
+      {
+        markStructuredNodes(*rit, false);
+        std::set<int> ns = getStructureBasicBlocksIndex(*rit, bbs);
+        ir::BasicBlock *entry = (*rit)->getEntry();
+
+        int entryIndex = *(ns.begin());
+        for(size_t i=0; i<bbs.size(); ++i)
+        {
+          if(bbs[i] == entry)
+            entryIndex = i;
+        }
+
+        std::set<int>::iterator iter = ns.begin();
+        int index = *iter;
+
+        std::vector<ir::BasicBlock *> unstruSeqHead;
+        std::vector<ir::BasicBlock *> unstruSeqTail;
+
+        iter = ns.begin();
+        while(iter != ns.end())
+        {
+          if(index != *iter)
+          {
+            if(index < entryIndex)
+              unstruSeqHead.push_back(bbs[index]);
+            else
+              unstruSeqTail.push_back(bbs[index]);
+            index++;
+          }
+          else
+          {
+            index++;
+            iter++;
+          }
+        }
+
+        std::vector<ir::BasicBlock *> struSeq;
+        getStructureSequence(*rit, struSeq);
+
+        int firstindex = *(ns.begin());
+        for(size_t i = 0; i < unstruSeqHead.size(); ++i)
+          bbs[firstindex++] = unstruSeqHead[i];
+        for(size_t i = 0; i < struSeq.size(); ++i)
+          bbs[firstindex++] = struSeq[i];
+        for(size_t i = 0; i < unstruSeqTail.size(); ++i)
+          bbs[firstindex++] = unstruSeqTail[i];
+      }
+      rit++;
+    }
+
+   /* now, erase the BRAs inserted before whose targets are their fallthrough blocks */
+    for(size_t i=0; i<bbs.size(); ++i)
+    {
+      if(bbs[i]->getLastInstruction()->getOpcode() == ir::OP_BRA &&
+         !((ir::BranchInstruction*)(bbs[i]->getLastInstruction()))->isPredicated())
+      {
+        if(((ir::BranchInstruction *)bbs[i]->getLastInstruction())->getLabelIndex() == bbs[i+1]->getLabelIndex())
+        {
+          ir::BasicBlock::iterator it= bbs[i]->end();
+          it--;
+
+          bbs[i]->erase(it);
+
+          if (bbs[i]->hasExtraBra)
+            bbs[i]->hasExtraBra = false;
+        }
+      }
+    }
+    for(size_t i=0; i<bbs.size(); ++i)
+      blocks[i] = bbs[i];
+
+    fn->sortLabels();
+    fn->computeCFG();
+
+    it = begin;
+    while(it != end)
+    {
+      if((*it)->canBeHandled)
+      {
+        switch((*it)->type())
+        {
+          case IfThen:
+            {
+              NodeList::iterator child_iter = (*it)->children.end();
+              ir::LabelIndex endiflabel;
+              child_iter--;
+              handleThenNode(*child_iter, endiflabel); // this call would pass out the proper endiflabel for handleIfNode's use.
+              child_iter--;
+              handleIfNode(*child_iter, endiflabel, endiflabel);
+            }
+            break;
+
+          case IfElse:
+            {
+              NodeList::iterator child_iter = (*it)->children.end();
+              ir::LabelIndex endiflabel;
+              ir::LabelIndex elselabel;
+              NodeList::iterator else_node;
+              child_iter--;
+              else_node = child_iter;
+              handleElseNode(*child_iter, elselabel, endiflabel);
+              ir::LabelIndex elseBBLabel = (*child_iter)->getEntry()->getLabelIndex();
+              child_iter--;
+              handleThenNode2(*child_iter, *else_node, elseBBLabel);
+              child_iter--;
+              handleIfNode(*child_iter, endiflabel, elselabel);
+            }
+            break;
+
+          case SelfLoop:
+            {
+              ir::LabelIndex whilelabel;
+              handleSelfLoopNode(*it, whilelabel);
+            }
+            break;
+
+          default:
+            break;
+        }
+      }
+
+      it++;
+    }
+
+  }
+
+  void ControlTree::getStructureSequence(Node *node, std::vector<ir::BasicBlock*> &seq)
+  {
+    /* in the control tree, for if-then, if node is before then node; for if-else, the
+     * stored sequence is if-then-else, for block structure, the stored sequence is just
+     * their executed sequence. so we could just get the structure sequence by recrusive
+     * calls getStructureSequence to all the elements in children one by one.
+     */
+    if(node->type() == BasicBlock)
+    {
+      seq.push_back(((BasicBlockNode *)node)->getBasicBlock());
+      return;
+    }
+
+    NodeList::iterator iter = node->children.begin();
+    while(iter != node->children.end())
+    {
+      getStructureSequence(*iter, seq);
+      iter++;
+    }
+
+  }
+
+
+  std::set<int> ControlTree::getStructureBasicBlocksIndex(Node* node, std::vector<ir::BasicBlock *> &bbs)
+  {
+    std::set<int> result;
+    if(node->type() == BasicBlock)
+    {
+      for(size_t i=0; i<bbs.size(); i++)
+      {
+        if(bbs[i] == ((BasicBlockNode *)node)->getBasicBlock())
+        {
+          result.insert(i);
+          break;
+        }
+      }
+      return result;
+    }
+    NodeList::iterator iter = (node->children).begin();
+    NodeList::iterator end = (node->children).end();
+    while(iter != end)
+    {
+      std::set<int> ret = getStructureBasicBlocksIndex(*iter, bbs);
+      result.insert(ret.begin(), ret.end());
+      iter++;
+    }
+    return result;
+  }
+
+
+  std::set<ir::BasicBlock *> ControlTree::getStructureBasicBlocks(Node *node)
+  {
+    std::set<ir::BasicBlock *> result;
+    if(node->type() == BasicBlock)
+    {
+      result.insert(((BasicBlockNode *)node)->getBasicBlock());
+      return result;
+    }
+    NodeList::iterator iter = (node->children).begin();
+    NodeList::iterator end = (node->children).end();
+    while(iter != end)
+    {
+      std::set<ir::BasicBlock *> ret = getStructureBasicBlocks(*iter);
+      result.insert(ret.begin(), ret.end());
+      iter++;
+    }
+    return result;
+  }
+
+
+  Node* ControlTree::insertNode(Node *p_node)
+  {
+    nodes.push_back(p_node);
+    return p_node;
+  }
+
+
+  bool ControlTree::checkForBarrier(const ir::BasicBlock* bb)
+  {
+    ir::BasicBlock::const_iterator iter = bb->begin();
+    ir::BasicBlock::const_iterator iter_end = bb->end();
+    while(iter != iter_end)
+    {
+      if((*iter).getOpcode() == ir::OP_SYNC)
+        return true;
+      iter++;
+    }
+
+    return false;
+  }
+
+
+  void ControlTree::getLiveIn(ir::BasicBlock& bb, std::set<ir::Register>& livein)
+  {
+    ir::BasicBlock::iterator iter = bb.begin();
+    std::set<ir::Register> varKill;
+    while(iter != bb.end())
+    {
+      ir::Instruction& insn = *iter;
+      const uint32_t srcNum = insn.getSrcNum();
+      const uint32_t dstNum = insn.getDstNum();
+      for(uint32_t srcID = 0; srcID < srcNum; ++srcID)
+      {
+        const ir::Register reg = insn.getSrc(srcID);
+        if(varKill.find(reg) == varKill.end())
+          livein.insert(reg);
+      }
+      for(uint32_t dstID = 0; dstID < dstNum; ++dstID)
+      {
+        const ir::Register reg = insn.getDst(dstID);
+        varKill.insert(reg);
+      }
+
+      iter++;
+    }
+  }
+
+  void ControlTree::calculateNecessaryLiveout()
+  {
+    NodeVector::iterator iter = nodes.begin();
+
+    while(iter != nodes.end())
+    {
+      switch((*iter)->type())
+      {
+        case IfElse:
+        {
+          std::set<ir::BasicBlock *> bbs;
+          NodeList::iterator thenIter = (*iter)->children.begin();
+          thenIter++;
+          bbs = getStructureBasicBlocks(*thenIter);
+
+          Node *elseNode = *((*iter)->children.rbegin());
+          std::set<ir::Register> livein;
+          getLiveIn(*(elseNode->getEntry()), livein);
+
+          std::set<ir::BasicBlock *>::iterator bbiter = bbs.begin();
+          while(bbiter != bbs.end())
+          {
+            (*bbiter)->liveout.insert(livein.begin(), livein.end());
+            bbiter++;
+          }
+        }
+
+        default:
+          break;
+      }
+      iter++;
+    }
+  }
+
+
+  void ControlTree::initializeNodes()
+  {
+    ir::BasicBlock& tmp_bb = fn->getTopBlock();
+    ir::BasicBlock* p_tmp_bb = &tmp_bb;
+    Node* p = NULL;
+
+    if(NULL != p_tmp_bb)
+    {
+      Node *p_tmp_node = new BasicBlockNode(p_tmp_bb);
+      p_tmp_node->label = p_tmp_bb->getLabelIndex();
+
+      if(checkForBarrier(p_tmp_bb))
+        p_tmp_node->hasBarrier() = true;
+
+      nodes.push_back(p_tmp_node);
+      bbmap[p_tmp_bb] = p_tmp_node;
+      p_tmp_bb = p_tmp_bb->getNextBlock();
+      p = p_tmp_node;
+    }
+
+    while(p_tmp_bb != NULL)
+    {
+      Node *p_tmp_node = new BasicBlockNode(p_tmp_bb);
+      p_tmp_node->label = p_tmp_bb->getLabelIndex();
+
+      if(checkForBarrier(p_tmp_bb))
+        p_tmp_node->hasBarrier() = true;
+
+      p->fallthrough() = p_tmp_node;
+      p = p_tmp_node;
+      nodes.push_back(p_tmp_node);
+      bbmap[p_tmp_bb] = p_tmp_node;
+      p_tmp_bb = p_tmp_bb->getNextBlock();
+    }
+
+    if(NULL != p)
+      p->fallthrough() = NULL;
+
+    p_tmp_bb = &tmp_bb;
+
+    this->nodes_entry = bbmap[p_tmp_bb];
+
+    while(p_tmp_bb != NULL)
+    {
+      ir::BlockSet::const_iterator iter_begin = p_tmp_bb->getPredecessorSet().begin();
+      ir::BlockSet::const_iterator iter_end = p_tmp_bb->getPredecessorSet().end();
+      while(iter_begin != iter_end)
+      {
+        bbmap[p_tmp_bb]->preds().insert(bbmap[*iter_begin]);
+        iter_begin++;
+      }
+
+      iter_begin = p_tmp_bb->getSuccessorSet().begin();
+      iter_end = p_tmp_bb->getSuccessorSet().end();
+      while(iter_begin != iter_end)
+      {
+        bbmap[p_tmp_bb]->succs().insert(bbmap[*iter_begin]);
+        iter_begin++;
+      }
+
+      p_tmp_bb = p_tmp_bb->getNextBlock();
+    }
+  }
+
+
+  void ControlTree::DFSPostOrder(Node *start)
+  {
+    visited.insert(start);
+    NodeSet::iterator y;
+    NodeSet::iterator iter_begin = start->succs().begin();
+    NodeSet::iterator iter_end = start->succs().end();
+    for(y = iter_begin; y != iter_end; ++y )
+    {
+      if(visited.find(*y) != visited.end())
+        continue;
+      DFSPostOrder(*y);
+    }
+    post_order.push_back(start);
+  }
+
+
+  bool ControlTree::isCyclic(Node* node)
+  {
+    if(node->type() == NaturalLoop ||
+       node->type() == WhileLoop ||
+       node->type() == SelfLoop)
+      return true;
+
+    return false;
+  }
+
+
+  bool ControlTree::isBackedge(const Node* head, const Node* tail)
+  {
+    const Node* match[] = {head, tail};
+    NodeList::iterator n = find_first_of(post_order.begin(), post_order.end(), match, match + 2);
+
+    if(*n == head)
+      return true;
+    if(*n == tail)
+      return false;
+
+    return false;
+  }
+
+
+  bool ControlTree::pathBack(Node* m, Node* n)
+  {
+    for(NodeSet::const_iterator iter = n->preds().begin(); iter!= n->preds().end(); iter++)
+    {
+      if(isBackedge(*iter, n))
+      {
+        visited.clear();
+        if(path(m, *iter, n))
+          return true;
+      }
+    }
+
+    return false;
+  }
+
+  /* this algorithm is from Muchnick's textbook(sec 7.7) (Advanced Compiler Design and Implementation) */
+  Node* ControlTree::acyclicRegionType(Node* node, NodeSet& nset)
+  {
+    nset.clear();
+    Node *n;
+    bool p, s, barrier;
+    NodeList nodes;
+
+    n = node;
+    p = true;
+    s = (n->succs().size()==1);
+    barrier = n->hasBarrier();
+    while(p && s && !barrier)
+    {
+      if(nset.insert(n).second)
+        nodes.push_back(n);
+      n = *(n->succs().begin());
+      barrier = n->hasBarrier();
+      p = (n->preds().size() == 1);
+      s = (n->succs().size() == 1);
+    }
+
+    if(p && !barrier)
+    {
+      if(nset.insert(n).second)
+        nodes.push_back(n);
+    }
+
+    n = node;
+    p = (n->preds().size() == 1);
+    s = true;
+    barrier = n->hasBarrier();
+
+    while(p && s && !barrier)
+    {
+      if(nset.insert(n).second)
+        nodes.push_front(n);
+      n = *(n->preds().begin());
+      barrier = n->hasBarrier();
+      p = (n->preds().size() == 1);
+      s = (n->succs().size() == 1);
+    }
+
+    if(s && !barrier)
+    {
+      if(nset.insert(n).second)
+        nodes.push_front(n);
+    }
+
+    node = n;
+
+    if(nodes.size() >=2 )
+    {
+      Node* p = new BlockNode(nodes);
+      NodeList::iterator iter = nodes.begin();
+      while(iter != nodes.end())
+      {
+        if((*iter)->canBeHandled == false)
+        {
+          p->canBeHandled = false;
+          break;
+        }
+        iter++;
+      }
+
+      return insertNode(p);
+    }
+
+    else if(node->succs().size() == 2)
+    {
+      Node *m;
+      m = *(node->succs().begin());
+      n = *(++(node->succs().begin()));
+
+      /* check for if node then n */
+      if( n->succs().size() == 1 &&
+         n->preds().size() == 1 &&
+         *(n->succs().begin()) == m &&
+         !n->hasBarrier() && !node->hasBarrier())
+      {
+        nset.clear();
+        nset.insert(node);
+        nset.insert(n);
+
+        Node* p = new IfThenNode(node, n);
+        if(node->fallthrough() == m)
+          node->inversePredicate = false;
+
+        if(node->canBeHandled == false || n->canBeHandled == false)
+          p->canBeHandled = false;
+
+        return insertNode(p);
+      }
+
+      /* check for if node then m */
+      if(m->succs().size() == 1 &&
+         m->preds().size() == 1 &&
+         *(m->succs().begin()) == n &&
+         !m->hasBarrier() && !node->hasBarrier())
+      {
+        nset.clear();
+        nset.insert(node);
+        nset.insert(m);
+
+        Node* p = new IfThenNode(node, m);
+        if(node->fallthrough() == n)
+          node->inversePredicate = false;
+
+        if(node->canBeHandled == false || m->canBeHandled == false)
+          p->canBeHandled = false;
+
+        return insertNode(p);
+      }
+
+      /* check for if node then n else m */
+      if(m->succs().size() == 1 && n->succs().size() == 1 &&
+         m->preds().size() == 1 && n->preds().size() == 1 &&
+         *(m->succs().begin()) == *(n->succs().begin()) &&
+         node->fallthrough() == n && !m->hasBarrier() && !n->hasBarrier() && !node->hasBarrier())
+      {
+        nset.clear();
+        nset.insert(node);
+        nset.insert(n);
+        nset.insert(m);
+
+        Node* p = new IfElseNode(node, n, m);
+
+        if(node->canBeHandled == false ||
+           m->canBeHandled == false ||
+           n->canBeHandled == false)
+          p->canBeHandled = false;
+
+        return insertNode(p);
+      }
+
+      /* check for if node then m else n */
+      if(m->succs().size() == 1 && n->succs().size() == 1 &&
+         m->preds().size() == 1 && n->preds().size() == 1 &&
+         *(m->succs().begin()) == *(n->succs().begin()) &&
+         node->fallthrough() == m && !m->hasBarrier() && !n->hasBarrier() &&!node->hasBarrier())
+      {
+        nset.clear();
+        nset.insert(node);
+        nset.insert(m);
+        nset.insert(n);
+
+        Node* p = new IfElseNode(node, m, n);
+
+        if(node->canBeHandled == false ||
+           m->canBeHandled == false ||
+           n->canBeHandled == false)
+          p->canBeHandled = false;
+        return insertNode(p);
+      }
+    }
+
+    return NULL;
+  }
+
+
+  bool ControlTree::path(Node *from, Node *to, Node *notthrough)
+  {
+
+    if(from == notthrough || visited.find(from) != visited.end())
+      return false;
+
+    if(from == to)
+      return true;
+
+    visited.insert(from);
+
+    for(NodeSet::const_iterator s = from->succs().begin(); s != from->succs().end(); s++)
+    {
+      if(path(*s, to, notthrough))
+        return true;
+    }
+
+    return false;
+  }
+
+
+  /* this algorithm could work right, but it is quite inefficient, and
+   * we are not handling any cyclic regions at this moment, so here just
+   * ignore the identification of cyclic regions. */
+  Node * ControlTree::cyclicRegionType(Node *node, NodeList &nset)
+  {
+    /* check for self-loop */
+    if(nset.size() == 1)
+    {
+      if(node->succs().find(node) != node->succs().end())
+      {
+        Node* p = new SelfLoopNode(node);
+
+        p->canBeHandled = true;
+        node->getExit()->isLoopExit = true;
+        return insertNode(p);
+      }
+      else
+        return NULL;
+    }
+
+    /* check for improper region */
+    for(NodeList::const_iterator m = nset.begin(); m != nset.end(); m++)
+    {
+      visited.clear();
+      if(!path(node, *m))
+        return NULL;
+    }
+
+    /* check for while loop */
+    NodeList::iterator m;
+    for(m = nset.begin(); m != nset.end(); ++m)
+    {
+      if(*m == node)
+        continue;
+      if(node->succs().size() == 2 && (*m)->succs().size() == 1 &&
+         node->preds().size() == 2 && (*m)->preds().size() == 1)
+      {
+        Node* p = new WhileLoopNode(node, *m);
+
+        p->canBeHandled = false;
+
+        return insertNode(p);
+      }
+    }
+    return NULL;
+  }
+
+
+  /* this algorithm is from Muchnick's textbook(sec 7.7) (Advanced Compiler Design and Implementation) */
+  void ControlTree::reduce(Node* node,  NodeSet nodeSet)
+  {
+    NodeSet::iterator n;
+    for(n = nodeSet.begin(); n != nodeSet.end(); n++)
+    {
+      NodeSet::iterator p;
+      for(p = (*n)->preds().begin(); p != (*n)->preds().end(); p++)
+      {
+        if(nodeSet.find(*p) != nodeSet.end())
+          continue;
+
+        (*p)->succs().erase(*n);
+
+        (*p)->succs().insert(node);
+        node->preds().insert(*p);
+
+        if((*p)->fallthrough() == *n)
+          (*p)->fallthrough() = node;
+      }
+
+
+     NodeSet::iterator s;
+     for(s = (*n)->succs().begin(); s != (*n)->succs().end(); s++)
+     {
+        if(nodeSet.find(*s) != nodeSet.end())
+          continue;
+
+       (*s)->preds().erase(*n);
+
+       (*s)->preds().insert(node);
+       node->succs().insert(*s);
+
+       if((*n)->fallthrough() == *s)
+         node->fallthrough() = *s;
+     }
+    }
+
+    if(!isCyclic(node))
+    {
+      for(n = nodeSet.begin(); n != nodeSet.end(); n++)
+      {
+        bool shouldbreak = false;
+        NodeSet::iterator p;
+        for(p = (*n)->preds().begin(); p != (*n)->preds().end(); p++)
+        {
+          if(nodeSet.find(*p) == nodeSet.end())
+            continue;
+
+          if(isBackedge(*p, *n))
+          {
+            node->preds().insert(node);
+            node->succs().insert(node);
+
+            shouldbreak = true;
+            break;
+          }
+        }
+
+        if(shouldbreak)
+          break;
+      }
+    }
+
+    compact(node, nodeSet);
+  }
+
+
+  /* this algorithm is from Muchnick's textbook(sec 7.7) (Advanced Compiler Design and Implementation) */
+  void ControlTree::compact(Node* node,  NodeSet nodeSet)
+  {
+    NodeList::iterator n, pos;
+    for(n = post_order.begin(); n!= post_order.end() && !nodeSet.empty();)
+    {
+      if(!nodeSet.erase(*n))
+      {
+        n++;
+        continue;
+      }
+
+      n = post_order.erase(n);
+      pos = n;
+    }
+
+    post_ctr = post_order.insert(pos, node);
+  }
+
+
+  /* this algorithm is from Muchnick's textbook(sec 7.7) (Advanced Compiler Design and Implementation) */
+  void ControlTree::structuralAnalysis(Node *entry)
+  {
+    Node* n;
+    NodeSet nset;
+    NodeList reachUnder;
+    bool changed;
+    do
+    {
+      changed = false;
+      post_order.clear();
+      visited.clear();
+
+      DFSPostOrder(entry);
+      post_ctr = post_order.begin();
+
+      while(post_order.size() > 1 && post_ctr != post_order.end())
+      {
+        n = *post_ctr;
+        Node* region = acyclicRegionType(n, nset);
+
+        if( NULL != region)
+        {
+          changed = true;
+
+          reduce(region, nset);
+
+          if(nset.find(entry) != nset.end())
+            entry = region;
+        }
+        // FIXME loop optimization is still buggy and under development, now disable it by default.
+        else
+        {
+#if 0
+          reachUnder.clear();
+          nset.clear();
+          for(NodeList::const_iterator m = post_order.begin(); m != post_order.end(); m++)
+          {
+            if(*m != n && pathBack(*m, n))
+            {
+              reachUnder.push_front(*m);
+              nset.insert(*m);
+            }
+          }
+
+          reachUnder.push_front(n);
+          nset.insert(n);
+          region = cyclicRegionType(n, reachUnder);
+
+          if(NULL != region)
+          {
+            reduce(region, nset);
+            changed = true;
+
+            if(nset.find(entry) != nset.end())
+              entry = region;
+          }
+          else
+          {
+            post_ctr++;
+          }
+#else
+          post_ctr++;
+#endif
+        }
+      }
+
+      if(!changed)
+        break;
+
+    } while(post_order.size()>1);
+
+  }
+
+  void ControlTree::analyze()
+  {
+    initializeNodes();
+    structuralAnalysis(nodes_entry);
+    handleStructuredNodes();
+    calculateNecessaryLiveout();
+  }
+}
diff --git a/backend/src/ir/structural_analysis.hpp b/backend/src/ir/structural_analysis.hpp
new file mode 100644
index 0000000..dc2f3c2
--- /dev/null
+++ b/backend/src/ir/structural_analysis.hpp
@@ -0,0 +1,346 @@
+/*
+ * structural_analysis.hpp
+ * This code is derived from the ControlTree.h and ControlTree.cpp of
+ * project gpuocelot by Yongjia Zhang.
+ * The original copyright of gpuocelot appears below in its entirety.
+ */
+
+/*
+ * Copyright 2011
+ * GEORGIA TECH RESEARCH CORPORATION
+ * ALL RIGHTS RESERVED
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *     * Redistributions of source code must retain the above copyright
+ * notice,   this list of conditions and the following disclaimers.
+ *     * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimers in the
+ *       documentation and/or other materials provided with the
+ * distribution.
+ *     * Neither the name of GEORGIA TECH RESEARCH CORPORATION nor the
+ * names of  its contributors may be used to endorse or promote
+ * products derived  from this software without specific prior
+ * written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY GEORGIA TECH RESEARCH CORPORATION ''AS IS''
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GEORGIA TECH RESEARCH
+ * CORPORATION BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * You agree that the Software will not be shipped, transferred, exported,
+ * or re-exported directly into any country prohibited by the United States
+ * Export Administration Act and the regulations thereunder nor will be
+ * used for any purpose prohibited by the Act.
+ */
+
+
+#ifndef __STRUCTURAL_ANALYSIS_HPP__
+#define __STRUCTURAL_ANALYSIS_HPP__
+
+#include "ir/unit.hpp"
+#include "ir/function.hpp"
+#include "ir/instruction.hpp"
+
+#include <iostream>
+#include <unordered_set>
+#include <unordered_map>
+#include <vector>
+#include <map>
+#include <list>
+#include <algorithm>
+#include <set>
+#define TRANSFORM_UNSTRUCTURE
+
+namespace analysis
+{
+  using namespace std;
+  using namespace gbe;
+
+  enum RegionType
+  {
+    BasicBlock = 0,
+    Block,
+    IfThen,
+    IfElse,
+    SelfLoop,
+    WhileLoop,
+    NaturalLoop
+  } ;
+
+  /* control tree virtual node */
+  class Node;
+
+  typedef unordered_set<Node *> NodeSet;
+  typedef list<Node *> NodeList;
+  typedef std::vector<Node *> NodeVector;
+
+  /* control tree virtual node */
+  class Node
+  {
+  public:
+    Node(RegionType rtype, const NodeList& children): has_barrier(false), mark(false), canBeHandled(true), inversePredicate(true)
+    {
+      this->rtype = rtype;
+      this->children = children;
+    }
+    virtual ~Node() {}
+    NodeSet& preds() { return pred; }
+    NodeSet& succs() { return succ; }
+    Node*& fallthrough() { return fall_through; }
+    bool& hasBarrier() { return has_barrier; }
+    RegionType type() { return rtype; }
+    virtual ir::BasicBlock* getEntry()
+    {
+      return (*(children.begin()))->getEntry();
+    }
+    virtual ir::BasicBlock* getExit()
+    {
+      return (*(children.rbegin()))->getExit();
+    }
+
+  public:
+    RegionType rtype;
+    NodeSet pred;
+    NodeSet succ;
+    NodeList children;
+    Node* fall_through;
+    bool has_barrier;
+    bool mark;
+    bool canBeHandled;
+    //label is for debug
+    int label;
+    /* inversePredicate should be false under two circumstance,
+     * fallthrough is the same with succs:
+     * (1) n->succs == m && node->fallthrough == m
+     * node
+     * | \
+     * |  \
+     * m<--n
+     * (2) m->succs == n && node->fallthrough == n
+     * node
+     * | \
+     * |  \
+     * m-->n
+     * */
+    bool inversePredicate;
+  };
+
+  /* represents basic block */
+  class BasicBlockNode : public Node
+  {
+  public:
+    BasicBlockNode(ir::BasicBlock *p_bb) : Node(BasicBlock, NodeList()) { this->p_bb = p_bb; }
+    virtual ~BasicBlockNode() {}
+    ir::BasicBlock* getBasicBlock() { return p_bb; }
+    virtual ir::BasicBlock* getEntry() { return p_bb; }
+    virtual ir::BasicBlock* getExit() { return p_bb; }
+    virtual ir::BasicBlock* getFirstBB() { return p_bb; }
+  private:
+    ir::BasicBlock *p_bb;
+  };
+
+  /* a sequence of nodes */
+  class BlockNode : public Node
+  {
+  public:
+    BlockNode(NodeList& children) : Node(Block, children) {}
+    virtual ~BlockNode(){}
+  };
+
+  /* If-Then structure node */
+  class IfThenNode : public Node
+  {
+  public:
+    IfThenNode(Node* cond, Node* ifTrue) : Node(IfThen, BuildChildren(cond, ifTrue)) {}
+    virtual ~IfThenNode() {}
+
+  private:
+    const NodeList BuildChildren(Node* cond, Node* ifTrue)
+    {
+      NodeList children;
+      children.push_back(cond);
+      children.push_back(ifTrue);
+      return children;
+    }
+  };
+
+  /* If-Else structure node */
+  class IfElseNode : public Node
+  {
+  public:
+    IfElseNode(Node* cond, Node* ifTrue, Node* ifFalse) : Node(IfElse, BuildChildren(cond, ifTrue, ifFalse)) {}
+    virtual ~IfElseNode() {}
+
+  private:
+    const NodeList BuildChildren(Node* cond, Node* ifTrue, Node* ifFalse)
+    {
+      NodeList children;
+      children.push_back(cond);
+      children.push_back(ifTrue);
+      children.push_back(ifFalse);
+      return children;
+    }
+  };
+
+  /* Self loop structure node */
+  class SelfLoopNode : public Node
+  {
+  public:
+    SelfLoopNode(Node* node) : Node(SelfLoop, BuildChildren(node)) {}
+    virtual ~SelfLoopNode() {}
+    virtual ir::BasicBlock* getEntry()
+    {
+      return (*(children.begin()))->getEntry();
+    }
+    virtual ir::BasicBlock* getExit()
+    {
+      return (*(children.begin()))->getExit();
+    }
+
+  private:
+    const NodeList BuildChildren(Node *node)
+    {
+      NodeList children;
+      children.push_back(node);
+      return children;
+    }
+  };
+
+  /* While loop structure node */
+  class WhileLoopNode : public Node
+  {
+  public:
+    WhileLoopNode(Node* cond, Node* execute) : Node(WhileLoop, BuildChildren(cond, execute)) {}
+    virtual ~WhileLoopNode() {}
+    virtual ir::BasicBlock* getEntry()
+    {
+      return (*(children.begin()))->getEntry();
+    }
+    virtual ir::BasicBlock* getExit()
+    {
+      return (*(children.begin()))->getExit();
+    }
+
+  private:
+    const NodeList BuildChildren(Node* cond, Node* execute)
+    {
+      NodeList children;
+      children.push_back(cond);
+      children.push_back(execute);
+      return children;
+    }
+
+  };
+
+  /* Natural loop structure node */
+  class NaturalLoopNode : public Node
+  {
+  public:
+    NaturalLoopNode(const NodeList& children): Node(NaturalLoop, children){}
+    virtual ~NaturalLoopNode() {}
+    virtual ir::BasicBlock* getEntry()
+    {
+      //TODO implement it
+      return NULL;
+    }
+    virtual ir::BasicBlock* getExit()
+    {
+      //TODO implement it
+      return NULL;
+    }
+  };
+
+  /* computes the control tree, and do the structure identification during the computation */
+  class ControlTree
+  {
+  public:
+    void analyze();
+
+    ControlTree(ir::Function* fn) { this->fn = fn; }
+    ~ControlTree();
+
+  private:
+    /* create a sequence of BasicBlockNodes, which are refered to the basic blocks in the function */
+    void initializeNodes();
+    /* insert a new Node, and returns the pointer of the node */
+    Node* insertNode(Node *);
+    /* do the structural analysis */
+    void structuralAnalysis(Node * entry);
+    /* do the dfs post order traverse of the current CFG */
+    void DFSPostOrder(Node *start);
+    /* returns true if there is a (possibly empty) path from m to k that does not pass through n */
+    bool path(Node *m, Node *k, Node *n = NULL);
+    /* link region node into abstract flowgraph, adjust the predecessor and successor functions, and augment the control tree */
+    void reduce(Node* node,  NodeSet nodeSet);
+    /* adds node to the control tree, inserts node into _post
+     * at the highest-numbered position of a node in nodeSet, removes
+     * the nodes in nodeSet from _post, compacts the remaining nodes at
+     * the beginning of _post, and sets _postCtr to the index of node
+     * in the resulting postorder */
+    void compact(Node* node,  NodeSet nodeSet);
+    Node* getNodesEntry() const  { return nodes_entry;}
+    /* determines whether node is the entry node of an acyclic
+     * control structure and returns its region. Stores in nset the set
+     * of nodes in the identified control structure */
+    Node* acyclicRegionType(Node* node, NodeSet& nset);
+    /* determines whether node is the entry node of a cyclic
+     * control structure and returns its region. Stores in nset the set
+     * of nodes in the identified control structure */
+    Node* cyclicRegionType(Node*, NodeList&);
+    /* is this a cyclic region? */
+    bool isCyclic(Node*);
+    /* is this a back edge? */
+    bool isBackedge(const Node*, const Node*);
+    /* returns true if there is a node k such that there is a
+     * (possibly empty) path from m to k that does not pass through n
+     * and an edge k->n that is a back edge, and false otherwise. */
+    bool pathBack(Node*, Node*);
+    /* check if there is a barrier in a basic block */
+    bool checkForBarrier(const ir::BasicBlock*);
+    /* insert while instruction at the proper position of Node */
+    void handleSelfLoopNode(Node *, ir::LabelIndex&);
+    /* mark all the BasicBlockNodes of the control tree node n as status */
+    void markStructuredNodes(Node *n, bool status);
+    /* mark all the ir::BasicBlocks' needEndIf of n as status */
+    void markNeedEndif(Node * n, bool status);
+    /* mark all the ir::BasicBlocks' needIf of n as status */
+    void markNeedIf(Node *, bool);
+    /* insert IF instruction at the proper position of Node */
+    void handleIfNode(Node *, ir::LabelIndex&, ir::LabelIndex&);
+    /* insert ENDIF instruction at the proper position of Node, this Node is
+       the 'then' node of identified if-then structure */
+    void handleThenNode(Node *, ir::LabelIndex&);
+    /* handle the then node of identified if-else structure */
+    void handleThenNode2(Node *, Node *, ir::LabelIndex);
+    /* insert ELSE instruction at the proper position of Node */
+    void handleElseNode(Node *, ir::LabelIndex&, ir::LabelIndex&);
+    /* this calls other functions to finish the handling of identified structure blocks */
+    void handleStructuredNodes();
+    std::set<int> getStructureBasicBlocksIndex(Node *, std::vector<ir::BasicBlock *> &);
+    std::set<ir::BasicBlock *> getStructureBasicBlocks(Node*);
+    /* get livein of bb */
+    void getLiveIn(ir::BasicBlock& bb, std::set<ir::Register>& livein);
+    /* see comment of BasicBlock::liveout in function.hpp for detail. */
+    void calculateNecessaryLiveout();
+    /* get the exectutive sequence of structure n */
+    void getStructureSequence(Node* n, std::vector<ir::BasicBlock*> &v);
+  private:
+    ir::Function *fn;
+    NodeVector nodes;
+    Node* nodes_entry;
+    unordered_map<ir::BasicBlock *, Node *> bbmap;
+    NodeList post_order;
+    NodeSet visited;
+    NodeList::iterator post_ctr;
+  };
+}
+#endif
diff --git a/backend/src/ir/type.cpp b/backend/src/ir/type.cpp
index 56f5c12..450ba61 100644
--- a/backend/src/ir/type.cpp
+++ b/backend/src/ir/type.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/ir/type.hpp b/backend/src/ir/type.hpp
index 8bfbdc8..d528859 100644
--- a/backend/src/ir/type.hpp
+++ b/backend/src/ir/type.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/ir/unit.cpp b/backend/src/ir/unit.cpp
index 4f9d740..84208e5 100644
--- a/backend/src/ir/unit.cpp
+++ b/backend/src/ir/unit.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/ir/unit.hpp b/backend/src/ir/unit.hpp
index adebd3f..b5b0fa9 100644
--- a/backend/src/ir/unit.hpp
+++ b/backend/src/ir/unit.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/ir/value.cpp b/backend/src/ir/value.cpp
index a055bdf..840fb5c 100644
--- a/backend/src/ir/value.cpp
+++ b/backend/src/ir/value.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/ir/value.hpp b/backend/src/ir/value.hpp
index 47b9048..a9e5108 100644
--- a/backend/src/ir/value.hpp
+++ b/backend/src/ir/value.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/libocl/CMakeLists.txt b/backend/src/libocl/CMakeLists.txt
new file mode 100644
index 0000000..314d373
--- /dev/null
+++ b/backend/src/libocl/CMakeLists.txt
@@ -0,0 +1,218 @@
+PROJECT(LIBOCL)
+SET (OCL_OBJECT_DIR ${LIBOCL_BINARY_DIR}/${BEIGNET_INSTALL_DIR})
+SET (OCL_HEADER_FILES ${OCL_OBJECT_DIR}/include/ocl_defines.h)
+SET (OCL_SOURCE_FILES "")
+
+ADD_CUSTOM_COMMAND(OUTPUT ${OCL_OBJECT_DIR}/include/ocl_defines.h
+    COMMAND mkdir -p ${OCL_OBJECT_DIR}/include/
+    #   COMMAND echo "cat ${LIBOCL_SOURCE_DIR}/tmpl/ocl_defines.tmpl.h \\> ${LIBOCL_BINARY_DIR}/include/ocl_defines.h"
+    COMMAND cat ${LIBOCL_SOURCE_DIR}/tmpl/ocl_defines.tmpl.h > ${OCL_OBJECT_DIR}/include/ocl_defines.h
+    #   COMMAND echo "cat ${LIBOCL_SOURCE_DIR}/../ocl_common_defines.h \\>\\> ${LIBOCL_BINARY_DIR}/include/ocl_defines.h"
+    COMMAND cat ${LIBOCL_SOURCE_DIR}/../ocl_common_defines.h >> ${OCL_OBJECT_DIR}/include/ocl_defines.h
+    DEPENDS ${LIBOCL_SOURCE_DIR}/tmpl/ocl_defines.tmpl.h ${LIBOCL_SOURCE_DIR}/../ocl_common_defines.h
+    COMMENT "Generate the header: ${LIBOCL_BINARY_DIR}/include/ocl_defines.h"
+    )
+
+#other module just copy.
+MACRO(COPY_THE_HEADER _mod)
+    # Use the python script to generate the header files.
+    STRING(REGEX REPLACE "\(o.*\)" "${OCL_OBJECT_DIR}/include/\\1.h" output_name ${_mod})
+    STRING(REGEX REPLACE "\(o.*\)" "${LIBOCL_SOURCE_DIR}/include/\\1.h" orgin_name ${_mod})
+    SET(OCL_HEADER_FILES ${OCL_HEADER_FILES} ${output_name})
+    IF(orgin_name STREQUAL output_name)
+    ELSE(orgin_name STREQUAL output_name)
+	ADD_CUSTOM_COMMAND(OUTPUT ${output_name}
+	    COMMAND mkdir -p ${OCL_OBJECT_DIR}/include/
+	    #COMMAND echo "cp ${orgin_name} ${output_name}"
+	    COMMAND cp ${orgin_name} ${output_name}
+	    DEPENDS ${orgin_name}
+	    COMMENT "Copy the header: ${output_name}"
+	    )
+    ENDIF(orgin_name STREQUAL output_name)
+ENDMACRO(COPY_THE_HEADER)
+MACRO(COPY_THE_SOURCE _mod)
+    # Use the python script to generate the header files.
+    STRING(REGEX REPLACE "\(o.*\)" "${LIBOCL_BINARY_DIR}/src/\\1.cl" output_name ${_mod})
+    STRING(REGEX REPLACE "\(o.*\)" "${LIBOCL_SOURCE_DIR}/src/\\1.cl" orgin_name ${_mod})
+    SET(OCL_SOURCE_FILES ${OCL_SOURCE_FILES} ${output_name})
+    IF(orgin_name STREQUAL output_name)
+    ELSE(orgin_name STREQUAL output_name)
+	ADD_CUSTOM_COMMAND(OUTPUT ${output_name}
+	    COMMAND mkdir -p ${LIBOCL_BINARY_DIR}/src/
+	    #COMMAND echo "cp ${orgin_name} ${output_name}"
+	    COMMAND cp ${orgin_name} ${output_name}
+	    DEPENDS ${orgin_name}
+	    COMMENT "Copy the source: ${output_name}"
+	    )
+    ENDIF(orgin_name STREQUAL output_name)
+ENDMACRO(COPY_THE_SOURCE)
+
+SET (OCL_COPY_HEADERS ocl ocl_types ocl_float ocl_printf)
+FOREACH(M ${OCL_COPY_HEADERS})
+    COPY_THE_HEADER(${M})
+ENDFOREACH(M) 
+
+SET (OCL_COPY_MODULES ocl_workitem ocl_atom ocl_async ocl_sync ocl_misc ocl_vload ocl_geometric ocl_image)
+FOREACH(M ${OCL_COPY_MODULES})
+    COPY_THE_HEADER(${M})
+    COPY_THE_SOURCE(${M})
+ENDFOREACH(M) 
+
+
+MACRO(GENERATE_HEADER_PY _mod)
+    STRING(REGEX REPLACE "\(o.*\)" "${OCL_OBJECT_DIR}/include/\\1.h" output_name ${_mod})
+    STRING(REGEX REPLACE "\(o.*\)" "${LIBOCL_SOURCE_DIR}/tmpl/\\1.tmpl.h" tmpl_name ${_mod})
+    STRING(REGEX REPLACE "\(o.*\)" "${LIBOCL_SOURCE_DIR}/script/\\1.def" def_name ${_mod})
+    SET(OCL_HEADER_FILES ${OCL_HEADER_FILES} ${output_name})
+    ADD_CUSTOM_COMMAND(OUTPUT ${output_name}
+	COMMAND mkdir -p ${OCL_OBJECT_DIR}/include/
+	#COMMAND echo "cat ${tmpl_name} \\> ${output_name}"
+	COMMAND cat ${tmpl_name} > ${output_name}
+	#COMMAND echo "${LIBOCL_SOURCE_DIR}/script/gen_vector.py ${def_name} ${output_name} 1"
+	COMMAND ${PYTHON_EXECUTABLE} ${LIBOCL_SOURCE_DIR}/script/gen_vector.py ${def_name} ${output_name} 1
+	#COMMAND echo "echo \\#endif \\>\\> ${output_name}"
+	COMMAND echo "\\#endif" >> ${output_name}
+	DEPENDS ${tmpl_name} ${def_name} ${LIBOCL_SOURCE_DIR}/script/gen_vector.py
+	COMMENT "Generate the header by python: ${output_name}"
+	)
+ENDMACRO(GENERATE_HEADER_PY)
+MACRO(GENERATE_SOURCE_PY _mod)
+    STRING(REGEX REPLACE "\(o.*\)" "${LIBOCL_BINARY_DIR}/src/\\1.cl" output_name ${_mod})
+    STRING(REGEX REPLACE "\(o.*\)" "${LIBOCL_SOURCE_DIR}/tmpl/\\1.tmpl.cl" tmpl_name ${_mod})
+    STRING(REGEX REPLACE "\(o.*\)" "${LIBOCL_SOURCE_DIR}/script/\\1.def" def_name ${_mod})
+    SET(OCL_SOURCE_FILES ${OCL_SOURCE_FILES} ${output_name})
+    ADD_CUSTOM_COMMAND(OUTPUT ${output_name}
+	COMMAND mkdir -p ${LIBOCL_BINARY_DIR}/src/
+	COMMAND cat ${tmpl_name} > ${output_name}
+	COMMAND ${PYTHON_EXECUTABLE} ${LIBOCL_SOURCE_DIR}/script/gen_vector.py ${def_name} ${output_name} 0
+	DEPENDS ${tmpl_name} ${def_name} ${LIBOCL_SOURCE_DIR}/script/gen_vector.py
+	COMMENT "Generate the source by python: ${output_name}"
+	)
+ENDMACRO(GENERATE_SOURCE_PY)
+
+SET (OCL_PY_GENERATED_MODULES ocl_common ocl_relational ocl_integer ocl_math)
+FOREACH(M ${OCL_PY_GENERATED_MODULES})
+    GENERATE_HEADER_PY(${M})
+    GENERATE_SOURCE_PY(${M})
+ENDFOREACH(M) 
+
+
+MACRO(GENERATE_HEADER_BASH _mod)
+    # Use the python script to generate the header files.
+    STRING(REGEX REPLACE "\(o.*\)" "${OCL_OBJECT_DIR}/include/\\1.h" output_name ${_mod})
+    STRING(REGEX REPLACE "\(o.*\)" "${LIBOCL_SOURCE_DIR}/script/\\1.sh" sh_name ${_mod})
+    SET(OCL_HEADER_FILES ${OCL_HEADER_FILES} ${output_name})
+    ADD_CUSTOM_COMMAND(OUTPUT ${output_name}
+	COMMAND mkdir -p ${OCL_OBJECT_DIR}/include/
+	COMMAND ${sh_name} -p > ${output_name}
+	DEPENDS ${sh_name}
+	COMMENT "Generate the header by script: ${output_name}"
+	)
+ENDMACRO(GENERATE_HEADER_BASH)
+MACRO(GENERATE_SOURCE_BASH _mod)
+    # Use the python script to generate the header files.
+    STRING(REGEX REPLACE "\(o.*\)" "${LIBOCL_BINARY_DIR}/src/\\1.cl" output_name ${_mod})
+    STRING(REGEX REPLACE "\(o.*\)" "${LIBOCL_SOURCE_DIR}/script/\\1.sh" def_name ${_mod})
+    SET(OCL_SOURCE_FILES ${OCL_SOURCE_FILES} ${output_name})
+    ADD_CUSTOM_COMMAND(OUTPUT ${output_name}
+	COMMAND mkdir -p ${LIBOCL_BINARY_DIR}/src/
+	COMMAND ${sh_name} > ${output_name}
+	DEPENDS ${sh_name}
+	COMMENT "Generate the source by script: ${output_name}"
+	)
+ENDMACRO(GENERATE_SOURCE_BASH)
+
+SET (OCL_BASH_GENERATED_MODULES ocl_as ocl_convert)
+FOREACH(M ${OCL_BASH_GENERATED_MODULES})
+    GENERATE_HEADER_BASH(${M})
+    GENERATE_SOURCE_BASH(${M})
+ENDFOREACH(M) 
+
+
+SET (CLANG_OCL_FLAGS -fno-builtin -ffp-contract=off -cl-kernel-arg-info -DGEN7_SAMPLER_CLAMP_BORDER_WORKAROUND "-cl-std=CL1.2")
+MACRO(ADD_CL_TO_BC_TARGET _file)
+    # CMake seems can not add pattern rule, use MACRO to replace.
+    STRING(REGEX REPLACE "${LIBOCL_BINARY_DIR}/src/\(o.*\)\\.cl" "${OCL_OBJECT_DIR}/\\1.bc" output_name ${_file})
+    ADD_CUSTOM_COMMAND(OUTPUT ${output_name}
+	COMMAND mkdir -p ${OCL_OBJECT_DIR}/
+	#COMMAND echo ${LLVM_INSTALL_DIR}clang -cc1 ${CLANG_OCL_FLAGS} -I ${LIBOCL_BINARY_DIR}/include/ -emit-llvm-bc -triple spir -o ${output_name} -x cl ${_file}
+	COMMAND ${LLVM_INSTALL_DIR}clang -cc1 ${CLANG_OCL_FLAGS} -I ${OCL_OBJECT_DIR}/include/ -emit-llvm-bc -triple spir -o ${output_name} -x cl ${_file}
+	DEPENDS ${_file} ${OCL_HEADER_FILES}
+	COMMENT "Compiling ${_file}"
+	)
+ENDMACRO(ADD_CL_TO_BC_TARGET)
+
+
+FOREACH(f ${OCL_SOURCE_FILES})
+    ADD_CL_TO_BC_TARGET(${f})
+ENDFOREACH(f) 
+
+FOREACH(f ${OCL_SOURCE_FILES})
+    STRING(REGEX REPLACE "${LIBOCL_BINARY_DIR}/src/\(o.*\)\\.cl" "${OCL_OBJECT_DIR}/\\1.bc" bc_name ${f})
+    SET(OCL_BC_FILES ${OCL_BC_FILES} ${bc_name})
+ENDFOREACH(f) 
+
+
+# handle the ll files
+MACRO(COPY_THE_LL _mod)
+    # Use the python script to generate the header files.
+    STRING(REGEX REPLACE "\(o.*\)" "${LIBOCL_BINARY_DIR}/src/\\1.ll" output_name ${_mod})
+    STRING(REGEX REPLACE "\(o.*\)" "${LIBOCL_SOURCE_DIR}/src/\\1.ll" orgin_name ${_mod})
+    IF(orgin_name STREQUAL output_name)
+    ELSE(orgin_name STREQUAL output_name)
+	ADD_CUSTOM_COMMAND(OUTPUT ${output_name}
+	    COMMAND mkdir -p ${LIBOCL_BINARY_DIR}/src/
+	    #COMMAND echo "cp ${orgin_name} ${output_name}"
+	    COMMAND cp ${orgin_name} ${output_name}
+	    DEPENDS ${orgin_name}
+	    COMMENT "Copy the LL file: ${output_name}"
+	    )
+    ENDIF(orgin_name STREQUAL output_name)
+ENDMACRO(COPY_THE_LL)
+MACRO(ADD_LL_TO_BC_TARGET M)
+    STRING(REGEX REPLACE "\(o.*\)" "${OCL_OBJECT_DIR}/\\1.bc" output_name ${M})
+    STRING(REGEX REPLACE "\(o.*\)" "${LIBOCL_BINARY_DIR}/src/\\1.ll" srcll_name ${M})
+    ADD_CUSTOM_COMMAND(OUTPUT ${output_name}
+	COMMAND mkdir -p ${OCL_OBJECT_DIR}/
+	#COMMAND echo ${LLVM_INSTALL_DIR}llvm-as -o ${output_name} ${srcll_name}
+	COMMAND ${LLVM_INSTALL_DIR}llvm-as -o ${output_name} ${srcll_name}
+	DEPENDS ${srcll_name}
+	COMMENT "Compiling ${srcll_name}"
+	)
+ENDMACRO(ADD_LL_TO_BC_TARGET)
+
+SET (OCL_LL_MODULES ocl_barrier ocl_memcpy ocl_memset)
+FOREACH(f ${OCL_LL_MODULES})
+    COPY_THE_LL(${f})
+    ADD_LL_TO_BC_TARGET(${f})
+    STRING(REGEX REPLACE "\(o.*\)" "${OCL_OBJECT_DIR}/\\1.bc" bc_name ${f})
+    SET(OCL_BC_FILES ${OCL_BC_FILES} ${bc_name})
+ENDFOREACH(f) 
+
+
+ADD_CUSTOM_COMMAND(OUTPUT ${OCL_OBJECT_DIR}/beignet.bc
+    COMMAND mkdir -p ${LIBOCL_BINARY_DIR}/lib/
+    #COMMAND echo llvm-link -o ${LIBOCL_BINARY_DIR}/lib/beignet.bc ${OCL_BC_FILES}
+    COMMAND ${LLVM_INSTALL_DIR}llvm-link -o ${OCL_OBJECT_DIR}/beignet.bc ${OCL_BC_FILES}
+    DEPENDS ${OCL_BC_FILES}
+    COMMENT "Generate the bitcode file: ${OCL_OBJECT_DIR}/beignet.bc"
+    )
+
+ADD_CUSTOM_COMMAND(OUTPUT ${OCL_OBJECT_DIR}/beignet.local.pch
+    COMMAND mkdir -p ${OCL_OBJECT_DIR}
+    COMMAND ${LLVM_INSTALL_DIR}clang -cc1 ${CLANG_OCL_FLAGS} -triple spir -I ${OCL_OBJECT_DIR}/include/ -emit-pch -x cl ${OCL_OBJECT_DIR}/include/ocl.h -o ${OCL_OBJECT_DIR}/beignet.local.pch
+    DEPENDS ${OCL_HEADER_FILES}
+    COMMENT "Generate the pch file: ${OCL_OBJECT_DIR}/beignet.local.pch"
+    )
+
+ADD_CUSTOM_COMMAND(OUTPUT ${OCL_OBJECT_DIR}/beignet.pch
+    COMMAND mkdir -p ${OCL_OBJECT_DIR}
+    COMMAND ${LLVM_INSTALL_DIR}clang -cc1 ${CLANG_OCL_FLAGS} -triple spir -I ${OCL_OBJECT_DIR}/include/ --relocatable-pch -emit-pch -isysroot ${LIBOCL_BINARY_DIR} -x cl ${OCL_OBJECT_DIR}/include/ocl.h -o ${OCL_OBJECT_DIR}/beignet.pch
+    DEPENDS ${OCL_HEADER_FILES}
+    COMMENT "Generate the pch file: ${OCL_OBJECT_DIR}/beignet.pch"
+    )
+
+
+add_custom_target(beignet_bitcode ALL DEPENDS ${OCL_OBJECT_DIR}/beignet.bc ${OCL_OBJECT_DIR}/beignet.pch ${OCL_OBJECT_DIR}/beignet.local.pch)
+SET (OCL_OBJECT_DIR ${OCL_OBJECT_DIR} PARENT_SCOPE)
+SET (OCL_HEADER_FILES ${OCL_HEADER_FILES} PARENT_SCOPE)
diff --git a/backend/src/libocl/include/ocl.h b/backend/src/libocl/include/ocl.h
new file mode 100644
index 0000000..e886670
--- /dev/null
+++ b/backend/src/libocl/include/ocl.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef __OCL_H__
+#define __OCL_H__
+
+#include "ocl_defines.h"
+#include "ocl_types.h"
+#include "ocl_as.h"
+#include "ocl_async.h"
+#include "ocl_atom.h"
+#include "ocl_common.h"
+#include "ocl_convert.h"
+#include "ocl_float.h"
+#include "ocl_geometric.h"
+#include "ocl_image.h"
+#include "ocl_integer.h"
+#include "ocl_math.h"
+#include "ocl_misc.h"
+#include "ocl_printf.h"
+#include "ocl_relational.h"
+#include "ocl_sync.h"
+#include "ocl_vload.h"
+#include "ocl_workitem.h"
+#pragma OPENCL EXTENSION cl_khr_fp64 : disable
+
+#endif
diff --git a/backend/src/libocl/include/ocl_async.h b/backend/src/libocl/include/ocl_async.h
new file mode 100644
index 0000000..dd89942
--- /dev/null
+++ b/backend/src/libocl/include/ocl_async.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef __OCL_ASYNC_H__
+#define __OCL_ASYNC_H__
+
+#include "ocl_types.h"
+
+#define DEFN(TYPE) \
+OVERLOADABLE event_t async_work_group_copy (local TYPE *dst,  const global TYPE *src, \
+							 size_t num, event_t event); \
+OVERLOADABLE event_t async_work_group_copy (global TYPE *dst,  const local TYPE *src, \
+							  size_t num, event_t event); \
+OVERLOADABLE event_t async_work_group_strided_copy (local TYPE *dst,  const global TYPE *src, \
+								 size_t num, size_t src_stride, event_t event); \
+OVERLOADABLE event_t async_work_group_strided_copy (global TYPE *dst,  const local TYPE *src, \
+								  size_t num, size_t dst_stride, event_t event); \
+
+#define DEF(TYPE) \
+  DEFN(TYPE); DEFN(TYPE##2); DEFN(TYPE##3); DEFN(TYPE##4); DEFN(TYPE##8); DEFN(TYPE##16);
+DEF(char)
+DEF(uchar)
+DEF(short)
+DEF(ushort)
+DEF(int)
+DEF(uint)
+DEF(long)
+DEF(ulong)
+DEF(float)
+DEF(double)
+#undef DEFN
+#undef DEF
+
+void wait_group_events (int num_events, event_t *event_list);
+
+#define DEFN(TYPE) \
+OVERLOADABLE void prefetch(const global TYPE *p, size_t num);
+#define DEF(TYPE) \
+DEFN(TYPE); DEFN(TYPE##2); DEFN(TYPE##3); DEFN(TYPE##4); DEFN(TYPE##8); DEFN(TYPE##16)
+DEF(char);
+DEF(uchar);
+DEF(short);
+DEF(ushort);
+DEF(int);
+DEF(uint);
+DEF(long);
+DEF(ulong);
+DEF(float);
+#undef DEFN
+#undef DEF
+
+#endif
diff --git a/backend/src/libocl/include/ocl_atom.h b/backend/src/libocl/include/ocl_atom.h
new file mode 100644
index 0000000..d0f6b10
--- /dev/null
+++ b/backend/src/libocl/include/ocl_atom.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef __OCL_ATOM_H__
+#define __OCL_ATOM_H__
+#include "ocl_types.h"
+
+/////////////////////////////////////////////////////////////////////////////
+// Atomic functions
+/////////////////////////////////////////////////////////////////////////////
+
+OVERLOADABLE uint atomic_add(volatile __global uint *p, uint val);
+OVERLOADABLE uint atomic_add(volatile __local uint *p, uint val);
+OVERLOADABLE int atomic_add(volatile __global int *p, int val);
+OVERLOADABLE int atomic_add(volatile __local int *p, int val);
+
+OVERLOADABLE uint atomic_sub(volatile __global uint *p, uint val);
+OVERLOADABLE uint atomic_sub(volatile __local uint *p, uint val);
+OVERLOADABLE int atomic_sub(volatile __global int *p, int val);
+OVERLOADABLE int atomic_sub(volatile __local int *p, int val);
+
+OVERLOADABLE uint atomic_and(volatile __global uint *p, uint val);
+OVERLOADABLE uint atomic_and(volatile __local uint *p, uint val);
+OVERLOADABLE int atomic_and(volatile __global int *p, int val);
+OVERLOADABLE int atomic_and(volatile __local int *p, int val);
+
+OVERLOADABLE uint atomic_or(volatile __global uint *p, uint val);
+OVERLOADABLE uint atomic_or(volatile __local uint *p, uint val);
+OVERLOADABLE int atomic_or(volatile __global int *p, int val);
+OVERLOADABLE int atomic_or(volatile __local int *p, int val);
+
+OVERLOADABLE uint atomic_xor(volatile __global uint *p, uint val);
+OVERLOADABLE uint atomic_xor(volatile __local uint *p, uint val);
+OVERLOADABLE int atomic_xor(volatile __global int *p, int val);
+OVERLOADABLE int atomic_xor(volatile __local int *p, int val);
+
+OVERLOADABLE uint atomic_xchg(volatile __global uint *p, uint val);
+OVERLOADABLE uint atomic_xchg(volatile __local uint *p, uint val);
+OVERLOADABLE int atomic_xchg(volatile __global int *p, int val);
+OVERLOADABLE int atomic_xchg(volatile __local int *p, int val);
+
+OVERLOADABLE int atomic_min(volatile __global int *p, int val);
+OVERLOADABLE int atomic_min(volatile __local int *p, int val);
+
+OVERLOADABLE int atomic_max(volatile __global int *p, int val);
+OVERLOADABLE int atomic_max(volatile __local int *p, int val);
+
+OVERLOADABLE uint atomic_min(volatile __global uint *p, uint val);
+OVERLOADABLE uint atomic_min(volatile __local uint *p, uint val);
+
+OVERLOADABLE uint atomic_max(volatile __global uint *p, uint val);
+OVERLOADABLE uint atomic_max(volatile __local uint *p, uint val);
+
+OVERLOADABLE float atomic_xchg (volatile __global float *p, float val);
+OVERLOADABLE float atomic_xchg (volatile __local float *p, float val);
+
+OVERLOADABLE uint atomic_inc (volatile __global uint *p);
+OVERLOADABLE uint atomic_inc (volatile __local uint *p);
+OVERLOADABLE int atomic_inc (volatile __global int *p);
+OVERLOADABLE int atomic_inc (volatile __local int *p);
+
+OVERLOADABLE uint atomic_dec (volatile __global uint *p);
+OVERLOADABLE uint atomic_dec (volatile __local uint *p);
+OVERLOADABLE int atomic_dec (volatile __global int *p);
+OVERLOADABLE int atomic_dec (volatile __local int *p);
+
+OVERLOADABLE uint atomic_cmpxchg (volatile __global uint *p, uint cmp, uint val);
+OVERLOADABLE uint atomic_cmpxchg (volatile __local uint *p, uint cmp, uint val);
+OVERLOADABLE int atomic_cmpxchg (volatile __global int *p, int cmp, int val);
+OVERLOADABLE int atomic_cmpxchg (volatile __local int *p, int cmp, int val);
+
+
+// XXX for conformance test
+// The following atom_xxx api is on OpenCL spec 1.0.
+#define atom_add atomic_add
+#define atom_sub atomic_sub
+#define atom_and atomic_and
+#define atom_or atomic_or
+#define atom_xor atomic_xor
+#define atom_xchg atomic_xchg
+#define atom_min atomic_min
+#define atom_max atomic_max
+#define atom_inc atomic_inc
+#define atom_dec atomic_dec
+#define atom_cmpxchg atomic_cmpxchg
+
+
+#endif  /* __OCL_ATOM_H__ */
diff --git a/backend/src/libocl/include/ocl_float.h b/backend/src/libocl/include/ocl_float.h
new file mode 100644
index 0000000..65577a3
--- /dev/null
+++ b/backend/src/libocl/include/ocl_float.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef __OCL_FLOAT_H__
+#define __OCL_FLOAT_H__
+
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL floating-point macros and pragmas
+/////////////////////////////////////////////////////////////////////////////
+#define FLT_DIG 6
+#define FLT_MANT_DIG 24
+#define FLT_MAX_10_EXP +38
+#define FLT_MAX_EXP +128
+#define FLT_MIN_10_EXP -37
+#define FLT_MIN_EXP -125
+#define FLT_RADIX 2
+#define FLT_ONE 1.0000000000e+00         /* 0x3F800000 */
+#define FLT_MAX 0x1.fffffep127f
+#define FLT_MIN 0x1.0p-126f
+#define FLT_EPSILON 0x1.0p-23f
+
+#define MAXFLOAT     3.40282347e38F
+INLINE_OVERLOADABLE float __ocl_inff(void) {
+  union { uint u; float f; } u;
+  u.u = 0x7F800000;
+  return u.f;
+}
+INLINE_OVERLOADABLE float __ocl_nanf(void) {
+  union { uint u; float f; } u;
+  u.u = 0x7F800001;
+  return u.f;
+}
+typedef union
+{
+  float value;
+  uint  word;
+} float_shape_type;
+
+/* Get a 32 bit int from a float.  */
+#ifndef GEN_OCL_GET_FLOAT_WORD
+# define GEN_OCL_GET_FLOAT_WORD(i,d)  \
+do {                                  \
+  float_shape_type gf_u;              \
+  gf_u.value = (d);                   \
+  (i) = gf_u.word;                    \
+} while (0)
+#endif
+/* Set a float from a 32 bit int.  */
+#ifndef GEN_OCL_SET_FLOAT_WORD
+# define GEN_OCL_SET_FLOAT_WORD(d,i)  \
+do {                                  \
+  float_shape_type sf_u;              \
+  sf_u.word = (i);                    \
+  (d) = sf_u.value;                   \
+} while (0)
+#endif
+
+INLINE_OVERLOADABLE int __ocl_finitef (float x){
+  unsigned ix;
+  GEN_OCL_GET_FLOAT_WORD (ix, x);
+  return (ix & 0x7fffffff) < 0x7f800000;
+}
+
+#define HUGE_VALF    (__ocl_inff())
+#define INFINITY     (__ocl_inff())
+#define NAN          (__ocl_nanf())
+#define M_E_F        2.718281828459045F
+#define M_LOG2E_F    1.4426950408889634F
+#define M_LOG10E_F   0.43429448190325176F
+#define M_LN2_F      0.6931471805599453F
+#define M_LN10_F     2.302585092994046F
+#define M_PI_F       3.141592653589793F
+#define M_PI_2_F     1.5707963267948966F
+#define M_PI_4_F     0.7853981633974483F
+#define M_1_PI_F     0.3183098861837907F
+#define M_2_PI_F     0.6366197723675814F
+#define M_2_SQRTPI_F 1.1283791670955126F
+#define M_SQRT2_F    1.4142135623730951F
+#define M_SQRT1_2_F  0.7071067811865476F
+
+
+#endif /* __OCL_FLOAT_H__ */
diff --git a/backend/src/libocl/include/ocl_geometric.h b/backend/src/libocl/include/ocl_geometric.h
new file mode 100644
index 0000000..86d543b
--- /dev/null
+++ b/backend/src/libocl/include/ocl_geometric.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef __OCL_GEOMETRIC_H__
+#define __OCL_GEOMETRIC_H__
+
+#include "ocl_types.h"
+
+OVERLOADABLE float dot(float p0, float p1);
+OVERLOADABLE float dot(float2 p0, float2 p1);
+OVERLOADABLE float dot(float3 p0, float3 p1);
+OVERLOADABLE float dot(float4 p0, float4 p1);
+OVERLOADABLE float length(float x);
+OVERLOADABLE float length(float2 x);
+OVERLOADABLE float length(float3 x);
+OVERLOADABLE float length(float4 x);
+OVERLOADABLE float distance(float x, float y);
+OVERLOADABLE float distance(float2 x, float2 y);
+OVERLOADABLE float distance(float3 x, float3 y);
+OVERLOADABLE float distance(float4 x, float4 y);
+OVERLOADABLE float normalize(float x);
+OVERLOADABLE float2 normalize(float2 x);
+OVERLOADABLE float3 normalize(float3 x);
+OVERLOADABLE float4 normalize(float4 x);
+
+OVERLOADABLE float fast_length(float x);
+OVERLOADABLE float fast_length(float2 x);
+OVERLOADABLE float fast_length(float3 x);
+OVERLOADABLE float fast_length(float4 x);
+OVERLOADABLE float fast_distance(float x, float y);
+OVERLOADABLE float fast_distance(float2 x, float2 y);
+OVERLOADABLE float fast_distance(float3 x, float3 y);
+OVERLOADABLE float fast_distance(float4 x, float4 y);
+OVERLOADABLE float fast_normalize(float x);
+OVERLOADABLE float2 fast_normalize(float2 x);
+OVERLOADABLE float3 fast_normalize(float3 x);
+OVERLOADABLE float4 fast_normalize(float4 x);
+
+OVERLOADABLE float3 cross(float3 v0, float3 v1);
+OVERLOADABLE float4 cross(float4 v0, float4 v1);
+
+#endif
diff --git a/backend/src/libocl/include/ocl_image.h b/backend/src/libocl/include/ocl_image.h
new file mode 100644
index 0000000..3637d56
--- /dev/null
+++ b/backend/src/libocl/include/ocl_image.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef __OCL_IMAGE_H__
+#define __OCL_IMAGE_H__
+
+#include "ocl_types.h"
+
+OVERLOADABLE int4 read_imagei(image1d_t cl_image, const sampler_t sampler, int coord);
+OVERLOADABLE int4 read_imagei(image1d_t cl_image, const sampler_t sampler, float coord);
+OVERLOADABLE int4 read_imagei(image1d_t cl_image, int coord);
+OVERLOADABLE void write_imagei(image1d_t cl_image, int coord, int4 color);
+OVERLOADABLE void write_imagei(image1d_t cl_image, float coord, int4 color);
+OVERLOADABLE uint4 read_imageui(image1d_t cl_image, const sampler_t sampler, int coord);
+OVERLOADABLE uint4 read_imageui(image1d_t cl_image, const sampler_t sampler, float coord);
+OVERLOADABLE uint4 read_imageui(image1d_t cl_image, int coord);
+OVERLOADABLE void write_imageui(image1d_t cl_image, int coord, uint4 color);
+OVERLOADABLE void write_imageui(image1d_t cl_image, float coord, uint4 color);
+OVERLOADABLE float4 read_imagef(image1d_t cl_image, const sampler_t sampler, int coord);
+OVERLOADABLE float4 read_imagef(image1d_t cl_image, const sampler_t sampler, float coord);
+OVERLOADABLE float4 read_imagef(image1d_t cl_image, int coord);
+OVERLOADABLE void write_imagef(image1d_t cl_image, int coord, float4 color);
+OVERLOADABLE void write_imagef(image1d_t cl_image, float coord, float4 color);
+OVERLOADABLE int4 read_imagei(image1d_buffer_t cl_image, const sampler_t sampler, int coord);
+OVERLOADABLE int4 read_imagei(image1d_buffer_t cl_image, const sampler_t sampler, float coord);
+OVERLOADABLE int4 read_imagei(image1d_buffer_t cl_image, int coord);
+OVERLOADABLE void write_imagei(image1d_buffer_t cl_image, int coord, int4 color);
+OVERLOADABLE void write_imagei(image1d_buffer_t cl_image, float coord, int4 color);
+OVERLOADABLE uint4 read_imageui(image1d_buffer_t cl_image, const sampler_t sampler, int coord);
+OVERLOADABLE uint4 read_imageui(image1d_buffer_t cl_image, const sampler_t sampler, float coord);
+OVERLOADABLE uint4 read_imageui(image1d_buffer_t cl_image, int coord);
+OVERLOADABLE void write_imageui(image1d_buffer_t cl_image, int coord, uint4 color);
+OVERLOADABLE void write_imageui(image1d_buffer_t cl_image, float coord, uint4 color);
+OVERLOADABLE float4 read_imagef(image1d_buffer_t cl_image, const sampler_t sampler, int coord);
+OVERLOADABLE float4 read_imagef(image1d_buffer_t cl_image, const sampler_t sampler, float coord);
+OVERLOADABLE float4 read_imagef(image1d_buffer_t cl_image, int coord);
+OVERLOADABLE void write_imagef(image1d_buffer_t cl_image, int coord, float4 color);
+OVERLOADABLE void write_imagef(image1d_buffer_t cl_image, float coord, float4 color);
+
+OVERLOADABLE int get_image_channel_data_type(image1d_t image);
+OVERLOADABLE int get_image_channel_order(image1d_t image);
+OVERLOADABLE int get_image_width(image1d_t image);
+OVERLOADABLE int get_image_channel_data_type(image1d_buffer_t image);
+OVERLOADABLE int get_image_channel_order(image1d_buffer_t image);
+OVERLOADABLE int get_image_width(image1d_buffer_t image);
+OVERLOADABLE int4 read_imagei(image2d_t cl_image, const sampler_t sampler, int2 coord);
+OVERLOADABLE int4 read_imagei(image2d_t cl_image, const sampler_t sampler, float2 coord);
+OVERLOADABLE int4 read_imagei(image2d_t cl_image, int2 coord);
+OVERLOADABLE void write_imagei(image2d_t cl_image, int2 coord, int4 color);
+OVERLOADABLE void write_imagei(image2d_t cl_image, float2 coord, int4 color);
+OVERLOADABLE uint4 read_imageui(image2d_t cl_image, const sampler_t sampler, int2 coord);
+OVERLOADABLE uint4 read_imageui(image2d_t cl_image, const sampler_t sampler, float2 coord);
+OVERLOADABLE uint4 read_imageui(image2d_t cl_image, int2 coord);
+OVERLOADABLE void write_imageui(image2d_t cl_image, int2 coord, uint4 color);
+OVERLOADABLE void write_imageui(image2d_t cl_image, float2 coord, uint4 color);
+OVERLOADABLE float4 read_imagef(image2d_t cl_image, const sampler_t sampler, int2 coord);
+OVERLOADABLE float4 read_imagef(image2d_t cl_image, const sampler_t sampler, float2 coord);
+OVERLOADABLE float4 read_imagef(image2d_t cl_image, int2 coord);
+OVERLOADABLE void write_imagef(image2d_t cl_image, int2 coord, float4 color);
+OVERLOADABLE void write_imagef(image2d_t cl_image, float2 coord, float4 color);
+OVERLOADABLE int4 read_imagei(image1d_array_t cl_image, const sampler_t sampler, int2 coord);
+OVERLOADABLE int4 read_imagei(image1d_array_t cl_image, const sampler_t sampler, float2 coord);
+OVERLOADABLE int4 read_imagei(image1d_array_t cl_image, int2 coord);
+OVERLOADABLE void write_imagei(image1d_array_t cl_image, int2 coord, int4 color);
+OVERLOADABLE void write_imagei(image1d_array_t cl_image, float2 coord, int4 color);
+OVERLOADABLE uint4 read_imageui(image1d_array_t cl_image, const sampler_t sampler, int2 coord);
+OVERLOADABLE uint4 read_imageui(image1d_array_t cl_image, const sampler_t sampler, float2 coord);
+OVERLOADABLE uint4 read_imageui(image1d_array_t cl_image, int2 coord);
+OVERLOADABLE void write_imageui(image1d_array_t cl_image, int2 coord, uint4 color);
+OVERLOADABLE void write_imageui(image1d_array_t cl_image, float2 coord, uint4 color);
+OVERLOADABLE float4 read_imagef(image1d_array_t cl_image, const sampler_t sampler, int2 coord);
+OVERLOADABLE float4 read_imagef(image1d_array_t cl_image, const sampler_t sampler, float2 coord);
+OVERLOADABLE float4 read_imagef(image1d_array_t cl_image, int2 coord);
+OVERLOADABLE void write_imagef(image1d_array_t cl_image, int2 coord, float4 color);
+OVERLOADABLE void write_imagef(image1d_array_t cl_image, float2 coord, float4 color);
+
+OVERLOADABLE int get_image_channel_data_type(image2d_t image);
+OVERLOADABLE int get_image_channel_order(image2d_t image);
+OVERLOADABLE int get_image_width(image2d_t image);
+OVERLOADABLE int get_image_height(image2d_t image);
+OVERLOADABLE int2 get_image_dim(image2d_t image);
+
+OVERLOADABLE int get_image_channel_data_type(image1d_array_t image);
+OVERLOADABLE int get_image_channel_order(image1d_array_t image);
+OVERLOADABLE int get_image_width(image1d_array_t image);
+OVERLOADABLE size_t get_image_array_size(image1d_array_t image);
+OVERLOADABLE int4 read_imagei(image3d_t cl_image, const sampler_t sampler, int4 coord);
+OVERLOADABLE int4 read_imagei(image3d_t cl_image, const sampler_t sampler, float4 coord);
+OVERLOADABLE int4 read_imagei(image3d_t cl_image, int4 coord);
+OVERLOADABLE void write_imagei(image3d_t cl_image, int4 coord, int4 color);
+OVERLOADABLE void write_imagei(image3d_t cl_image, float4 coord, int4 color);
+OVERLOADABLE uint4 read_imageui(image3d_t cl_image, const sampler_t sampler, int4 coord);
+OVERLOADABLE uint4 read_imageui(image3d_t cl_image, const sampler_t sampler, float4 coord);
+OVERLOADABLE uint4 read_imageui(image3d_t cl_image, int4 coord);
+OVERLOADABLE void write_imageui(image3d_t cl_image, int4 coord, uint4 color);
+OVERLOADABLE void write_imageui(image3d_t cl_image, float4 coord, uint4 color);
+OVERLOADABLE float4 read_imagef(image3d_t cl_image, const sampler_t sampler, int4 coord);
+OVERLOADABLE float4 read_imagef(image3d_t cl_image, const sampler_t sampler, float4 coord);
+OVERLOADABLE float4 read_imagef(image3d_t cl_image, int4 coord);
+OVERLOADABLE void write_imagef(image3d_t cl_image, int4 coord, float4 color);
+OVERLOADABLE void write_imagef(image3d_t cl_image, float4 coord, float4 color);
+
+OVERLOADABLE int4 read_imagei(image3d_t cl_image, const sampler_t sampler, int3 coord);
+OVERLOADABLE int4 read_imagei(image3d_t cl_image, const sampler_t sampler, float3 coord);
+OVERLOADABLE int4 read_imagei(image3d_t cl_image, int3 coord);
+OVERLOADABLE void write_imagei(image3d_t cl_image, int3 coord, int4 color);
+OVERLOADABLE void write_imagei(image3d_t cl_image, float3 coord, int4 color);
+OVERLOADABLE uint4 read_imageui(image3d_t cl_image, const sampler_t sampler, int3 coord);
+OVERLOADABLE uint4 read_imageui(image3d_t cl_image, const sampler_t sampler, float3 coord);
+OVERLOADABLE uint4 read_imageui(image3d_t cl_image, int3 coord);
+OVERLOADABLE void write_imageui(image3d_t cl_image, int3 coord, uint4 color);
+OVERLOADABLE void write_imageui(image3d_t cl_image, float3 coord, uint4 color);
+OVERLOADABLE float4 read_imagef(image3d_t cl_image, const sampler_t sampler, int3 coord);
+OVERLOADABLE float4 read_imagef(image3d_t cl_image, const sampler_t sampler, float3 coord);
+OVERLOADABLE float4 read_imagef(image3d_t cl_image, int3 coord);
+OVERLOADABLE void write_imagef(image3d_t cl_image, int3 coord, float4 color);
+OVERLOADABLE void write_imagef(image3d_t cl_image, float3 coord, float4 color);
+OVERLOADABLE int4 read_imagei(image2d_array_t cl_image, const sampler_t sampler, int4 coord);
+OVERLOADABLE int4 read_imagei(image2d_array_t cl_image, const sampler_t sampler, float4 coord);
+OVERLOADABLE int4 read_imagei(image2d_array_t cl_image, int4 coord);
+OVERLOADABLE void write_imagei(image2d_array_t cl_image, int4 coord, int4 color);
+OVERLOADABLE void write_imagei(image2d_array_t cl_image, float4 coord, int4 color);
+OVERLOADABLE uint4 read_imageui(image2d_array_t cl_image, const sampler_t sampler, int4 coord);
+OVERLOADABLE uint4 read_imageui(image2d_array_t cl_image, const sampler_t sampler, float4 coord);
+OVERLOADABLE uint4 read_imageui(image2d_array_t cl_image, int4 coord);
+OVERLOADABLE void write_imageui(image2d_array_t cl_image, int4 coord, uint4 color);
+OVERLOADABLE void write_imageui(image2d_array_t cl_image, float4 coord, uint4 color);
+OVERLOADABLE float4 read_imagef(image2d_array_t cl_image, const sampler_t sampler, int4 coord);
+OVERLOADABLE float4 read_imagef(image2d_array_t cl_image, const sampler_t sampler, float4 coord);
+OVERLOADABLE float4 read_imagef(image2d_array_t cl_image, int4 coord);
+OVERLOADABLE void write_imagef(image2d_array_t cl_image, int4 coord, float4 color);
+OVERLOADABLE void write_imagef(image2d_array_t cl_image, float4 coord, float4 color);
+
+OVERLOADABLE int4 read_imagei(image2d_array_t cl_image, const sampler_t sampler, int3 coord);
+OVERLOADABLE int4 read_imagei(image2d_array_t cl_image, const sampler_t sampler, float3 coord);
+OVERLOADABLE int4 read_imagei(image2d_array_t cl_image, int3 coord);
+OVERLOADABLE void write_imagei(image2d_array_t cl_image, int3 coord, int4 color);
+OVERLOADABLE void write_imagei(image2d_array_t cl_image, float3 coord, int4 color);
+OVERLOADABLE uint4 read_imageui(image2d_array_t cl_image, const sampler_t sampler, int3 coord);
+OVERLOADABLE uint4 read_imageui(image2d_array_t cl_image, const sampler_t sampler, float3 coord);
+OVERLOADABLE uint4 read_imageui(image2d_array_t cl_image, int3 coord);
+OVERLOADABLE void write_imageui(image2d_array_t cl_image, int3 coord, uint4 color);
+OVERLOADABLE void write_imageui(image2d_array_t cl_image, float3 coord, uint4 color);
+OVERLOADABLE float4 read_imagef(image2d_array_t cl_image, const sampler_t sampler, int3 coord);
+OVERLOADABLE float4 read_imagef(image2d_array_t cl_image, const sampler_t sampler, float3 coord);
+OVERLOADABLE float4 read_imagef(image2d_array_t cl_image, int3 coord);
+OVERLOADABLE void write_imagef(image2d_array_t cl_image, int3 coord, float4 color);
+OVERLOADABLE void write_imagef(image2d_array_t cl_image, float3 coord, float4 color);
+
+OVERLOADABLE int get_image_channel_data_type(image3d_t image);
+OVERLOADABLE int get_image_channel_order(image3d_t image);
+OVERLOADABLE int get_image_width(image3d_t image);
+OVERLOADABLE int get_image_height(image3d_t image);
+OVERLOADABLE int get_image_depth(image3d_t image);
+OVERLOADABLE int4 get_image_dim(image3d_t image);
+
+
+OVERLOADABLE int get_image_channel_data_type(image2d_array_t image);
+OVERLOADABLE int get_image_channel_order(image2d_array_t image);
+OVERLOADABLE int get_image_width(image2d_array_t image);
+OVERLOADABLE int get_image_height(image2d_array_t image);
+OVERLOADABLE int2 get_image_dim(image2d_array_t image);
+OVERLOADABLE size_t get_image_array_size(image2d_array_t image);
+
+#endif
diff --git a/backend/src/libocl/include/ocl_misc.h b/backend/src/libocl/include/ocl_misc.h
new file mode 100644
index 0000000..aa3f504
--- /dev/null
+++ b/backend/src/libocl/include/ocl_misc.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef __OCL_MISC_H__
+#define __OCL_MISC_H__
+
+#include "ocl_types.h"
+
+#define DEC2(TYPE, XTYPE, MASKTYPE) \
+  OVERLOADABLE TYPE##2 shuffle(XTYPE x, MASKTYPE##2 mask);
+
+#define DEC4(TYPE, XTYPE, MASKTYPE) \
+  OVERLOADABLE TYPE##4 shuffle(XTYPE x, MASKTYPE##4 mask);
+
+#define DEC8(TYPE, XTYPE, MASKTYPE) \
+  OVERLOADABLE TYPE##8 shuffle(XTYPE x, MASKTYPE##8 mask);
+
+#define DEC16(TYPE, XTYPE, MASKTYPE) \
+  OVERLOADABLE TYPE##16 shuffle(XTYPE x, MASKTYPE##16 mask);
+
+#define DEFMASK(TYPE, MASKTYPE) \
+  DEC2(TYPE, TYPE##2, MASKTYPE); DEC2(TYPE, TYPE##4, MASKTYPE); DEC2(TYPE, TYPE##8, MASKTYPE); DEC2(TYPE, TYPE##16, MASKTYPE) \
+  DEC4(TYPE, TYPE##2, MASKTYPE); DEC4(TYPE, TYPE##4, MASKTYPE); DEC4(TYPE, TYPE##8, MASKTYPE); DEC4(TYPE, TYPE##16, MASKTYPE) \
+  DEC8(TYPE, TYPE##2, MASKTYPE); DEC8(TYPE, TYPE##4, MASKTYPE); DEC8(TYPE, TYPE##8, MASKTYPE); DEC8(TYPE, TYPE##16, MASKTYPE) \
+  DEC16(TYPE, TYPE##2, MASKTYPE); DEC16(TYPE, TYPE##4, MASKTYPE); DEC16(TYPE, TYPE##8, MASKTYPE); DEC16(TYPE, TYPE##16, MASKTYPE)
+
+#define DEF(TYPE) \
+  DEFMASK(TYPE, uchar) \
+  DEFMASK(TYPE, ushort) \
+  DEFMASK(TYPE, uint) \
+  DEFMASK(TYPE, ulong)
+
+DEF(char)
+DEF(uchar)
+DEF(short)
+DEF(ushort)
+DEF(int)
+DEF(uint)
+DEF(float)
+DEF(long)
+DEF(ulong)
+#undef DEF
+#undef DEFMASK
+#undef DEC2
+#undef DEC4
+#undef DEC8
+#undef DEC16
+
+#define DEC2(TYPE, ARGTYPE, TEMPTYPE, MASKTYPE) \
+  OVERLOADABLE TYPE##2 shuffle2(ARGTYPE x, ARGTYPE y, MASKTYPE##2 mask);
+
+#define DEC2X(TYPE, MASKTYPE) \
+  OVERLOADABLE TYPE##2 shuffle2(TYPE##16 x, TYPE##16 y, MASKTYPE##2 mask);
+
+#define DEC4(TYPE, ARGTYPE, TEMPTYPE, MASKTYPE) \
+  OVERLOADABLE TYPE##4 shuffle2(ARGTYPE x, ARGTYPE y, MASKTYPE##4 mask);
+
+#define DEC4X(TYPE, MASKTYPE) \
+  OVERLOADABLE TYPE##4 shuffle2(TYPE##16 x, TYPE##16 y, MASKTYPE##4 mask);
+
+#define DEC8(TYPE, ARGTYPE, TEMPTYPE, MASKTYPE) \
+  OVERLOADABLE TYPE##8 shuffle2(ARGTYPE x, ARGTYPE y, MASKTYPE##8 mask);
+
+#define DEC8X(TYPE, MASKTYPE) \
+  OVERLOADABLE TYPE##8 shuffle2(TYPE##16 x, TYPE##16 y, MASKTYPE##8 mask);
+
+#define DEC16(TYPE, ARGTYPE, TEMPTYPE, MASKTYPE) \
+  OVERLOADABLE TYPE##16 shuffle2(ARGTYPE x, ARGTYPE y, MASKTYPE##16 mask);
+
+#define DEC16X(TYPE, MASKTYPE) \
+  OVERLOADABLE TYPE##16 shuffle2(TYPE##16 x, TYPE##16 y, MASKTYPE##16 mask);
+
+#define DEFMASK(TYPE, MASKTYPE) \
+  DEC2(TYPE, TYPE##2, TYPE##4, MASKTYPE) \
+  DEC2(TYPE, TYPE##4, TYPE##8, MASKTYPE) \
+  DEC2(TYPE, TYPE##8, TYPE##16, MASKTYPE) \
+  DEC2X(TYPE, MASKTYPE) \
+  DEC4(TYPE, TYPE##2, TYPE##4, MASKTYPE) \
+  DEC4(TYPE, TYPE##4, TYPE##8, MASKTYPE) \
+  DEC4(TYPE, TYPE##8, TYPE##16, MASKTYPE) \
+  DEC4X(TYPE, MASKTYPE) \
+  DEC8(TYPE, TYPE##2, TYPE##4, MASKTYPE) \
+  DEC8(TYPE, TYPE##4, TYPE##8, MASKTYPE) \
+  DEC8(TYPE, TYPE##8, TYPE##16, MASKTYPE) \
+  DEC8X(TYPE, MASKTYPE) \
+  DEC16(TYPE, TYPE##2, TYPE##4, MASKTYPE) \
+  DEC16(TYPE, TYPE##4, TYPE##8, MASKTYPE) \
+  DEC16(TYPE, TYPE##8, TYPE##16, MASKTYPE) \
+  DEC16X(TYPE, MASKTYPE)
+
+#define DEF(TYPE) \
+  DEFMASK(TYPE, uchar) \
+  DEFMASK(TYPE, ushort) \
+  DEFMASK(TYPE, uint) \
+  DEFMASK(TYPE, ulong)
+
+DEF(char)
+DEF(uchar)
+DEF(short)
+DEF(ushort)
+DEF(int)
+DEF(uint)
+DEF(float)
+DEF(long)
+DEF(ulong)
+#undef DEF
+#undef DEFMASK
+#undef DEC2
+#undef DEC2X
+#undef DEC4
+#undef DEC4X
+#undef DEC8
+#undef DEC8X
+#undef DEC16
+#undef DEC16X
+
+
+/* Temp to add the SIMD functions here. */
+/////////////////////////////////////////////////////////////////////////////
+// SIMD level function
+/////////////////////////////////////////////////////////////////////////////
+short __gen_ocl_simd_any(short);
+short __gen_ocl_simd_all(short);
+
+struct time_stamp {
+  // time tick
+  ulong tick;
+  // If context-switch or frequency change occurs since last read of tm,
+  // event will be non-zero, otherwise, it will be zero.
+  uint event;
+};
+
+struct time_stamp __gen_ocl_get_timestamp(void);
+#endif
diff --git a/src/cl_khr_icd.h b/backend/src/libocl/include/ocl_printf.h
similarity index 55%
copy from src/cl_khr_icd.h
copy to backend/src/libocl/include/ocl_printf.h
index 1e206b4..ffeefb9 100644
--- a/src/cl_khr_icd.h
+++ b/backend/src/libocl/include/ocl_printf.h
@@ -1,10 +1,10 @@
-/* 
- * Copyright © 2013 Simon Richter
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -13,22 +13,20 @@
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
  */
-#ifndef __CL_KHR_ICD_H__
-#define __CL_KHR_ICD_H__
-
-#ifdef HAS_OCLIcd
+#ifndef __OCL_PRINTF_H__
+#define __OCL_PRINTF_H__
 
-#define SET_ICD(dispatch) \
-  dispatch = &cl_khr_icd_dispatch;
-#define INIT_ICD(member)  .member = &cl_khr_icd_dispatch,
-#define DEFINE_ICD(member) struct _cl_icd_dispatch const *member;
+#include "ocl_types.h"
 
-extern struct _cl_icd_dispatch const cl_khr_icd_dispatch;
+/* The printf function. */
+/* From LLVM 3.4, c string are all in constant address space */
+#if 100*__clang_major__ + __clang_minor__ < 304
+int __gen_ocl_printf_stub(const char * format, ...);
 #else
-#define SET_ICD(dispatch)
-#define INIT_ICD(member)
-#define DEFINE_ICD(member)
+int __gen_ocl_printf_stub(constant char * format, ...);
 #endif
+#define printf __gen_ocl_printf_stub
 
 #endif
diff --git a/backend/src/libocl/include/ocl_sync.h b/backend/src/libocl/include/ocl_sync.h
new file mode 100644
index 0000000..ed7c6e4
--- /dev/null
+++ b/backend/src/libocl/include/ocl_sync.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef __OCL_SYNC_H__
+#define __OCL_SYNC_H__
+
+#include "ocl_types.h"
+
+/////////////////////////////////////////////////////////////////////////////
+// Synchronization functions
+/////////////////////////////////////////////////////////////////////////////
+#define CLK_LOCAL_MEM_FENCE  (1 << 0)
+#define CLK_GLOBAL_MEM_FENCE (1 << 1)
+
+typedef uint cl_mem_fence_flags;
+void barrier(cl_mem_fence_flags flags);
+void mem_fence(cl_mem_fence_flags flags);
+void read_mem_fence(cl_mem_fence_flags flags);
+void write_mem_fence(cl_mem_fence_flags flags);
+
+#endif  /* __OCL_SYNC_H__ */
diff --git a/backend/src/libocl/include/ocl_types.h b/backend/src/libocl/include/ocl_types.h
new file mode 100644
index 0000000..63478c9
--- /dev/null
+++ b/backend/src/libocl/include/ocl_types.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef __OCL_TYPES_H__
+#define __OCL_TYPES_H__
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#include "ocl_defines.h"
+
+#define NULL ((void*)0)
+
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL Common Defines
+/////////////////////////////////////////////////////////////////////////////
+#define INLINE inline __attribute__((always_inline))
+#define OVERLOADABLE __attribute__((overloadable))
+#define PURE __attribute__((pure))
+#define CONST __attribute__((const))
+#define INLINE_OVERLOADABLE inline __attribute__((overloadable,always_inline))
+// FIXME, clang's opencl FE doesn't support static.
+#define static
+
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL built-in scalar data types
+/////////////////////////////////////////////////////////////////////////////
+typedef unsigned char uchar;
+typedef unsigned short ushort;
+typedef unsigned int uint;
+typedef unsigned long ulong;
+typedef __typeof__(sizeof(int)) size_t;
+typedef __typeof__((int *)0-(int *)0) ptrdiff_t;
+typedef signed int intptr_t;
+typedef unsigned int uintptr_t;
+
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL address space
+/////////////////////////////////////////////////////////////////////////////
+// These are built-ins in LLVM 3.3.
+#if 100*__clang_major__ + __clang_minor__ <= 302
+#define __private __attribute__((address_space(0)))
+#define __global __attribute__((address_space(1)))
+#define __constant __attribute__((address_space(2)))
+#define __local __attribute__((address_space(3)))
+#define global __global
+#define local __local
+#define constant __constant
+#define private __private
+#endif
+
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL built-in vector data types
+/////////////////////////////////////////////////////////////////////////////
+#define DEF(type) typedef type type##2 __attribute__((ext_vector_type(2)));\
+                  typedef type type##3 __attribute__((ext_vector_type(3)));\
+                  typedef type type##4 __attribute__((ext_vector_type(4)));\
+                  typedef type type##8 __attribute__((ext_vector_type(8)));\
+                  typedef type type##16 __attribute__((ext_vector_type(16)));
+DEF(char);
+DEF(uchar);
+DEF(short);
+DEF(ushort);
+DEF(int);
+DEF(uint);
+DEF(long);
+DEF(ulong);
+DEF(float);
+DEF(double);
+#undef DEF
+
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL built-in image types
+/////////////////////////////////////////////////////////////////////////////
+// FIXME:
+// This is a transitional hack to bypass the LLVM 3.3 built-in types.
+// See the Khronos SPIR specification for handling of these types.
+#define __texture __attribute__((address_space(4)))
+struct _image1d_t;
+typedef __texture struct _image1d_t* __image1d_t;
+struct _image1d_buffer_t;
+typedef __texture struct _image1d_buffer_t* __image1d_buffer_t;
+struct _image1d_array_t;
+typedef __texture struct _image1d_array_t* __image1d_array_t;
+struct _image2d_t;
+typedef __texture struct _image2d_t* __image2d_t;
+struct _image2d_array_t;
+typedef __texture struct _image2d_array_t* __image2d_array_t;
+struct _image3d_t;
+typedef __texture struct _image3d_t* __image3d_t;
+typedef const ushort __sampler_t;
+#define image1d_t __image1d_t
+#define image1d_buffer_t __image1d_buffer_t
+#define image1d_array_t __image1d_array_t
+#define image2d_t __image2d_t
+#define image2d_array_t __image2d_array_t
+#define image3d_t __image3d_t
+#define sampler_t __sampler_t
+
+/////////////////////////////////////////////////////////////////////////////
+// OpenCL built-in event types
+/////////////////////////////////////////////////////////////////////////////
+typedef size_t __event_t;
+#define event_t __event_t
+
+#endif /* __OCL_TYPES_H__ */
diff --git a/backend/src/libocl/include/ocl_vload.h b/backend/src/libocl/include/ocl_vload.h
new file mode 100644
index 0000000..b1b1a32
--- /dev/null
+++ b/backend/src/libocl/include/ocl_vload.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef __OCL_VLOAD_H__
+#define __OCL_VLOAD_H__
+
+#include "ocl_types.h"
+
+/////////////////////////////////////////////////////////////////////////////
+// Vector loads and stores
+/////////////////////////////////////////////////////////////////////////////
+
+// These loads and stores will use untyped reads and writes, so we can just
+// cast to vector loads / stores. Not C99 compliant BTW due to aliasing issue.
+// Well we do not care, we do not activate TBAA in the compiler
+#define DECL_UNTYPED_RW_SPACE_N(TYPE, DIM, SPACE) \
+OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p); \
+OVERLOADABLE void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p);
+
+#define DECL_UNTYPED_RD_SPACE_N(TYPE, DIM, SPACE) \
+OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p);
+
+#define DECL_UNTYPED_V3_SPACE(TYPE, SPACE) \
+OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p); \
+OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p);
+
+#define DECL_UNTYPED_RDV3_SPACE(TYPE, SPACE) \
+OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p);
+
+#define DECL_UNTYPED_RW_ALL_SPACE(TYPE, SPACE) \
+  DECL_UNTYPED_RW_SPACE_N(TYPE, 2, SPACE) \
+  DECL_UNTYPED_V3_SPACE(TYPE, SPACE) \
+  DECL_UNTYPED_RW_SPACE_N(TYPE, 4, SPACE) \
+  DECL_UNTYPED_RW_SPACE_N(TYPE, 8, SPACE) \
+  DECL_UNTYPED_RW_SPACE_N(TYPE, 16, SPACE)
+
+#define DECL_UNTYPED_RD_ALL_SPACE(TYPE, SPACE) \
+  DECL_UNTYPED_RD_SPACE_N(TYPE, 2, SPACE) \
+  DECL_UNTYPED_RDV3_SPACE(TYPE, SPACE) \
+  DECL_UNTYPED_RD_SPACE_N(TYPE, 4, SPACE) \
+  DECL_UNTYPED_RD_SPACE_N(TYPE, 8, SPACE) \
+  DECL_UNTYPED_RD_SPACE_N(TYPE, 16, SPACE)
+
+#define DECL_UNTYPED_RW_ALL(TYPE) \
+  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __global) \
+  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __local) \
+  DECL_UNTYPED_RD_ALL_SPACE(TYPE, __constant) \
+  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __private)
+
+#define DECL_BYTE_RD_SPACE(TYPE, SPACE) \
+OVERLOADABLE TYPE##2 vload2(size_t offset, const SPACE TYPE *p);  \
+OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p);  \
+OVERLOADABLE TYPE##4 vload4(size_t offset, const SPACE TYPE *p);  \
+OVERLOADABLE TYPE##8 vload8(size_t offset, const SPACE TYPE *p);  \
+OVERLOADABLE TYPE##16 vload16(size_t offset, const SPACE TYPE *p);
+
+#define DECL_BYTE_WR_SPACE(TYPE, SPACE) \
+OVERLOADABLE void vstore2(TYPE##2 v, size_t offset, SPACE TYPE *p); \
+OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p); \
+OVERLOADABLE void vstore4(TYPE##4 v, size_t offset, SPACE TYPE *p); \
+OVERLOADABLE void vstore8(TYPE##8 v, size_t offset, SPACE TYPE *p); \
+OVERLOADABLE void vstore16(TYPE##16 v, size_t offset, SPACE TYPE *p);
+
+#define DECL_BYTE_RW_ALL(TYPE) \
+  DECL_BYTE_RD_SPACE(TYPE, __global) \
+  DECL_BYTE_RD_SPACE(TYPE, __local) \
+  DECL_BYTE_RD_SPACE(TYPE, __private) \
+  DECL_BYTE_RD_SPACE(TYPE, __constant) \
+  DECL_BYTE_WR_SPACE(TYPE, __global) \
+  DECL_BYTE_WR_SPACE(TYPE, __local) \
+  DECL_BYTE_WR_SPACE(TYPE, __private)
+
+DECL_BYTE_RW_ALL(char)
+DECL_BYTE_RW_ALL(uchar)
+DECL_BYTE_RW_ALL(short)
+DECL_BYTE_RW_ALL(ushort)
+DECL_UNTYPED_RW_ALL(int)
+DECL_UNTYPED_RW_ALL(uint)
+DECL_UNTYPED_RW_ALL(long)
+DECL_UNTYPED_RW_ALL(ulong)
+DECL_UNTYPED_RW_ALL(float)
+DECL_UNTYPED_RW_ALL(double)
+
+#undef DECL_UNTYPED_RW_ALL
+#undef DECL_UNTYPED_RW_ALL_SPACE
+#undef DECL_UNTYPED_RD_ALL_SPACE
+#undef DECL_UNTYPED_RW_SPACE_N
+#undef DECL_UNTYPED_RD_SPACE_N
+#undef DECL_UNTYPED_V3_SPACE
+#undef DECL_UNTYPED_RDV3_SPACE
+#undef DECL_BYTE_RD_SPACE
+#undef DECL_BYTE_WR_SPACE
+#undef DECL_BYTE_RW_ALL
+
+
+#define DECL_HALF_LD_SPACE(SPACE) \
+OVERLOADABLE float vload_half(size_t offset, const SPACE half *p);  \
+OVERLOADABLE float2 vload_half2(size_t offset, const SPACE half *p); \
+OVERLOADABLE float3 vload_half3(size_t offset, const SPACE half *p); \
+OVERLOADABLE float3 vloada_half3(size_t offset, const SPACE half *p); \
+OVERLOADABLE float4 vload_half4(size_t offset, const SPACE half *p);  \
+OVERLOADABLE float8 vload_half8(size_t offset, const SPACE half *p);  \
+OVERLOADABLE float16 vload_half16(size_t offset, const SPACE half *p);
+
+#define DECL_HALF_ST_SPACE_ROUND(SPACE, ROUND, FUNC) \
+OVERLOADABLE void vstore_half##ROUND(float data, size_t offset, SPACE half *p);  \
+OVERLOADABLE void vstorea_half##ROUND(float data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstore_half2##ROUND(float2 data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstorea_half2##ROUND(float2 data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstore_half3##ROUND(float3 data, size_t offset, SPACE half *p);  \
+OVERLOADABLE void vstorea_half3##ROUND(float3 data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstore_half4##ROUND(float4 data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstorea_half4##ROUND(float4 data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstore_half8##ROUND(float8 data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstorea_half8##ROUND(float8 data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstore_half16##ROUND(float16 data, size_t offset, SPACE half *p); \
+OVERLOADABLE void vstorea_half16##ROUND(float16 data, size_t offset, SPACE half *p);
+
+#define DECL_HALF_ST_SPACE(SPACE) \
+  DECL_HALF_ST_SPACE_ROUND(SPACE,  , dummy) \
+  DECL_HALF_ST_SPACE_ROUND(SPACE, _rte, dummy) \
+  DECL_HALF_ST_SPACE_ROUND(SPACE, _rtz, dummy) \
+  DECL_HALF_ST_SPACE_ROUND(SPACE, _rtp, dummy) \
+  DECL_HALF_ST_SPACE_ROUND(SPACE, _rtn, dummy) \
+
+DECL_HALF_LD_SPACE(__global)
+DECL_HALF_LD_SPACE(__local)
+DECL_HALF_LD_SPACE(__constant)
+DECL_HALF_LD_SPACE(__private)
+
+DECL_HALF_ST_SPACE(__global)
+DECL_HALF_ST_SPACE(__local)
+DECL_HALF_ST_SPACE(__private)
+
+//#undef DECL_UNTYPED_RW_ALL_SPACE
+#undef DECL_HALF_LD_SPACE
+#undef DECL_HALF_ST_SPACE
+#undef DECL_HALF_ST_SPACE_ROUND
+
+#define vloada_half vload_half
+#define vloada_half2 vload_half2
+#define vloada_half4 vload_half4
+#define vloada_half8 vload_half8
+#define vloada_half16 vload_half16
+
+#endif  /* __OCL_VLOAD_H__ */
diff --git a/CMake/CMakeConfigTemplate.hpp b/backend/src/libocl/include/ocl_workitem.h
similarity index 55%
copy from CMake/CMakeConfigTemplate.hpp
copy to backend/src/libocl/include/ocl_workitem.h
index 7702c54..7534ee8 100644
--- a/CMake/CMakeConfigTemplate.hpp
+++ b/backend/src/libocl/include/ocl_workitem.h
@@ -1,10 +1,10 @@
-/* 
- * Copyright © 2012 Intel Corporation
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -14,15 +14,19 @@
  * You should have received a copy of the GNU Lesser General Public
  * License along with this library. If not, see <http://www.gnu.org/licenses/>.
  *
- * Author: Benjamin Segovia <benjamin.segovia at intel.com>
  */
+#ifndef __OCL_WORKITEM_H__
+#define __OCL_WORKITEM_H__
 
-#ifndef CMAKE_CONFIG_HPP
-#define CMAKE_CONFIG_HPP
+#include "ocl_types.h"
 
-#define ON true
-#define OFF false
-#define GEN_INSTALLATION_PATH "${CMAKE_INSTALL_PREFIX}/lib/i965/"
-
-#endif /* CMAKE_CONFIG_HPP */
+uint get_work_dim(void);
+uint get_global_size(uint dimindx);
+uint get_global_id(uint dimindx);
+uint get_local_size(uint dimindx);
+uint get_local_id(uint dimindx);
+uint get_num_groups(uint dimindx);
+uint get_group_id(uint dimindx);
+uint get_global_offset(uint dimindx);
 
+#endif  /* __OCL_WORKITEM_H__ */
diff --git a/backend/src/gen_builtin_vector.py b/backend/src/libocl/script/gen_vector.py
similarity index 80%
rename from backend/src/gen_builtin_vector.py
rename to backend/src/libocl/script/gen_vector.py
index 2d602c8..291dd87 100755
--- a/backend/src/gen_builtin_vector.py
+++ b/backend/src/libocl/script/gen_vector.py
@@ -5,7 +5,7 @@
 # This library is free software; you can redistribute it and/or
 # modify it under the terms of the GNU Lesser General Public
 # License as published by the Free Software Foundation; either
-# version 2 of the License, or (at your option) any later version.
+# version 2.1 of the License, or (at your option) any later version.
 #
 # This library is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -24,9 +24,9 @@ import re
 import sys
 import os
 
-if len(sys.argv) != 3:
+if len(sys.argv) != 4:
     print "Invalid argument {0}".format(sys.argv)
-    print "use {0} spec_file_name output_file_name".format(sys.argv[0])
+    print "use {0} spec_file_name output_file_name just_proto".format(sys.argv[0])
     raise
 
 all_vector = 1,2,3,4,8,16
@@ -197,12 +197,14 @@ class builtinProto():
     paramCount = 0
     outputStr = []
     prefix = ""
+    justproto = 0
 
-    def init(self, sectionHeader, sectionPrefix):
+    def init(self, sectionHeader, sectionPrefix, justproto):
         self.valueTypeStr = ""
         self.functionName = ""
         self.paramTypeStrs = []
         self.paramCount = 0
+        self.justproto = justproto
         if sectionHeader != "":
             self.outputStr = [sectionHeader]
         else:
@@ -264,7 +266,7 @@ class builtinProto():
                 return
 
             if (n == 0):
-                formatStr = 'INLINE_OVERLOADABLE {0}{1} {2} ('.format(vtype[0], vtype[1], self.functionName)
+                formatStr = 'OVERLOADABLE {0}{1} {2} ('.format(vtype[0], vtype[1], self.functionName)
             else:
                 formatStr += ', '
 
@@ -283,9 +285,46 @@ class builtinProto():
                 formatStr += '{0} {1}param{2}'.format(ptype[0], pointerStr, n)
 
         formatStr += ')'
-        formatStr = self.append(formatStr, '{{return ({0}{1})('.format(vtype[0], vtype[1]))
-        self.indent = len(formatStr)
-        for j in range(0, vtype[1]):
+        if self.justproto == "1":
+            formatStr += ';'
+            self.append(formatStr)
+            return formatStr
+        if self.functionName != 'select' and ptypeSeqs[0] == ptypeSeqs[self.paramCount-1] and ptype[1] > 4:
+            formatStr += '\n{ \n  union{'
+            formatStr = self.append(formatStr, '    {0} va[{1}];'.format(vtype[0], vtype[1]))
+            formatStr = self.append(formatStr, '    {0}{1} vv{2};'.format(vtype[0], vtype[1], vtype[1]))
+            formatStr += '\n  }uret;'
+            formatStr += '\n  union{'
+            formatStr = self.append(formatStr, '    {0} pa[{1}];'.format(ptype[0], ptype[1]))
+            formatStr = self.append(formatStr, '    {0}{1} pv{2};'.format(ptype[0], ptype[1], ptype[1]))
+            formatStr += '\n  }'
+            for n in range(0, self.paramCount):
+              formatStr += 'usrc{0}'.format(n)
+              if n+1 != self.paramCount:
+                formatStr +=', '
+            formatStr += ';'
+
+            for n in range(0, self.paramCount):
+              formatStr = self.append(formatStr, '  usrc{0}.pv{1} = param{2};'.format(n, ptype[1], n))
+            formatStr = self.append(formatStr, '  for(int i =0; i < {0}; i++)'.format(ptype[1]))
+            formatStr += '\n    uret.va[i] = '
+            if self.prefix == 'relational' and self.functionName != 'bitselect' and self.functionName != 'select':
+              formatStr += '-'
+            formatStr += '{0}('.format(self.functionName)
+
+            for n in range(0, self.paramCount):
+              formatStr += 'usrc{0}.pa[i]'.format(n)
+              if n+1 != self.paramCount:
+                formatStr +=', '
+            formatStr += ');'
+            formatStr = self.append(formatStr, ' return uret.vv{0};'.format(vtype[1]))
+            formatStr += '\n}'
+            formatStr = self.append(formatStr)
+            return formatStr
+        else:
+          formatStr = self.append(formatStr, '{{return ({0}{1})('.format(vtype[0], vtype[1]))
+          self.indent = len(formatStr)
+          for j in range(0, vtype[1]):
             if (j != 0):
                 formatStr += ','
                 if (j + 1) % 2 == 0:
@@ -314,16 +353,16 @@ class builtinProto():
                     formatStr += '({0} {1} *)param{2} + {3:2d}'.format(ptype[2], ptype[0], n, j)
                 else:
                     if (self.functionName == 'select' and n == 2):
-                        formatStr += '({0})(param{1}.s{2:X} & (({0})1 << (sizeof({0})*8 - 1)))'.format(ptype[0], n, j)
+                        formatStr += '({0})(param{1}.s{2:x} & (({0})1 << (sizeof({0})*8 - 1)))'.format(ptype[0], n, j)
                     else:
-                        formatStr += 'param{0}.s{1:X}'.format(n, j)
+                        formatStr += 'param{0}.s{1:x}'.format(n, j)
 
             formatStr += ')'
 
-        formatStr += '); }\n'
-        self.append(formatStr)
+          formatStr += '); }\n'
+          self.append(formatStr)
 
-        return formatStr
+          return formatStr
 
     def output(self):
         for line in self.outputStr:
@@ -348,20 +387,13 @@ class builtinProto():
 
         self.append("")
 
-def safeUnlink(filename):
-    try:
-        os.remove(filename)
-    except OSError:
-        pass
-
 # save the prototypes into ocl_vector.h
 specFile = open(sys.argv[1], 'r')
 headerFileName = sys.argv[2]
-tempHeaderFileName = sys.argv[2] + '.tmp'
-safeUnlink(headerFileName)
-tempHeader = open(tempHeaderFileName, 'w')
+tempHeader = open(headerFileName, 'a')
+isJustProto = sys.argv[3]
 
-tempHeader.write("//This file is autogenerated by {0}.\n".format(sys.argv[0]))
+tempHeader.write("//Begin from this part is autogenerated.\n")
 tempHeader.write("//Don't modify it manually.\n")
 
 functionProto = builtinProto()
@@ -373,7 +405,7 @@ for line in specFile:
             sectionHeader = "//{0} builtin functions".format(line[2:].rstrip())
             sectionPrefix=(line[2:].split())[0]
         continue
-    functionProto.init(sectionHeader, sectionPrefix)
+    functionProto.init(sectionHeader, sectionPrefix, isJustProto)
     sectionHeader = ""
     setionPrefix = ""
     functionProto.init_from_line(line)
@@ -381,4 +413,3 @@ for line in specFile:
     functionProto.output(tempHeader)
 
 tempHeader.close()
-os.rename(tempHeaderFileName, headerFileName)
diff --git a/backend/src/libocl/script/ocl_as.sh b/backend/src/libocl/script/ocl_as.sh
new file mode 100755
index 0000000..a432189
--- /dev/null
+++ b/backend/src/libocl/script/ocl_as.sh
@@ -0,0 +1,147 @@
+#! /bin/sh -e
+
+echo '
+/*
+ * Copyright © 2012 - 2014 Intel Corporatio
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+'
+
+if [ $1"a" = "-pa" ]; then
+    echo "#ifndef __OCL_AS_H__"
+    echo "#define __OCL_AS_H__"
+    echo "#include \"ocl_types.h\""
+    echo
+else
+    echo "#include \"ocl_as.h\""
+    echo
+fi
+
+# Supported base types and their lengths
+TYPES="long:8 ulong:8 int:4 uint:4 short:2 ushort:2 char:1 uchar:1 double:8 float:4"
+# Supported vector lengths
+VECTOR_LENGTHS="1 2 3 4 8 16"
+ROUNDING_MODES="rte rtz rtp rtn"
+
+# Generate list of union sizes
+for type in $TYPES; do
+        size=`IFS=:; set -- dummy $type; echo $3`
+        for vector_length in $VECTOR_LENGTHS; do
+                if test $vector_length -eq 3; then
+                      continue;
+                fi
+                union_sizes="$union_sizes `expr $vector_length \* $size`"
+        done
+done
+union_sizes="`echo $union_sizes | tr ' ' '\n' | sort -n | uniq`"
+
+# For each union size
+for union_size in $union_sizes; do
+
+    if [ $1"a" != "-pa" ]; then
+	# Define an union that contains all vector types that have the same size as the union
+	unionname="union _type_cast_${union_size}_b"
+	echo "$unionname {"
+	for type in $TYPES; do
+	    basetype=`IFS=:; set -- dummy $type; echo $2`
+	    basesize=`IFS=:; set -- dummy $type; echo $3`
+	    for vector_length in $VECTOR_LENGTHS; do
+		if test $vector_length -eq 3; then
+		    vector_size_length="4"
+		else
+		    vector_size_length=$vector_length;
+		fi
+		vector_size_in_union="`expr $vector_size_length \* $basesize`"
+		if test $union_size -ne $vector_size_in_union; then
+		    continue
+		fi
+		if test $vector_length -eq 1; then
+		    vectortype=$basetype
+		else
+		    vectortype=$basetype$vector_length
+		fi
+		echo "  $vectortype _$vectortype;"
+	    done
+
+	done
+	echo "};"
+	echo
+    fi
+
+    # For each tuple of vector types that has the same size as the current union size,
+    # define an as_* function that converts types without changing binary representation.
+    for ftype in $TYPES; do
+	fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
+	fbasesize=`IFS=:; set -- dummy $ftype; echo $3`
+	for fvector_length in $VECTOR_LENGTHS; do
+	    if test $fvector_length -eq 3; then
+		fvector_size_length="4"
+	    else
+		fvector_size_length=$fvector_length;
+	    fi
+	    fvector_size_in_union="`expr $fvector_size_length \* $fbasesize`"
+	    if test $union_size -ne $fvector_size_in_union; then
+		continue
+	    fi
+	    if test $fvector_length -eq 1; then
+		fvectortype=$fbasetype
+	    else
+		fvectortype=$fbasetype$fvector_length
+	    fi
+	    for ttype in $TYPES; do
+		tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
+		tbasesize=`IFS=:; set -- dummy $ttype; echo $3`
+		if test $fbasetype = $tbasetype; then
+		    continue
+		fi
+		for tvector_length in $VECTOR_LENGTHS; do
+		    if test $tvector_length -eq 3; then
+			tvector_size_length="4"
+		    else
+			tvector_size_length=$tvector_length;
+		    fi
+		    tvector_size_in_union="`expr $tvector_size_length \* $tbasesize`"
+		    if test $union_size -ne $tvector_size_in_union; then
+			continue
+		    fi
+		    if test $tvector_length -eq 1; then
+			tvectortype=$tbasetype
+		    else
+			tvectortype=$tbasetype$tvector_length
+		    fi
+
+		    if [ $1"a" = "-pa" ]; then
+			echo "OVERLOADABLE $tvectortype as_$tvectortype($fvectortype v);"
+		    else
+			echo "OVERLOADABLE $tvectortype as_$tvectortype($fvectortype v) {"
+			echo "  $unionname u;"
+			echo "  u._$fvectortype = v;"
+			echo "  return u._$tvectortype;"
+			echo "}"
+			echo
+		    fi
+		done
+	    done
+	done
+
+    done
+
+done
+
+
+if [ $1"a" = "-pa" ]; then
+    echo "#endif /* __OCL_AS_H__ */"
+fi
diff --git a/backend/src/libocl/script/ocl_common.def b/backend/src/libocl/script/ocl_common.def
new file mode 100644
index 0000000..fac5ef5
--- /dev/null
+++ b/backend/src/libocl/script/ocl_common.def
@@ -0,0 +1,22 @@
+##common
+gentype clamp (gentype x, gentype minval, gentype maxval)
+gentypef clamp (gentypef x, float minval, float maxval)
+gentyped clamp (gentyped x, double minval, double maxval)
+gentype degrees (gentype radians)
+gentype max (gentype x,  gentype y)
+gentypef max (gentypef x, float y)
+gentyped max (gentyped x, double y)
+gentype min (gentype x,  gentype y)
+gentypef min (gentypef x,  float y)
+gentyped min (gentyped x,  double y)
+gentype mix (gentype x, gentype y, gentype a)
+gentypef mix (gentypef x, gentypef y, float a)
+gentyped mix (gentyped x, gentyped y, double a)
+gentype radians (gentype degrees)
+gentype step (gentype edge, gentype x)
+gentypef step (float edge, gentypef x)
+gentyped step (double edge, gentyped x)
+gentype smoothstep (gentype edge0, gentype edge1, gentype x)
+gentypef smoothstep (float edge0, float edge1, gentypef x)
+gentyped smoothstep (double edge0, double edge1, gentyped x)
+gentype sign (gentype x)
diff --git a/backend/src/libocl/script/ocl_convert.sh b/backend/src/libocl/script/ocl_convert.sh
new file mode 100755
index 0000000..afaacab
--- /dev/null
+++ b/backend/src/libocl/script/ocl_convert.sh
@@ -0,0 +1,676 @@
+#! /bin/sh -e
+
+echo '
+/*
+ * Copyright © 2012 - 2014 Intel Corporatio
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+'
+
+if [ $1"a" = "-pa" ]; then
+    echo "#ifndef __OCL_CONVERT_H__"
+    echo "#define __OCL_CONVERT_H__"
+    echo "#include \"ocl_types.h\""
+    echo
+else
+    echo "#include \"ocl_convert.h\""
+    echo
+fi
+
+# Supported base types and their lengths
+TYPES="long:8 ulong:8 int:4 uint:4 short:2 ushort:2 char:1 uchar:1 double:8 float:4"
+# Supported vector lengths
+VECTOR_LENGTHS="1 2 3 4 8 16"
+ROUNDING_MODES="rte rtz rtp rtn"
+
+# For all vector lengths and types, generate conversion functions
+for vector_length in $VECTOR_LENGTHS; do
+    if test $vector_length -eq 1; then
+	for ftype in $TYPES; do
+	    fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
+	    for ttype in $TYPES; do
+		tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
+		if [ $1"a" = "-pa" ]; then
+		    echo "OVERLOADABLE $tbasetype convert_$tbasetype($fbasetype v);"
+		else
+		    echo "OVERLOADABLE $tbasetype convert_$tbasetype($fbasetype v) {"
+		    echo "  return ($tbasetype)v;"
+		    echo "}"
+		    echo
+		fi
+	    done
+	done
+    else
+	for ftype in $TYPES; do
+	    fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
+	    for ttype in $TYPES; do
+		tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
+		if test $fbasetype = $tbasetype; then
+		    if test $vector_length -gt 1; then
+			fvectortype=$fbasetype$vector_length
+			tvectortype=$tbasetype$vector_length
+			if [ $1"a" = "-pa" ]; then
+			    echo "OVERLOADABLE $tvectortype convert_$tvectortype($fvectortype v);"
+			else
+			    echo "OVERLOADABLE $tvectortype convert_$tvectortype($fvectortype v) { return v; }"
+			fi
+		    else
+			if [ $1"a" = "-pa" ]; then
+			    echo "OVERLOADABLE $tbasetype convert_$tbasetype($fbasetype v);"
+			else
+			    echo "OVERLOADABLE $tbasetype convert_$tbasetype($fbasetype v) { return v; }"
+			fi
+		    fi
+		    continue
+		fi
+		fvectortype=$fbasetype$vector_length
+		tvectortype=$tbasetype$vector_length
+		construct="($tbasetype)(v.s0)"
+		if test $vector_length -gt 1; then
+		    construct="$construct, ($tbasetype)(v.s1)"
+		fi
+		if test $vector_length -gt 2; then
+		    construct="$construct, ($tbasetype)(v.s2)"
+		fi
+		if test $vector_length -gt 3; then
+		    construct="$construct, ($tbasetype)(v.s3)"
+		fi
+		if test $vector_length -gt 4; then
+		    construct="$construct, ($tbasetype)(v.s4)"
+		    construct="$construct, ($tbasetype)(v.s5)"
+		    construct="$construct, ($tbasetype)(v.s6)"
+		    construct="$construct, ($tbasetype)(v.s7)"
+		fi
+		if test $vector_length -gt 8; then
+		    construct="$construct, ($tbasetype)(v.s8)"
+		    construct="$construct, ($tbasetype)(v.s9)"
+		    construct="$construct, ($tbasetype)(v.sA)"
+		    construct="$construct, ($tbasetype)(v.sB)"
+		    construct="$construct, ($tbasetype)(v.sC)"
+		    construct="$construct, ($tbasetype)(v.sD)"
+		    construct="$construct, ($tbasetype)(v.sE)"
+		    construct="$construct, ($tbasetype)(v.sF)"
+		fi
+
+		if [ $1"a" = "-pa" ]; then
+		    echo "OVERLOADABLE $tvectortype convert_$tvectortype($fvectortype v);"
+		else
+		    echo "OVERLOADABLE $tvectortype convert_$tvectortype($fvectortype v) {"
+		    echo "  return ($tvectortype)($construct);"
+		    echo "}"
+		    echo
+		fi
+	    done
+	done
+    fi
+done
+
+echo '
+#define DEF(DSTTYPE, SRCTYPE) \
+OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x);
+DEF(char, uchar);
+DEF(char, short);
+DEF(char, ushort);
+DEF(char, int);
+DEF(char, uint);
+DEF(char, float);
+DEF(uchar, char);
+DEF(uchar, short);
+DEF(uchar, ushort);
+DEF(uchar, int);
+DEF(uchar, uint);
+DEF(uchar, float);
+DEF(short, ushort);
+DEF(short, int);
+DEF(short, uint);
+DEF(short, float);
+DEF(ushort, short);
+DEF(ushort, int);
+DEF(ushort, uint);
+DEF(ushort, float);
+DEF(int, uint);
+DEF(int, float);
+DEF(uint, int);
+DEF(uint, float);
+#undef DEF
+'
+
+if [ $1"a" = "-pa" ]; then
+    echo "#define DEF(DSTTYPE, SRCTYPE, MIN, MAX)  OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x);"
+else
+    echo '
+#define DEF(DSTTYPE, SRCTYPE, MIN, MAX) \
+OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
+  return x >= MAX ? (DSTTYPE)MAX : x <= MIN ? (DSTTYPE)MIN : x; \
+}
+'
+fi
+
+echo '
+DEF(char, long, -128, 127);
+DEF(uchar, long, 0, 255);
+DEF(short, long, -32768, 32767);
+DEF(ushort, long, 0, 65535);
+DEF(int, long, -0x7fffffff-1, 0x7fffffff);
+DEF(uint, long, 0, 0xffffffffu);
+DEF(long, float, -9.223372036854776e+18f, 9.223372036854776e+18f);
+DEF(ulong, float, 0, 1.8446744073709552e+19f);
+#undef DEF
+'
+
+if [ $1"a" = "-pa" ]; then
+    echo "#define DEF(DSTTYPE, SRCTYPE, MAX) OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x);"
+else
+    echo '
+#define DEF(DSTTYPE, SRCTYPE, MAX) \
+OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
+  return x >= MAX ? (DSTTYPE)MAX : x; \
+}
+'
+fi
+
+echo '
+DEF(char, ulong, 127);
+DEF(uchar, ulong, 255);
+DEF(short, ulong, 32767);
+DEF(ushort, ulong, 65535);
+DEF(int, ulong, 0x7fffffff);
+DEF(uint, ulong, 0xffffffffu);
+#undef DEF
+'
+
+if [ $1"a" = "-pa" ]; then
+    echo  "OVERLOADABLE long convert_long_sat(ulong x);"
+else
+    echo '
+OVERLOADABLE long convert_long_sat(ulong x) {
+  ulong MAX = 0x7ffffffffffffffful;
+  return x >= MAX ? MAX : x;
+}
+'
+fi
+
+if [ $1"a" = "-pa" ]; then
+    echo "#define DEF(DSTTYPE, SRCTYPE) OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x);"
+else
+    echo '
+#define DEF(DSTTYPE, SRCTYPE) \
+  OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
+  return x <= 0 ? 0 : x; \
+}
+'
+fi
+
+echo '
+  DEF(ushort, char);
+  DEF(uint, char);
+  DEF(uint, short);
+  DEF(ulong, char);
+  DEF(ulong, short);
+  DEF(ulong, int);
+  DEF(ulong, long);
+  #undef DEF
+'
+
+if [ $1"a" = "-pa" ]; then
+    echo "#define DEF(DSTTYPE, SRCTYPE) OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x);"
+else
+    echo '
+#define DEF(DSTTYPE, SRCTYPE) \
+  OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
+  return x; \
+}
+'
+fi
+
+echo '
+DEF(char, char);
+DEF(uchar, uchar);
+DEF(short, char);
+DEF(short, uchar);
+DEF(short, short);
+DEF(ushort, uchar);
+DEF(ushort, ushort);
+DEF(int, char);
+DEF(int, uchar);
+DEF(int, short);
+DEF(int, ushort);
+DEF(int, int);
+DEF(uint, uchar);
+DEF(uint, ushort);
+DEF(uint, uint);
+DEF(long, char);
+DEF(long, uchar);
+DEF(long, short);
+DEF(long, ushort);
+DEF(long, int);
+DEF(long, uint);
+DEF(long, long);
+DEF(ulong, uchar);
+DEF(ulong, ushort);
+DEF(ulong, uint);
+DEF(ulong, ulong);
+#undef DEF
+'
+
+# vector convert_DSTTYPE_sat function
+for vector_length in $VECTOR_LENGTHS; do
+    if test $vector_length -eq 1; then continue; fi
+
+    for ftype in $TYPES; do
+	fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
+	if test $fbasetype = "double"; then continue; fi
+
+	for ttype in $TYPES; do
+	    tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
+	    if test $tbasetype = "double" -o $tbasetype = "float"; then continue; fi
+
+	    fvectortype=$fbasetype$vector_length
+	    tvectortype=$tbasetype$vector_length
+	    conv="convert_${tbasetype}_sat"
+
+	    construct="$conv(v.s0)"
+	    if test $vector_length -gt 1; then
+		construct="$construct, $conv(v.s1)"
+	    fi
+	    if test $vector_length -gt 2; then
+		construct="$construct, $conv(v.s2)"
+	    fi
+	    if test $vector_length -gt 3; then
+		construct="$construct, $conv(v.s3)"
+	    fi
+	    if test $vector_length -gt 4; then
+		construct="$construct, $conv(v.s4)"
+		construct="$construct, $conv(v.s5)"
+		construct="$construct, $conv(v.s6)"
+		construct="$construct, $conv(v.s7)"
+	    fi
+	    if test $vector_length -gt 8; then
+		construct="$construct, $conv(v.s8)"
+		construct="$construct, $conv(v.s9)"
+		construct="$construct, $conv(v.sA)"
+		construct="$construct, $conv(v.sB)"
+		construct="$construct, $conv(v.sC)"
+		construct="$construct, $conv(v.sD)"
+		construct="$construct, $conv(v.sE)"
+		construct="$construct, $conv(v.sF)"
+	    fi
+
+	    if [ $1"a" = "-pa" ]; then
+		echo "OVERLOADABLE $tvectortype convert_${tvectortype}_sat($fvectortype v);"
+	    else
+		echo "OVERLOADABLE $tvectortype convert_${tvectortype}_sat($fvectortype v) {"
+		echo "  return ($tvectortype)($construct);"
+		echo "}"
+		echo
+	    fi
+	done
+    done
+done
+
+if [ $1"a" != "-pa" ]; then
+echo '
+float __gen_ocl_rndz(float x);
+float __gen_ocl_rnde(float x);
+float __gen_ocl_rndu(float x);
+float __gen_ocl_rndd(float x);
+OVERLOADABLE float __convert_float_rtz(long x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  long l = u.f;
+  if((l > x && x > 0) || x >= 0x7fffffc000000000 ||
+    (l < x && x < 0)) {
+    u.u -= 1;
+  }
+  return u.f;
+}
+OVERLOADABLE float __convert_float_rtp(long x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  long l = u.f;  //can not use u.f < x
+  if(l < x && x < 0x7fffffc000000000) {
+    if(x > 0)
+      u.u = u.u + 1;
+    else
+      u.u = u.u - 1;
+  }
+  return u.f;
+}
+OVERLOADABLE float __convert_float_rtn(long x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  long l = u.f;  //avoid overflow
+  if(l > x || x >= 0x7fffffc000000000) {
+    if(x > 0)
+      u.u = u.u - 1;
+    else
+      u.u = u.u + 1;
+  }
+  return u.f;
+}
+OVERLOADABLE float __convert_float_rtz(ulong x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  ulong l = u.f;
+  if(l > x  || x >= 0xffffff8000000000)
+    u.u -= 1;
+  return u.f;
+}
+OVERLOADABLE float __convert_float_rtp(ulong x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  ulong l = u.f;  //can not use u.f < x
+  if(l < x && x < 0xffffff8000000000)
+    u.u = u.u + 1;
+  return u.f;
+}
+OVERLOADABLE float __convert_float_rtn(ulong x)
+{
+  return __convert_float_rtz(x);
+}
+OVERLOADABLE float __convert_float_rtz(int x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  long i = u.f;
+  if((i > x && x > 0) ||
+    (i < x && x < 0)) {
+    u.u -= 1;
+  }
+  return u.f;
+}
+OVERLOADABLE float __convert_float_rtp(int x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  int i = u.f;
+  if(i < x) {
+    if(x > 0)
+      u.u += 1;
+    else
+      u.u -= 1;
+  }
+  return u.f;
+}
+OVERLOADABLE float __convert_float_rtn(int x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  long i = u.f;  //avoid overflow
+  if(i > x) {
+    if(x > 0)
+      u.u = u.u - 1;
+    else
+      u.u = u.u + 1;
+  }
+  return u.f;
+}
+OVERLOADABLE float __convert_float_rtz(uint x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  ulong i = u.f;
+  if(i > x)
+    u.u -= 1;
+  return u.f;
+}
+OVERLOADABLE float __convert_float_rtp(uint x)
+{
+  union {
+    uint u;
+    float f;
+  } u;
+  u.f = x;
+  uint i = u.f;
+  if(i < x)
+    u.u += 1;
+  return u.f;
+}
+OVERLOADABLE float __convert_float_rtn(uint x)
+{
+    return __convert_float_rtz(x);
+}
+'
+fi
+
+# convert_DSTTYPE_ROUNDING function
+for vector_length in $VECTOR_LENGTHS; do
+    for ftype in $TYPES; do
+	fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
+	if test $fbasetype = "double"; then continue; fi
+
+	for ttype in $TYPES; do
+	    tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
+	    if test $tbasetype = "double"; then continue; fi
+
+	    if test $vector_length -eq 1; then
+		if [ $1"a" = "-pa" ]; then
+		    echo "OVERLOADABLE $tbasetype convert_${tbasetype}_rte($fbasetype x);"
+		    echo "OVERLOADABLE $tbasetype convert_${tbasetype}_rtz($fbasetype x);"
+		    echo "OVERLOADABLE $tbasetype convert_${tbasetype}_rtp($fbasetype x);"
+		    echo "OVERLOADABLE $tbasetype convert_${tbasetype}_rtn($fbasetype x);"
+		else
+		    echo "OVERLOADABLE $tbasetype convert_${tbasetype}_rte($fbasetype x)"
+		    if test $fbasetype = "float" -a $tbasetype != "float"; then
+			echo "{ return __gen_ocl_rnde(x); }"
+		    else
+			echo "{ return x; }"
+		    fi
+
+		    echo "OVERLOADABLE $tbasetype convert_${tbasetype}_rtz($fbasetype x)"
+		    if test $fbasetype = "float" -a $tbasetype != "float"; then
+			echo "{ return __gen_ocl_rndz(x); }"
+		    elif [ "$fbasetype" = "int" -o "$fbasetype" = "uint" -o "$fbasetype" = "long" -o "$fbasetype" = "ulong" ] && [ "$tbasetype" = "float" ]; then
+			echo "{ return __convert_${tbasetype}_rtz(x); }"
+		    else
+			echo "{ return x; }"
+		    fi
+
+		    echo "OVERLOADABLE $tbasetype convert_${tbasetype}_rtp($fbasetype x)"
+		    if test $fbasetype = "float" -a $tbasetype != "float"; then
+			echo "{ return __gen_ocl_rndu(x); }"
+		    elif [ "$fbasetype" = "int" -o "$fbasetype" = "uint" -o "$fbasetype" = "long" -o "$fbasetype" = "ulong" ] && [ "$tbasetype" = "float" ]; then
+			echo "{ return __convert_${tbasetype}_rtp(x); }"
+		    else
+			echo "{ return x; }"
+		    fi
+
+		    echo "OVERLOADABLE $tbasetype convert_${tbasetype}_rtn($fbasetype x)"
+		    if test $fbasetype = "float" -a $tbasetype != "float"; then
+			echo "{ return __gen_ocl_rndd(x); }"
+		    elif [ "$fbasetype" = "int" -o "$fbasetype" = "uint" -o "$fbasetype" = "long" -o "$fbasetype" = "ulong" ] && [ "$tbasetype" = "float" ]; then
+			echo "{ return __convert_${tbasetype}_rtn(x); }"
+		    else
+			echo "{ return x; }"
+		    fi
+		fi
+
+		continue
+	    fi
+
+	    for rounding in $ROUNDING_MODES; do
+		fvectortype=$fbasetype$vector_length
+		tvectortype=$tbasetype$vector_length
+		conv="convert_${tbasetype}_${rounding}"
+
+		construct="$conv(v.s0)"
+		if test $vector_length -gt 1; then
+		    construct="$construct, $conv(v.s1)"
+		fi
+		if test $vector_length -gt 2; then
+		    construct="$construct, $conv(v.s2)"
+		fi
+		if test $vector_length -gt 3; then
+		    construct="$construct, $conv(v.s3)"
+		fi
+		if test $vector_length -gt 4; then
+		    construct="$construct, $conv(v.s4)"
+		    construct="$construct, $conv(v.s5)"
+		    construct="$construct, $conv(v.s6)"
+		    construct="$construct, $conv(v.s7)"
+		fi
+		if test $vector_length -gt 8; then
+		    construct="$construct, $conv(v.s8)"
+		    construct="$construct, $conv(v.s9)"
+		    construct="$construct, $conv(v.sA)"
+		    construct="$construct, $conv(v.sB)"
+		    construct="$construct, $conv(v.sC)"
+		    construct="$construct, $conv(v.sD)"
+		    construct="$construct, $conv(v.sE)"
+		    construct="$construct, $conv(v.sF)"
+		fi
+
+		if [ $1"a" = "-pa" ]; then
+		    echo "OVERLOADABLE $tvectortype convert_${tvectortype}_${rounding}($fvectortype v);"
+		else
+		    echo "OVERLOADABLE $tvectortype convert_${tvectortype}_${rounding}($fvectortype v) {"
+		    echo "  return ($tvectortype)($construct);"
+		    echo "}"
+		    echo
+		fi
+	    done
+	done
+    done
+done
+
+# convert_DSTTYPE_sat_ROUNDING function
+for vector_length in $VECTOR_LENGTHS; do
+    for ftype in $TYPES; do
+	fbasetype=`IFS=:; set -- dummy $ftype; echo $2`
+	if test $fbasetype = "double"; then continue; fi
+
+	for ttype in $TYPES; do
+	    tbasetype=`IFS=:; set -- dummy $ttype; echo $2`
+	    if test $tbasetype = "double" -o $tbasetype = "float"; then continue; fi
+
+	    if test $vector_length -eq 1; then
+		if [ $1"a" = "-pa" ]; then
+		    echo "OVERLOADABLE $tbasetype convert_${tbasetype}_sat_rte($fbasetype x);"
+		    echo "OVERLOADABLE $tbasetype convert_${tbasetype}_sat_rtz($fbasetype x);"
+		    echo "OVERLOADABLE $tbasetype convert_${tbasetype}_sat_rtp($fbasetype x);"
+		    echo "OVERLOADABLE $tbasetype convert_${tbasetype}_sat_rtn($fbasetype x);"
+		else
+		    echo "OVERLOADABLE $tbasetype convert_${tbasetype}_sat_rte($fbasetype x)"
+		    if test $fbasetype = "float"; then
+			echo "{ return convert_${tbasetype}_sat(__gen_ocl_rnde(x)); }"
+		    else
+			echo "{ return convert_${tbasetype}_sat(x); }"
+		    fi
+
+		    echo "OVERLOADABLE $tbasetype convert_${tbasetype}_sat_rtz($fbasetype x)"
+		    if test $fbasetype = "float"; then
+			echo "{ return convert_${tbasetype}_sat(__gen_ocl_rndz(x)); }"
+		    else
+			echo "{ return convert_${tbasetype}_sat(x); }"
+		    fi
+
+		    echo "OVERLOADABLE $tbasetype convert_${tbasetype}_sat_rtp($fbasetype x)"
+		    if test $fbasetype = "float"; then
+			echo "{ return convert_${tbasetype}_sat(__gen_ocl_rndu(x)); }"
+		    else
+			echo "{ return convert_${tbasetype}_sat(x); }"
+		    fi
+
+		    echo "OVERLOADABLE $tbasetype convert_${tbasetype}_sat_rtn($fbasetype x)"
+		    if test $fbasetype = "float"; then
+			echo "{ return convert_${tbasetype}_sat(__gen_ocl_rndd(x)); }"
+		    else
+			echo "{ return convert_${tbasetype}_sat(x); }"
+		    fi
+		fi
+		continue
+	    fi
+
+	    for rounding in $ROUNDING_MODES; do
+		fvectortype=$fbasetype$vector_length
+		tvectortype=$tbasetype$vector_length
+		conv="convert_${tbasetype}_sat_${rounding}"
+
+		construct="$conv(v.s0)"
+		if test $vector_length -gt 1; then
+		    construct="$construct, $conv(v.s1)"
+		fi
+		if test $vector_length -gt 2; then
+		    construct="$construct, $conv(v.s2)"
+		fi
+		if test $vector_length -gt 3; then
+		    construct="$construct, $conv(v.s3)"
+		fi
+		if test $vector_length -gt 4; then
+		    construct="$construct, $conv(v.s4)"
+		    construct="$construct, $conv(v.s5)"
+		    construct="$construct, $conv(v.s6)"
+		    construct="$construct, $conv(v.s7)"
+		fi
+		if test $vector_length -gt 8; then
+		    construct="$construct, $conv(v.s8)"
+		    construct="$construct, $conv(v.s9)"
+		    construct="$construct, $conv(v.sA)"
+		    construct="$construct, $conv(v.sB)"
+		    construct="$construct, $conv(v.sC)"
+		    construct="$construct, $conv(v.sD)"
+		    construct="$construct, $conv(v.sE)"
+		    construct="$construct, $conv(v.sF)"
+		fi
+
+		if [ $1"a" = "-pa" ]; then
+		    echo "OVERLOADABLE $tvectortype convert_${tvectortype}_sat_${rounding}($fvectortype v);"
+		else
+		    echo "OVERLOADABLE $tvectortype convert_${tvectortype}_sat_${rounding}($fvectortype v) {"
+		    echo "  return ($tvectortype)($construct);"
+		    echo "}"
+		    echo
+		fi
+	    done
+	done
+    done
+done
+
+if [ $1"a" = "-pa" ]; then
+    echo "#endif /* __OCL_CONVERT_H__ */"
+fi
diff --git a/backend/src/libocl/script/ocl_integer.def b/backend/src/libocl/script/ocl_integer.def
new file mode 100644
index 0000000..c35c242
--- /dev/null
+++ b/backend/src/libocl/script/ocl_integer.def
@@ -0,0 +1,30 @@
+##integer
+ugentype abs (gentype x)
+ugentype abs_diff (gentype x, gentype y)
+gentype add_sat (gentype x,  gentype y)
+gentype hadd (gentype x,  gentype y)
+gentype rhadd (gentype x, gentype y)
+gentype clamp (gentype x, gentype minval, gentype maxval)
+gentype clamp (gentype x, sgentype minval, sgentype maxval)
+gentype clz (gentype x)
+gentype mad_hi (gentype a, gentype b, gentype c)
+gentype mad_sat (gentype a, gentype b, gentype c)
+gentype max (gentype x,  gentype y)
+gentype max (gentype x,  sgentype y)
+gentype min (gentype x,  gentype y)
+gentype min (gentype x,  sgentype y)
+gentype mul_hi (gentype x,  gentype y)
+gentype rotate (gentype v,  gentype i)
+gentype sub_sat (gentype x,  gentype y)
+shortn upsample (charn hi, ucharn lo)
+ushortn upsample (ucharn hi, ucharn lo)
+intn upsample (shortn hi, ushortn lo)
+uintn upsample (ushortn hi, ushortn lo)
+longn upsample (intn hi, uintn lo)
+ulongn upsample (uintn hi, uintn lo)
+
+gentype popcount (gentype x)
+
+##fast_integer
+gentype mad24 (gentype x, gentype y, gentype z)
+gentype mul24 (gentype x, gentype y)
diff --git a/backend/src/libocl/script/ocl_math.def b/backend/src/libocl/script/ocl_math.def
new file mode 100644
index 0000000..5617c09
--- /dev/null
+++ b/backend/src/libocl/script/ocl_math.def
@@ -0,0 +1,164 @@
+##math
+gentype acos (gentype)
+gentype acosh (gentype)
+gentype acospi (gentype x)
+gentype asin (gentype)
+gentype asinh (gentype)
+gentype asinpi (gentype x)
+gentype atan (gentype y_over_x)
+gentype atan2 (gentype y, gentype x)
+gentype atanh (gentype)
+gentype atanpi (gentype x)
+gentype atan2pi (gentype y, gentype x)
+gentype cbrt (gentype)
+gentype ceil (gentype)
+gentype copysign (gentype x, gentype y)
+gentype cos (gentype)
+gentype cosh (gentype)
+gentype cospi (gentype x)
+gentype erfc (gentype)
+gentype erf (gentype)
+gentype exp (gentype x)
+gentype exp2 (gentype)
+gentype exp10 (gentype)
+gentype expm1 (gentype x)
+gentype fabs (gentype)
+gentype fdim (gentype x, gentype y)
+gentype floor (gentype)
+# XXX we use madd for fma
+gentype fma (gentype a, gentype b, gentype c)
+gentype fmax (gentype x, gentype y)
+gentypef fmax (gentypef x, float y)
+gentyped fmax (gentyped x, double y)
+gentype fmin (gentype x, gentype y)
+gentypef fmin (gentypef x, float y)
+gentyped fmin (gentyped x, double y)
+gentype fmod (gentype x, gentype y)
+gentype fract (gentype x, __global gentype *iptr)
+gentype fract (gentype x, __local gentype *iptr)
+gentype fract (gentype x, __private gentype *iptr)
+floatn frexp (floatn x, __global intn *exp)
+floatn frexp (floatn x, __local intn *exp)
+floatn frexp (floatn x, __private intn *exp)
+float frexp (float x, __global int *exp)
+float frexp (float x, __local int *exp)
+float frexp (float x, __private int *exp)
+doublen frexp (doublen x, __global intn *exp)
+doublen frexp (doublen x, __local intn *exp)
+doublen frexp (doublen x, __private intn *exp)
+double frexp (double x, __global int *exp)
+double frexp (double x, __local int *exp)
+double frexp (double x, __private int *exp)
+gentype hypot (gentype x, gentype y)
+intn ilogb (floatn x)
+int ilogb (float x)
+intn ilogb (doublen x)
+int ilogb (double x)
+floatn ldexp (floatn x, intn k)
+floatn ldexp (floatn x, int k)
+float ldexp (float x, int k)
+doublen ldexp (doublen x, intn k)
+doublen ldexp (doublen x, int k)
+double ldexp (double x, int k)
+gentype lgamma (gentype x)
+floatn lgamma_r (floatn x, __global intn *signp)
+floatn lgamma_r (floatn x, __local intn *signp)
+floatn lgamma_r (floatn x, __private intn *signp)
+float lgamma_r (float x, __global int *signp)
+float lgamma_r (float x, __local int *signp)
+float lgamma_r (float x,   __private int *signp)
+#doublen lgamma_r (doublen x, __global intn *signp)
+#doublen lgamma_r (doublen x, __local intn *signp)
+#doublen lgamma_r (doublen x, __private intn *signp)
+#double lgamma_r (double x, __global int *signp)
+#double lgamma_r (double x, __local int *signp)
+#double lgamma_r (double x, __private int *signp)
+gentype log (gentype)
+gentype log2 (gentype)
+gentype log10 (gentype)
+gentype log1p (gentype x)
+gentype logb (gentype x)
+gentype mad (gentype a, gentype b, gentype c)
+gentype maxmag (gentype x, gentype y)
+gentype minmag (gentype x, gentype y)
+gentype modf (gentype x, __global gentype *iptr)
+gentype modf (gentype x, __local gentype *iptr)
+gentype modf (gentype x, __private gentype *iptr)
+floatn nan (uintn nancode)
+float nan (uint nancode)
+doublen nan (ulongn nancode)
+double nan (ulong nancode)
+gentype nextafter (gentype x, gentype y)
+gentype pow (gentype x, gentype y)
+floatn pown (floatn x, intn y)
+float pown (float x, int y)
+doublen pown (doublen x, intn y)
+double pown (double x, int y)
+gentype powr (gentype x, gentype y)
+gentype remainder (gentype x, gentype y)
+floatn remquo (floatn x, floatn y, __global intn *quo)
+floatn remquo (floatn x, floatn y, __local intn *quo)
+floatn remquo (floatn x, floatn y, __private intn *quo)
+float remquo (float x, float y, __global int *quo)
+float remquo (float x, float y, __local int *quo)
+float remquo (float x, float y, __private int *quo)
+doublen remquo (doublen x, doublen y, __global intn *quo)
+doublen remquo (doublen x, doublen y, __local intn *quo)
+doublen remquo (doublen x, doublen y, __private intn *quo)
+double remquo (double x, double y, __global int *quo)
+double remquo (double x, double y, __local int *quo)
+double remquo (double x, double y, __private int *quo)
+gentype rint (gentype)
+floatn rootn (floatn x, intn y)
+
+doublen rootn (doublen x, intn y)
+doublen rootn (double x, int y)
+gentype round (gentype x)
+gentype rsqrt (gentype)
+gentype sin (gentype)
+gentype sincos (gentype x, __global gentype *cosval)
+gentype sincos (gentype x, __local gentype *cosval)
+gentype sincos (gentype x, __private gentype *cosval)
+gentype sinh (gentype)
+gentype sinpi (gentype x)
+gentype sqrt (gentype)
+gentype tan (gentype)
+gentype tanh (gentype)
+gentype tanpi (gentype x)
+gentype tgamma (gentype)
+gentype trunc (gentype)
+
+
+# XXX we already defined all native and non-native
+# functions to the same one.
+gentype native_cos (gentype x)
+gentype native_divide (gentype x, gentype y)
+gentype native_exp (gentype x)
+gentype native_exp2 (gentype x)
+gentype native_exp10 (gentype x)
+gentype native_log (gentype x)
+gentype native_log2 (gentype x)
+gentype native_log10 (gentype x)
+gentype native_powr (gentype x, gentype y)
+gentype native_recip (gentype x)
+gentype native_rsqrt (gentype x)
+gentype native_sin (gentype x)
+gentype native_sqrt (gentype x)
+gentype native_tan (gentype x)
+
+
+##half_native_math
+#gentype half_cos (gentype x)
+#gentype half_divide (gentype x, gentype y)
+#gentype half_exp (gentype x)
+#gentype half_exp2 (gentype x)
+#gentype half_exp10 (gentype x)
+#gentype half_log (gentype x)
+#gentype half_log2 (gentype x)
+#gentype half_log10 (gentype x)
+#gentype half_powr (gentype x, gentype y)
+#gentype half_recip (gentype x)
+#gentype half_rsqrt (gentype x)
+#gentype half_sin (gentype x)
+#gentype half_sqrt (gentype x)
+#gentype half_tan (gentype x)
diff --git a/backend/src/libocl/script/ocl_relational.def b/backend/src/libocl/script/ocl_relational.def
new file mode 100644
index 0000000..379c511
--- /dev/null
+++ b/backend/src/libocl/script/ocl_relational.def
@@ -0,0 +1,34 @@
+##relational
+intn isequal (floatn x, floatn y)
+longn isequal (doublen x, doublen y)
+intn isnotequal (floatn x, floatn y)
+longn isnotequal (doublen x, doublen y)
+intn isgreater (floatn x, floatn y)
+longn isgreater (doublen x, doublen y)
+intn isgreaterequal (floatn x, floatn y)
+longn isgreaterequal (doublen x, doublen y)
+intn isless (floatn x, floatn y)
+longn isless (doublen x, doublen y)
+intn islessequal (floatn x, floatn y)
+longn islessequal (doublen x, doublen y)
+intn islessgreater (floatn x, floatn y)
+longn islessgreater (doublen x, doublen y)
+intn isfinite (floatn
+longn isfinite (doublen)
+intn isinf (floatn)
+longn isinf (doublen)
+intn isnan (floatn)
+longn isnan (doublen)
+intn isnormal (floatn)
+longn isnormal (doublen)
+intn isordered (floatn x, floatn y)
+longn isordered (doublen x, doublen y)
+intn isunordered (floatn x, floatn y)
+longn isunordered (doublen x, doublen y)
+intn signbit (floatn)
+longn signbit (doublen)
+int any (igentype x)
+int all (igentype x)
+gentype bitselect (gentype a, gentype b, gentype c)
+gentype select (gentype a, gentype b, igentype c)
+gentype select (gentype a, gentype b, ugentype c)
diff --git a/backend/src/libocl/src/ocl_async.cl b/backend/src/libocl/src/ocl_async.cl
new file mode 100644
index 0000000..041aaf2
--- /dev/null
+++ b/backend/src/libocl/src/ocl_async.cl
@@ -0,0 +1,87 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#include "ocl_async.h"
+#include "ocl_sync.h"
+#include "ocl_workitem.h"
+
+#define BODY(SRC_STRIDE, DST_STRIDE) \
+  uint size = get_local_size(2) * get_local_size(1) * get_local_size(0); \
+  uint count = num / size;  \
+  uint offset = get_local_id(2) * get_local_size(1) + get_local_id(1);  \
+  offset = offset * get_local_size(0) + get_local_id(0); \
+  for(uint i=0; i<count; i+=1) { \
+    *(dst + offset * DST_STRIDE) = *(src + offset * SRC_STRIDE); \
+    offset += size;                                 \
+  } \
+  if(offset < num) \
+    *(dst + offset * DST_STRIDE) = *(src + offset * SRC_STRIDE); \
+  return 0;
+
+#define DEFN(TYPE) \
+OVERLOADABLE event_t async_work_group_copy (local TYPE *dst,  const global TYPE *src, \
+							 size_t num, event_t event) { \
+  BODY(1, 1); \
+} \
+OVERLOADABLE event_t async_work_group_copy (global TYPE *dst,  const local TYPE *src, \
+							  size_t num, event_t event) { \
+  BODY(1, 1); \
+} \
+OVERLOADABLE event_t async_work_group_strided_copy (local TYPE *dst,  const global TYPE *src, \
+								 size_t num, size_t src_stride, event_t event) { \
+  BODY(src_stride, 1); \
+} \
+OVERLOADABLE event_t async_work_group_strided_copy (global TYPE *dst,  const local TYPE *src, \
+								  size_t num, size_t dst_stride, event_t event) { \
+  BODY(1, dst_stride); \
+}
+#define DEF(TYPE) \
+  DEFN(TYPE); DEFN(TYPE##2); DEFN(TYPE##3); DEFN(TYPE##4); DEFN(TYPE##8); DEFN(TYPE##16);
+DEF(char)
+DEF(uchar)
+DEF(short)
+DEF(ushort)
+DEF(int)
+DEF(uint)
+DEF(long)
+DEF(ulong)
+DEF(float)
+DEF(double)
+#undef BODY
+#undef DEFN
+#undef DEF
+
+void wait_group_events (int num_events, event_t *event_list) {
+  barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
+}
+
+#define DEFN(TYPE) \
+OVERLOADABLE void prefetch(const global TYPE *p, size_t num) { }
+#define DEF(TYPE) \
+DEFN(TYPE); DEFN(TYPE##2); DEFN(TYPE##3); DEFN(TYPE##4); DEFN(TYPE##8); DEFN(TYPE##16)
+DEF(char);
+DEF(uchar);
+DEF(short);
+DEF(ushort);
+DEF(int);
+DEF(uint);
+DEF(long);
+DEF(ulong);
+DEF(float);
+#undef DEFN
+#undef DEF
diff --git a/backend/src/libocl/src/ocl_atom.cl b/backend/src/libocl/src/ocl_atom.cl
new file mode 100644
index 0000000..0b6c671
--- /dev/null
+++ b/backend/src/libocl/src/ocl_atom.cl
@@ -0,0 +1,137 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "ocl_atom.h"
+#include "ocl_as.h"
+
+OVERLOADABLE uint __gen_ocl_atomic_add(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_add(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_sub(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_sub(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_and(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_and(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_or(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_or(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_xor(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_xor(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_xchg(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_xchg(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_inc(__global uint *p);
+OVERLOADABLE uint __gen_ocl_atomic_inc(__local uint *p);
+OVERLOADABLE uint __gen_ocl_atomic_dec(__global uint *p);
+OVERLOADABLE uint __gen_ocl_atomic_dec(__local uint *p);
+OVERLOADABLE uint __gen_ocl_atomic_cmpxchg(__global uint *p, uint cmp, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_cmpxchg(__local uint *p, uint cmp, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_imin(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_imin(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_imax(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_imax(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_umin(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_umin(__local uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_umax(__global uint *p, uint val);
+OVERLOADABLE uint __gen_ocl_atomic_umax(__local uint *p, uint val);
+
+#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE, PREFIX)                        \
+  OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p, TYPE val) { \
+    return (TYPE)__gen_ocl_##PREFIX##NAME((SPACE uint *)p, val);            \
+  }
+
+#define DECL_ATOMIC_OP_TYPE(NAME, TYPE, PREFIX) \
+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __global, PREFIX) \
+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local, PREFIX)
+
+#define DECL_ATOMIC_OP(NAME) \
+  DECL_ATOMIC_OP_TYPE(NAME, uint, atomic_)        \
+  DECL_ATOMIC_OP_TYPE(NAME, int, atomic_)
+
+DECL_ATOMIC_OP(add)
+DECL_ATOMIC_OP(sub)
+DECL_ATOMIC_OP(and)
+DECL_ATOMIC_OP(or)
+DECL_ATOMIC_OP(xor)
+DECL_ATOMIC_OP(xchg)
+DECL_ATOMIC_OP_TYPE(min, int, atomic_i)
+DECL_ATOMIC_OP_TYPE(max, int, atomic_i)
+DECL_ATOMIC_OP_TYPE(min, uint, atomic_u)
+DECL_ATOMIC_OP_TYPE(max, uint, atomic_u)
+
+#undef DECL_ATOMIC_OP_SPACE
+
+#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE, PREFIX)                        \
+  OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p, TYPE val) { \
+    return as_float(__gen_ocl_##PREFIX##NAME((SPACE uint *)p, as_uint(val))); \
+  }
+DECL_ATOMIC_OP_SPACE(xchg, float, __global, atomic_)
+DECL_ATOMIC_OP_SPACE(xchg, float, __local, atomic_)
+
+#undef DECL_ATOMIC_OP
+#undef DECL_ATOMIC_OP_TYPE
+#undef DECL_ATOMIC_OP_SPACE
+
+#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE) \
+  OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p) { \
+    return (TYPE)__gen_ocl_atomic_##NAME((SPACE uint *)p); \
+  }
+
+#define DECL_ATOMIC_OP_TYPE(NAME, TYPE) \
+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __global) \
+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local)
+
+#define DECL_ATOMIC_OP(NAME) \
+  DECL_ATOMIC_OP_TYPE(NAME, uint) \
+  DECL_ATOMIC_OP_TYPE(NAME, int)
+
+DECL_ATOMIC_OP(inc)
+DECL_ATOMIC_OP(dec)
+
+#undef DECL_ATOMIC_OP
+#undef DECL_ATOMIC_OP_TYPE
+#undef DECL_ATOMIC_OP_SPACE
+
+#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE)  \
+  OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p, TYPE cmp, TYPE val) { \
+    return (TYPE)__gen_ocl_atomic_##NAME((SPACE uint *)p, (uint)cmp, (uint)val); \
+  }
+
+#define DECL_ATOMIC_OP_TYPE(NAME, TYPE) \
+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __global) \
+  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local)
+
+#define DECL_ATOMIC_OP(NAME) \
+  DECL_ATOMIC_OP_TYPE(NAME, uint) \
+  DECL_ATOMIC_OP_TYPE(NAME, int)
+
+DECL_ATOMIC_OP(cmpxchg)
+
+#undef DECL_ATOMIC_OP
+#undef DECL_ATOMIC_OP_TYPE
+#undef DECL_ATOMIC_OP_SPACE
+
+// XXX for conformance test
+// The following atom_xxx api is on OpenCL spec 1.0.
+// But the conformance test suite will test them anyway.
+#define atom_add atomic_add
+#define atom_sub atomic_sub
+#define atom_and atomic_and
+#define atom_or atomic_or
+#define atom_xor atomic_xor
+#define atom_xchg atomic_xchg
+#define atom_min atomic_min
+#define atom_max atomic_max
+#define atom_inc atomic_inc
+#define atom_dec atomic_dec
+#define atom_cmpxchg atomic_cmpxchg
diff --git a/backend/src/ocl_barrier.ll b/backend/src/libocl/src/ocl_barrier.ll
similarity index 100%
rename from backend/src/ocl_barrier.ll
rename to backend/src/libocl/src/ocl_barrier.ll
diff --git a/backend/src/libocl/src/ocl_geometric.cl b/backend/src/libocl/src/ocl_geometric.cl
new file mode 100644
index 0000000..e469ff9
--- /dev/null
+++ b/backend/src/libocl/src/ocl_geometric.cl
@@ -0,0 +1,112 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "ocl_geometric.h"
+#include "ocl_common.h"
+#include "ocl_relational.h"
+#include "ocl_math.h"
+#include "ocl_float.h"
+
+PURE CONST float __gen_ocl_fabs(float x);
+
+OVERLOADABLE float dot(float p0, float p1) {
+  return p0 * p1;
+}
+OVERLOADABLE float dot(float2 p0, float2 p1) {
+  return p0.x * p1.x + p0.y * p1.y;
+}
+OVERLOADABLE float dot(float3 p0, float3 p1) {
+  return p0.x * p1.x + p0.y * p1.y + p0.z * p1.z;
+}
+OVERLOADABLE float dot(float4 p0, float4 p1) {
+  return p0.x * p1.x + p0.y * p1.y + p0.z * p1.z + p0.w * p1.w;
+}
+OVERLOADABLE float length(float x) { return __gen_ocl_fabs(x); }
+
+#define BODY \
+  if(m == 0) \
+    return 0; \
+  if(isinf(m)) \
+    return INFINITY; \
+  if(m < 1) \
+    m = 1; \
+  x /= m; \
+  return m * sqrt(dot(x,x));
+OVERLOADABLE float length(float2 x) {
+  float m = max(__gen_ocl_fabs(x.s0), __gen_ocl_fabs(x.s1));
+  BODY;
+}
+OVERLOADABLE float length(float3 x) {
+  float m = max(__gen_ocl_fabs(x.s0), max(__gen_ocl_fabs(x.s1), __gen_ocl_fabs(x.s2)));
+  BODY;
+}
+OVERLOADABLE float length(float4 x) {
+  float m = max(__gen_ocl_fabs(x.s0), max(__gen_ocl_fabs(x.s1), max(__gen_ocl_fabs(x.s2), __gen_ocl_fabs(x.s3))));
+  BODY;
+}
+#undef BODY
+OVERLOADABLE float distance(float x, float y) { return length(x-y); }
+OVERLOADABLE float distance(float2 x, float2 y) { return length(x-y); }
+OVERLOADABLE float distance(float3 x, float3 y) { return length(x-y); }
+OVERLOADABLE float distance(float4 x, float4 y) { return length(x-y); }
+OVERLOADABLE float normalize(float x) {
+  union { float f; unsigned u; } u;
+  u.f = x;
+  if(u.u == 0)
+    return 0.f;
+  if(isnan(x))
+    return NAN;
+  return u.u < 0x7fffffff ? 1.f : -1.f;
+}
+OVERLOADABLE float2 normalize(float2 x) {
+  float m = length(x);
+  if(m == 0)
+    return 0;
+  return x / m;
+}
+OVERLOADABLE float3 normalize(float3 x) {
+  float m = length(x);
+  if(m == 0)
+    return 0;
+  return x / m;
+}
+OVERLOADABLE float4 normalize(float4 x) {
+  float m = length(x);
+  if(m == 0)
+    return 0;
+  return x / m;
+}
+
+OVERLOADABLE float fast_length(float x) { return __gen_ocl_fabs(x); }
+OVERLOADABLE float fast_length(float2 x) { return sqrt(dot(x,x)); }
+OVERLOADABLE float fast_length(float3 x) { return sqrt(dot(x,x)); }
+OVERLOADABLE float fast_length(float4 x) { return sqrt(dot(x,x)); }
+OVERLOADABLE float fast_distance(float x, float y) { return length(x-y); }
+OVERLOADABLE float fast_distance(float2 x, float2 y) { return length(x-y); }
+OVERLOADABLE float fast_distance(float3 x, float3 y) { return length(x-y); }
+OVERLOADABLE float fast_distance(float4 x, float4 y) { return length(x-y); }
+OVERLOADABLE float fast_normalize(float x) { return x > 0 ? 1.f : (x < 0 ? -1.f : 0.f); }
+OVERLOADABLE float2 fast_normalize(float2 x) { return x * rsqrt(dot(x, x)); }
+OVERLOADABLE float3 fast_normalize(float3 x) { return x * rsqrt(dot(x, x)); }
+OVERLOADABLE float4 fast_normalize(float4 x) { return x * rsqrt(dot(x, x)); }
+
+OVERLOADABLE float3 cross(float3 v0, float3 v1) {
+   return v0.yzx*v1.zxy-v0.zxy*v1.yzx;
+}
+OVERLOADABLE float4 cross(float4 v0, float4 v1) {
+   return (float4)(v0.yzx*v1.zxy-v0.zxy*v1.yzx, 0.f);
+}
diff --git a/backend/src/libocl/src/ocl_image.cl b/backend/src/libocl/src/ocl_image.cl
new file mode 100644
index 0000000..fd421bf
--- /dev/null
+++ b/backend/src/libocl/src/ocl_image.cl
@@ -0,0 +1,429 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "ocl_image.h"
+#include "ocl_math.h"
+#include "ocl_integer.h"
+#include "ocl_common.h"
+
+// 1D read
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, float u, uint sampler_offset);
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, int u, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, float u, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, int u, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, float u, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, int u, uint sampler_offset);
+
+// 2D & 1D Array read
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset);
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
+
+// 3D & 2D Array read
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset);
+OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset);
+OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset);
+OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
+
+// 1D write
+OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int4 color);
+OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, int u, uint4 color);
+OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, int u, float4 color);
+
+// 2D & 1D Array write
+OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int v, int4 color);
+OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, int u, int v, uint4 color);
+OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, int u, int v, float4 color);
+
+// 3D & 2D Array write
+OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int v, int w, int4 color);
+OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, int u, int v, int w, uint4 color);
+OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, int u, int v, int w, float4 color);
+
+int __gen_ocl_get_image_width(uint surface_id);
+int __gen_ocl_get_image_height(uint surface_id);
+int __gen_ocl_get_image_channel_data_type(uint surface_id);
+int __gen_ocl_get_image_channel_order(uint surface_id);
+int __gen_ocl_get_image_depth(uint surface_id);
+
+// 2D 3D Image Common Macro
+#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
+#define GEN_FIX_1 1
+#else
+#define GEN_FIX_1 0
+#endif
+
+#define GET_IMAGE(cl_image, surface_id) \
+    uint surface_id = (uint)cl_image
+OVERLOADABLE float __gen_compute_array_index(const float index, image1d_array_t image)
+{
+  GET_IMAGE(image, surface_id);
+  float array_size = __gen_ocl_get_image_depth(surface_id);
+  return clamp(rint(index), 0.f, array_size - 1.f);
+}
+
+OVERLOADABLE float __gen_compute_array_index(float index, image2d_array_t image)
+{
+  GET_IMAGE(image, surface_id);
+  float array_size = __gen_ocl_get_image_depth(surface_id);
+  return clamp(rint(index), 0.f, array_size - 1.f);
+}
+
+OVERLOADABLE int __gen_compute_array_index(int index, image1d_array_t image)
+{
+  GET_IMAGE(image, surface_id);
+  int array_size = __gen_ocl_get_image_depth(surface_id);
+  return clamp(index, 0, array_size - 1);
+}
+
+OVERLOADABLE int __gen_compute_array_index(int index, image2d_array_t image)
+{
+  GET_IMAGE(image, surface_id);
+  int array_size = __gen_ocl_get_image_depth(surface_id);
+  return clamp(index, 0, array_size - 1);
+}
+
+#define DECL_READ_IMAGE0(int_clamping_fix,                                   \
+                        image_type, type, suffix, coord_type, n)             \
+  OVERLOADABLE type read_image ##suffix(image_type cl_image,          \
+                                               const sampler_t sampler,      \
+                                               coord_type coord)             \
+  {                                                                          \
+    GET_IMAGE(cl_image, surface_id);                                         \
+    GET_IMAGE_ARRAY_SIZE(cl_image, coord, int, ai);                          \
+    if (int_clamping_fix &&                                                  \
+        ((sampler & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP) &&             \
+        ((sampler & __CLK_FILTER_MASK) == CLK_FILTER_NEAREST))               \
+            return   __gen_ocl_read_image ##suffix(                          \
+                        EXPEND_READ_COORD(surface_id, sampler, coord));      \
+    return  __gen_ocl_read_image ##suffix(                                   \
+                    EXPEND_READ_COORDF(surface_id, sampler, coord), 0);      \
+  }
+
+#define DECL_READ_IMAGE1(float_coord_rounding_fix, int_clamping_fix,         \
+                        image_type, type, suffix, coord_type, n)             \
+  OVERLOADABLE type read_image ##suffix(image_type cl_image,          \
+                                               const sampler_t sampler,      \
+                                               coord_type coord)             \
+  {                                                                          \
+    GET_IMAGE(cl_image, surface_id);                                         \
+    GET_IMAGE_ARRAY_SIZE(cl_image, coord, float, ai)                         \
+    coord_type tmpCoord = coord;                                             \
+    if (float_coord_rounding_fix | int_clamping_fix) {                       \
+      if (((sampler & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP)              \
+          && ((sampler & __CLK_FILTER_MASK) == CLK_FILTER_NEAREST)) {        \
+        if (float_coord_rounding_fix                                         \
+            && ((sampler & CLK_NORMALIZED_COORDS_TRUE) == 0)) {              \
+          FIXUP_FLOAT_COORD(tmpCoord);                                       \
+        }                                                                    \
+        if (int_clamping_fix) {                                              \
+            coord_type intCoord;                                             \
+            if (sampler & CLK_NORMALIZED_COORDS_TRUE) {                      \
+              DENORMALIZE_COORD(surface_id, intCoord, tmpCoord);             \
+            } else                                                           \
+              intCoord = tmpCoord;                                           \
+            return   __gen_ocl_read_image ##suffix(                          \
+                       EXPEND_READ_COORDI(surface_id, sampler, intCoord));\
+       }                                                                     \
+      }                                                                      \
+    }                                                                        \
+    return  __gen_ocl_read_image ##suffix(                                   \
+                        EXPEND_READ_COORDF(surface_id, sampler, tmpCoord), 0);\
+  }
+
+#define DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, coord_type, n)   \
+  OVERLOADABLE type read_image ##suffix(image_type cl_image,          \
+                                               coord_type coord)             \
+  {                                                                          \
+    GET_IMAGE(cl_image, surface_id);                                         \
+    GET_IMAGE_ARRAY_SIZE(cl_image, coord, int, ai)                           \
+    return __gen_ocl_read_image ##suffix(                                    \
+           EXPEND_READ_COORDF(surface_id,                                    \
+                             CLK_NORMALIZED_COORDS_FALSE                     \
+                             | CLK_ADDRESS_NONE                              \
+                             | CLK_FILTER_NEAREST, (float)coord), 0);        \
+  }
+
+#define DECL_WRITE_IMAGE(image_type, type, suffix, coord_type) \
+  OVERLOADABLE void write_image ##suffix(image_type cl_image, coord_type coord, type color)\
+  {\
+    GET_IMAGE(cl_image, surface_id);\
+    __gen_ocl_write_image ##suffix(EXPEND_WRITE_COORD(surface_id, coord, color));\
+  }
+
+#define DECL_IMAGE_INFO_COMMON(image_type)    \
+  OVERLOADABLE  int get_image_channel_data_type(image_type image)\
+  { \
+    GET_IMAGE(image, surface_id);\
+    return __gen_ocl_get_image_channel_data_type(surface_id); \
+  }\
+  OVERLOADABLE  int get_image_channel_order(image_type image)\
+  { \
+    GET_IMAGE(image, surface_id);\
+    return __gen_ocl_get_image_channel_order(surface_id); \
+  } \
+  OVERLOADABLE int get_image_width(image_type image) \
+  { \
+    GET_IMAGE(image, surface_id); \
+    return __gen_ocl_get_image_width(surface_id);  \
+  }
+
+// 1D
+#define DECL_IMAGE(int_clamping_fix, image_type, type, suffix)                       \
+  DECL_READ_IMAGE0(int_clamping_fix, image_type, type, suffix, int, 1)               \
+  DECL_READ_IMAGE1(GEN_FIX_1, int_clamping_fix, image_type, type, suffix, float, 1)  \
+  DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, int, 1)                        \
+  DECL_WRITE_IMAGE(image_type, type, suffix, int)                                    \
+  DECL_WRITE_IMAGE(image_type, type, suffix, float)
+
+#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord, 1
+#define EXPEND_READ_COORDF(id, sampler, coord) id, sampler, (float)coord
+#define EXPEND_READ_COORDI(id, sampler, coord) id, sampler, (int)(coord < 0 ? -1 : coord), 1
+#define DENORMALIZE_COORD(id, dstCoord, srcCoord) dstCoord = srcCoord * __gen_ocl_get_image_width(id);
+#define EXPEND_WRITE_COORD(id, coord, color) id, coord, color
+#define GET_IMAGE_ARRAY_SIZE(a,b,c,d)
+
+#define FIXUP_FLOAT_COORD(tmpCoord)                            \
+  {                                                            \
+    if (tmpCoord < 0 && tmpCoord > -0x1p-20f)                  \
+      tmpCoord += -0x1p-9f;                                     \
+  }
+
+DECL_IMAGE(GEN_FIX_1, image1d_t, int4, i)
+DECL_IMAGE(GEN_FIX_1, image1d_t, uint4, ui)
+DECL_IMAGE(0, image1d_t, float4, f)
+DECL_IMAGE(GEN_FIX_1, image1d_buffer_t, int4, i)
+DECL_IMAGE(GEN_FIX_1, image1d_buffer_t, uint4, ui)
+DECL_IMAGE(0, image1d_buffer_t, float4, f)
+
+// 1D Info
+DECL_IMAGE_INFO_COMMON(image1d_t)
+DECL_IMAGE_INFO_COMMON(image1d_buffer_t)
+
+#undef EXPEND_READ_COORD
+#undef EXPEND_READ_COORDF
+#undef EXPEND_READ_COORDI
+#undef DENORMALIZE_COORD
+#undef EXPEND_WRITE_COORD
+#undef FIXUP_FLOAT_COORD
+#undef DECL_IMAGE
+// End of 1D
+
+#define DECL_IMAGE(int_clamping_fix, image_type, type, suffix, n)                       \
+  DECL_READ_IMAGE0(int_clamping_fix, image_type, type, suffix, int ##n, n)              \
+  DECL_READ_IMAGE1(GEN_FIX_1, int_clamping_fix, image_type, type, suffix, float ##n, n) \
+  DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, int ##n, n)                       \
+  DECL_WRITE_IMAGE(image_type, type, suffix, int ## n)                                  \
+  DECL_WRITE_IMAGE(image_type, type, suffix, float ## n)
+// 2D
+#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, coord.s1, 1
+#define EXPEND_READ_COORDF(id, sampler, coord) id, sampler, (float)coord.s0, (float)coord.s1
+#define EXPEND_READ_COORDI(id, sampler, coord) id, sampler, (int)(coord.s0 < 0 ? -1 : coord.s0), \
+                                               (int)(coord.s1 < 0 ? -1 : coord.s1), 1
+#define DENORMALIZE_COORD(id, dstCoord, srcCoord) dstCoord.x = srcCoord.x * __gen_ocl_get_image_width(id); \
+                                                  dstCoord.y = srcCoord.y * __gen_ocl_get_image_height(id);
+#define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, coord.s1, color
+
+#define FIXUP_FLOAT_COORD(tmpCoord)                            \
+  {                                                            \
+    if (tmpCoord.s0 < 0 && tmpCoord.s0 > -0x1p-20f)            \
+      tmpCoord.s0 += -0x1p-9f;                                  \
+    if (tmpCoord.s1 < 0 && tmpCoord.s1 > -0x1p-20f)            \
+      tmpCoord.s1 += -0x1p-9f;                                 \
+  }
+
+DECL_IMAGE(GEN_FIX_1, image2d_t, int4, i, 2)
+DECL_IMAGE(GEN_FIX_1, image2d_t, uint4, ui, 2)
+DECL_IMAGE(0, image2d_t, float4, f, 2)
+
+// 1D Array
+#undef GET_IMAGE_ARRAY_SIZE
+#undef EXPEND_READ_COORD
+#undef EXPEND_READ_COORDF
+#undef EXPEND_READ_COORDI
+#undef DENORMALIZE_COORD
+#undef EXPEND_WRITE_COORD
+#undef FIXUP_FLOAT_COORD
+
+#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, (int)0, ai, 2
+#define EXPEND_READ_COORDF(id, sampler, coord) id, sampler, (float)coord.s0, (float)ai
+#define EXPEND_READ_COORDI(id, sampler, coord) id, sampler, (int)(coord.s0 < 0 ? -1 : coord.s0), 0, (int)ai, 2
+#define DENORMALIZE_COORD(id, dstCoord, srcCoord) dstCoord.x = srcCoord.x * __gen_ocl_get_image_width(id);
+#define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, __gen_compute_array_index(coord.s1, cl_image), color
+#define GET_IMAGE_ARRAY_SIZE(image, coord, coord_type, ai) \
+  coord_type ai = __gen_compute_array_index(coord.s1, image);
+
+#define FIXUP_FLOAT_COORD(tmpCoord)                            \
+  {                                                            \
+    if (tmpCoord.s0 < 0 && tmpCoord.s0 > -0x1p-20f)            \
+      tmpCoord.s0 += -0x1p-9f;                                  \
+  }
+
+DECL_IMAGE(GEN_FIX_1, image1d_array_t, int4, i, 2)
+DECL_IMAGE(GEN_FIX_1, image1d_array_t, uint4, ui, 2)
+DECL_IMAGE(0, image1d_array_t, float4, f, 2)
+
+// 2D Info
+DECL_IMAGE_INFO_COMMON(image2d_t)
+OVERLOADABLE int get_image_height(image2d_t image)
+{
+  GET_IMAGE(image, surface_id);
+  return __gen_ocl_get_image_height(surface_id);
+}
+OVERLOADABLE int2 get_image_dim(image2d_t image)
+{
+  return (int2){get_image_width(image), get_image_height(image)};
+}
+
+// 1D Array info
+DECL_IMAGE_INFO_COMMON(image1d_array_t)
+OVERLOADABLE size_t get_image_array_size(image1d_array_t image)
+{
+  GET_IMAGE(image, surface_id);
+  return __gen_ocl_get_image_depth(surface_id);
+}
+
+#undef EXPEND_READ_COORD
+#undef EXPEND_READ_COORDI
+#undef EXPEND_READ_COORDF
+#undef DENORMALIZE_COORD
+#undef EXPEND_WRITE_COORD
+#undef FIXUP_FLOAT_COORD
+#undef GET_IMAGE_ARRAY_SIZE
+// End of 2D and 1D Array
+
+// 3D
+#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, coord.s1, coord.s2, 1
+#define EXPEND_READ_COORDF(id, sampler, coord) id, sampler, (float)coord.s0, (float)coord.s1, (float)coord.s2
+#define EXPEND_READ_COORDI(id, sampler, coord) id, sampler, (int) (coord.s0 < 0 ? -1 : coord.s0), \
+                                               (int)(coord.s1 < 0 ? -1 : coord.s1), (int)(coord.s2 < 0 ? -1 : coord.s2), 1
+#define DENORMALIZE_COORD(id, dstCoord, srcCoord) dstCoord.x = srcCoord.x * __gen_ocl_get_image_width(id); \
+                                                  dstCoord.y = srcCoord.y * __gen_ocl_get_image_height(id); \
+                                                  dstCoord.z = srcCoord.z * __gen_ocl_get_image_depth(id);
+#define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, coord.s1, coord.s2, color
+
+#define FIXUP_FLOAT_COORD(tmpCoord)                             \
+  {                                                             \
+    if (tmpCoord.s0 < 0 && tmpCoord.s0 > -0x1p-20f)              \
+      tmpCoord.s0 += -0x1p-9f;                                   \
+    if (tmpCoord.s1 < 0 && tmpCoord.s1 > -0x1p-20f)              \
+      tmpCoord.s1 += -0x1p-9f;                                   \
+    if (tmpCoord.s2 < 0 && tmpCoord.s2 > -0x1p-20f)              \
+      tmpCoord.s2 += -0x1p-9f;                                   \
+  }
+#define GET_IMAGE_ARRAY_SIZE(a,b,c,d)
+
+DECL_IMAGE(GEN_FIX_1, image3d_t, int4, i, 4)
+DECL_IMAGE(GEN_FIX_1, image3d_t, uint4, ui, 4)
+DECL_IMAGE(0, image3d_t, float4, f, 4)
+
+DECL_IMAGE(GEN_FIX_1, image3d_t, int4, i, 3)
+DECL_IMAGE(GEN_FIX_1, image3d_t, uint4, ui, 3)
+DECL_IMAGE(0, image3d_t, float4, f, 3)
+
+#undef EXPEND_READ_COORD
+#undef EXPEND_READ_COORDF
+#undef EXPEND_READ_COORDI
+#undef DENORMALIZE_COORD
+#undef EXPEND_WRITE_COORD
+#undef FIXUP_FLOAT_COORD
+#undef GET_IMAGE_ARRAY_SIZE
+
+#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, coord.s1, ai, 1
+#define EXPEND_READ_COORDF(id, sampler, coord) id, sampler, (float)coord.s0, (float)coord.s1, (float)ai
+#define EXPEND_READ_COORDI(id, sampler, coord) id, sampler, (int) (coord.s0 < 0 ? -1 : coord.s0), \
+                                               (int)(coord.s1 < 0 ? -1 : coord.s1), (int)ai, 1
+#define DENORMALIZE_COORD(id, dstCoord, srcCoord) dstCoord.x = srcCoord.x * __gen_ocl_get_image_width(id); \
+                                                  dstCoord.y = srcCoord.y * __gen_ocl_get_image_height(id);
+#define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, coord.s1, __gen_compute_array_index(coord.s2, cl_image), color
+
+#define FIXUP_FLOAT_COORD(tmpCoord)                             \
+  {                                                             \
+    if (tmpCoord.s0 < 0 && tmpCoord.s0 > -0x1p-20f)              \
+      tmpCoord.s0 += -0x1p-9f;                                   \
+    if (tmpCoord.s1 < 0 && tmpCoord.s1 > -0x1p-20f)              \
+      tmpCoord.s1 += -0x1p-9f;                                   \
+  }
+#define GET_IMAGE_ARRAY_SIZE(image, coord, coord_type, ai) \
+  coord_type ai = __gen_compute_array_index(coord.s2, image);
+
+// 2D Array
+DECL_IMAGE(GEN_FIX_1, image2d_array_t, int4, i, 4)
+DECL_IMAGE(GEN_FIX_1, image2d_array_t, uint4, ui, 4)
+DECL_IMAGE(0, image2d_array_t, float4, f, 4)
+
+DECL_IMAGE(GEN_FIX_1, image2d_array_t, int4, i, 3)
+DECL_IMAGE(GEN_FIX_1, image2d_array_t, uint4, ui, 3)
+DECL_IMAGE(0, image2d_array_t, float4, f, 3)
+
+// 3D Info
+DECL_IMAGE_INFO_COMMON(image3d_t)
+OVERLOADABLE int get_image_height(image3d_t image)
+{
+  GET_IMAGE(image, surface_id);
+  return __gen_ocl_get_image_height(surface_id);
+}
+OVERLOADABLE int get_image_depth(image3d_t image)
+{
+  GET_IMAGE(image, surface_id);
+  return __gen_ocl_get_image_depth(surface_id);
+}
+OVERLOADABLE int4 get_image_dim(image3d_t image)
+{
+  return (int4){get_image_width(image), get_image_height(image), get_image_depth(image), 0};
+}
+
+// 2D Array Info
+DECL_IMAGE_INFO_COMMON(image2d_array_t)
+OVERLOADABLE int get_image_height(image2d_array_t image)
+{
+  GET_IMAGE(image, surface_id);
+  return __gen_ocl_get_image_height(surface_id);
+}
+OVERLOADABLE int2 get_image_dim(image2d_array_t image)
+{
+  return (int2){get_image_width(image), get_image_height(image)};
+}
+OVERLOADABLE size_t get_image_array_size(image2d_array_t image)
+{
+  GET_IMAGE(image, surface_id);
+  return __gen_ocl_get_image_depth(surface_id);
+}
+
+#undef EXPEND_READ_COORD
+#undef EXPEND_READ_COORDF
+#undef EXPEND_READ_COORDI
+#undef DENORMALIZE_COORD
+#undef EXPEND_WRITE_COORD
+#undef FIXUP_FLOAT_COORD
+#undef GET_IMAGE_ARRAY_SIZE
+// End of 3D and 2D Array
+
+#undef DECL_IMAGE
+#undef DECL_READ_IMAGE
+#undef DECL_READ_IMAGE_NOSAMPLER
+#undef DECL_WRITE_IMAGE
+#undef GEN_FIX_1
+// End of Image
+
+
+#undef GET_IMAGE
diff --git a/backend/src/ocl_memcpy.ll b/backend/src/libocl/src/ocl_memcpy.ll
similarity index 62%
rename from backend/src/ocl_memcpy.ll
rename to backend/src/libocl/src/ocl_memcpy.ll
index 476033e..fbc44d1 100644
--- a/backend/src/ocl_memcpy.ll
+++ b/backend/src/libocl/src/ocl_memcpy.ll
@@ -1,7 +1,7 @@
 ;The memcpy's source code.
-; INLINE_OVERLOADABLE void __gen_memcpy(uchar* dst, uchar* src, size_t size) {
+; INLINE_OVERLOADABLE void __gen_memcpy_align(uchar* dst, uchar* src, size_t size) {
 ;   size_t index = 0;
-;   while((index + 4) >= size) {
+;   while((index + 4) <= size) {
 ;     *((uint *)(dst + index)) = *((uint *)(src + index));
 ;     index += 4;
 ;   }
@@ -11,14 +11,14 @@
 ;   }
 ; }
 
-define void @__gen_memcpy_gg(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
+define void @__gen_memcpy_gg_align(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
 entry:
   br label %while.cond
 
 while.cond:                                       ; preds = %while.body, %entry
   %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
   %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
+  %cmp = icmp ugt i32 %add, %size
   br i1 %cmp, label %while.cond3, label %while.body
 
 while.body:                                       ; preds = %while.cond
@@ -47,14 +47,14 @@ while.end7:                                       ; preds = %while.cond3
   ret void
 }
 
-define void @__gen_memcpy_gp(i8 addrspace(1)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
+define void @__gen_memcpy_gp_align(i8 addrspace(1)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
 entry:
   br label %while.cond
 
 while.cond:                                       ; preds = %while.body, %entry
   %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
   %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
+  %cmp = icmp ugt i32 %add, %size
   br i1 %cmp, label %while.cond3, label %while.body
 
 while.body:                                       ; preds = %while.cond
@@ -83,14 +83,14 @@ while.end7:                                       ; preds = %while.cond3
   ret void
 }
 
-define void @__gen_memcpy_gl(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
+define void @__gen_memcpy_gl_align(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
 entry:
   br label %while.cond
 
 while.cond:                                       ; preds = %while.body, %entry
   %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
   %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
+  %cmp = icmp ugt i32 %add, %size
   br i1 %cmp, label %while.cond3, label %while.body
 
 while.body:                                       ; preds = %while.cond
@@ -119,14 +119,14 @@ while.end7:                                       ; preds = %while.cond3
   ret void
 }
 
-define void @__gen_memcpy_pg(i8 addrspace(0)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
+define void @__gen_memcpy_pg_align(i8 addrspace(0)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
 entry:
   br label %while.cond
 
 while.cond:                                       ; preds = %while.body, %entry
   %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
   %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
+  %cmp = icmp ugt i32 %add, %size
   br i1 %cmp, label %while.cond3, label %while.body
 
 while.body:                                       ; preds = %while.cond
@@ -155,14 +155,14 @@ while.end7:                                       ; preds = %while.cond3
   ret void
 }
 
-define void @__gen_memcpy_pp(i8 addrspace(0)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
+define void @__gen_memcpy_pp_align(i8 addrspace(0)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
 entry:
   br label %while.cond
 
 while.cond:                                       ; preds = %while.body, %entry
   %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
   %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
+  %cmp = icmp ugt i32 %add, %size
   br i1 %cmp, label %while.cond3, label %while.body
 
 while.body:                                       ; preds = %while.cond
@@ -191,14 +191,14 @@ while.end7:                                       ; preds = %while.cond3
   ret void
 }
 
-define void @__gen_memcpy_pl(i8 addrspace(0)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
+define void @__gen_memcpy_pl_align(i8 addrspace(0)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
 entry:
   br label %while.cond
 
 while.cond:                                       ; preds = %while.body, %entry
   %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
   %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
+  %cmp = icmp ugt i32 %add, %size
   br i1 %cmp, label %while.cond3, label %while.body
 
 while.body:                                       ; preds = %while.cond
@@ -227,14 +227,14 @@ while.end7:                                       ; preds = %while.cond3
   ret void
 }
 
-define void @__gen_memcpy_lg(i8 addrspace(3)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
+define void @__gen_memcpy_lg_align(i8 addrspace(3)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
 entry:
   br label %while.cond
 
 while.cond:                                       ; preds = %while.body, %entry
   %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
   %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
+  %cmp = icmp ugt i32 %add, %size
   br i1 %cmp, label %while.cond3, label %while.body
 
 while.body:                                       ; preds = %while.cond
@@ -263,14 +263,14 @@ while.end7:                                       ; preds = %while.cond3
   ret void
 }
 
-define void @__gen_memcpy_lp(i8 addrspace(3)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
+define void @__gen_memcpy_lp_align(i8 addrspace(3)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
 entry:
   br label %while.cond
 
 while.cond:                                       ; preds = %while.body, %entry
   %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
   %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
+  %cmp = icmp ugt i32 %add, %size
   br i1 %cmp, label %while.cond3, label %while.body
 
 while.body:                                       ; preds = %while.cond
@@ -299,14 +299,14 @@ while.end7:                                       ; preds = %while.cond3
   ret void
 }
 
-define void @__gen_memcpy_ll(i8 addrspace(3)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
+define void @__gen_memcpy_ll_align(i8 addrspace(3)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
 entry:
   br label %while.cond
 
 while.cond:                                       ; preds = %while.body, %entry
   %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
   %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
+  %cmp = icmp ugt i32 %add, %size
   br i1 %cmp, label %while.cond3, label %while.body
 
 while.body:                                       ; preds = %while.cond
@@ -334,3 +334,219 @@ while.body5:                                      ; preds = %while.cond3
 while.end7:                                       ; preds = %while.cond3
   ret void
 }
+
+;The memcpy's source code.
+; INLINE_OVERLOADABLE void __gen_memcpy(uchar* dst, uchar* src, size_t size) {
+;   size_t index = 0;
+;   while(index < size) {
+;     dst[index] = src[index];
+;     index++;
+;   }
+; }
+
+define void @__gen_memcpy_gg(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  %cmp4 = icmp eq i32 %size, 0
+  br i1 %cmp4, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(1)* %src to i32
+  %1 = add i32 %0, %index.05
+  %2 = inttoptr i32 %1 to i8 addrspace(1)*
+  %3 = load i8 addrspace(1)* %2, align 1
+  %4 = ptrtoint i8 addrspace(1)* %dst to i32
+  %5 = add i32 %4, %index.05
+  %6 = inttoptr i32 %5 to i8 addrspace(1)*
+  store i8 %3, i8 addrspace(1)* %6, align 1
+  %inc = add i32 %index.05, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+define void @__gen_memcpy_gp(i8 addrspace(1)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  %cmp4 = icmp eq i32 %size, 0
+  br i1 %cmp4, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(0)* %src to i32
+  %1 = add i32 %0, %index.05
+  %2 = inttoptr i32 %1 to i8 addrspace(0)*
+  %3 = load i8 addrspace(0)* %2, align 1
+  %4 = ptrtoint i8 addrspace(1)* %dst to i32
+  %5 = add i32 %4, %index.05
+  %6 = inttoptr i32 %5 to i8 addrspace(1)*
+  store i8 %3, i8 addrspace(1)* %6, align 1
+  %inc = add i32 %index.05, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+define void @__gen_memcpy_gl(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  %cmp4 = icmp eq i32 %size, 0
+  br i1 %cmp4, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(3)* %src to i32
+  %1 = add i32 %0, %index.05
+  %2 = inttoptr i32 %1 to i8 addrspace(3)*
+  %3 = load i8 addrspace(3)* %2, align 1
+  %4 = ptrtoint i8 addrspace(1)* %dst to i32
+  %5 = add i32 %4, %index.05
+  %6 = inttoptr i32 %5 to i8 addrspace(1)*
+  store i8 %3, i8 addrspace(1)* %6, align 1
+  %inc = add i32 %index.05, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+define void @__gen_memcpy_pg(i8 addrspace(0)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  %cmp4 = icmp eq i32 %size, 0
+  br i1 %cmp4, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(1)* %src to i32
+  %1 = add i32 %0, %index.05
+  %2 = inttoptr i32 %1 to i8 addrspace(1)*
+  %3 = load i8 addrspace(1)* %2, align 1
+  %4 = ptrtoint i8 addrspace(0)* %dst to i32
+  %5 = add i32 %4, %index.05
+  %6 = inttoptr i32 %5 to i8 addrspace(0)*
+  store i8 %3, i8 addrspace(0)* %6, align 1
+  %inc = add i32 %index.05, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+define void @__gen_memcpy_pp(i8 addrspace(0)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  %cmp4 = icmp eq i32 %size, 0
+  br i1 %cmp4, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(0)* %src to i32
+  %1 = add i32 %0, %index.05
+  %2 = inttoptr i32 %1 to i8 addrspace(0)*
+  %3 = load i8 addrspace(0)* %2, align 1
+  %4 = ptrtoint i8 addrspace(0)* %dst to i32
+  %5 = add i32 %4, %index.05
+  %6 = inttoptr i32 %5 to i8 addrspace(0)*
+  store i8 %3, i8 addrspace(0)* %6, align 1
+  %inc = add i32 %index.05, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+define void @__gen_memcpy_pl(i8 addrspace(0)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  %cmp4 = icmp eq i32 %size, 0
+  br i1 %cmp4, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(3)* %src to i32
+  %1 = add i32 %0, %index.05
+  %2 = inttoptr i32 %1 to i8 addrspace(3)*
+  %3 = load i8 addrspace(3)* %2, align 1
+  %4 = ptrtoint i8 addrspace(0)* %dst to i32
+  %5 = add i32 %4, %index.05
+  %6 = inttoptr i32 %5 to i8 addrspace(0)*
+  store i8 %3, i8 addrspace(0)* %6, align 1
+  %inc = add i32 %index.05, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+define void @__gen_memcpy_lg(i8 addrspace(3)* %dst, i8 addrspace(1)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  %cmp4 = icmp eq i32 %size, 0
+  br i1 %cmp4, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(1)* %src to i32
+  %1 = add i32 %0, %index.05
+  %2 = inttoptr i32 %1 to i8 addrspace(1)*
+  %3 = load i8 addrspace(1)* %2, align 1
+  %4 = ptrtoint i8 addrspace(3)* %dst to i32
+  %5 = add i32 %4, %index.05
+  %6 = inttoptr i32 %5 to i8 addrspace(3)*
+  store i8 %3, i8 addrspace(3)* %6, align 1
+  %inc = add i32 %index.05, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+define void @__gen_memcpy_lp(i8 addrspace(3)* %dst, i8 addrspace(0)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  %cmp4 = icmp eq i32 %size, 0
+  br i1 %cmp4, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(0)* %src to i32
+  %1 = add i32 %0, %index.05
+  %2 = inttoptr i32 %1 to i8 addrspace(0)*
+  %3 = load i8 addrspace(0)* %2, align 1
+  %4 = ptrtoint i8 addrspace(3)* %dst to i32
+  %5 = add i32 %4, %index.05
+  %6 = inttoptr i32 %5 to i8 addrspace(3)*
+  store i8 %3, i8 addrspace(3)* %6, align 1
+  %inc = add i32 %index.05, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+define void @__gen_memcpy_ll(i8 addrspace(3)* %dst, i8 addrspace(3)* %src, i32 %size) nounwind alwaysinline {
+entry:
+  %cmp4 = icmp eq i32 %size, 0
+  br i1 %cmp4, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.05 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(3)* %src to i32
+  %1 = add i32 %0, %index.05
+  %2 = inttoptr i32 %1 to i8 addrspace(3)*
+  %3 = load i8 addrspace(3)* %2, align 1
+  %4 = ptrtoint i8 addrspace(3)* %dst to i32
+  %5 = add i32 %4, %index.05
+  %6 = inttoptr i32 %5 to i8 addrspace(3)*
+  store i8 %3, i8 addrspace(3)* %6, align 1
+  %inc = add i32 %index.05, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
diff --git a/backend/src/ocl_memset.ll b/backend/src/libocl/src/ocl_memset.ll
similarity index 64%
rename from backend/src/ocl_memset.ll
rename to backend/src/libocl/src/ocl_memset.ll
index addf9f5..665eac4 100644
--- a/backend/src/ocl_memset.ll
+++ b/backend/src/libocl/src/ocl_memset.ll
@@ -1,5 +1,5 @@
 ;The memset's source code.
-; INLINE_OVERLOADABLE void __gen_memset(uchar* dst, uchar val, size_t size) {
+; INLINE_OVERLOADABLE void __gen_memset_align(uchar* dst, uchar val, size_t size) {
 ;   size_t index = 0;
 ;   uint v = (val << 24) | (val << 16) | (val << 8) | val;
 ;   while((index + 4) >= size) {
@@ -12,7 +12,7 @@
 ;  }
 ; }
 
-define void @__gen_memset_p(i8* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
+define void @__gen_memset_p_align(i8* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
 entry:
   %conv = zext i8 %val to i32
   %shl = shl nuw i32 %conv, 24
@@ -26,7 +26,7 @@ entry:
 while.cond:                                       ; preds = %while.body, %entry
   %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
   %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
+  %cmp = icmp ugt i32 %add, %size
   br i1 %cmp, label %while.cond10, label %while.body
 
 while.body:                                       ; preds = %while.cond
@@ -50,7 +50,7 @@ while.end14:                                      ; preds = %while.cond10
   ret void
 }
 
-define void @__gen_memset_g(i8 addrspace(1)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
+define void @__gen_memset_g_align(i8 addrspace(1)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
 entry:
   %conv = zext i8 %val to i32
   %shl = shl nuw i32 %conv, 24
@@ -64,7 +64,7 @@ entry:
 while.cond:                                       ; preds = %while.body, %entry
   %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
   %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
+  %cmp = icmp ugt i32 %add, %size
   br i1 %cmp, label %while.cond10, label %while.body
 
 while.body:                                       ; preds = %while.cond
@@ -88,7 +88,7 @@ while.end14:                                      ; preds = %while.cond10
   ret void
 }
 
-define void @__gen_memset_l(i8 addrspace(3)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
+define void @__gen_memset_l_align(i8 addrspace(3)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
 entry:
   %conv = zext i8 %val to i32
   %shl = shl nuw i32 %conv, 24
@@ -102,7 +102,7 @@ entry:
 while.cond:                                       ; preds = %while.body, %entry
   %index.0 = phi i32 [ 0, %entry ], [ %add, %while.body ]
   %add = add i32 %index.0, 4
-  %cmp = icmp ult i32 %add, %size
+  %cmp = icmp ugt i32 %add, %size
   br i1 %cmp, label %while.cond10, label %while.body
 
 while.body:                                       ; preds = %while.cond
@@ -125,3 +125,69 @@ while.body13:                                     ; preds = %while.cond10
 while.end14:                                      ; preds = %while.cond10
   ret void
 }
+
+;The memset's source code.
+; INLINE_OVERLOADABLE void __gen_memset(uchar* dst, uchar val, size_t size) {
+;   size_t index = 0;
+;   while(index < size) {
+;     dst[index] = val;
+;     index++;
+;  }
+; }
+
+define void @__gen_memset_p(i8 addrspace(0)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
+entry:
+  %cmp3 = icmp eq i32 %size, 0
+  br i1 %cmp3, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.04 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(0)* %dst to i32
+  %1 = add i32 %0, %index.04
+  %2 = inttoptr i32 %1 to i8 addrspace(0)*
+  store i8 %val, i8 addrspace(0)* %2, align 1
+  %inc = add i32 %index.04, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+define void @__gen_memset_g(i8 addrspace(1)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
+entry:
+  %cmp3 = icmp eq i32 %size, 0
+  br i1 %cmp3, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.04 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(1)* %dst to i32
+  %1 = add i32 %0, %index.04
+  %2 = inttoptr i32 %1 to i8 addrspace(1)*
+  store i8 %val, i8 addrspace(1)* %2, align 1
+  %inc = add i32 %index.04, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
+
+define void @__gen_memset_l(i8 addrspace(3)* %dst, i8 zeroext %val, i32 %size) nounwind alwaysinline {
+entry:
+  %cmp3 = icmp eq i32 %size, 0
+  br i1 %cmp3, label %while.end, label %while.body
+
+while.body:                                       ; preds = %entry, %while.body
+  %index.04 = phi i32 [ %inc, %while.body ], [ 0, %entry ]
+  %0 = ptrtoint i8 addrspace(3)* %dst to i32
+  %1 = add i32 %0, %index.04
+  %2 = inttoptr i32 %1 to i8 addrspace(3)*
+  store i8 %val, i8 addrspace(3)* %2, align 1
+  %inc = add i32 %index.04, 1
+  %cmp = icmp ult i32 %inc, %size
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body, %entry
+  ret void
+}
diff --git a/backend/src/libocl/src/ocl_misc.cl b/backend/src/libocl/src/ocl_misc.cl
new file mode 100644
index 0000000..7f40054
--- /dev/null
+++ b/backend/src/libocl/src/ocl_misc.cl
@@ -0,0 +1,231 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "ocl_misc.h"
+
+#define DEC2(TYPE, XTYPE, MASKTYPE) \
+  OVERLOADABLE TYPE##2 shuffle(XTYPE x, MASKTYPE##2 mask) { \
+    TYPE##2 y; \
+    y.s0 = ((TYPE *) &x)[mask.s0 & (vec_step(x) - 1)]; \
+    y.s1 = ((TYPE *) &x)[mask.s1 & (vec_step(x) - 1)]; \
+    return y; \
+  }
+
+#define DEC4(TYPE, XTYPE, MASKTYPE) \
+  OVERLOADABLE TYPE##4 shuffle(XTYPE x, MASKTYPE##4 mask) { \
+    TYPE##4 y; \
+    y.s0 = ((TYPE *) &x)[mask.s0 & (vec_step(x) - 1)]; \
+    y.s1 = ((TYPE *) &x)[mask.s1 & (vec_step(x) - 1)]; \
+    y.s2 = ((TYPE *) &x)[mask.s2 & (vec_step(x) - 1)]; \
+    y.s3 = ((TYPE *) &x)[mask.s3 & (vec_step(x) - 1)]; \
+    return y; \
+  }
+
+#define DEC8(TYPE, XTYPE, MASKTYPE) \
+  OVERLOADABLE TYPE##8 shuffle(XTYPE x, MASKTYPE##8 mask) { \
+    TYPE##8 y; \
+    y.s0 = ((TYPE *) &x)[mask.s0 & (vec_step(x) - 1)]; \
+    y.s1 = ((TYPE *) &x)[mask.s1 & (vec_step(x) - 1)]; \
+    y.s2 = ((TYPE *) &x)[mask.s2 & (vec_step(x) - 1)]; \
+    y.s3 = ((TYPE *) &x)[mask.s3 & (vec_step(x) - 1)]; \
+    y.s4 = ((TYPE *) &x)[mask.s4 & (vec_step(x) - 1)]; \
+    y.s5 = ((TYPE *) &x)[mask.s5 & (vec_step(x) - 1)]; \
+    y.s6 = ((TYPE *) &x)[mask.s6 & (vec_step(x) - 1)]; \
+    y.s7 = ((TYPE *) &x)[mask.s7 & (vec_step(x) - 1)]; \
+    return y; \
+  }
+
+#define DEC16(TYPE, XTYPE, MASKTYPE) \
+  OVERLOADABLE TYPE##16 shuffle(XTYPE x, MASKTYPE##16 mask) { \
+    TYPE##16 y; \
+    y.s0 = ((TYPE *) &x)[mask.s0 & (vec_step(x) - 1)]; \
+    y.s1 = ((TYPE *) &x)[mask.s1 & (vec_step(x) - 1)]; \
+    y.s2 = ((TYPE *) &x)[mask.s2 & (vec_step(x) - 1)]; \
+    y.s3 = ((TYPE *) &x)[mask.s3 & (vec_step(x) - 1)]; \
+    y.s4 = ((TYPE *) &x)[mask.s4 & (vec_step(x) - 1)]; \
+    y.s5 = ((TYPE *) &x)[mask.s5 & (vec_step(x) - 1)]; \
+    y.s6 = ((TYPE *) &x)[mask.s6 & (vec_step(x) - 1)]; \
+    y.s7 = ((TYPE *) &x)[mask.s7 & (vec_step(x) - 1)]; \
+    y.s8 = ((TYPE *) &x)[mask.s8 & (vec_step(x) - 1)]; \
+    y.s9 = ((TYPE *) &x)[mask.s9 & (vec_step(x) - 1)]; \
+    y.sa = ((TYPE *) &x)[mask.sa & (vec_step(x) - 1)]; \
+    y.sb = ((TYPE *) &x)[mask.sb & (vec_step(x) - 1)]; \
+    y.sc = ((TYPE *) &x)[mask.sc & (vec_step(x) - 1)]; \
+    y.sd = ((TYPE *) &x)[mask.sd & (vec_step(x) - 1)]; \
+    y.se = ((TYPE *) &x)[mask.se & (vec_step(x) - 1)]; \
+    y.sf = ((TYPE *) &x)[mask.sf & (vec_step(x) - 1)]; \
+    return y; \
+  }
+
+#define DEFMASK(TYPE, MASKTYPE) \
+  DEC2(TYPE, TYPE##2, MASKTYPE); DEC2(TYPE, TYPE##4, MASKTYPE); DEC2(TYPE, TYPE##8, MASKTYPE); DEC2(TYPE, TYPE##16, MASKTYPE) \
+  DEC4(TYPE, TYPE##2, MASKTYPE); DEC4(TYPE, TYPE##4, MASKTYPE); DEC4(TYPE, TYPE##8, MASKTYPE); DEC4(TYPE, TYPE##16, MASKTYPE) \
+  DEC8(TYPE, TYPE##2, MASKTYPE); DEC8(TYPE, TYPE##4, MASKTYPE); DEC8(TYPE, TYPE##8, MASKTYPE); DEC8(TYPE, TYPE##16, MASKTYPE) \
+  DEC16(TYPE, TYPE##2, MASKTYPE); DEC16(TYPE, TYPE##4, MASKTYPE); DEC16(TYPE, TYPE##8, MASKTYPE); DEC16(TYPE, TYPE##16, MASKTYPE)
+
+#define DEF(TYPE) \
+  DEFMASK(TYPE, uchar) \
+  DEFMASK(TYPE, ushort) \
+  DEFMASK(TYPE, uint) \
+  DEFMASK(TYPE, ulong)
+
+DEF(char)
+DEF(uchar)
+DEF(short)
+DEF(ushort)
+DEF(int)
+DEF(uint)
+DEF(float)
+DEF(long)
+DEF(ulong)
+#undef DEF
+#undef DEFMASK
+#undef DEC2
+#undef DEC4
+#undef DEC8
+#undef DEC16
+
+#define DEC2(TYPE, ARGTYPE, TEMPTYPE, MASKTYPE) \
+  OVERLOADABLE TYPE##2 shuffle2(ARGTYPE x, ARGTYPE y, MASKTYPE##2 mask) { \
+    return shuffle((TEMPTYPE)(x, y), mask); \
+  }
+
+#define DEC2X(TYPE, MASKTYPE) \
+  OVERLOADABLE TYPE##2 shuffle2(TYPE##16 x, TYPE##16 y, MASKTYPE##2 mask) { \
+    TYPE##2 z; \
+    z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
+    z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
+    return z; \
+  }
+
+#define DEC4(TYPE, ARGTYPE, TEMPTYPE, MASKTYPE) \
+  OVERLOADABLE TYPE##4 shuffle2(ARGTYPE x, ARGTYPE y, MASKTYPE##4 mask) { \
+    return shuffle((TEMPTYPE)(x, y), mask); \
+  }
+
+#define DEC4X(TYPE, MASKTYPE) \
+  OVERLOADABLE TYPE##4 shuffle2(TYPE##16 x, TYPE##16 y, MASKTYPE##4 mask) { \
+    TYPE##4 z; \
+    z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
+    z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
+    z.s2 = mask.s2 < 16 ? ((TYPE *)&x)[mask.s2] : ((TYPE *)&y)[mask.s2 & 15]; \
+    z.s3 = mask.s3 < 16 ? ((TYPE *)&x)[mask.s3] : ((TYPE *)&y)[mask.s3 & 15]; \
+    return z; \
+  }
+
+#define DEC8(TYPE, ARGTYPE, TEMPTYPE, MASKTYPE) \
+  OVERLOADABLE TYPE##8 shuffle2(ARGTYPE x, ARGTYPE y, MASKTYPE##8 mask) { \
+    return shuffle((TEMPTYPE)(x, y), mask); \
+  }
+
+#define DEC8X(TYPE, MASKTYPE) \
+  OVERLOADABLE TYPE##8 shuffle2(TYPE##16 x, TYPE##16 y, MASKTYPE##8 mask) { \
+    TYPE##8 z; \
+    z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
+    z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
+    z.s2 = mask.s2 < 16 ? ((TYPE *)&x)[mask.s2] : ((TYPE *)&y)[mask.s2 & 15]; \
+    z.s3 = mask.s3 < 16 ? ((TYPE *)&x)[mask.s3] : ((TYPE *)&y)[mask.s3 & 15]; \
+    z.s4 = mask.s4 < 16 ? ((TYPE *)&x)[mask.s4] : ((TYPE *)&y)[mask.s4 & 15]; \
+    z.s5 = mask.s5 < 16 ? ((TYPE *)&x)[mask.s5] : ((TYPE *)&y)[mask.s5 & 15]; \
+    z.s6 = mask.s6 < 16 ? ((TYPE *)&x)[mask.s6] : ((TYPE *)&y)[mask.s6 & 15]; \
+    z.s7 = mask.s7 < 16 ? ((TYPE *)&x)[mask.s7] : ((TYPE *)&y)[mask.s7 & 15]; \
+    return z; \
+  }
+
+#define DEC16(TYPE, ARGTYPE, TEMPTYPE, MASKTYPE) \
+  OVERLOADABLE TYPE##16 shuffle2(ARGTYPE x, ARGTYPE y, MASKTYPE##16 mask) { \
+    return shuffle((TEMPTYPE)(x, y), mask); \
+  }
+
+#define DEC16X(TYPE, MASKTYPE) \
+  OVERLOADABLE TYPE##16 shuffle2(TYPE##16 x, TYPE##16 y, MASKTYPE##16 mask) { \
+    TYPE##16 z; \
+    z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
+    z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
+    z.s2 = mask.s2 < 16 ? ((TYPE *)&x)[mask.s2] : ((TYPE *)&y)[mask.s2 & 15]; \
+    z.s3 = mask.s3 < 16 ? ((TYPE *)&x)[mask.s3] : ((TYPE *)&y)[mask.s3 & 15]; \
+    z.s4 = mask.s4 < 16 ? ((TYPE *)&x)[mask.s4] : ((TYPE *)&y)[mask.s4 & 15]; \
+    z.s5 = mask.s5 < 16 ? ((TYPE *)&x)[mask.s5] : ((TYPE *)&y)[mask.s5 & 15]; \
+    z.s6 = mask.s6 < 16 ? ((TYPE *)&x)[mask.s6] : ((TYPE *)&y)[mask.s6 & 15]; \
+    z.s7 = mask.s7 < 16 ? ((TYPE *)&x)[mask.s7] : ((TYPE *)&y)[mask.s7 & 15]; \
+    z.s8 = mask.s8 < 16 ? ((TYPE *)&x)[mask.s8] : ((TYPE *)&y)[mask.s8 & 15]; \
+    z.s9 = mask.s9 < 16 ? ((TYPE *)&x)[mask.s9] : ((TYPE *)&y)[mask.s9 & 15]; \
+    z.sa = mask.sa < 16 ? ((TYPE *)&x)[mask.sa] : ((TYPE *)&y)[mask.sa & 15]; \
+    z.sb = mask.sb < 16 ? ((TYPE *)&x)[mask.sb] : ((TYPE *)&y)[mask.sb & 15]; \
+    z.sc = mask.sc < 16 ? ((TYPE *)&x)[mask.sc] : ((TYPE *)&y)[mask.sc & 15]; \
+    z.sd = mask.sd < 16 ? ((TYPE *)&x)[mask.sd] : ((TYPE *)&y)[mask.sd & 15]; \
+    z.se = mask.se < 16 ? ((TYPE *)&x)[mask.se] : ((TYPE *)&y)[mask.se & 15]; \
+    z.sf = mask.sf < 16 ? ((TYPE *)&x)[mask.sf] : ((TYPE *)&y)[mask.sf & 15]; \
+    return z; \
+  }
+
+#define DEFMASK(TYPE, MASKTYPE) \
+  DEC2(TYPE, TYPE##2, TYPE##4, MASKTYPE) \
+  DEC2(TYPE, TYPE##4, TYPE##8, MASKTYPE) \
+  DEC2(TYPE, TYPE##8, TYPE##16, MASKTYPE) \
+  DEC2X(TYPE, MASKTYPE) \
+  DEC4(TYPE, TYPE##2, TYPE##4, MASKTYPE) \
+  DEC4(TYPE, TYPE##4, TYPE##8, MASKTYPE) \
+  DEC4(TYPE, TYPE##8, TYPE##16, MASKTYPE) \
+  DEC4X(TYPE, MASKTYPE) \
+  DEC8(TYPE, TYPE##2, TYPE##4, MASKTYPE) \
+  DEC8(TYPE, TYPE##4, TYPE##8, MASKTYPE) \
+  DEC8(TYPE, TYPE##8, TYPE##16, MASKTYPE) \
+  DEC8X(TYPE, MASKTYPE) \
+  DEC16(TYPE, TYPE##2, TYPE##4, MASKTYPE) \
+  DEC16(TYPE, TYPE##4, TYPE##8, MASKTYPE) \
+  DEC16(TYPE, TYPE##8, TYPE##16, MASKTYPE) \
+  DEC16X(TYPE, MASKTYPE)
+
+#define DEF(TYPE) \
+  DEFMASK(TYPE, uchar) \
+  DEFMASK(TYPE, ushort) \
+  DEFMASK(TYPE, uint) \
+  DEFMASK(TYPE, ulong)
+
+DEF(char)
+DEF(uchar)
+DEF(short)
+DEF(ushort)
+DEF(int)
+DEF(uint)
+DEF(float)
+DEF(long)
+DEF(ulong)
+#undef DEF
+#undef DEFMASK
+#undef DEC2
+#undef DEC2X
+#undef DEC4
+#undef DEC4X
+#undef DEC8
+#undef DEC8X
+#undef DEC16
+#undef DEC16X
+
+uint __gen_ocl_read_tm(void);
+uint __gen_ocl_region(ushort offset, uint data);
+
+struct time_stamp __gen_ocl_get_timestamp(void) {
+  struct time_stamp val;
+
+  uint tm = __gen_ocl_read_tm();
+  val.tick = ((ulong)__gen_ocl_region(1, tm) << 32) | __gen_ocl_region(0, tm);
+  val.event = __gen_ocl_region(2, tm);
+
+  return val;
+};
diff --git a/CMake/CMakeConfigTemplate.hpp b/backend/src/libocl/src/ocl_sync.cl
similarity index 60%
copy from CMake/CMakeConfigTemplate.hpp
copy to backend/src/libocl/src/ocl_sync.cl
index 7702c54..d008639 100644
--- a/CMake/CMakeConfigTemplate.hpp
+++ b/backend/src/libocl/src/ocl_sync.cl
@@ -1,10 +1,10 @@
-/* 
- * Copyright © 2012 Intel Corporation
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -14,15 +14,18 @@
  * You should have received a copy of the GNU Lesser General Public
  * License along with this library. If not, see <http://www.gnu.org/licenses/>.
  *
- * Author: Benjamin Segovia <benjamin.segovia at intel.com>
  */
+#include "ocl_sync.h"
 
-#ifndef CMAKE_CONFIG_HPP
-#define CMAKE_CONFIG_HPP
+void __gen_ocl_barrier_local(void);
+void __gen_ocl_barrier_global(void);
+void __gen_ocl_barrier_local_and_global(void);
 
-#define ON true
-#define OFF false
-#define GEN_INSTALLATION_PATH "${CMAKE_INSTALL_PREFIX}/lib/i965/"
+void mem_fence(cl_mem_fence_flags flags) {
+}
 
-#endif /* CMAKE_CONFIG_HPP */
+void read_mem_fence(cl_mem_fence_flags flags) {
+}
 
+void write_mem_fence(cl_mem_fence_flags flags) {
+}
diff --git a/backend/src/libocl/src/ocl_vload.cl b/backend/src/libocl/src/ocl_vload.cl
new file mode 100644
index 0000000..fa5e04f
--- /dev/null
+++ b/backend/src/libocl/src/ocl_vload.cl
@@ -0,0 +1,274 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+#include "ocl_vload.h"
+#include "ocl_relational.h"
+
+// These loads and stores will use untyped reads and writes, so we can just
+// cast to vector loads / stores. Not C99 compliant BTW due to aliasing issue.
+// Well we do not care, we do not activate TBAA in the compiler
+#define DECL_UNTYPED_RW_SPACE_N(TYPE, DIM, SPACE) \
+OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \
+  return *(SPACE TYPE##DIM *) (p + DIM * offset); \
+} \
+OVERLOADABLE void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p) { \
+  *(SPACE TYPE##DIM *) (p + DIM * offset) = v; \
+}
+
+#define DECL_UNTYPED_RD_SPACE_N(TYPE, DIM, SPACE) \
+OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \
+  return *(SPACE TYPE##DIM *) (p + DIM * offset); \
+}
+
+#define DECL_UNTYPED_V3_SPACE(TYPE, SPACE) \
+OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p) {\
+  *(p + 3 * offset) = v.s0; \
+  *(p + 3 * offset + 1) = v.s1; \
+  *(p + 3 * offset + 2) = v.s2; \
+} \
+OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
+  return (TYPE##3)(*(p + 3 * offset), *(p+ 3 * offset + 1), *(p + 3 * offset + 2));\
+}
+
+#define DECL_UNTYPED_RDV3_SPACE(TYPE, SPACE) \
+OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
+  return (TYPE##3)(*(p + 3 * offset), *(p+ 3 * offset + 1), *(p + 3 * offset + 2));\
+}
+
+#define DECL_UNTYPED_RW_ALL_SPACE(TYPE, SPACE) \
+  DECL_UNTYPED_RW_SPACE_N(TYPE, 2, SPACE) \
+  DECL_UNTYPED_V3_SPACE(TYPE, SPACE) \
+  DECL_UNTYPED_RW_SPACE_N(TYPE, 4, SPACE) \
+  DECL_UNTYPED_RW_SPACE_N(TYPE, 8, SPACE) \
+  DECL_UNTYPED_RW_SPACE_N(TYPE, 16, SPACE)
+
+#define DECL_UNTYPED_RD_ALL_SPACE(TYPE, SPACE) \
+  DECL_UNTYPED_RD_SPACE_N(TYPE, 2, SPACE) \
+  DECL_UNTYPED_RDV3_SPACE(TYPE, SPACE) \
+  DECL_UNTYPED_RD_SPACE_N(TYPE, 4, SPACE) \
+  DECL_UNTYPED_RD_SPACE_N(TYPE, 8, SPACE) \
+  DECL_UNTYPED_RD_SPACE_N(TYPE, 16, SPACE)
+
+#define DECL_UNTYPED_RW_ALL(TYPE) \
+  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __global) \
+  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __local) \
+  DECL_UNTYPED_RD_ALL_SPACE(TYPE, __constant) \
+  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __private)
+
+#define DECL_BYTE_RD_SPACE(TYPE, SPACE) \
+OVERLOADABLE TYPE##2 vload2(size_t offset, const SPACE TYPE *p) { \
+  return (TYPE##2)(*(p+2*offset), *(p+2*offset+1)); \
+} \
+OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
+  return (TYPE##3)(*(p+3*offset), *(p+3*offset+1), *(p+3*offset+2)); \
+} \
+OVERLOADABLE TYPE##4 vload4(size_t offset, const SPACE TYPE *p) { \
+  return (TYPE##4)(vload2(2*offset, p), vload2(2*offset, p+2)); \
+} \
+OVERLOADABLE TYPE##8 vload8(size_t offset, const SPACE TYPE *p) { \
+  return (TYPE##8)(vload4(2*offset, p), vload4(2*offset, p+4)); \
+} \
+OVERLOADABLE TYPE##16 vload16(size_t offset, const SPACE TYPE *p) { \
+  return (TYPE##16)(vload8(2*offset, p), vload8(2*offset, p+8)); \
+}
+
+#define DECL_BYTE_WR_SPACE(TYPE, SPACE) \
+OVERLOADABLE void vstore2(TYPE##2 v, size_t offset, SPACE TYPE *p) {\
+  *(p + 2 * offset) = v.s0; \
+  *(p + 2 * offset + 1) = v.s1; \
+} \
+OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p) {\
+  *(p + 3 * offset) = v.s0; \
+  *(p + 3 * offset + 1) = v.s1; \
+  *(p + 3 * offset + 2) = v.s2; \
+} \
+OVERLOADABLE void vstore4(TYPE##4 v, size_t offset, SPACE TYPE *p) { \
+  vstore2(v.lo, 2*offset, p); \
+  vstore2(v.hi, 2*offset, p+2); \
+} \
+OVERLOADABLE void vstore8(TYPE##8 v, size_t offset, SPACE TYPE *p) { \
+  vstore4(v.lo, 2*offset, p); \
+  vstore4(v.hi, 2*offset, p+4); \
+} \
+OVERLOADABLE void vstore16(TYPE##16 v, size_t offset, SPACE TYPE *p) { \
+  vstore8(v.lo, 2*offset, p); \
+  vstore8(v.hi, 2*offset, p+8); \
+}
+
+#define DECL_BYTE_RW_ALL(TYPE) \
+  DECL_BYTE_RD_SPACE(TYPE, __global) \
+  DECL_BYTE_RD_SPACE(TYPE, __local) \
+  DECL_BYTE_RD_SPACE(TYPE, __private) \
+  DECL_BYTE_RD_SPACE(TYPE, __constant) \
+  DECL_BYTE_WR_SPACE(TYPE, __global) \
+  DECL_BYTE_WR_SPACE(TYPE, __local) \
+  DECL_BYTE_WR_SPACE(TYPE, __private)
+
+DECL_BYTE_RW_ALL(char)
+DECL_BYTE_RW_ALL(uchar)
+DECL_BYTE_RW_ALL(short)
+DECL_BYTE_RW_ALL(ushort)
+DECL_UNTYPED_RW_ALL(int)
+DECL_UNTYPED_RW_ALL(uint)
+DECL_UNTYPED_RW_ALL(long)
+DECL_UNTYPED_RW_ALL(ulong)
+DECL_UNTYPED_RW_ALL(float)
+DECL_UNTYPED_RW_ALL(double)
+
+#undef DECL_UNTYPED_RW_ALL
+#undef DECL_UNTYPED_RW_ALL_SPACE
+#undef DECL_UNTYPED_RD_ALL_SPACE
+#undef DECL_UNTYPED_RW_SPACE_N
+#undef DECL_UNTYPED_RD_SPACE_N
+#undef DECL_UNTYPED_V3_SPACE
+#undef DECL_UNTYPED_RDV3_SPACE
+#undef DECL_BYTE_RD_SPACE
+#undef DECL_BYTE_WR_SPACE
+#undef DECL_BYTE_RW_ALL
+
+PURE CONST float __gen_ocl_f16to32(short h);
+PURE CONST short __gen_ocl_f32to16(float f);
+
+OVERLOADABLE short f32to16_rtp(float f) {
+  short s = __gen_ocl_f32to16(f);
+  float con = __gen_ocl_f16to32(s);
+  //if(isinf(con)) return s;
+  if (f > con)
+    return s - signbit(f) * 2 + 1;
+  else
+    return s;
+}
+
+OVERLOADABLE short f32to16_rtn(float f) {
+  short s = __gen_ocl_f32to16(f);
+  float con = __gen_ocl_f16to32(s);
+  //if(isinf(con)) return s;
+  if (con > f)
+    return s + signbit(f) * 2 - 1;
+  else
+    return s;
+}
+
+OVERLOADABLE short f32to16_rtz(float f) {
+  short s = __gen_ocl_f32to16(f);
+  float con = __gen_ocl_f16to32(s);
+  //if(isinf(con)) return s;
+  if (((con > f) && !signbit(f)) ||
+      ((con < f) && signbit(f)))
+    return s - 1;
+  else
+    return s;
+}
+
+#define DECL_HALF_LD_SPACE(SPACE) \
+OVERLOADABLE float vload_half(size_t offset, const SPACE half *p) { \
+  return __gen_ocl_f16to32(*(SPACE short *)(p + offset)); \
+} \
+OVERLOADABLE float2 vload_half2(size_t offset, const SPACE half *p) { \
+  return (float2)(vload_half(offset*2, p), \
+                  vload_half(offset*2 + 1, p)); \
+} \
+OVERLOADABLE float3 vload_half3(size_t offset, const SPACE half *p) { \
+  return (float3)(vload_half(offset*3, p), \
+                  vload_half(offset*3 + 1, p), \
+                  vload_half(offset*3 + 2, p)); \
+} \
+OVERLOADABLE float3 vloada_half3(size_t offset, const SPACE half *p) { \
+  return (float3)(vload_half(offset*4, p), \
+                  vload_half(offset*4 + 1, p), \
+                  vload_half(offset*4 + 2, p)); \
+} \
+OVERLOADABLE float4 vload_half4(size_t offset, const SPACE half *p) { \
+  return (float4)(vload_half2(offset*2, p), \
+                  vload_half2(offset*2 + 1, p)); \
+} \
+OVERLOADABLE float8 vload_half8(size_t offset, const SPACE half *p) { \
+  return (float8)(vload_half4(offset*2, p), \
+                  vload_half4(offset*2 + 1, p)); \
+} \
+OVERLOADABLE float16 vload_half16(size_t offset, const SPACE half *p) { \
+  return (float16)(vload_half8(offset*2, p), \
+                   vload_half8(offset*2 + 1, p)); \
+}
+
+#define DECL_HALF_ST_SPACE_ROUND(SPACE, ROUND, FUNC) \
+OVERLOADABLE void vstore_half##ROUND(float data, size_t offset, SPACE half *p) { \
+  *(SPACE short *)(p + offset) = FUNC(data); \
+} \
+OVERLOADABLE void vstorea_half##ROUND(float data, size_t offset, SPACE half *p) { \
+  vstore_half##ROUND(data, offset, p); \
+} \
+OVERLOADABLE void vstore_half2##ROUND(float2 data, size_t offset, SPACE half *p) { \
+  vstore_half##ROUND(data.lo, offset*2, p); \
+  vstore_half##ROUND(data.hi, offset*2 + 1, p); \
+} \
+OVERLOADABLE void vstorea_half2##ROUND(float2 data, size_t offset, SPACE half *p) { \
+  vstore_half2##ROUND(data, offset, p); \
+} \
+OVERLOADABLE void vstore_half3##ROUND(float3 data, size_t offset, SPACE half *p) { \
+  vstore_half##ROUND(data.s0, offset*3, p); \
+  vstore_half##ROUND(data.s1, offset*3 + 1, p); \
+  vstore_half##ROUND(data.s2, offset*3 + 2, p); \
+} \
+OVERLOADABLE void vstorea_half3##ROUND(float3 data, size_t offset, SPACE half *p) { \
+  vstore_half##ROUND(data.s0, offset*4, p); \
+  vstore_half##ROUND(data.s1, offset*4 + 1, p); \
+  vstore_half##ROUND(data.s2, offset*4 + 2, p); \
+} \
+OVERLOADABLE void vstore_half4##ROUND(float4 data, size_t offset, SPACE half *p) { \
+  vstore_half2##ROUND(data.lo, offset*2, p); \
+  vstore_half2##ROUND(data.hi, offset*2 + 1, p); \
+} \
+OVERLOADABLE void vstorea_half4##ROUND(float4 data, size_t offset, SPACE half *p) { \
+  vstore_half4##ROUND(data, offset, p); \
+} \
+OVERLOADABLE void vstore_half8##ROUND(float8 data, size_t offset, SPACE half *p) { \
+  vstore_half4##ROUND(data.lo, offset*2, p); \
+  vstore_half4##ROUND(data.hi, offset*2 + 1, p); \
+} \
+OVERLOADABLE void vstorea_half8##ROUND(float8 data, size_t offset, SPACE half *p) { \
+  vstore_half8##ROUND(data, offset, p); \
+} \
+OVERLOADABLE void vstore_half16##ROUND(float16 data, size_t offset, SPACE half *p) { \
+  vstore_half8##ROUND(data.lo, offset*2, p); \
+  vstore_half8##ROUND(data.hi, offset*2 + 1, p); \
+} \
+OVERLOADABLE void vstorea_half16##ROUND(float16 data, size_t offset, SPACE half *p) { \
+  vstore_half16##ROUND(data, offset, p); \
+}
+
+#define DECL_HALF_ST_SPACE(SPACE) \
+  DECL_HALF_ST_SPACE_ROUND(SPACE,  , __gen_ocl_f32to16) \
+  DECL_HALF_ST_SPACE_ROUND(SPACE, _rte, __gen_ocl_f32to16) \
+  DECL_HALF_ST_SPACE_ROUND(SPACE, _rtz, f32to16_rtz) \
+  DECL_HALF_ST_SPACE_ROUND(SPACE, _rtp, f32to16_rtp) \
+  DECL_HALF_ST_SPACE_ROUND(SPACE, _rtn, f32to16_rtn) \
+
+DECL_HALF_LD_SPACE(__global)
+DECL_HALF_LD_SPACE(__local)
+DECL_HALF_LD_SPACE(__constant)
+DECL_HALF_LD_SPACE(__private)
+
+DECL_HALF_ST_SPACE(__global)
+DECL_HALF_ST_SPACE(__local)
+DECL_HALF_ST_SPACE(__private)
+
+//#undef DECL_UNTYPED_RW_ALL_SPACE
+#undef DECL_HALF_LD_SPACE
+#undef DECL_HALF_ST_SPACE
+#undef DECL_HALF_ST_SPACE_ROUND
diff --git a/backend/src/libocl/src/ocl_workitem.cl b/backend/src/libocl/src/ocl_workitem.cl
new file mode 100644
index 0000000..f4629f8
--- /dev/null
+++ b/backend/src/libocl/src/ocl_workitem.cl
@@ -0,0 +1,57 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "ocl_workitem.h"
+
+PURE CONST uint __gen_ocl_get_work_dim(void);
+uint get_work_dim(void)
+{
+  return __gen_ocl_get_work_dim();
+}
+
+
+#define DECL_INTERNAL_WORK_ITEM_FN(NAME) \
+PURE CONST unsigned int __gen_ocl_##NAME##0(void); \
+PURE CONST unsigned int __gen_ocl_##NAME##1(void); \
+PURE CONST unsigned int __gen_ocl_##NAME##2(void);
+DECL_INTERNAL_WORK_ITEM_FN(get_group_id)
+DECL_INTERNAL_WORK_ITEM_FN(get_local_id)
+DECL_INTERNAL_WORK_ITEM_FN(get_local_size)
+DECL_INTERNAL_WORK_ITEM_FN(get_global_size)
+DECL_INTERNAL_WORK_ITEM_FN(get_global_offset)
+DECL_INTERNAL_WORK_ITEM_FN(get_num_groups)
+#undef DECL_INTERNAL_WORK_ITEM_FN
+
+#define DECL_PUBLIC_WORK_ITEM_FN(NAME, OTHER_RET)    \
+unsigned NAME(unsigned int dim) {             \
+  if (dim == 0) return __gen_ocl_##NAME##0();        \
+  else if (dim == 1) return __gen_ocl_##NAME##1();   \
+  else if (dim == 2) return __gen_ocl_##NAME##2();   \
+  else return OTHER_RET;                             \
+}
+
+DECL_PUBLIC_WORK_ITEM_FN(get_group_id, 0)
+DECL_PUBLIC_WORK_ITEM_FN(get_local_id, 0)
+DECL_PUBLIC_WORK_ITEM_FN(get_local_size, 1)
+DECL_PUBLIC_WORK_ITEM_FN(get_global_size, 1)
+DECL_PUBLIC_WORK_ITEM_FN(get_global_offset, 0)
+DECL_PUBLIC_WORK_ITEM_FN(get_num_groups, 1)
+#undef DECL_PUBLIC_WORK_ITEM_FN
+
+uint get_global_id(uint dim) {
+  return get_local_id(dim) + get_local_size(dim) * get_group_id(dim) + get_global_offset(dim);
+}
diff --git a/backend/src/libocl/tmpl/ocl_common.tmpl.cl b/backend/src/libocl/tmpl/ocl_common.tmpl.cl
new file mode 100644
index 0000000..db7b0d8
--- /dev/null
+++ b/backend/src/libocl/tmpl/ocl_common.tmpl.cl
@@ -0,0 +1,65 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "ocl_common.h"
+#include "ocl_float.h"
+
+/////////////////////////////////////////////////////////////////////////////
+// Common Functions
+/////////////////////////////////////////////////////////////////////////////
+PURE CONST float __gen_ocl_fmax(float a, float b);
+PURE CONST float __gen_ocl_fmin(float a, float b);
+
+OVERLOADABLE float step(float edge, float x) {
+  return x < edge ? 0.0 : 1.0;
+}
+
+OVERLOADABLE float max(float a, float b) {
+  return __gen_ocl_fmax(a, b);
+}
+OVERLOADABLE float min(float a, float b) {
+  return __gen_ocl_fmin(a, b);
+}
+OVERLOADABLE float mix(float x, float y, float a) {
+  return x + (y-x)*a;
+}
+OVERLOADABLE float clamp(float v, float l, float u) {
+  return max(min(v, u), l);
+}
+
+
+OVERLOADABLE float degrees(float radians) {
+  return (180 / M_PI_F) * radians;
+}
+OVERLOADABLE float radians(float degrees) {
+  return (M_PI_F / 180) * degrees;
+}
+
+OVERLOADABLE float smoothstep(float e0, float e1, float x) {
+  x = clamp((x - e0) / (e1 - e0), 0.f, 1.f);
+  return x * x * (3 - 2 * x);
+}
+
+OVERLOADABLE float sign(float x) {
+  if(x > 0)
+    return 1;
+  if(x < 0)
+    return -1;
+  if(x == -0.f)
+    return -0.f;
+  return 0.f;
+}
diff --git a/backend/src/libocl/tmpl/ocl_common.tmpl.h b/backend/src/libocl/tmpl/ocl_common.tmpl.h
new file mode 100644
index 0000000..4a9379d
--- /dev/null
+++ b/backend/src/libocl/tmpl/ocl_common.tmpl.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef __OCL_COMMON_H__
+#define __OCL_COMMON_H__
+
+#include "ocl_types.h"
+
+/////////////////////////////////////////////////////////////////////////////
+// Common Functions
+/////////////////////////////////////////////////////////////////////////////
+OVERLOADABLE float step(float edge, float x);
+OVERLOADABLE float max(float a, float b);
+OVERLOADABLE float min(float a, float b);
+OVERLOADABLE float mix(float x, float y, float a);
+OVERLOADABLE float clamp(float v, float l, float u);
+
+OVERLOADABLE float degrees(float radians);
+OVERLOADABLE float radians(float degrees);
+OVERLOADABLE float smoothstep(float e0, float e1, float x);
+
+OVERLOADABLE float sign(float x);
diff --git a/backend/src/libocl/tmpl/ocl_defines.tmpl.h b/backend/src/libocl/tmpl/ocl_defines.tmpl.h
new file mode 100644
index 0000000..4e210be
--- /dev/null
+++ b/backend/src/libocl/tmpl/ocl_defines.tmpl.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef __OCL_COMMON_DEF_H__
+#define __OCL_COMMON_DEF_H__
+
+#define __OPENCL_VERSION__ 120
+#define __CL_VERSION_1_0__ 100
+#define __CL_VERSION_1_1__ 110
+#define __CL_VERSION_1_2__ 120
+#define __ENDIAN_LITTLE__ 1
+#define __IMAGE_SUPPORT__ 1
+#define __kernel_exec(X, TYPE) __kernel __attribute__((work_group_size_hint(X,1,1))) \
+                                        __attribute__((vec_type_hint(TYPE)))
+#define kernel_exec(X, TYPE) __kernel_exec(X, TYPE)
+#define cl_khr_global_int32_base_atomics
+#define cl_khr_global_int32_extended_atomics
+#define cl_khr_local_int32_base_atomics
+#define cl_khr_local_int32_extended_atomics
+#define cl_khr_byte_addressable_store
+#define cl_khr_icd
+#define cl_khr_gl_sharing
+
+#endif /* end of __OCL_COMMON_DEF_H__ */
diff --git a/backend/src/libocl/tmpl/ocl_integer.tmpl.cl b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
new file mode 100644
index 0000000..992727f
--- /dev/null
+++ b/backend/src/libocl/tmpl/ocl_integer.tmpl.cl
@@ -0,0 +1,398 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "ocl_integer.h"
+
+PURE CONST uint __gen_ocl_fbh(uint);
+PURE CONST uint __gen_ocl_fbl(uint);
+PURE CONST uint __gen_ocl_cbit(uint);
+
+OVERLOADABLE char clz(char x) {
+  if (x < 0)
+    return 0;
+  if (x == 0)
+    return 8;
+  return __gen_ocl_fbh(x) - 24;
+}
+
+OVERLOADABLE uchar clz(uchar x) {
+  if (x == 0)
+    return 8;
+  return __gen_ocl_fbh(x) - 24;
+}
+
+OVERLOADABLE short clz(short x) {
+  if (x < 0)
+    return 0;
+  if (x == 0)
+    return 16;
+  return __gen_ocl_fbh(x) - 16;
+}
+
+OVERLOADABLE ushort clz(ushort x) {
+  if (x == 0)
+    return 16;
+  return __gen_ocl_fbh(x) - 16;
+}
+
+OVERLOADABLE int clz(int x) {
+  if (x < 0)
+    return 0;
+  if (x == 0)
+    return 32;
+  return __gen_ocl_fbh(x);
+}
+
+OVERLOADABLE uint clz(uint x) {
+  if (x == 0)
+    return 32;
+  return __gen_ocl_fbh(x);
+}
+
+OVERLOADABLE long clz(long x) {
+  union { int i[2]; long x; } u;
+  u.x = x;
+  if (u.i[1] & 0x80000000u)
+    return 0;
+  if (u.i[1] == 0 && u.i[0] == 0)
+    return 64;
+  uint v = clz(u.i[1]);
+  if(v == 32)
+    v += clz(u.i[0]);
+  return v;
+}
+
+OVERLOADABLE ulong clz(ulong x) {
+  if (x == 0)
+    return 64;
+  union { uint i[2]; ulong x; } u;
+  u.x = x;
+  uint v = clz(u.i[1]);
+  if(v == 32)
+    v += clz(u.i[0]);
+  return v;
+}
+
+OVERLOADABLE char popcount(char x) {
+  return x == 0 ? 0 : x < 0?__gen_ocl_cbit(x) - 24 : __gen_ocl_cbit(x);
+}
+OVERLOADABLE short popcount(short x) {
+  return x == 0 ? 0 : x < 0?__gen_ocl_cbit(x) - 16 : __gen_ocl_cbit(x);
+}
+#define SDEF(TYPE)        \
+OVERLOADABLE TYPE popcount(TYPE x){ return x == 0? 0:__gen_ocl_cbit(x);}
+SDEF(uchar);
+SDEF(ushort);
+SDEF(int);
+SDEF(uint);
+#undef SDEF
+
+OVERLOADABLE long popcount(long x) {
+  union { int i[2]; long x; } u;
+  u.x = x;
+  uint v = popcount(u.i[1]);
+  v += popcount(u.i[0]);
+  return v;
+}
+
+OVERLOADABLE ulong popcount(ulong x) {
+  union { uint i[2]; ulong x; } u;
+  u.x = x;
+  uint v = popcount(u.i[1]);
+  v += popcount(u.i[0]);
+  return v;
+}
+
+// sat
+#define SDEF(TYPE)        \
+OVERLOADABLE TYPE ocl_sadd_sat(TYPE x, TYPE y);   \
+OVERLOADABLE TYPE ocl_ssub_sat(TYPE x, TYPE y);   \
+OVERLOADABLE TYPE add_sat(TYPE x, TYPE y) { return ocl_sadd_sat(x, y); } \
+OVERLOADABLE TYPE sub_sat(TYPE x, TYPE y) { return ocl_ssub_sat(x, y); }
+SDEF(char);
+SDEF(short);
+#undef SDEF
+OVERLOADABLE int ocl_sadd_sat(int x, int y);
+OVERLOADABLE int add_sat(int x, int y) { return ocl_sadd_sat(x, y); }
+OVERLOADABLE int ocl_ssub_sat(int x, int y);
+OVERLOADABLE int sub_sat(int x, int y) {
+  return (y == 0x80000000u) ? (ocl_sadd_sat(ocl_sadd_sat(0x7fffffff, x), 1)) : ocl_ssub_sat(x, y);
+}
+OVERLOADABLE long ocl_sadd_sat(long x, long y);
+OVERLOADABLE long add_sat(long x, long y) {
+  union {long l; uint i[2];} ux, uy;
+  ux.l = x;
+  uy.l = y;
+  if((ux.i[1] ^ uy.i[1]) & 0x80000000u)
+    return x + y;
+  return ocl_sadd_sat(x, y);
+}
+OVERLOADABLE long ocl_ssub_sat(long x, long y);
+OVERLOADABLE long sub_sat(long x, long y) {
+  union {long l; uint i[2];} ux, uy;
+  ux.l = x;
+  uy.l = y;
+  if((ux.i[1] ^ uy.i[1]) & 0x80000000u)
+    return ocl_ssub_sat(x, y);
+  return x - y;
+}
+#define UDEF(TYPE)    \
+OVERLOADABLE TYPE ocl_uadd_sat(TYPE x, TYPE y);                          \
+OVERLOADABLE TYPE ocl_usub_sat(TYPE x, TYPE y);                          \
+OVERLOADABLE TYPE add_sat(TYPE x, TYPE y) { return ocl_uadd_sat(x, y); } \
+OVERLOADABLE TYPE sub_sat(TYPE x, TYPE y) { return ocl_usub_sat(x, y); }
+UDEF(uchar);
+UDEF(ushort);
+UDEF(uint);
+UDEF(ulong);
+#undef UDEF
+
+
+OVERLOADABLE int __gen_ocl_mul_hi(int x, int y);
+OVERLOADABLE uint __gen_ocl_mul_hi(uint x, uint y);
+OVERLOADABLE long __gen_ocl_mul_hi(long x, long y);
+OVERLOADABLE ulong __gen_ocl_mul_hi(ulong x, ulong y);
+OVERLOADABLE char mul_hi(char x, char y) { return (x * y) >> 8; }
+OVERLOADABLE uchar mul_hi(uchar x, uchar y) { return (x * y) >> 8; }
+OVERLOADABLE short mul_hi(short x, short y) { return (x * y) >> 16; }
+OVERLOADABLE ushort mul_hi(ushort x, ushort y) { return (x * y) >> 16; }
+OVERLOADABLE int mul_hi(int x, int y) { return __gen_ocl_mul_hi(x, y); }
+OVERLOADABLE uint mul_hi(uint x, uint y) { return __gen_ocl_mul_hi(x, y); }
+OVERLOADABLE long mul_hi(long x, long y) {
+  return __gen_ocl_mul_hi(x, y);
+}
+OVERLOADABLE ulong mul_hi(ulong x, ulong y) {
+  return __gen_ocl_mul_hi(x, y);
+}
+
+#define DEF(type) OVERLOADABLE type mad_hi(type a, type b, type c) { return mul_hi(a, b) + c; }
+DEF(char)
+DEF(uchar)
+DEF(short)
+DEF(ushort)
+DEF(int)
+DEF(uint)
+DEF(long)
+DEF(ulong)
+#undef DEF
+
+OVERLOADABLE int mul24(int a, int b) { return ((a << 8) >> 8) * ((b << 8) >> 8); }
+OVERLOADABLE uint mul24(uint a, uint b) { return (a & 0xFFFFFF) * (b & 0xFFFFFF); }
+
+OVERLOADABLE int mad24(int a, int b, int c) { return mul24(a, b) + c; }
+OVERLOADABLE uint mad24(uint a, uint b, uint c) { return mul24(a, b) + c; }
+
+OVERLOADABLE char mad_sat(char a, char b, char c) {
+  int x = (int)a * (int)b + (int)c;
+  if (x > 127)
+    x = 127;
+  if (x < -128)
+    x = -128;
+  return x;
+}
+
+OVERLOADABLE uchar mad_sat(uchar a, uchar b, uchar c) {
+  uint x = (uint)a * (uint)b + (uint)c;
+  if (x > 255)
+    x = 255;
+  return x;
+}
+
+OVERLOADABLE short mad_sat(short a, short b, short c) {
+  int x = (int)a * (int)b + (int)c;
+  if (x > 32767)
+    x = 32767;
+  if (x < -32768)
+    x = -32768;
+  return x;
+}
+
+OVERLOADABLE ushort mad_sat(ushort a, ushort b, ushort c) {
+  uint x = (uint)a * (uint)b + (uint)c;
+  if (x > 65535)
+    x = 65535;
+  return x;
+}
+
+OVERLOADABLE int mad_sat(int a, int b, int c) {
+  long x = (long)a * (long)b + (long)c;
+  if (x > 0x7FFFFFFF)
+    x = 0x7FFFFFFF;
+  else if (x < -0x7FFFFFFF-1)
+    x = -0x7FFFFFFF-1;
+  return (int)x;
+}
+
+OVERLOADABLE uint mad_sat(uint a, uint b, uint c) {
+  ulong x = (ulong)a * (ulong)b + (ulong)c;
+  if (x > 0xFFFFFFFFu)
+    x = 0xFFFFFFFFu;
+  return (uint)x;
+}
+
+OVERLOADABLE long __gen_ocl_mad_sat(long a, long b, long c);
+OVERLOADABLE ulong __gen_ocl_mad_sat(ulong a, ulong b, ulong c);
+
+OVERLOADABLE long mad_sat(long a, long b, long c) {
+  return __gen_ocl_mad_sat(a, b, c);
+}
+
+OVERLOADABLE ulong mad_sat(ulong a, ulong b, ulong c) {
+  return __gen_ocl_mad_sat(a, b, c);
+}
+
+OVERLOADABLE uchar __rotate_left(uchar x, uchar y) { return (x << y) | (x >> (8 - y)); }
+OVERLOADABLE char __rotate_left(char x, char y) { return __rotate_left((uchar)x, (uchar)y); }
+OVERLOADABLE ushort __rotate_left(ushort x, ushort y) { return (x << y) | (x >> (16 - y)); }
+OVERLOADABLE short __rotate_left(short x, short y) { return __rotate_left((ushort)x, (ushort)y); }
+OVERLOADABLE uint __rotate_left(uint x, uint y) { return (x << y) | (x >> (32 - y)); }
+OVERLOADABLE int __rotate_left(int x, int y) { return __rotate_left((uint)x, (uint)y); }
+OVERLOADABLE ulong __rotate_left(ulong x, ulong y) { return (x << y) | (x >> (64 - y)); }
+OVERLOADABLE long __rotate_left(long x, long y) { return __rotate_left((ulong)x, (ulong)y); }
+#define DEF(type, m) OVERLOADABLE type rotate(type x, type y) { return __rotate_left(x, (type)(y & m)); }
+DEF(char, 7)
+DEF(uchar, 7)
+DEF(short, 15)
+DEF(ushort, 15)
+DEF(int, 31)
+DEF(uint, 31)
+DEF(long, 63)
+DEF(ulong, 63)
+#undef DEF
+
+OVERLOADABLE short __gen_ocl_upsample(short hi, short lo);
+OVERLOADABLE int __gen_ocl_upsample(int hi, int lo);
+OVERLOADABLE long __gen_ocl_upsample(long hi, long lo);
+OVERLOADABLE short upsample(char hi, uchar lo) { return __gen_ocl_upsample((short)hi, (short)lo); }
+OVERLOADABLE ushort upsample(uchar hi, uchar lo) { return __gen_ocl_upsample((short)hi, (short)lo); }
+OVERLOADABLE int upsample(short hi, ushort lo) { return __gen_ocl_upsample((int)hi, (int)lo); }
+OVERLOADABLE uint upsample(ushort hi, ushort lo) { return __gen_ocl_upsample((int)hi, (int)lo); }
+OVERLOADABLE long upsample(int hi, uint lo) {
+  return __gen_ocl_upsample((long)hi, (long)lo);
+}
+OVERLOADABLE ulong upsample(uint hi, uint lo) {
+  return __gen_ocl_upsample((long)hi, (long)lo);
+}
+
+OVERLOADABLE uint __gen_ocl_hadd(uint x, uint y);
+OVERLOADABLE uint __gen_ocl_rhadd(uint x, uint y);
+#define DEC DEF(char); DEF(uchar); DEF(short); DEF(ushort)
+#define DEF(type) OVERLOADABLE type hadd(type x, type y) { return (x + y) >> 1; }
+DEC
+#undef DEF
+#define DEF(type) OVERLOADABLE type rhadd(type x, type y) { return (x + y + 1) >> 1; }
+DEC
+#undef DEF
+#undef DEC
+OVERLOADABLE int hadd(int x, int y) {
+  return (x < 0 && y > 0) || (x > 0 && y < 0) ?
+         ((x + y) >> 1) :
+         __gen_ocl_hadd((uint)x, (uint)y);
+}
+OVERLOADABLE uint hadd(uint x, uint y) { return __gen_ocl_hadd(x, y); }
+OVERLOADABLE int rhadd(int x, int y) {
+  return (x < 0 && y > 0) || (x > 0 && y < 0) ?
+         ((x + y + 1) >> 1) :
+         __gen_ocl_rhadd((uint)x, (uint)y);
+ }
+OVERLOADABLE uint rhadd(uint x, uint y) { return __gen_ocl_rhadd(x, y); }
+OVERLOADABLE ulong __gen_ocl_hadd(ulong x, ulong y);
+OVERLOADABLE ulong __gen_ocl_rhadd(ulong x, ulong y);
+OVERLOADABLE long hadd(long x, long y) {
+  return (x < 0 && y > 0) || (x > 0 && y < 0) ?
+         ((x + y) >> 1) :
+         __gen_ocl_hadd((ulong)x, (ulong)y);
+}
+OVERLOADABLE ulong hadd(ulong x, ulong y) {
+  return __gen_ocl_hadd(x, y);
+}
+OVERLOADABLE long rhadd(long x, long y) {
+  return (x < 0 && y > 0) || (x > 0 && y < 0) ?
+         ((x + y + 1) >> 1) :
+         __gen_ocl_rhadd((ulong)x, (ulong)y);
+}
+OVERLOADABLE ulong rhadd(ulong x, ulong y) {
+  return __gen_ocl_rhadd(x, y);
+}
+
+int __gen_ocl_abs(int x);
+#define DEC(TYPE) OVERLOADABLE u##TYPE abs(TYPE x) { return (u##TYPE) __gen_ocl_abs(x); }
+DEC(int)
+DEC(short)
+DEC(char)
+#undef DEC
+OVERLOADABLE ulong abs(long x) { return x < 0 ? -x : x; }
+/* For unsigned types, do nothing. */
+#define DEC(TYPE) OVERLOADABLE TYPE abs(TYPE x) { return x; }
+DEC(uint)
+DEC(ushort)
+DEC(uchar)
+DEC(ulong)
+#undef DEC
+
+/* Char and short type abs diff */
+/* promote char and short to int and will be no module overflow */
+#define DEC(TYPE, UTYPE) OVERLOADABLE UTYPE abs_diff(TYPE x, TYPE y) \
+                         { return (UTYPE) (abs((int)x - (int)y)); }
+DEC(char, uchar)
+DEC(uchar, uchar)
+DEC(short, ushort)
+DEC(ushort, ushort)
+#undef DEC
+
+OVERLOADABLE uint abs_diff (uint x, uint y) {
+    /* same signed will never overflow. */
+    return y > x ? (y -x) : (x - y);
+}
+
+OVERLOADABLE uint abs_diff (int x, int y) {
+    /* same signed will never module overflow. */
+    if ((x >= 0 && y >= 0) || (x <= 0 && y <= 0))
+        return abs(x - y);
+
+    return (abs(x) + abs(y));
+}
+
+OVERLOADABLE ulong abs_diff (long x, long y) {
+  if ((x >= 0 && y >= 0) || (x <= 0 && y <= 0))
+    return abs(x - y);
+  return abs(x) + abs(y);
+}
+OVERLOADABLE ulong abs_diff (ulong x, ulong y) {
+  return y > x ? (y - x) : (x - y);
+}
+
+
+#define DECL_MIN_MAX_CLAMP(TYPE) \
+OVERLOADABLE TYPE max(TYPE a, TYPE b) { \
+  return a > b ? a : b; \
+} \
+OVERLOADABLE TYPE min(TYPE a, TYPE b) { \
+  return a < b ? a : b; \
+} \
+OVERLOADABLE TYPE clamp(TYPE v, TYPE l, TYPE u) { \
+  return max(min(v, u), l); \
+}
+DECL_MIN_MAX_CLAMP(int)
+DECL_MIN_MAX_CLAMP(short)
+DECL_MIN_MAX_CLAMP(char)
+DECL_MIN_MAX_CLAMP(uint)
+DECL_MIN_MAX_CLAMP(unsigned short)
+DECL_MIN_MAX_CLAMP(unsigned char)
+DECL_MIN_MAX_CLAMP(long)
+DECL_MIN_MAX_CLAMP(ulong)
+#undef DECL_MIN_MAX_CLAMP
diff --git a/backend/src/libocl/tmpl/ocl_integer.tmpl.h b/backend/src/libocl/tmpl/ocl_integer.tmpl.h
new file mode 100644
index 0000000..f067b8d
--- /dev/null
+++ b/backend/src/libocl/tmpl/ocl_integer.tmpl.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef __OCL_INTEGER_H__
+#define __OCL_INTEGER_H__
+
+#include "ocl_types.h"
+
+#define CHAR_BIT    8
+#define CHAR_MAX    SCHAR_MAX
+#define CHAR_MIN    SCHAR_MIN
+#define INT_MAX     2147483647
+#define INT_MIN     (-2147483647 - 1)
+#define LONG_MAX    0x7fffffffffffffffL
+#define LONG_MIN    (-0x7fffffffffffffffL - 1)
+#define SCHAR_MAX   127
+#define SCHAR_MIN   (-127 - 1)
+#define SHRT_MAX    32767
+#define SHRT_MIN    (-32767 - 1)
+#define UCHAR_MAX   255
+#define USHRT_MAX   65535
+#define UINT_MAX    0xffffffff
+#define ULONG_MAX   0xffffffffffffffffUL
+
+OVERLOADABLE char clz(char x);
+OVERLOADABLE uchar clz(uchar x);
+OVERLOADABLE short clz(short x);
+OVERLOADABLE ushort clz(ushort x);
+OVERLOADABLE int clz(int x);
+OVERLOADABLE uint clz(uint x);
+OVERLOADABLE long clz(long x);
+OVERLOADABLE ulong clz(ulong x);
+
+OVERLOADABLE char popcount(char x);
+OVERLOADABLE uchar popcount(uchar x);
+OVERLOADABLE short popcount(short x);
+OVERLOADABLE ushort popcount(ushort x);
+OVERLOADABLE int popcount(int x);
+OVERLOADABLE uint popcount(uint x);
+OVERLOADABLE long popcount(long x);
+OVERLOADABLE ulong popcount(ulong x);
+
+OVERLOADABLE char mul_hi(char x, char y);
+OVERLOADABLE uchar mul_hi(uchar x, uchar y);
+OVERLOADABLE short mul_hi(short x, short y);
+OVERLOADABLE ushort mul_hi(ushort x, ushort y);
+OVERLOADABLE int mul_hi(int x, int y);
+OVERLOADABLE uint mul_hi(uint x, uint y);
+OVERLOADABLE long mul_hi(long x, long y);
+OVERLOADABLE ulong mul_hi(ulong x, ulong y);
+
+#define SDEF(TYPE)        \
+OVERLOADABLE TYPE add_sat(TYPE x, TYPE y);   \
+OVERLOADABLE TYPE sub_sat(TYPE x, TYPE y);
+SDEF(char);
+SDEF(short);
+SDEF(int);
+SDEF(long);
+#undef SDEF
+#define UDEF(TYPE)  \
+OVERLOADABLE TYPE add_sat(TYPE x, TYPE y);   \
+OVERLOADABLE TYPE sub_sat(TYPE x, TYPE y);
+UDEF(uchar);
+UDEF(ushort);
+UDEF(uint);
+UDEF(ulong);
+#undef UDEF
+
+#define DEF(type) OVERLOADABLE type mad_hi(type a, type b, type c);
+DEF(char)
+DEF(uchar)
+DEF(short)
+DEF(ushort)
+DEF(int)
+DEF(uint)
+DEF(long)
+DEF(ulong)
+#undef DEF
+
+OVERLOADABLE int mul24(int a, int b);
+OVERLOADABLE uint mul24(uint a, uint b);
+
+OVERLOADABLE int mad24(int a, int b, int c);
+OVERLOADABLE uint mad24(uint a, uint b, uint c);
+
+OVERLOADABLE char mad_sat(char a, char b, char c) ;
+OVERLOADABLE uchar mad_sat(uchar a, uchar b, uchar c);
+OVERLOADABLE short mad_sat(short a, short b, short c);
+OVERLOADABLE ushort mad_sat(ushort a, ushort b, ushort c);
+OVERLOADABLE int mad_sat(int a, int b, int c);
+OVERLOADABLE uint mad_sat(uint a, uint b, uint c);
+OVERLOADABLE long mad_sat(long a, long b, long c);
+OVERLOADABLE ulong mad_sat(ulong a, ulong b, ulong c);
+
+#define DEF(type, m) OVERLOADABLE type rotate(type x, type y);
+DEF(char, 7)
+DEF(uchar, 7)
+DEF(short, 15)
+DEF(ushort, 15)
+DEF(int, 31)
+DEF(uint, 31)
+DEF(long, 63)
+DEF(ulong, 63)
+#undef DEF
+
+OVERLOADABLE short upsample(char hi, uchar lo);
+OVERLOADABLE ushort upsample(uchar hi, uchar lo);
+OVERLOADABLE int upsample(short hi, ushort lo);
+OVERLOADABLE uint upsample(ushort hi, ushort lo);
+OVERLOADABLE long upsample(int hi, uint lo);
+OVERLOADABLE ulong upsample(uint hi, uint lo);
+
+#define DEC DEF(char); DEF(uchar); DEF(short); DEF(ushort)
+#define DEF(type) OVERLOADABLE type hadd(type x, type y);
+DEC
+#undef DEF
+#define DEF(type) OVERLOADABLE type rhadd(type x, type y);
+DEC
+#undef DEF
+#undef DEC
+OVERLOADABLE int hadd(int x, int y);
+OVERLOADABLE uint hadd(uint x, uint y);
+OVERLOADABLE int rhadd(int x, int y);
+OVERLOADABLE uint rhadd(uint x, uint y);
+OVERLOADABLE long hadd(long x, long y);
+OVERLOADABLE ulong hadd(ulong x, ulong y);
+OVERLOADABLE long rhadd(long x, long y);
+OVERLOADABLE ulong rhadd(ulong x, ulong y);
+
+#define DEC(TYPE) OVERLOADABLE u##TYPE abs(TYPE x);
+DEC(int)
+DEC(short)
+DEC(char)
+#undef DEC
+OVERLOADABLE ulong abs(long x);
+/* For unsigned types, do nothing. */
+#define DEC(TYPE) OVERLOADABLE TYPE abs(TYPE x);
+DEC(uint)
+DEC(ushort)
+DEC(uchar)
+DEC(ulong)
+#undef DEC
+
+/* Char and short type abs diff */
+/* promote char and short to int and will be no module overflow */
+#define DEC(TYPE, UTYPE) OVERLOADABLE UTYPE abs_diff(TYPE x, TYPE y);
+DEC(char, uchar)
+DEC(uchar, uchar)
+DEC(short, ushort)
+DEC(ushort, ushort)
+#undef DEC
+
+OVERLOADABLE uint abs_diff (uint x, uint y);
+OVERLOADABLE uint abs_diff (int x, int y);
+OVERLOADABLE ulong abs_diff (long x, long y);
+OVERLOADABLE ulong abs_diff (ulong x, ulong y);
+
+
+#define DECL_MIN_MAX_CLAMP(TYPE) \
+OVERLOADABLE TYPE max(TYPE a, TYPE b);  \
+OVERLOADABLE TYPE min(TYPE a, TYPE b);  \
+OVERLOADABLE TYPE clamp(TYPE v, TYPE l, TYPE u);
+DECL_MIN_MAX_CLAMP(int)
+DECL_MIN_MAX_CLAMP(short)
+DECL_MIN_MAX_CLAMP(char)
+DECL_MIN_MAX_CLAMP(uint)
+DECL_MIN_MAX_CLAMP(unsigned short)
+DECL_MIN_MAX_CLAMP(unsigned char)
+DECL_MIN_MAX_CLAMP(long)
+DECL_MIN_MAX_CLAMP(ulong)
+#undef DECL_MIN_MAX_CLAMP
diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.cl b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
new file mode 100644
index 0000000..236fa0b
--- /dev/null
+++ b/backend/src/libocl/tmpl/ocl_math.tmpl.cl
@@ -0,0 +1,3442 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "ocl_math.h"
+#include "ocl_float.h"
+#include "ocl_relational.h"
+#include "ocl_common.h"
+
+constant int __ocl_math_fastpath_flag = 1;
+
+PURE CONST float __gen_ocl_fabs(float x);
+PURE CONST float __gen_ocl_sin(float x);
+PURE CONST float __gen_ocl_cos(float x);
+PURE CONST float __gen_ocl_sqrt(float x);
+PURE CONST float __gen_ocl_rsqrt(float x);
+PURE CONST float __gen_ocl_log(float x);
+PURE CONST float __gen_ocl_exp(float x);
+PURE CONST float __gen_ocl_pow(float x, float y);
+PURE CONST float __gen_ocl_rcp(float x);
+PURE CONST float __gen_ocl_rndz(float x);
+PURE CONST float __gen_ocl_rnde(float x);
+PURE CONST float __gen_ocl_rndu(float x);
+PURE CONST float __gen_ocl_rndd(float x);
+
+
+/* native functions */
+OVERLOADABLE float native_cos(float x) { return __gen_ocl_cos(x); }
+OVERLOADABLE float native_sin(float x) { return __gen_ocl_sin(x); }
+OVERLOADABLE float native_sqrt(float x) { return __gen_ocl_sqrt(x); }
+OVERLOADABLE float native_rsqrt(float x) { return __gen_ocl_rsqrt(x); }
+OVERLOADABLE float native_log2(float x) { return __gen_ocl_log(x); }
+OVERLOADABLE float native_log(float x) {
+  return native_log2(x) * 0.6931472002f;
+}
+OVERLOADABLE float native_log10(float x) {
+  return native_log2(x) * 0.3010299956f;
+}
+OVERLOADABLE float native_powr(float x, float y) { return __gen_ocl_pow(x,y); }
+OVERLOADABLE float native_recip(float x) { return __gen_ocl_rcp(x); }
+OVERLOADABLE float native_tan(float x) {
+  return native_sin(x) / native_cos(x);
+}
+OVERLOADABLE float native_exp2(float x) { return __gen_ocl_exp(x); }
+OVERLOADABLE float native_exp(float x) { return __gen_ocl_exp(M_LOG2E_F*x); }
+OVERLOADABLE float native_exp10(float x) { return __gen_ocl_pow(10, x); }
+OVERLOADABLE float native_divide(float x, float y) { return x/y; }
+
+/* Fast path */
+OVERLOADABLE float __gen_ocl_internal_fastpath_acosh (float x) {
+    return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_asinh (float x) {
+    return native_log(x + native_sqrt(x * x + 1));
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_atanh (float x) {
+    return 0.5f * native_log((1 + x) / (1 - x));
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_cbrt (float x) {
+    return __gen_ocl_pow(x, 0.3333333333f);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_cos (float x) {
+    return native_cos(x);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_cosh (float x) {
+    return (1 + native_exp(-2 * x)) / (2 * native_exp(-x));
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_cospi (float x) {
+    return __gen_ocl_cos(x * M_PI_F);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_exp (float x) {
+    return native_exp(x);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_exp10 (float x) {
+    return native_exp10(x);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_expm1 (float x) {
+    return __gen_ocl_pow(M_E_F, x) - 1;
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_fmod (float x, float y) {
+    return x-y*__gen_ocl_rndz(x/y);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_hypot (float x, float y) {
+    return __gen_ocl_sqrt(x*x + y*y);
+}
+OVERLOADABLE int __gen_ocl_internal_fastpath_ilogb (float x) {
+    return __gen_ocl_rndd(native_log2(x));
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_ldexp (float x, int n) {
+    return __gen_ocl_pow(2, n) * x;
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_log (float x) {
+    return native_log(x);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_log2 (float x) {
+    return native_log2(x);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_log10 (float x) {
+    return native_log10(x);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_log1p (float x) {
+    return native_log(x + 1);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_logb (float x) {
+    return __gen_ocl_rndd(native_log2(x));
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_remainder (float x, float y) {
+    return x-y*__gen_ocl_rnde(x/y);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_rootn(float x, int n) {
+    return __gen_ocl_pow(x, 1.f / n);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_sin (float x) {
+    return native_sin(x);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_sincos (float x, __global float *cosval) {
+    *cosval = native_cos(x);
+    return native_sin(x);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_sincos (float x, __local float *cosval) {
+    *cosval = native_cos(x);
+    return native_sin(x);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_sincos (float x, __private float *cosval) {
+    *cosval = native_cos(x);
+    return native_sin(x);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_sinh (float x) {
+    return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_sinpi (float x) {
+    return __gen_ocl_sin(x * M_PI_F);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_tan (float x) {
+    return native_tan(x);
+}
+OVERLOADABLE float __gen_ocl_internal_fastpath_tanh (float x) {
+    float y = native_exp(-2 * x);
+    return (1 - y) / (1 + y);
+}
+
+
+/* Internal implement, high accuracy. */
+OVERLOADABLE float __gen_ocl_internal_floor(float x) { return __gen_ocl_rndd(x); }
+OVERLOADABLE float __gen_ocl_internal_copysign(float x, float y) {
+  union { unsigned u; float f; } ux, uy;
+  ux.f = x;
+  uy.f = y;
+  ux.u = (ux.u & 0x7fffffff) | (uy.u & 0x80000000u);
+  return ux.f;
+}
+
+OVERLOADABLE float __gen_ocl_internal_log(float x) {
+/*
+ *  Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+  union { unsigned int i; float f; } u;
+  const float
+  ln2_hi =   6.9313812256e-01,  /* 0x3f317180 */
+  ln2_lo =   9.0580006145e-06,  /* 0x3717f7d1 */
+  two25 =    3.355443200e+07, /* 0x4c000000 */
+  Lg1 = 6.6666668653e-01, /* 3F2AAAAB */
+  Lg2 = 4.0000000596e-01, /* 3ECCCCCD */
+  Lg3 = 2.8571429849e-01, /* 3E924925 */
+  Lg4 = 2.2222198546e-01, /* 3E638E29 */
+  Lg5 = 1.8183572590e-01, /* 3E3A3325 */
+  Lg6 = 1.5313838422e-01, /* 3E1CD04F */
+  Lg7 = 1.4798198640e-01; /* 3E178897 */
+
+  const float zero   =  0.0;
+  float hfsq,f,s,z,R,w,t1,t2,dk;
+  int k,ix,i,j;
+
+  u.f = x;  ix = u.i;
+  k=0;
+  if (ix < 0x00800000) {      /* x < 2**-126  */
+      if ((ix&0x7fffffff)==0)
+    return -two25/zero;   /* log(+-0)=-inf */
+      if (ix<0) return (x-x)/zero;  /* log(-#) = NaN */
+      return -INFINITY;  /* Gen does not support subnormal number now */
+      //k -= 25; x *= two25; /* subnormal number, scale up x */
+      //u.f = x;  ix = u.i;
+  }
+  if (ix >= 0x7f800000) return x+x;
+  k += (ix>>23)-127;
+  ix &= 0x007fffff;
+  i = (ix+(0x95f64<<3))&0x800000;
+  u.i = ix|(i^0x3f800000); x = u.f;
+  k += (i>>23);
+  f = x-(float)1.0;
+  if((0x007fffff&(15+ix))<16) { /* |f| < 2**-20 */
+      if(f==zero) {
+        if(k==0) return zero;
+        else {
+          dk=(float)k; return dk*ln2_hi+dk*ln2_lo;
+        }
+      }
+      R = f*f*((float)0.5-(float)0.33333333333333333*f);
+      if(k==0)
+        return f-R;
+      else {
+        dk=(float)k;  return dk*ln2_hi-((R-dk*ln2_lo)-f);
+      }
+  }
+  s = f/((float)2.0+f);
+  dk = (float)k;
+  z = s*s;
+  i = ix-(0x6147a<<3);
+  w = z*z;
+  j = (0x6b851<<3)-ix;
+  t1= w*(Lg2+w*(Lg4+w*Lg6));
+  t2= z*(Lg1+w*(Lg3+w*(Lg5+w*Lg7)));
+  i |= j;
+  R = t2+t1;
+  if(i>0) {
+      hfsq=(float)0.5*f*f;
+      if(k==0) return f-(hfsq-s*(hfsq+R)); else
+         return dk*ln2_hi-((hfsq-(s*(hfsq+R)+dk*ln2_lo))-f);
+  } else {
+      if(k==0) return f-s*(f-R); else
+         return dk*ln2_hi-((s*(f-R)-dk*ln2_lo)-f);
+  }
+}
+
+
+OVERLOADABLE float __gen_ocl_internal_log10(float x) {
+/*
+ *  Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+  union {float f; unsigned i; }u;
+  const float
+  zero       = 0.0,
+  two25      =  3.3554432000e+07, /* 0x4c000000 */
+  ivln10     =  4.3429449201e-01, /* 0x3ede5bd9 */
+  log10_2hi  =  3.0102920532e-01, /* 0x3e9a2080 */
+  log10_2lo  =  7.9034151668e-07; /* 0x355427db */
+
+  float y,z;
+  int i,k,hx;
+
+  u.f = x; hx = u.i;
+  k=0;
+  if (hx < 0x00800000) {                  /* x < 2**-126  */
+    if ((hx&0x7fffffff)==0)
+      return -two25/zero;             /* log(+-0)=-inf */
+    if (hx<0) return NAN;        /* log(-#) = NaN */
+    return -INFINITY;      /* Gen does not support subnormal now */
+    //k -= 25; x *= two25; /* subnormal number, scale up x */
+    //u.f = x; hx = u.i;
+  }
+  if (hx >= 0x7f800000) return x+x;
+  k += (hx>>23)-127;
+  i  = ((unsigned)k&0x80000000)>>31;
+  hx = (hx&0x007fffff)|((0x7f-i)<<23);
+  y  = (float)(k+i);
+  u.i = hx; x = u.f;
+  z  = y*log10_2lo + ivln10*__gen_ocl_internal_log(x);
+  return  z+y*log10_2hi;
+}
+
+
+OVERLOADABLE float __gen_ocl_internal_log2(float x) {
+/*
+ *  Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
+ *  adapted for log2 by Ulrich Drepper <drepper at cygnus.com>
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+  const float zero   =  0.0,
+  ln2 = 0.69314718055994530942,
+  two25 =    3.355443200e+07, /** 0x4c000000 */
+  Lg1 = 6.6666668653e-01, /** 3F2AAAAB */
+  Lg2 = 4.0000000596e-01, /** 3ECCCCCD */
+  Lg3 = 2.8571429849e-01, /** 3E924925 */
+  Lg4 = 2.2222198546e-01, /** 3E638E29 */
+  Lg5 = 1.8183572590e-01, /** 3E3A3325 */
+  Lg6 = 1.5313838422e-01, /** 3E1CD04F */
+  Lg7 = 1.4798198640e-01; /** 3E178897 */
+
+  float hfsq,f,s,z,R,w,t1,t2,dk;
+  int k,ix,i,j;
+
+  union {float f; int i; }u;//GET_FLOAT_WORD(ix,x);
+  u.f = x; ix = u.i;
+
+  k=0;
+  if (ix < 0x00800000) {           /** x < 2**-126  */
+      if ((ix&0x7fffffff)==0)
+      return -two25/(x-x);        /** log(+-0)=-inf */
+
+      if (ix<0) return (x-x)/(x-x);    /** log(-#) = NaN */
+      return -INFINITY;
+      k -= 25; x *= two25; /** subnormal number, scale up x */
+      u.f = x; ix = u.i; //GET_FLOAT_WORD(ix,x);
+  }
+
+  if (ix >= 0x7f800000) return x+x;
+
+  k += (ix>>23)-127;
+  ix &= 0x007fffff;
+  i = (ix+(0x95f64<<3))&0x800000;
+
+  u.i = ix|(i^0x3f800000); x = u.f;//SET_FLOAT_WORD(x,ix|(i^0x3f800000));    /** normalize x or x/2 */
+  k += (i>>23);
+  dk = (float)k;
+  f = x-(float)1.0;
+
+  if((0x007fffff&(15+ix))<16) {    /** |f| < 2**-20 */
+      if(f==zero) return dk;
+
+      R = f*f*((float)0.5-(float)0.33333333333333333*f);
+      return dk-(R-f)/ln2;
+  }
+
+  s = f/((float)2.0+f);
+  z = s*s;
+  i = ix-(0x6147a<<3);
+  w = z*z;
+  j = (0x6b851<<3)-ix;
+  t1= w*(Lg2+w*(Lg4+w*Lg6));
+  t2= z*(Lg1+w*(Lg3+w*(Lg5+w*Lg7)));
+  i |= j;
+  R = t2+t1;
+
+  if(i>0) {
+      hfsq=(float)0.5*f*f;
+      return dk-((hfsq-(s*(hfsq+R)))-f)/ln2;
+  } else {
+      return dk-((s*(f-R))-f)/ln2;
+  }
+}
+
+
+float __gen_ocl_scalbnf (float x, int n){
+  /* copy from fdlibm */
+  float two25 = 3.355443200e+07,	/* 0x4c000000 */
+  twom25 = 2.9802322388e-08,	        /* 0x33000000 */
+  huge = 1.0e+30,
+  tiny = 1.0e-30;
+  int k,ix;
+  GEN_OCL_GET_FLOAT_WORD(ix,x);
+  k = (ix&0x7f800000)>>23; /* extract exponent */
+  if (k==0) {	/* 0 or subnormal x */
+    if ((ix&0x7fffffff)==0) return x; /* +-0 */
+    x *= two25;
+    GEN_OCL_GET_FLOAT_WORD(ix,x);
+    k = ((ix&0x7f800000)>>23) - 25;
+  }
+  if (k==0xff) return x+x;	/* NaN or Inf */
+  if (n< -50000)
+    return tiny*__gen_ocl_internal_copysign(tiny,x);	/*underflow*/
+  if (n> 50000 || k+n > 0xfe)
+    return huge*__gen_ocl_internal_copysign(huge,x); /* overflow  */
+  /* Now k and n are bounded we know that k = k+n does not overflow. */
+  k = k+n;
+  if (k > 0) { /* normal result */
+    GEN_OCL_SET_FLOAT_WORD(x,(ix&0x807fffff)|(k<<23));
+    return x;
+  }
+  if (k <= -25)
+    return tiny*__gen_ocl_internal_copysign(tiny,x);	/*underflow*/
+  k += 25;				/* subnormal result */
+  GEN_OCL_SET_FLOAT_WORD(x,(ix&0x807fffff)|(k<<23));
+  return x*twom25;
+}
+
+
+__constant const float PIo2[] = {
+  1.5703125000e+00, /* 0x3fc90000 */
+  4.5776367188e-04, /* 0x39f00000 */
+  2.5987625122e-05, /* 0x37da0000 */
+  7.5437128544e-08, /* 0x33a20000 */
+  6.0026650317e-11, /* 0x2e840000 */
+  7.3896444519e-13, /* 0x2b500000 */
+  5.3845816694e-15, /* 0x27c20000 */
+  5.6378512969e-18, /* 0x22d00000 */
+  8.3009228831e-20, /* 0x1fc40000 */
+  3.2756352257e-22, /* 0x1bc60000 */
+  6.3331015649e-25, /* 0x17440000 */
+};
+
+
+int __kernel_rem_pio2f(float *x, float *y, int e0, int nx, int prec, const __constant int *ipio2)
+{
+  /* copied from fdlibm */
+const float
+zero   = 0.0,
+one    = 1.0,
+two8   =  2.5600000000e+02, /* 0x43800000 */
+twon8  =  3.9062500000e-03; /* 0x3b800000 */
+
+  int init_jk[3]; /* initial value for jk */
+  int jz,jx,jv,jp,jk,carry,n,iq[20],i,j,k,m,q0,ih;
+  float z,fw,f[20],fq[20],q[20];
+  init_jk[0] = 4; init_jk[1] = 7; init_jk[2] = 9;
+    /* initialize jk*/
+  jk = init_jk[prec];
+  jp = jk;
+
+    /* determine jx,jv,q0, note that 3>q0 */
+  jx =  nx-1;
+  jv = (e0-3)/8; if(jv<0) jv=0;
+  q0 =  e0-8*(jv+1);
+
+    /* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */
+  j = jv-jx; m = jx+jk;
+  for(i=0;i<=m;i++,j++) f[i] = (j<0)? zero : (float) ipio2[j];
+
+    /* compute q[0],q[1],...q[jk] */
+  for (i=0;i<=jk;i++) {
+      for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j]; q[i] = fw;
+  }
+
+  jz = jk;
+recompute:
+    /* distill q[] into iq[] reversingly */
+  for(i=0,j=jz,z=q[jz];j>0;i++,j--) {
+      fw    =  (float)((int)(twon8* z));
+      iq[i] =  (int)(z-two8*fw);
+      z     =  q[j-1]+fw;
+  }
+
+    /* compute n */
+  z  = __gen_ocl_scalbnf(z,q0);   /* actual value of z */
+  z -= (float)8.0*__gen_ocl_internal_floor(z*(float)0.125); /* trim off integer >= 8 */
+  n  = (int) z;
+  z -= (float)n;
+  ih = 0;
+  if(q0>0) {  /* need iq[jz-1] to determine n */
+      i  = (iq[jz-1]>>(8-q0)); n += i;
+      iq[jz-1] -= i<<(8-q0);
+      ih = iq[jz-1]>>(7-q0);
+  }
+  else if(q0==0) ih = iq[jz-1]>>8;
+  else if(z>=(float)0.5) ih=2;
+
+  if(ih>0) {  /* q > 0.5 */
+      n += 1; carry = 0;
+      for(i=0;i<jz ;i++) {  /* compute 1-q */
+    j = iq[i];
+    if(carry==0) {
+        if(j!=0) {
+      carry = 1; iq[i] = 0x100- j;
+        }
+    } else  iq[i] = 0xff - j;
+      }
+      if(q0>0) {    /* rare case: chance is 1 in 12 */
+          switch(q0) {
+          case 1:
+           iq[jz-1] &= 0x7f; break;
+        case 2:
+           iq[jz-1] &= 0x3f; break;
+          }
+      }
+      if(ih==2) {
+    z = one - z;
+    if(carry!=0) z -= __gen_ocl_scalbnf(one,q0);
+      }
+  }
+
+    /* check if recomputation is needed */
+  if(z==zero) {
+      j = 0;
+      for (i=jz-1;i>=jk;i--) j |= iq[i];
+      if(j==0) { /* need recomputation */
+    for(k=1;iq[jk-k]==0;k++);   /* k = no. of terms needed */
+
+    for(i=jz+1;i<=jz+k;i++) {   /* add q[jz+1] to q[jz+k] */
+        f[jx+i] = (float) ipio2[jv+i];
+        for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j];
+        q[i] = fw;
+    }
+    jz += k;
+    goto recompute;
+      }
+  }
+
+    /* chop off zero terms */
+  if(z==(float)0.0) {
+      jz -= 1; q0 -= 8;
+      while(iq[jz]==0) { jz--; q0-=8;}
+  } else { /* break z into 8-bit if necessary */
+      z = __gen_ocl_scalbnf(z,-q0);
+      if(z>=two8) {
+    fw = (float)((int)(twon8*z));
+    iq[jz] = (int)(z-two8*fw);
+    jz += 1; q0 += 8;
+    iq[jz] = (int) fw;
+      } else iq[jz] = (int) z ;
+  }
+
+    /* convert integer "bit" chunk to floating-point value */
+  fw = __gen_ocl_scalbnf(one,q0);
+  for(i=jz;i>=0;i--) {
+      q[i] = fw*(float)iq[i]; fw*=twon8;
+  }
+
+    /* compute PIo2[0,...,jp]*q[jz,...,0] */
+  for(i=jz;i>=0;i--) {
+      for(fw=0.0,k=0;k<=jp&&k<=jz-i;k++) fw += PIo2[k]*q[i+k];
+      fq[jz-i] = fw;
+  }
+
+    /* compress fq[] into y[] */
+  switch(prec) {
+      case 0:
+    fw = 0.0;
+    for (i=jz;i>=0;i--) fw += fq[i];
+    y[0] = (ih==0)? fw: -fw;
+    break;
+      case 1:
+      case 2:
+    fw = 0.0;
+    for (i=jz;i>=0;i--) fw += fq[i];
+    y[0] = (ih==0)? fw: -fw;
+    fw = fq[0]-fw;
+    for (i=1;i<=jz;i++) fw += fq[i];
+    y[1] = (ih==0)? fw: -fw;
+    break;
+      case 3: /* painful */
+    for (i=jz;i>0;i--) {
+        fw      = fq[i-1]+fq[i];
+        fq[i]  += fq[i-1]-fw;
+        fq[i-1] = fw;
+    }
+    for (i=jz;i>1;i--) {
+        fw      = fq[i-1]+fq[i];
+        fq[i]  += fq[i-1]-fw;
+        fq[i-1] = fw;
+    }
+    for (fw=0.0,i=jz;i>=2;i--) fw += fq[i];
+    if(ih==0) {
+        y[0] =  fq[0]; y[1] =  fq[1]; y[2] =  fw;
+    } else {
+        y[0] = -fq[0]; y[1] = -fq[1]; y[2] = -fw;
+    }
+  }
+  return n&7;
+
+}
+
+__constant const int npio2_hw[32] = {
+0x3fc90f00, 0x40490f00, 0x4096cb00, 0x40c90f00, 0x40fb5300, 0x4116cb00,
+0x412fed00, 0x41490f00, 0x41623100, 0x417b5300, 0x418a3a00, 0x4196cb00,
+0x41a35c00, 0x41afed00, 0x41bc7e00, 0x41c90f00, 0x41d5a000, 0x41e23100,
+0x41eec200, 0x41fb5300, 0x4203f200, 0x420a3a00, 0x42108300, 0x4216cb00,
+0x421d1400, 0x42235c00, 0x4229a500, 0x422fed00, 0x42363600, 0x423c7e00,
+0x4242c700, 0x42490f00
+};
+
+__constant const int two_over_pi[22*9] = {
+0xA2, 0xF9, 0x83, 0x6E, 0x4E, 0x44, 0x15, 0x29, 0xFC,
+0x27, 0x57, 0xD1, 0xF5, 0x34, 0xDD, 0xC0, 0xDB, 0x62,
+0x95, 0x99, 0x3C, 0x43, 0x90, 0x41, 0xFE, 0x51, 0x63,
+0xAB, 0xDE, 0xBB, 0xC5, 0x61, 0xB7, 0x24, 0x6E, 0x3A,
+0x42, 0x4D, 0xD2, 0xE0, 0x06, 0x49, 0x2E, 0xEA, 0x09,
+0xD1, 0x92, 0x1C, 0xFE, 0x1D, 0xEB, 0x1C, 0xB1, 0x29,
+0xA7, 0x3E, 0xE8, 0x82, 0x35, 0xF5, 0x2E, 0xBB, 0x44,
+0x84, 0xE9, 0x9C, 0x70, 0x26, 0xB4, 0x5F, 0x7E, 0x41,
+0x39, 0x91, 0xD6, 0x39, 0x83, 0x53, 0x39, 0xF4, 0x9C,
+0x84, 0x5F, 0x8B, 0xBD, 0xF9, 0x28, 0x3B, 0x1F, 0xF8,
+0x97, 0xFF, 0xDE, 0x05, 0x98, 0x0F, 0xEF, 0x2F, 0x11,
+0x8B, 0x5A, 0x0A, 0x6D, 0x1F, 0x6D, 0x36, 0x7E, 0xCF,
+0x27, 0xCB, 0x09, 0xB7, 0x4F, 0x46, 0x3F, 0x66, 0x9E,
+0x5F, 0xEA, 0x2D, 0x75, 0x27, 0xBA, 0xC7, 0xEB, 0xE5,
+0xF1, 0x7B, 0x3D, 0x07, 0x39, 0xF7, 0x8A, 0x52, 0x92,
+0xEA, 0x6B, 0xFB, 0x5F, 0xB1, 0x1F, 0x8D, 0x5D, 0x08,
+0x56, 0x03, 0x30, 0x46, 0xFC, 0x7B, 0x6B, 0xAB, 0xF0,
+0xCF, 0xBC, 0x20, 0x9A, 0xF4, 0x36, 0x1D, 0xA9, 0xE3,
+0x91, 0x61, 0x5E, 0xE6, 0x1B, 0x08, 0x65, 0x99, 0x85,
+0x5F, 0x14, 0xA0, 0x68, 0x40, 0x8D, 0xFF, 0xD8, 0x80,
+0x4D, 0x73, 0x27, 0x31, 0x06, 0x06, 0x15, 0x56, 0xCA,
+0x73, 0xA8, 0xC9, 0x60, 0xE2, 0x7B, 0xC0, 0x8C, 0x6B,
+};
+
+
+int __ieee754_rem_pio2f(float x, float *y) {
+  /* copied from fdlibm */
+  float z,w,t,r,fn;
+  float tx[3];
+
+const float half_value = 5.0000000e-1;
+const float zero =  0.0000000000;
+const float two8 =  2.5600000000e+02;
+const float invpio2 =  6.3661980629e-01;
+const float pio2_1  =  1.5707855225e+00;
+const float pio2_1t =  1.0804334124e-05;
+const float pio2_2  =  1.0804273188e-05;
+const float pio2_2t =  6.0770999344e-11;
+const float pio2_3  =  6.0770943833e-11;
+const float pio2_3t =  6.1232342629e-17;
+  int e0,i,j,nx,n,ix,hx;
+
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  ix = hx&0x7fffffff;
+  if(ix<=0x3f490fd8)   /* |x| ~<= pi/4 , no need for reduction */
+      {y[0] = x; y[1] = 0; return 0;}
+  if(ix<0x4016cbe4) {  /* |x| < 3pi/4, special case with n=+-1 */
+      if(hx>0) {
+    z = x - pio2_1;
+    if((ix&0xfffffff0)!=0x3fc90fd0) { /* 24+24 bit pi OK */
+        y[0] = z - pio2_1t;
+        y[1] = (z-y[0])-pio2_1t;
+    } else {    /* near pi/2, use 24+24+24 bit pi */
+        z -= pio2_2;
+        y[0] = z - pio2_2t;
+        y[1] = (z-y[0])-pio2_2t;
+    }
+    return 1;
+      } else {  /* negative x */
+    z = x + pio2_1;
+    if((ix&0xfffffff0)!=0x3fc90fd0) { /* 24+24 bit pi OK */
+        y[0] = z + pio2_1t;
+        y[1] = (z-y[0])+pio2_1t;
+    } else {    /* near pi/2, use 24+24+24 bit pi */
+        z += pio2_2;
+        y[0] = z + pio2_2t;
+        y[1] = (z-y[0])+pio2_2t;
+    }
+    return -1;
+      }
+  }
+  if(ix<=0x43490f80) { /* |x| ~<= 2^7*(pi/2), medium size */
+      t  = __gen_ocl_fabs(x);
+      n  = (int) (t*invpio2+half_value);
+      fn = (float)n;
+      r  = t-fn*pio2_1;
+      w  = fn*pio2_1t;  /* 1st round good to 40 bit */
+      if(n<32&&(ix&0xffffff00)!=npio2_hw[n-1]) {
+    y[0] = r-w; /* quick check no cancellation */
+      } else {
+          uint high;
+          j  = ix>>23;
+          y[0] = r-w;
+    GEN_OCL_GET_FLOAT_WORD(high,y[0]);
+          i = j-((high>>23)&0xff);
+          if(i>8) {  /* 2nd iteration needed, good to 57 */
+        t  = r;
+        w  = fn*pio2_2;
+        r  = t-w;
+        w  = fn*pio2_2t-((t-r)-w);
+        y[0] = r-w;
+        GEN_OCL_GET_FLOAT_WORD(high,y[0]);
+        i = j-((high>>23)&0xff);
+        if(i>25)  { /* 3rd iteration need, 74 bits acc */
+          t  = r; /* will cover all possible cases */
+          w  = fn*pio2_3;
+          r  = t-w;
+          w  = fn*pio2_3t-((t-r)-w);
+          y[0] = r-w;
+        }
+    }
+      }
+      y[1] = (r-y[0])-w;
+      if(hx<0)  {y[0] = -y[0]; y[1] = -y[1]; return -n;}
+      else   return n;
+  }
+    /*
+     * all other (large) arguments
+     */
+  if(ix>=0x7f800000) {    /* x is inf or NaN */
+      y[0]=y[1]=x-x; return 0;
+  }
+    /* set z = scalbn(|x|,ilogb(x)-7) */
+  e0  = (ix>>23)-134;   /* e0 = ilogb(z)-7; */
+  GEN_OCL_SET_FLOAT_WORD(z, ix - ((int)(e0<<23)));
+  for(i=0;i<2;i++) {
+    tx[i] = (float)((int)(z));
+    z     = (z-tx[i])*two8;
+  }
+  tx[2] = z;
+  nx = 3;
+  while(tx[nx-1]==zero) nx--; /* skip zero term */
+  n  =  __kernel_rem_pio2f(tx,y,e0,nx,2,two_over_pi);
+  if(hx<0) {y[0] = -y[0]; y[1] = -y[1]; return -n;}
+  return n;
+}
+
+OVERLOADABLE float __kernel_sinf(float x, float y, int iy)
+{
+  /* copied from fdlibm */
+const float
+half_value =  5.0000000000e-01,/* 0x3f000000 */
+S1  = -1.6666667163e-01, /* 0xbe2aaaab */
+S2  =  8.3333337680e-03, /* 0x3c088889 */
+S3  = -1.9841270114e-04, /* 0xb9500d01 */
+S4  =  2.7557314297e-06, /* 0x3638ef1b */
+S5  = -2.5050759689e-08, /* 0xb2d72f34 */
+S6  =  1.5896910177e-10; /* 0x2f2ec9d3 */
+  float z,r,v;
+  int ix;
+  GEN_OCL_GET_FLOAT_WORD(ix,x);
+  ix &= 0x7fffffff;     /* high word of x */
+  if(ix<0x32000000)     /* |x| < 2**-27 */
+     {if((int)x==0) return x;}    /* generate inexact */
+  z =  x*x;
+  v =  z*x;
+  r =  S2+z*(S3+z*(S4+z*(S5+z*S6)));
+  if(iy==0) return x+v*(S1+z*r);
+  else      return x-((z*(half_value*y-v*r)-y)-v*S1);
+}
+
+float __kernel_cosf(float x, float y)
+{
+  /* copied from fdlibm */
+  const float
+  one =  1.0000000000e+00, /* 0x3f800000 */
+  C1  =  4.1666667908e-02, /* 0x3d2aaaab */
+  C2  = -1.3888889225e-03, /* 0xbab60b61 */
+  C3  =  2.4801587642e-05, /* 0x37d00d01 */
+  C4  = -2.7557314297e-07, /* 0xb493f27c */
+  C5  =  2.0875723372e-09, /* 0x310f74f6 */
+  C6  = -1.1359647598e-11; /* 0xad47d74e */
+  const float pio2_hi = 0x1.92p0, pio2_mid = 0x1.fb4p-12, pio2_low = 0x1.4442d2p-24;
+  float a,hz,z,r,qx;
+  int ix;
+  GEN_OCL_GET_FLOAT_WORD(ix,x);
+  ix &= 0x7fffffff;     /* ix = |x|'s high word*/
+  if(ix<0x32000000) {     /* if x < 2**27 */
+      if(((int)x)==0) return one;   /* generate inexact */
+  }
+
+  if(x < 0.0f) { x= -x; y = -y; }
+  if(ix > 0x3f490fdb) { /* |x|>pi/4*/
+    return -__kernel_sinf(x-pio2_hi-pio2_mid-pio2_low, y, 1);
+  }
+  z  = x*x;
+  r  = z*(C1+z*(C2+z*(C3+z*(C4+z*(C5+z*C6)))));
+  if(ix < 0x3e99999a)       /* if |x| < 0.3 */
+      return one - ((float)0.5*z - (z*r - x*y));
+  else {
+      GEN_OCL_SET_FLOAT_WORD(qx,ix-0x01000000); /* x/4 */
+      hz = (float)0.5*z-qx;
+      a  = one-qx;
+      return a - (hz - (z*r-x*y));
+  }
+}
+
+OVERLOADABLE float sin(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_sin(x);
+
+  /* copied from fdlibm */
+  float y[2],z=0.0;
+  int n, ix;
+
+  GEN_OCL_GET_FLOAT_WORD(ix,x);
+
+    /* |x| ~< pi/4 */
+  ix &= 0x7fffffff;
+  if(ix <= 0x3f490fd8) return __kernel_sinf(x,z,0);
+
+    /* sin(Inf or NaN) is NaN */
+  else if (ix>=0x7f800000) return x-x;
+
+    /* argument reduction needed */
+  else {
+      n = __ieee754_rem_pio2f(x,y);
+      switch(n&3) {
+    case 0: return  __kernel_sinf(y[0],y[1],1);
+    case 1: return  __kernel_cosf(y[0],y[1]);
+    case 2: return -__kernel_sinf(y[0],y[1],1);
+    default:
+      return -__kernel_cosf(y[0],y[1]);
+      }
+  }
+}
+
+OVERLOADABLE float cos(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_cos(x);
+
+  /* copied from fdlibm */
+  float y[2],z=0.0;
+  int n, ix;
+
+  GEN_OCL_GET_FLOAT_WORD(ix,x);
+
+    /* |x| ~< pi/4 */
+  ix &= 0x7fffffff;
+  if(ix <= 0x3f490fd8) return __kernel_cosf(x,z);
+
+    /* cos(Inf or NaN) is NaN */
+  else if (ix>=0x7f800000) return x-x;
+
+    /* argument reduction needed */
+  else {
+      n = __ieee754_rem_pio2f(x,y);
+      switch(n&3) {
+    case 0: return  __kernel_cosf(y[0],y[1]);
+    case 1: return  -__kernel_sinf(y[0],y[1],1);
+    case 2: return -__kernel_cosf(y[0],y[1]);
+    default:
+      return __kernel_sinf(y[0],y[1],1);
+      }
+  }
+}
+
+float __kernel_tanf(float x, float y, int iy)
+{
+  /* copied from fdlibm */
+        float z,r,v,w,s;
+        int ix,hx;
+        const float
+        one   =  1.0000000000e+00, /* 0x3f800000 */
+        pio4  =  7.8539812565e-01, /* 0x3f490fda */
+        pio4lo=  3.7748947079e-08; /* 0x33222168 */
+        float T[13];// =  {
+         T[0] = 3.3333334327e-01; /* 0x3eaaaaab */
+         T[1] = 1.3333334029e-01; /* 0x3e088889 */
+         T[2] = 5.3968254477e-02; /* 0x3d5d0dd1 */
+         T[3] = 2.1869488060e-02; /* 0x3cb327a4 */
+         T[4] = 8.8632395491e-03; /* 0x3c11371f */
+         T[5] = 3.5920790397e-03; /* 0x3b6b6916 */
+         T[6] = 1.4562094584e-03; /* 0x3abede48 */
+         T[7] = 5.8804126456e-04; /* 0x3a1a26c8 */
+         T[8] = 2.4646313977e-04; /* 0x398137b9 */
+         T[9] = 7.8179444245e-05; /* 0x38a3f445 */
+         T[10] = 7.1407252108e-05; /* 0x3895c07a */
+         T[11] = -1.8558637748e-05; /* 0xb79bae5f */
+         T[12] = 2.5907305826e-05; /* 0x37d95384 */
+
+
+        GEN_OCL_GET_FLOAT_WORD(hx,x);
+        ix = hx&0x7fffffff;     /* high word of |x| */
+        if(ix<0x31800000)                       /* x < 2**-28 */
+            {if((int)x==0) {                    /* generate inexact */
+                if((ix|(iy+1))==0) return one/__gen_ocl_fabs(x);
+                else return (iy==1)? x: -one/x;
+            }
+            }
+        if(ix>=0x3f2ca140) {                    /* |x|>=0.6744 */
+            if(hx<0) {x = -x; y = -y;}
+
+
+            z = pio4-x;
+            w = pio4lo-y;
+            x = z+w; y = 0.0;
+        }
+        z       =  x*x;
+        w       =  z*z;
+    /* Break x^5*(T[1]+x^2*T[2]+...) into
+     *    x^5(T[1]+x^4*T[3]+...+x^20*T[11]) +
+     *    x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12]))
+     */
+        r = T[1]+w*(T[3]+w*(T[5]+w*(T[7]+w*(T[9]+w*T[11]))));
+        v = z*(T[2]+w*(T[4]+w*(T[6]+w*(T[8]+w*(T[10]+w*T[12])))));
+        s = z*x;
+        r = y + z*(s*(r+v)+y);
+        r += T[0]*s;
+        w = x+r;
+        if(ix>=0x3f2ca140) {
+            v = (float)iy;
+            return (float)(1-((hx>>30)&2))*(v-(float)2.0*(x-(w*w/(w+v)-r)));
+        }
+        if(iy==1) return w;
+        else {          /* if allow error up to 2 ulp
+                           simply return -1.0/(x+r) here */
+     /*  compute -1.0/(x+r) accurately */
+            float a,t;
+            int i;
+            z  = w;
+            GEN_OCL_GET_FLOAT_WORD(i,z);
+            GEN_OCL_SET_FLOAT_WORD(z,i&0xfffff000);
+            v  = r-(z - x);     /* z+v = r+x */
+            t = a  = -(float)1.0/w;     /* a = -1.0/w */
+            GEN_OCL_GET_FLOAT_WORD(i,t);
+            GEN_OCL_SET_FLOAT_WORD(t,i&0xfffff000);
+            s  = (float)1.0+t*z;
+            return t+a*(s+t*v);
+        }
+}
+
+OVERLOADABLE float tan(float x)
+{
+
+    if (__ocl_math_fastpath_flag)
+      return __gen_ocl_internal_fastpath_tan(x);
+
+  /* copied from fdlibm */
+        const float pio2_hi = 0x1.92p-0, pio2_mid = 0x1.fb4p-12, pio2_low = 0x1.4442d2p-24;
+        const float pio4  =  7.8539812565e-01;
+        float y[2],z=0.0;
+        int n, ix;
+
+        GEN_OCL_GET_FLOAT_WORD(ix,x);
+
+    /* |x| ~< pi/4 */
+        ix &= 0x7fffffff;
+        if(ix <= 0x3f490fda) return __kernel_tanf(x,z,1);
+
+    /* tan(Inf or NaN) is NaN */
+        else if (ix>=0x7f800000) return x-x;            /* NaN */
+
+    /* argument reduction needed */
+      else {
+        n = __ieee754_rem_pio2f(x,y);
+
+        x = y[0];
+        float m = y[1];
+        int iy = 1-((n&1)<<1);
+        GEN_OCL_GET_FLOAT_WORD(ix,x);
+        float sign = 1.0f;
+        if(ix < 0) {
+          x = -x; m = -m;
+          sign = -1.0f;
+        }
+
+        if(x > pio4) {/* reduce x to less than pi/4 through (pi/2-x) */
+          float t = __kernel_tanf(pio2_hi-x+pio2_mid+pio2_low, -m, 1);
+          if(iy == -1) return sign*(-t); else return sign*1/t;
+        } else
+            return __kernel_tanf(y[0],y[1],1-((n&1)<<1)); /*   1 -- n even
+                                                              -1 -- n odd */
+      }
+}
+
+OVERLOADABLE float __gen_ocl_internal_cospi(float x) {
+  int ix;
+  if(isinf(x) || isnan(x)) { return NAN; }
+  if(x < 0.0f) { x = -x; }
+  GEN_OCL_GET_FLOAT_WORD(ix, x);
+  if(x> 0x1.0p24) return 1.0f;
+  float m = __gen_ocl_internal_floor(x);
+  ix = (int)m;
+  m = x-m;
+  if((ix&0x1) != 0) m+=1.0f;
+    ix = __gen_ocl_internal_floor(m*4.0f);
+
+  switch(ix) {
+   case 0:
+    return __kernel_cosf(m*M_PI_F, 0.0f);
+   case 1:
+   case 2:
+    return __kernel_sinf((0.5f-m)*M_PI_F, 0.0f, 0);
+   case 3:
+   case 4:
+    return -__kernel_cosf((m-1.0f)*M_PI_F, 0.0f);
+   case 5:
+   case 6:
+    return __kernel_sinf((m-1.5f)*M_PI_F, 0.0f, 0);
+   default:
+    return __kernel_cosf((2.0f-m)*M_PI_F, 0.0f);
+   }
+}
+
+OVERLOADABLE float __gen_ocl_internal_sinpi(float x) {
+  float sign = 1.0f;
+  int ix;
+  if(isinf(x)) return NAN;
+  if(x < 0.0f) { x = -x; sign = -1.0f; }
+  GEN_OCL_GET_FLOAT_WORD(ix, x);
+  if(x> 0x1.0p24) return 0.0f;
+  float m = __gen_ocl_internal_floor(x);
+  ix = (int)m;
+  m = x-m;
+  if((ix&0x1) != 0) m+=1.0f;
+    ix = __gen_ocl_internal_floor(m*4.0f);
+
+  switch(ix) {
+   case 0:
+    return sign*__kernel_sinf(m*M_PI_F, 0.0f, 0);
+   case 1:
+   case 2:
+    return sign*__kernel_cosf((m-0.5f)*M_PI_F, 0.0f);
+   case 3:
+   case 4:
+    return -sign*__kernel_sinf((m-1.0f)*M_PI_F, 0.0f, 0);
+   case 5:
+   case 6:
+    return -sign*__kernel_cosf((m-1.5f)*M_PI_F, 0.0f);
+   default:
+    return -sign*__kernel_sinf((2.0f-m)*M_PI_F, 0.0f, 0);
+   }
+
+}
+
+OVERLOADABLE float lgamma(float x) {
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+    const float
+        zero=  0.,
+        one =  1.0000000000e+00,
+        pi  =  3.1415927410e+00,
+        a0  =  7.7215664089e-02,
+        a1  =  3.2246702909e-01,
+        a2  =  6.7352302372e-02,
+        a3  =  2.0580807701e-02,
+        a4  =  7.3855509982e-03,
+        a5  =  2.8905137442e-03,
+        a6  =  1.1927076848e-03,
+        a7  =  5.1006977446e-04,
+        a8  =  2.2086278477e-04,
+        a9  =  1.0801156895e-04,
+        a10 =  2.5214456400e-05,
+        a11 =  4.4864096708e-05,
+        tc  =  1.4616321325e+00,
+        tf  = -1.2148628384e-01,
+        tt  =  6.6971006518e-09,
+        t0  =  4.8383611441e-01,
+        t1  = -1.4758771658e-01,
+        t2  =  6.4624942839e-02,
+        t3  = -3.2788541168e-02,
+        t4  =  1.7970675603e-02,
+        t5  = -1.0314224288e-02,
+        t6  =  6.1005386524e-03,
+        t7  = -3.6845202558e-03,
+        t8  =  2.2596477065e-03,
+        t9  = -1.4034647029e-03,
+        t10 =  8.8108185446e-04,
+        t11 = -5.3859531181e-04,
+        t12 =  3.1563205994e-04,
+        t13 = -3.1275415677e-04,
+        t14 =  3.3552918467e-04,
+        u0  = -7.7215664089e-02,
+        u1  =  6.3282704353e-01,
+        u2  =  1.4549225569e+00,
+        u3  =  9.7771751881e-01,
+        u4  =  2.2896373272e-01,
+        u5  =  1.3381091878e-02,
+        v1  =  2.4559779167e+00,
+        v2  =  2.1284897327e+00,
+        v3  =  7.6928514242e-01,
+        v4  =  1.0422264785e-01,
+        v5  =  3.2170924824e-03,
+        s0  = -7.7215664089e-02,
+        s1  =  2.1498242021e-01,
+        s2  =  3.2577878237e-01,
+        s3  =  1.4635047317e-01,
+        s4  =  2.6642270386e-02,
+        s5  =  1.8402845599e-03,
+        s6  =  3.1947532989e-05,
+        r1  =  1.3920053244e+00,
+        r2  =  7.2193557024e-01,
+        r3  =  1.7193385959e-01,
+        r4  =  1.8645919859e-02,
+        r5  =  7.7794247773e-04,
+        r6  =  7.3266842264e-06,
+        w0  =  4.1893854737e-01,
+        w1  =  8.3333335817e-02,
+        w2  = -2.7777778450e-03,
+        w3  =  7.9365057172e-04,
+        w4  = -5.9518753551e-04,
+        w5  =  8.3633989561e-04,
+        w6  = -1.6309292987e-03;
+	float t, y, z, nadj, p, p1, p2, p3, q, r, w;
+	int i, hx, ix;
+	nadj = 0;
+	hx = *(int *)&x;
+	ix = hx & 0x7fffffff;
+	if (ix >= 0x7f800000)
+		return x * x;
+	if (ix == 0)
+		return ((x + one) / zero);
+	if (ix < 0x1c800000) {
+		if (hx < 0) {
+			return -native_log(-x);
+		} else
+			return -native_log(x);
+	}
+	if (hx < 0) {
+		if (ix >= 0x4b000000)
+			return ((-x) / zero);
+		t = __gen_ocl_internal_sinpi(x);
+		if (t == zero)
+			return ((-x) / zero);
+		nadj = native_log(pi / __gen_ocl_fabs(t * x));
+		x = -x;
+	}
+	if (ix == 0x3f800000 || ix == 0x40000000)
+		r = 0;
+	else if (ix < 0x40000000) {
+		if (ix <= 0x3f666666) {
+			r = -native_log(x);
+			if (ix >= 0x3f3b4a20) {
+				y = one - x;
+				i = 0;
+			} else if (ix >= 0x3e6d3308) {
+				y = x - (tc - one);
+				i = 1;
+			} else {
+				y = x;
+				i = 2;
+			}
+		} else {
+			r = zero;
+			if (ix >= 0x3fdda618) {
+				y = (float) 2.0 - x;
+				i = 0;
+			}
+			else if (ix >= 0x3F9da620) {
+				y = x - tc;
+				i = 1;
+			}
+			else {
+				y = x - one;
+				i = 2;
+			}
+		}
+		switch (i) {
+		case 0:
+			z = y * y;
+			p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10))));
+			p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11)))));
+			p = y * p1 + p2;
+			r += (p - (float) 0.5 * y);
+			break;
+		case 1:
+			z = y * y;
+			w = z * y;
+			p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12)));
+			p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13)));
+			p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14)));
+			p = z * p1 - (tt - w * (p2 + y * p3));
+			r += (tf + p);
+			break;
+		case 2:
+			p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5)))));
+			p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5))));
+			r += (-(float) 0.5 * y + p1 / p2);
+		}
+	} else if (ix < 0x41000000) {
+		i = (int) x;
+		t = zero;
+		y = x - (float) i;
+		p = y * (s0 + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6))))));
+		q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6)))));
+		r = .5f * y + p / q;
+		z = one;
+		switch (i) {
+		case 7:
+			z *= (y + (float) 6.0);
+		case 6:
+			z *= (y + (float) 5.0);
+		case 5:
+			z *= (y + (float) 4.0);
+		case 4:
+			z *= (y + (float) 3.0);
+		case 3:
+			z *= (y + (float) 2.0);
+			r += native_log(z);
+			break;
+		}
+
+	} else if (ix < 0x5c800000) {
+		t = native_log(x);
+		z = one / x;
+		y = z * z;
+		w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6)))));
+		r = (x - .5f) * (t - one) + w;
+	} else
+		r = x * (native_log(x) - one);
+	if (hx < 0)
+		r = nadj - r;
+	return r;
+}
+
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+#define BODY \
+    const float  \
+        zero=  0.,  \
+        one =  1.0000000000e+00,  \
+        pi  =  3.1415927410e+00,  \
+        a0  =  7.7215664089e-02,  \
+        a1  =  3.2246702909e-01,  \
+        a2  =  6.7352302372e-02,  \
+        a3  =  2.0580807701e-02,  \
+        a4  =  7.3855509982e-03,  \
+        a5  =  2.8905137442e-03,  \
+        a6  =  1.1927076848e-03,  \
+        a7  =  5.1006977446e-04,  \
+        a8  =  2.2086278477e-04,  \
+        a9  =  1.0801156895e-04,  \
+        a10 =  2.5214456400e-05,  \
+        a11 =  4.4864096708e-05,  \
+        tc  =  1.4616321325e+00,  \
+        tf  = -1.2148628384e-01,  \
+        tt  =  6.6971006518e-09,  \
+        t0  =  4.8383611441e-01,  \
+        t1  = -1.4758771658e-01,  \
+        t2  =  6.4624942839e-02,  \
+        t3  = -3.2788541168e-02,  \
+        t4  =  1.7970675603e-02,  \
+        t5  = -1.0314224288e-02,  \
+        t6  =  6.1005386524e-03,  \
+        t7  = -3.6845202558e-03,  \
+        t8  =  2.2596477065e-03,  \
+        t9  = -1.4034647029e-03,  \
+        t10 =  8.8108185446e-04,  \
+        t11 = -5.3859531181e-04,  \
+        t12 =  3.1563205994e-04,  \
+        t13 = -3.1275415677e-04,  \
+        t14 =  3.3552918467e-04,  \
+        u0  = -7.7215664089e-02,  \
+        u1  =  6.3282704353e-01,  \
+        u2  =  1.4549225569e+00,  \
+        u3  =  9.7771751881e-01,  \
+        u4  =  2.2896373272e-01,  \
+        u5  =  1.3381091878e-02,  \
+        v1  =  2.4559779167e+00,  \
+        v2  =  2.1284897327e+00,  \
+        v3  =  7.6928514242e-01,  \
+        v4  =  1.0422264785e-01,  \
+        v5  =  3.2170924824e-03,  \
+        s0  = -7.7215664089e-02,  \
+        s1  =  2.1498242021e-01,  \
+        s2  =  3.2577878237e-01,  \
+        s3  =  1.4635047317e-01,  \
+        s4  =  2.6642270386e-02,  \
+        s5  =  1.8402845599e-03,  \
+        s6  =  3.1947532989e-05,  \
+        r1  =  1.3920053244e+00,  \
+        r2  =  7.2193557024e-01,  \
+        r3  =  1.7193385959e-01,  \
+        r4  =  1.8645919859e-02,  \
+        r5  =  7.7794247773e-04,  \
+        r6  =  7.3266842264e-06,  \
+        w0  =  4.1893854737e-01,  \
+        w1  =  8.3333335817e-02,  \
+        w2  = -2.7777778450e-03,  \
+        w3  =  7.9365057172e-04,  \
+        w4  = -5.9518753551e-04,  \
+        w5  =  8.3633989561e-04,  \
+        w6  = -1.6309292987e-03;  \
+	float t, y, z, nadj, p, p1, p2, p3, q, r, w;  \
+	int i, hx, ix;  \
+	nadj = 0;  \
+	hx = *(int *)&x;  \
+	*signgamp = 1;  \
+	ix = hx & 0x7fffffff;  \
+	if (ix >= 0x7f800000)  \
+		return x * x;  \
+	if (ix == 0)  \
+		return ((x + one) / zero);  \
+	if (ix < 0x1c800000) {  \
+		if (hx < 0) {  \
+			*signgamp = -1;  \
+			return -native_log(-x);  \
+		} else  \
+			return -native_log(x);  \
+	}  \
+	if (hx < 0) {  \
+		if (ix >= 0x4b000000)  \
+			return ((-x) / zero);  \
+		t = __gen_ocl_internal_sinpi(x);  \
+		if (t == zero)  \
+			return ((-x) / zero);  \
+		nadj = native_log(pi / __gen_ocl_fabs(t * x));  \
+		if (t < zero)  \
+			*signgamp = -1;  \
+		x = -x;  \
+	}  \
+	if (ix == 0x3f800000 || ix == 0x40000000)  \
+		r = 0;  \
+	else if (ix < 0x40000000) {  \
+		if (ix <= 0x3f666666) {  \
+			r = -native_log(x);  \
+			if (ix >= 0x3f3b4a20) {  \
+				y = one - x;  \
+				i = 0;  \
+			} else if (ix >= 0x3e6d3308) {  \
+				y = x - (tc - one);  \
+				i = 1;  \
+			} else {  \
+				y = x;  \
+				i = 2;  \
+			}  \
+		} else {  \
+			r = zero;  \
+			if (ix >= 0x3fdda618) {  \
+				y = (float) 2.0 - x;  \
+				i = 0;  \
+			}  \
+			else if (ix >= 0x3F9da620) {  \
+				y = x - tc;  \
+				i = 1;  \
+			}  \
+			else {  \
+				y = x - one;  \
+				i = 2;  \
+			}  \
+		}  \
+		switch (i) {  \
+		case 0:  \
+			z = y * y;  \
+			p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10))));  \
+			p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11)))));  \
+			p = y * p1 + p2;  \
+			r += (p - (float) 0.5 * y);  \
+			break;  \
+		case 1:  \
+			z = y * y;  \
+			w = z * y;  \
+			p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12)));  \
+			p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13)));  \
+			p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14)));  \
+			p = z * p1 - (tt - w * (p2 + y * p3));  \
+			r += (tf + p);  \
+			break;  \
+		case 2:  \
+			p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5)))));  \
+			p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5))));  \
+			r += (-(float) 0.5 * y + p1 / p2);  \
+		}  \
+	} else if (ix < 0x41000000) {  \
+		i = (int) x;  \
+		t = zero;  \
+		y = x - (float) i;  \
+		p = y * (s0 + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6))))));  \
+		q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6)))));  \
+		r = .5f * y + p / q;  \
+		z = one;  \
+		switch (i) {  \
+		case 7:  \
+			z *= (y + (float) 6.0);  \
+		case 6:  \
+			z *= (y + (float) 5.0);  \
+		case 5:  \
+			z *= (y + (float) 4.0);  \
+		case 4:  \
+			z *= (y + (float) 3.0);  \
+		case 3:  \
+			z *= (y + (float) 2.0);  \
+			r += native_log(z);  \
+			break;  \
+		}  \
+		  \
+	} else if (ix < 0x5c800000) {  \
+		t = native_log(x);  \
+		z = one / x;  \
+		y = z * z;  \
+		w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6)))));  \
+		r = (x - .5f) * (t - one) + w;  \
+	} else  \
+		r = x * (native_log(x) - one);  \
+	if (hx < 0)  \
+		r = nadj - r;  \
+	return r;
+OVERLOADABLE float lgamma_r(float x, global int *signgamp) { BODY; }
+OVERLOADABLE float lgamma_r(float x, local int *signgamp) { BODY; }
+OVERLOADABLE float lgamma_r(float x, private int *signgamp) { BODY; }
+#undef BODY
+
+OVERLOADABLE float log1p(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_log1p(x);
+/*
+ *  Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+  const float
+  ln2_hi =   6.9313812256e-01,  /* 0x3f317180 */
+  ln2_lo =   9.0580006145e-06,  /* 0x3717f7d1 */
+  two25 =    3.355443200e+07, /* 0x4c000000 */
+  Lp1 = 6.6666668653e-01, /* 3F2AAAAB */
+  Lp2 = 4.0000000596e-01, /* 3ECCCCCD */
+  Lp3 = 2.8571429849e-01, /* 3E924925 */
+  Lp4 = 2.2222198546e-01, /* 3E638E29 */
+  Lp5 = 1.8183572590e-01, /* 3E3A3325 */
+  Lp6 = 1.5313838422e-01, /* 3E1CD04F */
+  Lp7 = 1.4798198640e-01; /* 3E178897 */
+  const float zero = 0.0;
+  float hfsq,f,c,s,z,R,u;
+  int k,hx,hu,ax;
+  union {float f; unsigned i;} un;
+  un.f = x;  hx = un.i;
+  ax = hx&0x7fffffff;
+
+  k = 1;
+  if (hx < 0x3ed413d7) {      /* x < 0.41422  */
+      if(ax>=0x3f800000) {    /* x <= -1.0 */
+    if(x==(float)-1.0) return -two25/zero; /* log1p(-1)=+inf */
+    else return (x-x)/(x-x);  /* log1p(x<-1)=NaN */
+      }
+      if(ax<0x31000000) {     /* |x| < 2**-29 */
+    if(two25+x>zero     /* raise inexact */
+              &&ax<0x24800000)    /* |x| < 2**-54 */
+        return x;
+    else
+        return x - x*x*(float)0.5;
+      }
+      if(hx>0||hx<=((int)0xbe95f61f)) {
+    k=0;f=x;hu=1;}  /* -0.2929<x<0.41422 */
+  }
+  if (hx >= 0x7f800000) return x+x;
+  if(k!=0) {
+      if(hx<0x5a000000) {
+    u  = (float)1.0+x;
+
+    un.f = u; hu = un.i;
+          k  = (hu>>23)-127;
+    /* correction term */
+          c  = (k>0)? (float)1.0-(u-x):x-(u-(float)1.0);
+    c /= u;
+      } else {
+    u  = x;
+    un.f = u; hu = un.i;
+          k  = (hu>>23)-127;
+    c  = 0;
+      }
+      hu &= 0x007fffff;
+      if(hu<0x3504f7) {
+          un.i = hu|0x3f800000; u = un.f;/* normalize u */
+      } else {
+          k += 1;
+          un.i = hu|0x3f000000; u = un.f;  /* normalize u/2 */
+          hu = (0x00800000-hu)>>2;
+      }
+      f = u-(float)1.0;
+  }
+  hfsq=(float)0.5*f*f;
+  if(hu==0) { /* |f| < 2**-20 */
+      if(f==zero) { if(k==0) return zero;
+      else {c += k*ln2_lo; return k*ln2_hi+c;} }
+      R = hfsq*((float)1.0-(float)0.66666666666666666*f);
+      if(k==0) return f-R; else
+             return k*ln2_hi-((R-(k*ln2_lo+c))-f);
+  }
+  s = f/((float)2.0+f);
+  z = s*s;
+  R = z*(Lp1+z*(Lp2+z*(Lp3+z*(Lp4+z*(Lp5+z*(Lp6+z*Lp7))))));
+  if(k==0) return f-(hfsq-s*(hfsq+R)); else
+     return k*ln2_hi-((hfsq-(s*(hfsq+R)+(k*ln2_lo+c)))-f);
+
+}
+OVERLOADABLE float logb(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_logb(x);
+
+union {float f; unsigned i;} u;
+  u.f = x;
+  int e =  ((u.i & 0x7f800000) >> 23);
+  if(e == 0) {
+    /* sub normal or +/-0 */
+    return -INFINITY;
+  } else if(e == 0xff) {
+    /* inf & nan */
+    return x*x;
+  } else {
+    return (float)(e-127);
+  }
+}
+#define FP_ILOGB0 (-0x7FFFFFFF-1)
+#define FP_ILOGBNAN FP_ILOGB0
+OVERLOADABLE int ilogb(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_ilogb(x);
+
+  union { int i; float f; } u;
+  if (isnan(x))
+    return FP_ILOGBNAN;
+  if (isinf(x))
+    return 0x7FFFFFFF;
+  u.f = x;
+  u.i &= 0x7fffffff;
+  if (u.i == 0)
+    return FP_ILOGB0;
+  if (u.i >= 0x800000)
+    return (u.i >> 23) - 127;
+  int r = -126;
+  int a = u.i & 0x7FFFFF;
+  while(a < 0x800000) {
+    a <<= 1;
+    r --;
+  }
+  return r;
+}
+OVERLOADABLE float nan(uint code) {
+  return NAN;
+}
+OVERLOADABLE float __gen_ocl_internal_tanpi(float x) {
+  return native_tan(x * M_PI_F);
+}
+OVERLOADABLE float __gen_ocl_internal_cbrt(float x) {
+  /* copied from fdlibm */
+  const unsigned
+  B1 = 709958130, /* B1 = (84+2/3-0.03306235651)*2**23 */
+  B2 = 642849266; /* B2 = (76+2/3-0.03306235651)*2**23 */
+
+  const float
+  C =  5.4285717010e-01, /* 19/35     = 0x3f0af8b0 */
+  D = -7.0530611277e-01, /* -864/1225 = 0xbf348ef1 */
+  E =  1.4142856598e+00, /* 99/70     = 0x3fb50750 */
+  F =  1.6071428061e+00, /* 45/28     = 0x3fcdb6db */
+  G =  3.5714286566e-01; /* 5/14      = 0x3eb6db6e */
+
+  float r,s,t, w;
+  int hx;
+  uint sign;
+  uint high;
+
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  sign=hx&0x80000000;     /* sign= sign(x) */
+  hx  ^=sign;
+  if(hx>=0x7f800000) return(x+x); /* cbrt(NaN,INF) is itself */
+  if(hx==0)
+      return(x);    /* cbrt(0) is itself */
+
+  GEN_OCL_SET_FLOAT_WORD(x,hx); /* x <- |x| */
+    /* rough cbrt to 5 bits */
+  if(hx<0x00800000)     /* subnormal number */
+    {
+    //SET_FLOAT_WORD(t,0x4b800000); /* set t= 2**24 */
+     //t*=x; GET_FLOAT_WORD(high,t); SET_FLOAT_WORD(t,high/3+B2);
+      t = (sign = 0) ? 0.0f : -0.0f;
+      return t;
+    }
+  else
+    GEN_OCL_SET_FLOAT_WORD(t,hx/3+B1);
+
+
+    /* new cbrt to 23 bits */
+  r=t*t/x;
+  s=C+r*t;
+  t*=G+F/(s+E+D/s);
+    /* one step newton iteration to 53 bits with error less than 0.667 ulps */
+  s=t*t;    /* t*t is exact */
+  r=x/s;
+  w=t+t;
+  r=(r-t)/(w+r);  /* r-s is exact */
+  t=t+t*r;
+
+    /* retore the sign bit */
+  GEN_OCL_GET_FLOAT_WORD(high,t);
+  GEN_OCL_SET_FLOAT_WORD(t,high|sign);
+  return(t);
+}
+
+#define BODY \
+  *cosval = cos(x); \
+  return sin(x);
+
+OVERLOADABLE float sincos(float x, global float *cosval) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_sincos(x, cosval);
+  BODY;
+}
+OVERLOADABLE float sincos(float x, local float *cosval) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_sincos(x, cosval);
+  BODY;
+}
+OVERLOADABLE float sincos(float x, private float *cosval) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_sincos(x, cosval);
+  BODY;
+}
+#undef BODY
+
+INLINE float __gen_ocl_asin_util(float x) {
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunSoft, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+  float
+  pS0 =  1.66666666666666657415e-01,
+  pS1 = -3.25565818622400915405e-01,
+  pS2 =  2.01212532134862925881e-01,
+  pS3 = -4.00555345006794114027e-02,
+  pS4 =  7.91534994289814532176e-04,
+  pS5 =  3.47933107596021167570e-05,
+  qS1 = -2.40339491173441421878e+00,
+  qS2 =  2.02094576023350569471e+00,
+  qS3 = -6.88283971605453293030e-01,
+  qS4 =  7.70381505559019352791e-02;
+
+  float t = x*x;
+  float p = t*(pS0+t*(pS1+t*(pS2+t*(pS3+t*(pS4+t*pS5)))));
+  float q = 1.0+t*(qS1+t*(qS2+t*(qS3+t*qS4)));
+  float w = p / q;
+  return x + x*w;
+}
+
+OVERLOADABLE float __gen_ocl_internal_asin(float x) {
+  uint ix;
+  union { uint i; float f; } u;
+  u.f = x;
+  ix = u.i & 0x7fffffff;
+  if(ix == 0x3f800000) {
+    return x * M_PI_2_F;  /* asin(|1|)=+-pi/2 with inexact */
+  }
+  if(ix > 0x3f800000) {            /* |x|>= 1 */
+    return  NAN;          /* asin(|x|>1) is NaN */
+  }
+
+  if(ix < 0x32000000) {            /* if |x| < 2**-27 */
+    if(HUGE_VALF + x > FLT_ONE) return x;   /* return x with inexact if x!=0*/
+  }
+
+  if(x < -0.5) {
+    return 2 * __gen_ocl_asin_util(native_sqrt((1+x) / 2)) - M_PI_2_F;
+  } else if(x > 0.5) {
+    return M_PI_2_F - 2 * __gen_ocl_asin_util(native_sqrt((1-x) / 2));
+  } else {
+    return __gen_ocl_asin_util(x);
+  }
+}
+OVERLOADABLE float __gen_ocl_internal_asinpi(float x) {
+  return __gen_ocl_internal_asin(x) / M_PI_F;
+}
+OVERLOADABLE float __gen_ocl_internal_acos(float x) {
+  if(x > 0.5)
+    return 2 * __gen_ocl_asin_util(native_sqrt((1-x)/2));
+  else
+    return M_PI_2_F - __gen_ocl_internal_asin(x);
+}
+OVERLOADABLE float __gen_ocl_internal_acospi(float x) {
+  return __gen_ocl_internal_acos(x) / M_PI_F;
+}
+__constant float atanhi[4] = {
+  4.6364760399e-01, /* atan(0.5)hi 0x3eed6338 */
+  7.8539812565e-01, /* atan(1.0)hi 0x3f490fda */
+  9.8279368877e-01, /* atan(1.5)hi 0x3f7b985e */
+  1.5707962513e+00, /* atan(inf)hi 0x3fc90fda */
+};
+__constant float atanlo[4] = {
+  5.0121582440e-09, /* atan(0.5)lo 0x31ac3769 */
+  3.7748947079e-08, /* atan(1.0)lo 0x33222168 */
+  3.4473217170e-08, /* atan(1.5)lo 0x33140fb4 */
+  7.5497894159e-08, /* atan(inf)lo 0x33a22168 */
+};
+
+OVERLOADABLE float __gen_ocl_internal_atan(float x) {
+  /* copied from fdlibm */
+  float aT[11];
+  aT[0] = 3.3333334327e-01; /* 0x3eaaaaaa */
+  aT[1] =  -2.0000000298e-01; /* 0xbe4ccccd */
+  aT[2] =   1.4285714924e-01; /* 0x3e124925 */
+  aT[3] =  -1.1111110449e-01; /* 0xbde38e38 */
+  aT[4] =   9.0908870101e-02; /* 0x3dba2e6e */
+  aT[5] =  -7.6918758452e-02; /* 0xbd9d8795 */
+  aT[6] =   6.6610731184e-02; /* 0x3d886b35 */
+  aT[7] =  -5.8335702866e-02; /* 0xbd6ef16b */
+  aT[8] =   4.9768779427e-02; /* 0x3d4bda59 */
+  aT[9] =  -3.6531571299e-02; /* 0xbd15a221 */
+  aT[10] =   1.6285819933e-02; /* 0x3c8569d7 */
+  const float one = 1.0, huge = 1.0e30;
+
+  float w,s1,s2,z;
+  int ix,hx,id;
+
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  ix = hx&0x7fffffff;
+  if(ix>=0x50800000) {  /* if |x| >= 2^34 */
+      if(ix>0x7f800000)
+    return x+x;   /* NaN */
+      if(hx>0) return  atanhi[3]+atanlo[3];
+      else     return -atanhi[3]-atanlo[3];
+  } if (ix < 0x3ee00000) {  /* |x| < 0.4375 */
+      if (ix < 0x31000000) {  /* |x| < 2^-29 */
+    if(huge+x>one) return x;  /* raise inexact */
+      }
+      id = -1;
+  } else {
+  x = __gen_ocl_fabs(x);
+  if (ix < 0x3f980000) {    /* |x| < 1.1875 */
+      if (ix < 0x3f300000) {  /* 7/16 <=|x|<11/16 */
+    id = 0; x = ((float)2.0*x-one)/((float)2.0+x);
+      } else {      /* 11/16<=|x|< 19/16 */
+    id = 1; x  = (x-one)/(x+one);
+      }
+  } else {
+      if (ix < 0x401c0000) {  /* |x| < 2.4375 */
+    id = 2; x  = (x-(float)1.5)/(one+(float)1.5*x);
+      } else {      /* 2.4375 <= |x| < 2^66 */
+    id = 3; x  = -(float)1.0/x;
+      }
+  }}
+    /* end of argument reduction */
+  z = x*x;
+  w = z*z;
+    /* break sum from i=0 to 10 aT[i]z**(i+1) into odd and even poly */
+  s1 = z*(aT[0]+w*(aT[2]+w*(aT[4]+w*(aT[6]+w*(aT[8]+w*aT[10])))));
+  s2 = w*(aT[1]+w*(aT[3]+w*(aT[5]+w*(aT[7]+w*aT[9]))));
+  if (id<0) return x - x*(s1+s2);
+  else {
+      z = atanhi[id] - ((x*(s1+s2) - atanlo[id]) - x);
+      return (hx<0)? -z:z;
+  }
+
+}
+OVERLOADABLE float __gen_ocl_internal_atanpi(float x) {
+  return __gen_ocl_internal_atan(x) / M_PI_F;
+}
+
+// XXX work-around PTX profile
+OVERLOADABLE float sqrt(float x) { return native_sqrt(x); }
+OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); }
+OVERLOADABLE float __gen_ocl_internal_atan2(float y, float x) {
+  /* copied from fdlibm */
+  float z;
+  int k,m,hx,hy,ix,iy;
+  const float
+  tiny  = 1.0e-30,
+  zero  = 0.0,
+  pi_o_4  = 7.8539818525e-01, /* 0x3f490fdb */
+  pi_o_2  = 1.5707963705e+00, /* 0x3fc90fdb */
+  pi      = 3.1415927410e+00, /* 0x40490fdb */
+  pi_lo   = -8.7422776573e-08; /* 0xb3bbbd2e */
+
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  ix = hx&0x7fffffff;
+  GEN_OCL_GET_FLOAT_WORD(hy,y);
+  iy = hy&0x7fffffff;
+
+  if((ix>0x7f800000)||
+     (iy>0x7f800000)) /* x or y is NaN */
+     return x+y;
+  if(hx==0x3f800000) return z=__gen_ocl_internal_atan(y);   /* x=1.0 */
+  m = ((hy>>31)&1)|((hx>>30)&2);  /* 2*sign(x)+sign(y) */
+
+    /* when y = 0 */
+  if(iy==0) {
+      switch(m) {
+    case 0:
+    case 1: return y;   /* atan(+-0,+anything)=+-0 */
+    case 2: return  pi+tiny;/* atan(+0,-anything) = pi */
+    case 3: return -pi-tiny;/* atan(-0,-anything) =-pi */
+      }
+  }
+    /* when x = 0 */
+  if(ix==0) return (hy<0)?  -pi_o_2-tiny: pi_o_2+tiny;
+
+  /* both are denorms. Gen does not support denorm, so we convert to normal float number*/
+  if(ix <= 0x7fffff && iy <= 0x7fffff) {
+    x = (float)(ix) * (1.0f - ((hx>>30) & 0x2));
+    y = (float)(iy) * (1.0f - ((hy>>30) & 0x2));
+  }
+
+    /* when x is INF */
+  if(ix==0x7f800000) {
+      if(iy==0x7f800000) {
+    switch(m) {
+        case 0: return  pi_o_4+tiny;/* atan(+INF,+INF) */
+        case 1: return -pi_o_4-tiny;/* atan(-INF,+INF) */
+        case 2: return  (float)3.0*pi_o_4+tiny;/*atan(+INF,-INF)*/
+        case 3: return (float)-3.0*pi_o_4-tiny;/*atan(-INF,-INF)*/
+    }
+      } else {
+    switch(m) {
+        case 0: return  zero  ; /* atan(+...,+INF) */
+        case 1: return -zero  ; /* atan(-...,+INF) */
+        case 2: return  pi+tiny  ;  /* atan(+...,-INF) */
+        case 3: return -pi-tiny  ;  /* atan(-...,-INF) */
+    }
+      }
+  }
+    /* when y is INF */
+  if(iy==0x7f800000) return (hy<0)? -pi_o_2-tiny: pi_o_2+tiny;
+
+    /* compute y/x */
+  k = (iy-ix)>>23;
+  if(k > 60) z=pi_o_2+(float)0.5*pi_lo;   /* |y/x| >  2**60 */
+  else if(hx<0&&k<-60) z=0.0;   /* |y|/x < -2**60 */
+  else z=__gen_ocl_internal_atan(__gen_ocl_fabs(y/x)); /* safe to do y/x */
+  switch (m) {
+      case 0: return       z  ; /* atan(+,+) */
+      case 1: {
+              uint zh;
+          GEN_OCL_GET_FLOAT_WORD(zh,z);
+          GEN_OCL_SET_FLOAT_WORD(z,zh ^ 0x80000000);
+        }
+        return       z  ; /* atan(-,+) */
+      case 2: return  pi-(z-pi_lo);/* atan(+,-) */
+      default: /* case 3 */
+            return  (z-pi_lo)-pi;/* atan(-,-) */
+  }
+}
+
+OVERLOADABLE float __gen_ocl_internal_atan2pi(float y, float x) {
+  uint ix = as_uint(x), iy = as_uint(y),
+       pos_zero = 0, neg_zero = 0x80000000u,
+       pos_inf = 0x7f800000, neg_inf = 0xff800000u;
+  if(iy == pos_zero) {
+    if(ix == pos_zero)
+      return 0;
+    if(ix == neg_zero)
+      return 1;
+    if(x < 0)
+      return 1;
+    if(x > 0)
+      return 0;
+  }
+  if(iy == neg_zero) {
+    if(ix == pos_zero)
+      return -0.f;
+    if(ix == neg_zero)
+      return -1;
+    if(x < 0)
+      return -1;
+    if(x > 0)
+      return -0.f;
+  }
+  if((ix & 0x7fffffff) == 0) {
+    if(y < 0)
+      return -.5f;
+    if(y > 0)
+      return .5f;
+  }
+  if(ix == pos_inf) {
+    if(y > 0 && iy != pos_inf)
+      return 0;
+    if(y < 0 && iy != neg_inf)
+      return -0.f;
+  }
+  if(ix == neg_inf) {
+    if(y > 0 && iy != pos_inf)
+      return 1;
+    if(y < 0 && iy != neg_inf)
+      return -1;
+  }
+  if(iy == pos_inf) {
+    if(ix == pos_inf)
+      return 0.25f;
+    if(ix == neg_inf)
+      return 0.75f;
+    if(x >= 0 || x <= 0)
+      return 0.5f;
+  }
+  if(iy == neg_inf) {
+    if(ix == pos_inf)
+      return -0.25f;
+    if(ix == neg_inf)
+      return -0.75f;
+    if(x >= 0 || x <= 0)
+      return -0.5f;
+  }
+  return __gen_ocl_internal_atan2(y, x) / M_PI_F;
+}
+OVERLOADABLE float __gen_ocl_internal_fabs(float x)  { return __gen_ocl_fabs(x); }
+OVERLOADABLE float __gen_ocl_internal_trunc(float x) { return __gen_ocl_rndz(x); }
+OVERLOADABLE float __gen_ocl_internal_round(float x) {
+  float y = __gen_ocl_rndz(x);
+  if (__gen_ocl_fabs(x - y) >= 0.5f)
+    y += __gen_ocl_internal_copysign(1.f, x);
+  return y;
+}
+OVERLOADABLE float __gen_ocl_internal_ceil(float x)  { return __gen_ocl_rndu(x); }
+OVERLOADABLE float powr(float x, float y) { return __gen_ocl_pow(x,y); }
+OVERLOADABLE float __gen_ocl_internal_rint(float x) {
+  return __gen_ocl_rnde(x);
+}
+
+OVERLOADABLE float __gen_ocl_internal_exp(float x) {
+  //use native instruction when it has enough precision
+  if (x > -0x1.6p1 && x < 0x1.6p1)
+  {
+    return native_exp(x);
+  }
+
+  float o_threshold = 8.8721679688e+01,  /* 0x42b17180 */
+  u_threshold = -1.0397208405e+02,  /* 0xc2cff1b5 */
+  twom100 = 7.8886090522e-31, 	 /* 2**-100=0x0d800000 */
+  ivln2	 =	1.4426950216e+00; /* 0x3fb8aa3b =1/ln2 */
+  float y,hi=0.0,lo=0.0,t;
+  int k=0,xsb;
+  unsigned hx;
+  float ln2HI_0 = 6.9313812256e-01;	/* 0x3f317180 */
+  float ln2HI_1 = -6.9313812256e-01;	/* 0xbf317180 */
+  float ln2LO_0 = 9.0580006145e-06;  	/* 0x3717f7d1 */
+  float ln2LO_1 = -9.0580006145e-06; /* 0xb717f7d1 */
+  float half_0 = 0.5;
+  float half_1 =	-0.5;
+
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  xsb = (hx>>31)&1;		/* sign bit of x */
+  hx &= 0x7fffffff;		/* high word of |x| */
+
+  /* filter out non-finite argument */
+  if(hx >= 0x42b17218) {			/* if |x|>=88.721... */
+    // native_exp already handled this
+    return native_exp(x);
+  }
+
+  /* argument reduction */
+  if(hx > 0x3eb17218) {		/* if  |x| > 0.5 ln2 */
+    if(hx < 0x3F851592) {	/* and |x| < 1.5 ln2 */
+      hi = x-(xsb ==1 ? ln2HI_1 : ln2HI_0);
+      lo= xsb == 1? ln2LO_1 : ln2LO_0;
+      k = 1-xsb-xsb;
+    } else {
+      float tmp = xsb == 1 ? half_1 : half_0;
+      k  = ivln2*x+tmp;
+      t  = k;
+      hi = x - t*ln2HI_0;	/* t*ln2HI is exact here */
+      lo = t*ln2LO_0;
+    }
+    x  = hi - lo;
+  }
+
+  y = native_exp(x);
+  if(k >= -125) {
+    unsigned hy;
+    GEN_OCL_GET_FLOAT_WORD(hy,y);
+    GEN_OCL_SET_FLOAT_WORD(y,hy+(k<<23));	/* add k to y's exponent */
+    return y;
+  } else {
+    unsigned hy;
+    GEN_OCL_GET_FLOAT_WORD(hy,y);
+    GEN_OCL_SET_FLOAT_WORD(y,hy+((k+100)<<23)); /* add k to y's exponent */
+    return y*twom100;
+  }
+}
+
+INLINE_OVERLOADABLE float tgamma(float x) {
+  float y;
+  int s;
+  y=lgamma_r(x,&s);
+  return __gen_ocl_internal_exp(y)*s;
+}
+
+/* erf,erfc from glibc s_erff.c -- float version of s_erf.c.
+ * Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com.
+ */
+
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunPro, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+INLINE_OVERLOADABLE float __gen_ocl_internal_erf(float x) {
+/*...*/
+const float
+tiny = 1.0e-30,
+half_val=  5.0000000000e-01, /* 0x3F000000 */
+one =  1.0000000000e+00, /* 0x3F800000 */
+two =  2.0000000000e+00, /* 0x40000000 */
+	/* c = (subfloat)0.84506291151 */
+erx =  8.4506291151e-01, /* 0x3f58560b */
+/*
+ * Coefficients for approximation to  erf on [0,0.84375]
+ */
+efx =  1.2837916613e-01, /* 0x3e0375d4 */
+efx8=  1.0270333290e+00, /* 0x3f8375d4 */
+pp0  =  1.2837916613e-01, /* 0x3e0375d4 */
+pp1  = -3.2504209876e-01, /* 0xbea66beb */
+pp2  = -2.8481749818e-02, /* 0xbce9528f */
+pp3  = -5.7702702470e-03, /* 0xbbbd1489 */
+pp4  = -2.3763017452e-05, /* 0xb7c756b1 */
+qq1  =  3.9791721106e-01, /* 0x3ecbbbce */
+qq2  =  6.5022252500e-02, /* 0x3d852a63 */
+qq3  =  5.0813062117e-03, /* 0x3ba68116 */
+qq4  =  1.3249473704e-04, /* 0x390aee49 */
+qq5  = -3.9602282413e-06, /* 0xb684e21a */
+/*
+ * Coefficients for approximation to  erf  in [0.84375,1.25]
+ */
+pa0  = -2.3621185683e-03, /* 0xbb1acdc6 */
+pa1  =  4.1485610604e-01, /* 0x3ed46805 */
+pa2  = -3.7220788002e-01, /* 0xbebe9208 */
+pa3  =  3.1834661961e-01, /* 0x3ea2fe54 */
+pa4  = -1.1089469492e-01, /* 0xbde31cc2 */
+pa5  =  3.5478305072e-02, /* 0x3d1151b3 */
+pa6  = -2.1663755178e-03, /* 0xbb0df9c0 */
+qa1  =  1.0642088205e-01, /* 0x3dd9f331 */
+qa2  =  5.4039794207e-01, /* 0x3f0a5785 */
+qa3  =  7.1828655899e-02, /* 0x3d931ae7 */
+qa4  =  1.2617121637e-01, /* 0x3e013307 */
+qa5  =  1.3637083583e-02, /* 0x3c5f6e13 */
+qa6  =  1.1984500103e-02, /* 0x3c445aa3 */
+ /*
+ * Coefficients for approximation to  erfc in [1.25,1/0.35]
+ */ra0  = -9.8649440333e-03, /* 0xbc21a093 */
+ra1  = -6.9385856390e-01, /* 0xbf31a0b7 */
+ra2  = -1.0558626175e+01, /* 0xc128f022 */
+ra3  = -6.2375331879e+01, /* 0xc2798057 */
+ra4  = -1.6239666748e+02, /* 0xc322658c */
+ra5  = -1.8460508728e+02, /* 0xc3389ae7 */
+ra6  = -8.1287437439e+01, /* 0xc2a2932b */
+ra7  = -9.8143291473e+00, /* 0xc11d077e */
+sa1  =  1.9651271820e+01, /* 0x419d35ce */
+sa2  =  1.3765776062e+02, /* 0x4309a863 */
+sa3  =  4.3456588745e+02, /* 0x43d9486f */
+sa4  =  6.4538726807e+02, /* 0x442158c9 */
+sa5  =  4.2900814819e+02, /* 0x43d6810b */
+sa6  =  1.0863500214e+02, /* 0x42d9451f */
+sa7  =  6.5702495575e+00, /* 0x40d23f7c */
+sa8  = -6.0424413532e-02, /* 0xbd777f97 */
+/*
+ * Coefficients for approximation to  erfc in [1/.35,28]
+ */
+rb0  = -9.8649431020e-03, /* 0xbc21a092 */
+rb1  = -7.9928326607e-01, /* 0xbf4c9dd4 */
+rb2  = -1.7757955551e+01, /* 0xc18e104b */
+rb3  = -1.6063638306e+02, /* 0xc320a2ea */
+rb4  = -6.3756646729e+02, /* 0xc41f6441 */
+rb5  = -1.0250950928e+03, /* 0xc480230b */
+rb6  = -4.8351919556e+02, /* 0xc3f1c275 */
+sb1  =  3.0338060379e+01, /* 0x41f2b459 */
+sb2  =  3.2579251099e+02, /* 0x43a2e571 */
+sb3  =  1.5367296143e+03, /* 0x44c01759 */
+sb4  =  3.1998581543e+03, /* 0x4547fdbb */
+sb5  =  2.5530502930e+03, /* 0x451f90ce */
+sb6  =  4.7452853394e+02, /* 0x43ed43a7 */
+sb7  = -2.2440952301e+01; /* 0xc1b38712 */
+
+	int hx,ix,i;
+	float R,S,P,Q,s,y,z,r;
+	GEN_OCL_GET_FLOAT_WORD(hx,x);
+	ix = hx&0x7fffffff;
+	if(ix>=0x7f800000) {		/* erf(nan)=nan */
+	    i = ((unsigned int)hx>>31)<<1;
+	    return (float)(1-i)+one/x;	/* erf(+-inf)=+-1 */
+	}
+
+	if(ix < 0x3f580000) {		/* |x|<0.84375 */
+	    if(ix < 0x31800000) { 	/* |x|<2**-28 */
+	        if (ix < 0x04000000)
+		    /*avoid underflow */
+		    return (float)0.125*((float)8.0*x+efx8*x);
+		return x + efx*x;
+	    }
+	    z = x*x;
+	    r = pp0+z*(pp1+z*(pp2+z*(pp3+z*pp4)));
+	    s = one+z*(qq1+z*(qq2+z*(qq3+z*(qq4+z*qq5))));
+	    y = r/s;
+	    return x + x*y;
+	}
+	if(ix < 0x3fa00000) {		/* 0.84375 <= |x| < 1.25 */
+	    s = __gen_ocl_internal_fabs(x)-one;
+	    P = pa0+s*(pa1+s*(pa2+s*(pa3+s*(pa4+s*(pa5+s*pa6)))));
+	    Q = one+s*(qa1+s*(qa2+s*(qa3+s*(qa4+s*(qa5+s*qa6)))));
+	    if(hx>=0) return erx + P/Q; else return -erx - P/Q;
+	}
+	if (ix >= 0x40c00000) {		/* inf>|x|>=6 */
+	    if(hx>=0) return one-tiny; else return tiny-one;
+	}
+	x = __gen_ocl_internal_fabs(x);
+    s = one/(x*x);
+	if(ix< 0x4036DB6E) {	/* |x| < 1/0.35 */
+	    R=ra0+s*(ra1+s*(ra2+s*(ra3+s*(ra4+s*(
+				ra5+s*(ra6+s*ra7))))));
+	    S=one+s*(sa1+s*(sa2+s*(sa3+s*(sa4+s*(
+				sa5+s*(sa6+s*(sa7+s*sa8)))))));
+	} else {	/* |x| >= 1/0.35 */
+	    R=rb0+s*(rb1+s*(rb2+s*(rb3+s*(rb4+s*(
+				rb5+s*rb6)))));
+	    S=one+s*(sb1+s*(sb2+s*(sb3+s*(sb4+s*(
+				sb5+s*(sb6+s*sb7))))));
+	}
+	GEN_OCL_GET_FLOAT_WORD(ix,x);
+	GEN_OCL_SET_FLOAT_WORD(z,ix&0xfffff000);
+	r  =  __gen_ocl_internal_exp(-z*z-(float)0.5625)*__gen_ocl_internal_exp((z-x)*(z+x)+R/S);
+	if(hx>=0) return one-r/x; else return  r/x-one;
+}
+INLINE_OVERLOADABLE float __gen_ocl_internal_erfc(float x) {
+/*...*/
+const float
+tiny = 1.0e-30,
+half_val=  5.0000000000e-01, /* 0x3F000000 */
+one =  1.0000000000e+00, /* 0x3F800000 */
+two =  2.0000000000e+00, /* 0x40000000 */
+	/* c = (subfloat)0.84506291151 */
+erx =  8.4506291151e-01, /* 0x3f58560b */
+/*
+ * Coefficients for approximation to  erf on [0,0.84375]
+ */
+efx =  1.2837916613e-01, /* 0x3e0375d4 */
+efx8=  1.0270333290e+00, /* 0x3f8375d4 */
+pp0  =  1.2837916613e-01, /* 0x3e0375d4 */
+pp1  = -3.2504209876e-01, /* 0xbea66beb */
+pp2  = -2.8481749818e-02, /* 0xbce9528f */
+pp3  = -5.7702702470e-03, /* 0xbbbd1489 */
+pp4  = -2.3763017452e-05, /* 0xb7c756b1 */
+qq1  =  3.9791721106e-01, /* 0x3ecbbbce */
+qq2  =  6.5022252500e-02, /* 0x3d852a63 */
+qq3  =  5.0813062117e-03, /* 0x3ba68116 */
+qq4  =  1.3249473704e-04, /* 0x390aee49 */
+qq5  = -3.9602282413e-06, /* 0xb684e21a */
+/*
+ * Coefficients for approximation to  erf  in [0.84375,1.25]
+ */
+pa0  = -2.3621185683e-03, /* 0xbb1acdc6 */
+pa1  =  4.1485610604e-01, /* 0x3ed46805 */
+pa2  = -3.7220788002e-01, /* 0xbebe9208 */
+pa3  =  3.1834661961e-01, /* 0x3ea2fe54 */
+pa4  = -1.1089469492e-01, /* 0xbde31cc2 */
+pa5  =  3.5478305072e-02, /* 0x3d1151b3 */
+pa6  = -2.1663755178e-03, /* 0xbb0df9c0 */
+qa1  =  1.0642088205e-01, /* 0x3dd9f331 */
+qa2  =  5.4039794207e-01, /* 0x3f0a5785 */
+qa3  =  7.1828655899e-02, /* 0x3d931ae7 */
+qa4  =  1.2617121637e-01, /* 0x3e013307 */
+qa5  =  1.3637083583e-02, /* 0x3c5f6e13 */
+qa6  =  1.1984500103e-02, /* 0x3c445aa3 */
+ /*
+ * Coefficients for approximation to  erfc in [1.25,1/0.35]
+ */ra0  = -9.8649440333e-03, /* 0xbc21a093 */
+ra1  = -6.9385856390e-01, /* 0xbf31a0b7 */
+ra2  = -1.0558626175e+01, /* 0xc128f022 */
+ra3  = -6.2375331879e+01, /* 0xc2798057 */
+ra4  = -1.6239666748e+02, /* 0xc322658c */
+ra5  = -1.8460508728e+02, /* 0xc3389ae7 */
+ra6  = -8.1287437439e+01, /* 0xc2a2932b */
+ra7  = -9.8143291473e+00, /* 0xc11d077e */
+sa1  =  1.9651271820e+01, /* 0x419d35ce */
+sa2  =  1.3765776062e+02, /* 0x4309a863 */
+sa3  =  4.3456588745e+02, /* 0x43d9486f */
+sa4  =  6.4538726807e+02, /* 0x442158c9 */
+sa5  =  4.2900814819e+02, /* 0x43d6810b */
+sa6  =  1.0863500214e+02, /* 0x42d9451f */
+sa7  =  6.5702495575e+00, /* 0x40d23f7c */
+sa8  = -6.0424413532e-02, /* 0xbd777f97 */
+/*
+ * Coefficients for approximation to  erfc in [1/.35,28]
+ */
+rb0  = -9.8649431020e-03, /* 0xbc21a092 */
+rb1  = -7.9928326607e-01, /* 0xbf4c9dd4 */
+rb2  = -1.7757955551e+01, /* 0xc18e104b */
+rb3  = -1.6063638306e+02, /* 0xc320a2ea */
+rb4  = -6.3756646729e+02, /* 0xc41f6441 */
+rb5  = -1.0250950928e+03, /* 0xc480230b */
+rb6  = -4.8351919556e+02, /* 0xc3f1c275 */
+sb1  =  3.0338060379e+01, /* 0x41f2b459 */
+sb2  =  3.2579251099e+02, /* 0x43a2e571 */
+sb3  =  1.5367296143e+03, /* 0x44c01759 */
+sb4  =  3.1998581543e+03, /* 0x4547fdbb */
+sb5  =  2.5530502930e+03, /* 0x451f90ce */
+sb6  =  4.7452853394e+02, /* 0x43ed43a7 */
+sb7  = -2.2440952301e+01; /* 0xc1b38712 */
+	int hx,ix;
+	float R,S,P,Q,s,y,z,r;
+	GEN_OCL_GET_FLOAT_WORD(hx,x);
+	ix = hx&0x7fffffff;
+	if(ix>=0x7f800000) {			/* erfc(nan)=nan */
+						/* erfc(+-inf)=0,2 */
+	    return (float)(((unsigned int)hx>>31)<<1)+one/x;
+	}
+
+	if(ix < 0x3f580000) {		/* |x|<0.84375 */
+	    if(ix < 0x23800000)  	/* |x|<2**-56 */
+		return one-x;
+	    z = x*x;
+	    r = pp0+z*(pp1+z*(pp2+z*(pp3+z*pp4)));
+	    s = one+z*(qq1+z*(qq2+z*(qq3+z*(qq4+z*qq5))));
+	    y = r/s;
+	    if(hx < 0x3e800000) {  	/* x<1/4 */
+		return one-(x+x*y);
+	    } else {
+		r = x*y;
+		r += (x-half_val);
+	        return half_val - r ;
+	    }
+	}
+	if(ix < 0x3fa00000) {		/* 0.84375 <= |x| < 1.25 */
+	    s = __gen_ocl_internal_fabs(x)-one;
+	    P = pa0+s*(pa1+s*(pa2+s*(pa3+s*(pa4+s*(pa5+s*pa6)))));
+	    Q = one+s*(qa1+s*(qa2+s*(qa3+s*(qa4+s*(qa5+s*qa6)))));
+	    if(hx>=0) {
+	        z  = one-erx; return z - P/Q;
+	    } else {
+		z = erx+P/Q; return one+z;
+	    }
+	}
+	if (ix < 0x41e00000) {		/* |x|<28 */
+	    x = __gen_ocl_internal_fabs(x);
+        s = one/(x*x);
+	    if(ix< 0x4036DB6D) {	/* |x| < 1/.35 ~ 2.857143*/
+	        R=ra0+s*(ra1+s*(ra2+s*(ra3+s*(ra4+s*(
+				ra5+s*(ra6+s*ra7))))));
+	        S=one+s*(sa1+s*(sa2+s*(sa3+s*(sa4+s*(
+				sa5+s*(sa6+s*(sa7+s*sa8)))))));
+	    } else {			/* |x| >= 1/.35 ~ 2.857143 */
+		if(hx<0&&ix>=0x40c00000) return two-tiny;/* x < -6 */
+	        R=rb0+s*(rb1+s*(rb2+s*(rb3+s*(rb4+s*(
+				rb5+s*rb6)))));
+	        S=one+s*(sb1+s*(sb2+s*(sb3+s*(sb4+s*(
+				sb5+s*(sb6+s*sb7))))));
+	    }
+	    GEN_OCL_GET_FLOAT_WORD(ix,x);
+	    GEN_OCL_SET_FLOAT_WORD(z,ix&0xffffe000);
+	    r  =  __gen_ocl_internal_exp(-z*z-(float)0.5625)*
+			__gen_ocl_internal_exp((z-x)*(z+x)+R/S);
+	    if(hx>0) {
+		float ret = r/x;
+		return ret;
+	    } else
+		return two-r/x;
+	} else {
+	    if(hx>0) {
+		return tiny*tiny;
+	    } else
+		return two-tiny;
+	}
+}
+
+OVERLOADABLE float __gen_ocl_internal_fmod (float x, float y) {
+  //return x-y*__gen_ocl_rndz(x/y);
+  float one = 1.0;
+  float Zero[2];
+  int n,hx,hy,hz,ix,iy,sx,i;
+  Zero[0] = 0.0;
+  Zero[1] = -0.0;
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  GEN_OCL_GET_FLOAT_WORD(hy,y);
+  sx = hx&0x80000000;		/* sign of x */
+  hx ^=sx;		/* |x| */
+  hy &= 0x7fffffff;	/* |y| */
+  /* purge off exception values */
+  if(hy==0||(hx>=0x7f800000)||		/* y=0,or x not finite */
+  (hy>0x7f800000))			/* or y is NaN */
+    return (x*y)/(x*y);
+  if(hx<hy) return x;			/* |x|<|y| return x */
+  if(hx==hy)
+    return Zero[(unsigned)sx>>31];	/* |x|=|y| return x*0*/
+
+  /* determine ix = ilogb(x) */
+  if(hx<0x00800000) {	/* subnormal x */
+    for (ix = -126,i=(hx<<8); i>0; i<<=1) ix -=1;
+  } else ix = (hx>>23)-127;
+
+  /* determine iy = ilogb(y) */
+  if(hy<0x00800000) {	/* subnormal y */
+    for (iy = -126,i=(hy<<8); i>=0; i<<=1) iy -=1;
+  } else iy = (hy>>23)-127;
+
+  /* set up {hx,lx}, {hy,ly} and align y to x */
+  if(ix >= -126)
+    hx = 0x00800000|(0x007fffff&hx);
+  else {		/* subnormal x, shift x to normal */
+    n = -126-ix;
+    hx = hx<<n;
+  }
+  if(iy >= -126)
+    hy = 0x00800000|(0x007fffff&hy);
+  else {		/* subnormal y, shift y to normal */
+    n = -126-iy;
+    hy = hy<<n;
+  }
+  /* fix point fmod */
+  n = ix - iy;
+  while(n--) {
+    hz=hx-hy;
+    if(hz<0){hx = hx+hx;}
+    else {
+      if(hz==0)		/* return sign(x)*0 */
+        return Zero[(unsigned)sx>>31];
+      hx = hz+hz;
+    }
+  }
+  hz=hx-hy;
+  if(hz>=0) {hx=hz;}
+
+    /* convert back to floating value and restore the sign */
+  if(hx==0)			/* return sign(x)*0 */
+    return Zero[(unsigned)sx>>31];
+  while(hx<0x00800000) {		/* normalize x */
+    hx = hx+hx;
+    iy -= 1;
+  }
+  if(iy>= -126) {		/* normalize output */
+    hx = ((hx-0x00800000)|((iy+127)<<23));
+	GEN_OCL_SET_FLOAT_WORD(x,hx|sx);
+   } else {		/* subnormal output */
+     n = -126 - iy;
+     hx >>= n;
+     GEN_OCL_SET_FLOAT_WORD(x,hx|sx);
+     x *= one;		/* create necessary signal */
+  }
+  return x;		/* exact output */
+}
+
+OVERLOADABLE float __gen_ocl_internal_expm1(float x) {
+  //return __gen_ocl_pow(M_E_F, x) - 1;
+  float	Q1 = -3.3333335072e-02, /* 0xbd088889 */
+  ln2_hi = 6.9313812256e-01,	/* 0x3f317180 */
+  ln2_lo = 9.0580006145e-06,	/* 0x3717f7d1 */
+  Q2 = 1.5873016091e-03, /* 0x3ad00d01 */
+  Q3 = -7.9365076090e-05, /* 0xb8a670cd */
+  Q4 = 4.0082177293e-06, /* 0x36867e54 */
+  Q5 = -2.0109921195e-07, /* 0xb457edbb */
+  huge = 1.0e30,
+  tiny = 1.0e-30,
+  ivln2 = 1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
+  one	=  1.0,
+  o_threshold=  8.8721679688e+01;  /* 0x42b17180 */
+  float y,hi,lo,c,t,e,hxs,hfx,r1;
+  int k,xsb;
+  int hx;
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  xsb = hx&0x80000000;
+  /* sign bit of x */
+  //if(xsb==0)
+  //y=x;
+  //else
+  //y= -x; /* y = |x| */
+  y = __gen_ocl_internal_fabs(x);
+  hx &= 0x7fffffff;		/* high word of |x| */
+  /* filter out huge and non-finite argument */
+  if(hx >= 0x4195b844) {			/* if |x|>=27*ln2 */
+    if(hx >= 0x42b17218) {		/* if |x|>=88.721... */
+      if(hx>0x7f800000)
+        return x+x; 	 /* NaN */
+      if(hx==0x7f800000)
+        return (xsb==0)? x:-1.0;/* exp(+-inf)={inf,-1} */
+      if(x > o_threshold)
+        return huge*huge; /* overflow */
+    }
+    if(xsb!=0) { /* x < -27*ln2, return -1.0 with inexact */
+      if(x+tiny<(float)0.0)	/* raise inexact */
+        return tiny-one;	/* return -1 */
+    }
+  }
+  /* argument reduction */
+  if(hx > 0x3eb17218) {/* if  |x| > 0.5 ln2 */
+    if(hx < 0x3F851592) {/* and |x| < 1.5 ln2 */
+      if(xsb==0){
+        hi = x - ln2_hi; lo = ln2_lo;  k =  1;
+      }	else {
+        hi = x + ln2_hi; lo = -ln2_lo;  k = -1;
+      }
+    } else {
+      k  = ivln2*x+((xsb==0)?(float)0.5:(float)-0.5);
+      t  = k;
+      hi = x - t*ln2_hi;/* t*ln2_hi is exact here */
+      lo = t*ln2_lo;
+    }
+    x  = hi - lo;
+    c  = (hi-x)-lo;
+  } else if(hx < 0x33000000) {	/* when |x|<2**-25, return x */
+    //t = huge+x; /* return x with inexact flags when x!=0 */
+    //return x - (t-(huge+x));
+    return x;
+  } else k = 0;
+  /* x is now in primary range */
+  hfx = (float)0.5*x;
+  hxs = x*hfx;
+  r1 = one+hxs*(Q1+hxs*(Q2+hxs*(Q3+hxs*(Q4+hxs*Q5))));
+  t = (float)3.0-r1*hfx;
+  e = hxs*((r1-t)/((float)6.0 - x*t));
+  if(k==0)
+    return x - (x*e-hxs);		/* c is 0 */
+  else{
+    e = (x*(e-c)-c);
+    e -= hxs;
+    if(k== -1)return (float)0.5*(x-e)-(float)0.5;
+    if(k==1){
+      if(x < (float)-0.25)
+        return -(float)2.0*(e-(x+(float)0.5));
+      else
+        return  (one+(float)2.0*(x-e));
+    }
+    if (k <= -2 || k>56) {	 /* suffice to return exp(x)-1 */
+      int i;
+      y = one-(e-x);
+      GEN_OCL_GET_FLOAT_WORD(i,y);
+      GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23));	/* add k to y's exponent */
+      return y-one;
+    }
+    t = one;
+    if(k<23) {
+      int i;
+      GEN_OCL_SET_FLOAT_WORD(t,0x3f800000 - (0x1000000>>k)); /* t=1-2^-k */
+      y = t-(e-x);
+      GEN_OCL_GET_FLOAT_WORD(i,y);
+      GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23));	/* add k to y's exponent */
+    } else {
+      int i;
+      GEN_OCL_SET_FLOAT_WORD(t,((0x7f-k)<<23));	/* 2^-k */
+      y = x-(e+t);
+      y += one;
+      GEN_OCL_GET_FLOAT_WORD(i,y);
+      GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23));	/* add k to y's exponent */
+    }
+  }
+  return y;
+}
+
+OVERLOADABLE float __gen_ocl_internal_acosh(float x) {
+  //return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
+  float one	= 1.0,
+  ln2	= 6.9314718246e-01;/* 0x3f317218 */
+  float t;
+  int hx;
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  if(hx<0x3f800000) {	/* x < 1 */
+    return (x-x)/(x-x);
+  } else if(hx >=0x4d800000) {	/* x > 2**28 */
+    if(hx >=0x7f800000) {/* x is inf of NaN */
+      return x+x;
+    } else
+      return __gen_ocl_internal_log(x)+ln2;/* acosh(huge)=log(2x) */
+  } else if (hx==0x3f800000) {
+    return 0.0;			/* acosh(1) = 0 */
+  } else if (hx > 0x40000000) {	/* 2**28 > x > 2 */
+    t=x*x;
+    return __gen_ocl_internal_log((float)2.0*x-one/(x+__gen_ocl_sqrt(t-one)));
+  } else {			/* 1<x<2 */
+    t = x-one;
+    return log1p(t+__gen_ocl_sqrt((float)2.0*t+t*t));
+  }
+}
+
+OVERLOADABLE float __gen_ocl_internal_asinh(float x){
+  //return native_log(x + native_sqrt(x * x + 1));
+  float one =  1.0000000000e+00, /* 0x3F800000 */
+  ln2 =  6.9314718246e-01, /* 0x3f317218 */
+  huge=  1.0000000000e+30;
+  float w;
+  int hx,ix;
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  ix = hx&0x7fffffff;
+  if(ix< 0x38000000) {	/* |x|<2**-14 */
+    if(huge+x>one) return x;	/* return x inexact except 0 */
+  }
+  if(ix>0x47000000) {/* |x| > 2**14 */
+    if(ix>=0x7f800000) return x+x;/* x is inf or NaN */
+    w = __gen_ocl_internal_log(__gen_ocl_internal_fabs(x))+ln2;
+  } else {
+    float xa = __gen_ocl_internal_fabs(x);
+    if (ix>0x40000000) {/* 2**14 > |x| > 2.0 */
+      w = __gen_ocl_internal_log(2.0f*xa+one/(__gen_ocl_sqrt(xa*xa+one)+xa));
+    } else {		/* 2.0 > |x| > 2**-14 */
+      float t = xa*xa;
+      w =log1p(xa+t/(one+__gen_ocl_sqrt(one+t)));
+    }
+  }
+  return __gen_ocl_internal_copysign(w, x);
+}
+
+OVERLOADABLE float __gen_ocl_internal_sinh(float x){
+  //return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
+  float one = 1.0,
+  shuge = 1.0e37;
+  float t,w,h;
+  int ix,jx;
+  GEN_OCL_GET_FLOAT_WORD(jx,x);
+  ix = jx&0x7fffffff;
+  /* x is INF or NaN */
+  if(ix>=0x7f800000) return x+x;
+  h = 0.5;
+  if (jx<0) h = -h;
+  /* |x| in [0,22], return sign(x)*0.5*(E+E/(E+1))) */
+  if (ix < 0x41b00000) {		/* |x|<22 */
+    if (ix<0x31800000)	/* |x|<2**-28 */
+      if(shuge+x>one) return x;/* sinh(tiny) = tiny with inexact */
+    t = __gen_ocl_internal_expm1(__gen_ocl_internal_fabs(x));
+    if(ix<0x3f800000) return h*((float)2.0*t-t*t/(t+one));
+      return h*(t+t/(t+one));
+  }
+  /* |x| in [22, log(maxdouble)] return 0.5*exp(|x|) */
+  if (ix < 0x42b17180)  return h*__gen_ocl_internal_exp(__gen_ocl_internal_fabs(x));
+  /* |x| in [log(maxdouble), overflowthresold] */
+  if (ix<=0x42b2d4fc) {
+    w = __gen_ocl_internal_exp((float)0.5*__gen_ocl_internal_fabs(x));
+    t = h*w;
+    return t*w;
+  }
+  /* |x| > overflowthresold, sinh(x) overflow */
+  return x*shuge;
+}
+
+OVERLOADABLE float __gen_ocl_internal_tanh(float x) {
+  //float y = native_exp(-2 * x);
+  //return (1 - y) / (1 + y);
+  float one=1.0, two=2.0, tiny = 1.0e-30;
+  float t,z;
+  int jx,ix;
+  GEN_OCL_GET_FLOAT_WORD(jx,x);
+  ix = jx&0x7fffffff;
+  /* x is INF or NaN */
+  if(ix>=0x7f800000) {
+    if (jx>=0)
+      return one/x+one; /* tanh(+-inf)=+-1 */
+    else
+      return one/x-one; /* tanh(NaN) = NaN */
+  }
+
+  if (ix < 0x41b00000) { /* |x|<22 */
+    if (ix == 0)
+      return x;		/* x == +-0 */
+    if (ix<0x24000000) 	/* |x|<2**-55 */
+      return x*(one+x);    	/* tanh(small) = small */
+    if (ix>=0x3f800000) {	/* |x|>=1  */
+      t = __gen_ocl_internal_expm1(two*__gen_ocl_internal_fabs(x));
+      z = one - two/(t+two);
+    } else {
+      t = __gen_ocl_internal_expm1(-two*__gen_ocl_internal_fabs(x));
+      z= -t/(t+two);
+    }
+  } else { /* |x| > 22, return +-1 */
+    z = one - tiny;		/* raised inexact flag */
+  }
+  return (jx>=0)? z: -z;
+}
+
+OVERLOADABLE float __gen_ocl_internal_cosh(float x) {
+  //return (1 + native_exp(-2 * x)) / (2 * native_exp(-x));
+  float halF = 0.5,
+  huge = 1.0e+30,
+  tiny = 1.0e-30,
+  one = 1.0;
+  float t,w;
+  int ix;
+  GEN_OCL_GET_FLOAT_WORD(ix,x);
+  ix &= 0x7fffffff;
+  /* |x| in [0,22] */
+  if (ix < 0x41b00000) {
+    /* |x| in [0,0.5*ln2], return 1+expm1(|x|)^2/(2*exp(|x|)) */
+    if(ix<0x3eb17218) {
+      t = __gen_ocl_internal_expm1(__gen_ocl_fabs(x));
+      w = one+t;
+      if (ix<0x24000000) return w;	/* cosh(tiny) = 1 */
+      return one+(t*t)/(w+w);
+    }
+    /* |x| in [0.5*ln2,22], return (exp(|x|)+1/exp(|x|)/2; */
+    t = __gen_ocl_internal_exp(__gen_ocl_fabs(x));
+    return halF*t+halF/t;
+  }
+  /* |x| in [22, log(maxdouble)] return half*exp(|x|) */
+  if (ix < 0x42b17180)  return halF*__gen_ocl_internal_exp(__gen_ocl_fabs(x));
+  /* |x| in [log(maxdouble), overflowthresold] */
+  if (ix<=0x42b2d4fc) {
+    w = __gen_ocl_internal_exp(halF*__gen_ocl_fabs(x));
+    t = halF*w;
+    return t*w;
+  }
+  /* x is INF or NaN */
+  if(ix>=0x7f800000) return x*x;
+  /* |x| > overflowthresold, cosh(x) overflow */
+  return huge*huge;
+}
+
+OVERLOADABLE float __gen_ocl_internal_remainder(float x, float p){
+  //return x-y*__gen_ocl_rnde(x/y);
+  float zero = 0.0;
+  int hx,hp;
+  unsigned sx;
+  float p_half;
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  GEN_OCL_GET_FLOAT_WORD(hp,p);
+  sx = hx&0x80000000;
+  hp &= 0x7fffffff;
+  hx &= 0x7fffffff;
+  /* purge off exception values */
+  if(hp==0) return (x*p)/(x*p);	        /* p = 0 */
+  if((hx>=0x7f800000)||               /* x not finite */
+    ((hp>0x7f800000)))	               /* p is NaN */
+    return (x*p)/(x*p);
+  if (hp<=0x7effffff) x = __gen_ocl_internal_fmod(x,p+p); /* now x < 2p */
+  if ((hx-hp)==0) return zero*x;
+  x = __gen_ocl_fabs(x);
+  p = __gen_ocl_fabs(p);
+  if (hp<0x01000000) {
+    if(x+x>p) {
+      x-=p;
+      if(x+x>=p) x -= p;
+    }
+  } else {
+    p_half = (float)0.5*p;
+    if(x>p_half) {
+      x-=p;
+      if(x>=p_half) x -= p;
+    }
+  }
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  GEN_OCL_SET_FLOAT_WORD(x,hx^sx);
+  return x;
+}
+
+OVERLOADABLE float __gen_ocl_internal_ldexp(float x, int n) {
+  if(!__ocl_finitef(x)||x==(float)0.0) return x;
+  x = __gen_ocl_scalbnf(x,n);
+  return x;
+}
+
+OVERLOADABLE float __gen_ocl_internal_atanh(float x) {
+  //return 0.5f * native_sqrt((1 + x) / (1 - x));
+  float xa = __gen_ocl_fabs (x);
+  float t;
+  if (isless (xa, 0.5f)){
+    if (xa < 0x1.0p-28f) return x;
+    t = xa + xa;
+    t = 0.5f * log1p (t + t * xa / (1.0f - xa));
+  } else if (isless (xa, 1.0f)){
+    t = 0.5f * log1p ((xa + xa) / (1.0f - xa));
+  } else{
+    if (isgreater (xa, 1.0f)) return (x - x) / (x - x);
+    return x / 0.0f;
+  }
+  return __gen_ocl_internal_copysign(t, x);
+}
+
+OVERLOADABLE float __gen_ocl_internal_exp10(float x){
+  float px, qx,ans;
+  short n;
+  int i;
+  float*p;
+  float MAXL10 = 38.230809449325611792;
+  float LOG210 = 3.32192809488736234787e0;
+  float LG102A = 3.00781250000000000000E-1;
+  float LG102B = 2.48745663981195213739E-4;
+  float P[6];
+  P[0] = 2.063216740311022E-001;
+  P[1] = 5.420251702225484E-001;
+  P[2] = 1.171292686296281E+000;
+  P[3] = 2.034649854009453E+000;
+  P[4] = 2.650948748208892E+000;
+  P[5] = 2.302585167056758E+000;
+  if( isinf(x))
+    return INFINITY;
+
+  if( x < -MAXL10 )return 0.0;
+  /* The following is necessary because range reduction blows up: */
+  if( x == 0 )return 1.0;
+
+  /* Express 10**x = 10**g 2**n
+    *	 = 10**g 10**( n log10(2) )
+    *	 = 10**( g + n log10(2) )
+    */
+  px = x * LOG210;
+  qx = __gen_ocl_internal_floor( px + 0.5 );
+  n = qx;
+  x -= qx * LG102A;
+  x -= qx * LG102B;
+
+  /* rational approximation for exponential
+    * of the fractional part:
+    * 10**x - 1  =  2x P(x**2)/( Q(x**2) - P(x**2) )
+    */
+  p = P;
+  ans = *p++;
+  i = 5;
+  do{
+    ans = ans * x  +  *p++;
+  }
+  while( --i );
+  px = 1.0 + x * ans;
+
+  /* multiply by power of 2 */
+  x = __gen_ocl_internal_ldexp( px, n );
+  return x;
+}
+
+OVERLOADABLE float cospi(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_cospi(x);
+
+  return __gen_ocl_internal_cospi(x);
+}
+
+OVERLOADABLE float cosh(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_cosh(x);
+
+  return  __gen_ocl_internal_cosh(x);
+}
+
+OVERLOADABLE float acos(float x) {
+  return __gen_ocl_internal_acos(x);
+}
+
+OVERLOADABLE float acospi(float x) {
+  return __gen_ocl_internal_acospi(x);
+}
+
+OVERLOADABLE float acosh(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_acosh(x);
+
+  return __gen_ocl_internal_acosh(x);
+}
+
+OVERLOADABLE float sinpi(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_sinpi(x);
+
+  return __gen_ocl_internal_sinpi(x);
+}
+
+OVERLOADABLE float sinh(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_sinh(x);
+
+  return __gen_ocl_internal_sinh(x);
+}
+
+OVERLOADABLE float asin(float x) {
+  return __gen_ocl_internal_asin(x);
+}
+
+OVERLOADABLE float asinpi(float x) {
+  return __gen_ocl_internal_asinpi(x);
+}
+
+OVERLOADABLE float asinh(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_asinh(x);
+
+  return __gen_ocl_internal_asinh(x);
+}
+
+OVERLOADABLE float tanpi(float x) {
+  return __gen_ocl_internal_tanpi(x);
+}
+
+OVERLOADABLE float tanh(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_tanh(x);
+
+  return __gen_ocl_internal_tanh(x);
+}
+
+OVERLOADABLE float atan(float x) {
+  return __gen_ocl_internal_atan(x);
+}
+
+OVERLOADABLE float atan2(float y, float x) {
+  return __gen_ocl_internal_atan2(y, x);
+}
+
+OVERLOADABLE float atan2pi(float y, float x) {
+  return __gen_ocl_internal_atan2pi(y, x);
+}
+
+OVERLOADABLE float atanpi(float x) {
+  return __gen_ocl_internal_atanpi(x);
+}
+
+OVERLOADABLE float atanh(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_atanh(x);
+
+  return __gen_ocl_internal_atanh(x);
+}
+
+OVERLOADABLE float cbrt(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_cbrt(x);
+
+  return __gen_ocl_internal_cbrt(x);
+}
+
+OVERLOADABLE float rint(float x) {
+  return __gen_ocl_internal_rint(x);
+}
+
+OVERLOADABLE float copysign(float x, float y) {
+  return __gen_ocl_internal_copysign(x, y);
+}
+
+OVERLOADABLE float erf(float x) {
+  return __gen_ocl_internal_erf(x);
+}
+
+OVERLOADABLE float erfc(float x) {
+  return __gen_ocl_internal_erfc(x);
+}
+
+OVERLOADABLE float fmod (float x, float y) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_fmod(x, y);
+
+  return __gen_ocl_internal_fmod(x, y);
+}
+
+OVERLOADABLE float remainder(float x, float p) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_remainder(x, p);
+
+  return __gen_ocl_internal_remainder(x, p);
+}
+
+OVERLOADABLE float ldexp(float x, int n) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_ldexp(x, n);
+
+  return __gen_ocl_internal_ldexp(x, n);
+}
+
+PURE CONST float __gen_ocl_mad(float a, float b, float c);
+PURE CONST float __gen_ocl_fmax(float a, float b);
+PURE CONST float __gen_ocl_fmin(float a, float b);
+
+OVERLOADABLE float mad(float a, float b, float c) {
+  return __gen_ocl_mad(a, b, c);
+}
+
+
+#define BODY \
+  if (isnan(x) || isinf(x)) { \
+    *exp = 0; \
+    return x; \
+  } \
+  uint u = as_uint(x); \
+  uint a = u & 0x7FFFFFFFu; \
+  if (a == 0) { \
+    *exp = 0; \
+    return x; \
+  } \
+  if (a >= 0x800000) { \
+    *exp = (a >> 23) - 126; \
+    return as_float((u & (0x807FFFFFu)) | 0x3F000000); \
+  } \
+  int e = -126; \
+  while (a < 0x400000) { \
+    e --; \
+    a <<= 1; \
+  } \
+  a <<= 1; \
+  *exp = e; \
+  return as_float((a & (0x807FFFFFu)) | (u & 0x80000000u) | 0x3F000000);
+OVERLOADABLE float frexp(float x, global int *exp) { BODY; }
+OVERLOADABLE float frexp(float x, local int *exp) { BODY; }
+OVERLOADABLE float frexp(float x, private int *exp) { BODY; }
+#undef BODY
+
+OVERLOADABLE float nextafter(float x, float y) {
+  int hx, hy, ix, iy;
+  hx = as_int(x);
+  hy = as_int(y);
+  ix = hx & 0x7fffffff;
+  iy = hy & 0x7fffffff;
+  if(ix>0x7f800000 || iy>0x7f800000)
+    return x+y;
+  if(hx == hy)
+    return y;
+  if(ix == 0) {
+    if(iy == 0)
+      return y;
+    else
+      return as_float((hy&0x80000000) | 1);
+  }
+  if(hx >= 0) {
+    if(hx > hy) {
+      hx -= 1;
+    } else {
+      hx += 1;
+    }
+  } else {
+    if(hy >= 0 || hx > hy){
+      hx -= 1;
+    } else {
+      hx += 1;
+    }
+  }
+  return as_float(hx);
+}
+
+#define BODY \
+  uint hx = as_uint(x), ix = hx & 0x7FFFFFFF; \
+  if (ix > 0x7F800000) { \
+    *i = nan(0u); \
+    return nan(0u); \
+  } \
+  if (ix == 0x7F800000) { \
+    *i = x; \
+    return as_float(hx & 0x80000000u); \
+  } \
+  *i = __gen_ocl_rndz(x); \
+  return x - *i;
+OVERLOADABLE float modf(float x, global float *i) { BODY; }
+OVERLOADABLE float modf(float x, local float *i) { BODY; }
+OVERLOADABLE float modf(float x, private float *i) { BODY; }
+#undef BODY
+
+OVERLOADABLE float __gen_ocl_internal_fmax(float a, float b) { return max(a,b); }
+OVERLOADABLE float __gen_ocl_internal_fmin(float a, float b) { return min(a,b); }
+OVERLOADABLE float __gen_ocl_internal_maxmag(float x, float y) {
+  float a = __gen_ocl_fabs(x), b = __gen_ocl_fabs(y);
+  return a > b ? x : b > a ? y : max(x, y);
+}
+OVERLOADABLE float __gen_ocl_internal_minmag(float x, float y) {
+  float a = __gen_ocl_fabs(x), b = __gen_ocl_fabs(y);
+  return a < b ? x : b < a ? y : min(x, y);
+}
+OVERLOADABLE float __gen_ocl_internal_fdim(float x, float y) {
+  if(isnan(x))
+    return x;
+  if(isnan(y))
+    return y;
+  return x > y ? (x - y) : +0.f;
+}
+
+OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
+  float z,ax,z_h,z_l,p_h,p_l;
+  float y1,t1,t2,r,s,sn,t,u,v,w;
+  int i,j,k,yisint,n;
+  int hx,hy,ix,iy,is;
+  float bp[2],dp_h[2],dp_l[2],
+  zero    =  0.0,
+  one	=  1.0,
+  two	=  2.0,
+  two24	=  16777216.0,	/* 0x4b800000 */
+  huge	=  1.0e30,
+  tiny    =  1.0e-30,
+  /* poly coefs for (3/2)*(log(x)-2s-2/3*s**3 */
+  L1  =  6.0000002384e-01, /* 0x3f19999a */
+  L2  =  4.2857143283e-01, /* 0x3edb6db7 */
+  L3  =  3.3333334327e-01, /* 0x3eaaaaab */
+  L4  =  2.7272811532e-01, /* 0x3e8ba305 */
+  L5  =  2.3066075146e-01, /* 0x3e6c3255 */
+  L6  =  2.0697501302e-01, /* 0x3e53f142 */
+  P1   =  1.6666667163e-01, /* 0x3e2aaaab */
+  P2   = -2.7777778450e-03, /* 0xbb360b61 */
+  P3   =  6.6137559770e-05, /* 0x388ab355 */
+  P4   = -1.6533901999e-06, /* 0xb5ddea0e */
+  P5   =  4.1381369442e-08, /* 0x3331bb4c */
+  lg2  =  6.9314718246e-01, /* 0x3f317218 */
+  lg2_h  =  6.93145752e-01, /* 0x3f317200 */
+  lg2_l  =  1.42860654e-06, /* 0x35bfbe8c */
+  ovt =  4.2995665694e-08, /* -(128-log2(ovfl+.5ulp)) */
+  cp    =  9.6179670095e-01, /* 0x3f76384f =2/(3ln2) */
+  cp_h  =  9.6179199219e-01, /* 0x3f763800 =head of cp */
+  cp_l  =  4.7017383622e-06, /* 0x369dc3a0 =tail of cp_h */
+  ivln2    =  1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
+  ivln2_h  =  1.4426879883e+00, /* 0x3fb8aa00 =16b 1/ln2*/
+  ivln2_l  =  7.0526075433e-06; /* 0x36eca570 =1/ln2 tail*/
+  bp[0] = 1.0,bp[1] = 1.5,
+  dp_h[0] = 0.0,dp_h[1] = 5.84960938e-01,
+  dp_l[0] = 0.0,dp_l[1] = 1.56322085e-06;
+  GEN_OCL_GET_FLOAT_WORD(hx,x);
+  GEN_OCL_GET_FLOAT_WORD(hy,y);
+  ix = hx&0x7fffffff;  iy = hy&0x7fffffff;
+  if (ix < 0x00800000) {	   /* x < 2**-126  */
+    ix = 0;/* Gen does not support subnormal number now */
+  }
+  if (iy < 0x00800000) {	  /* y < 2**-126  */
+    iy = 0;/* Gen does not support subnormal number now */
+  }
+   /* y==zero: x**0 = 1 */
+  if(iy==0) return one;
+  if(hx==0x3f800000) return one;
+  /* +-NaN return x+y */
+  if(ix > 0x7f800000 || iy > 0x7f800000)
+    return (x+0.0f)+y+(0.0f);
+  /* determine if y is an odd int when x < 0
+     * yisint = 0	... y is not an integer
+     * yisint = 1	... y is an odd int
+     * yisint = 2	... y is an even int
+     */
+  yisint  = 0;
+  if(hx<0) {
+    if(iy>=0x4b800000) yisint = 2; /* even integer y */
+    else if(iy>=0x3f800000) {
+      k = (iy>>23)-0x7f;	   /* exponent */
+      j = iy>>(23-k);
+      if((j<<(23-k))==iy) yisint = 2-(j&1);
+    }
+  }
+  /* special value of y */
+  if (iy==0x7f800000) {	/* y is +-inf */
+    if (ix==0x3f800000)
+      //return  y - y;	/* inf**+-1 is NaN */
+      return one;
+    else if (ix > 0x3f800000)/* (|x|>1)**+-inf = inf,0 */
+      return (hy>=0)? y: zero;
+    else			/* (|x|<1)**-,+inf = inf,0 */
+      return (hy<0)?-y: zero;
+  }
+  if(iy==0x3f800000) {	/* y is  +-1 */
+    if(hy<0) return one/x; else return x;
+  }
+  if(hy==0x40000000) return x*x; /* y is  2 */
+  if(hy==0x3f000000) {	/* y is  0.5 */
+    if(hx>=0)return __gen_ocl_sqrt(x);
+  }
+
+  ax   = __gen_ocl_fabs(x);
+    /* special value of x */
+  if(ix==0x7f800000||ix==0||ix==0x3f800000){
+    z = ax;			/*x is +-0,+-inf,+-1*/
+    if(hy<0) z = one/z;	/* z = (1/|x|) */
+    if(hx<0) {
+      if(((ix-0x3f800000)|yisint)==0) {
+        z = (z-z)/(z-z); /* (-1)**non-int is NaN */
+      } else if(yisint==1)
+        z = -z;		/* (x<0)**odd = -(|x|**odd) */
+    }
+    return z;
+  }
+  n = ((uint)hx>>31)-1;
+
+  /* (x<0)**(non-int) is NaN */
+  if((n|yisint)==0) return (x-x)/(x-x);
+
+  sn = one; /* s (sign of result -ve**odd) = -1 else = 1 */
+  if((n|(yisint-1))==0) sn = -one;/* (-ve)**(odd int) */
+
+  /* |y| is huge */
+  if(iy>0x4d000000) { /* if |y| > 2**27 */
+    /* over/underflow if x is not close to one */
+    if(ix<0x3f7ffff8) return (hy<0)? sn*huge*huge:sn*tiny*tiny;
+    if(ix>0x3f800007) return (hy>0)? sn*huge*huge:sn*tiny*tiny;
+    /* now |1-x| is tiny <= 2**-20, suffice to compute
+          log(x) by x-x^2/2+x^3/3-x^4/4 */
+    t = ax-1;		/* t has 20 trailing zeros */
+    w = (t*t)*((float)0.5-t*(0.333333333333f-t*0.25f));
+    u = ivln2_h*t;	/* ivln2_h has 16 sig. bits */
+    v = t*ivln2_l-w*ivln2;
+    t1 = u+v;
+    GEN_OCL_GET_FLOAT_WORD(is,t1);
+    GEN_OCL_SET_FLOAT_WORD(t1,is&0xfffff000);
+    t2 = v-(t1-u);
+  } else {
+    float s2,s_h,s_l,t_h,t_l;
+    n = 0;
+	/* take care subnormal number */
+    //if(ix<0x00800000)
+      //{ax *= two24; n -= 24; GEN_OCL_GET_FLOAT_WORD(ix,ax); }
+    n  += ((ix)>>23)-0x7f;
+    j  = ix&0x007fffff;
+	/* determine interval */
+    ix = j|0x3f800000;		/* normalize ix */
+    if(j<=0x1cc471) k=0;	/* |x|<sqrt(3/2) */
+    else if(j<0x5db3d7) k=1;	/* |x|<sqrt(3)   */
+    else {k=0;n+=1;ix -= 0x00800000;}
+    GEN_OCL_SET_FLOAT_WORD(ax,ix);
+
+	/* compute s = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */
+    u = ax-bp[k];		/* bp[0]=1.0, bp[1]=1.5 */
+    v = one/(ax+bp[k]);
+    s = u*v;
+    s_h = s;
+    GEN_OCL_GET_FLOAT_WORD(is,s_h);
+    GEN_OCL_SET_FLOAT_WORD(s_h,is&0xfffff000);
+    /* t_h=ax+bp[k] High */
+    is = ((ix>>1)&0xfffff000)|0x20000000;
+    GEN_OCL_SET_FLOAT_WORD(t_h,is+0x00400000+(k<<21));
+    t_l = ax - (t_h-bp[k]);
+    s_l = v*((u-s_h*t_h)-s_h*t_l);
+    /* compute log(ax) */
+    s2 = s*s;
+    r = s2*s2*(L1+s2*(L2+s2*(L3+s2*(L4+s2*(L5+s2*L6)))));
+    r += s_l*(s_h+s);
+    s2  = s_h*s_h;
+    t_h = 3.0f+s2+r;
+    GEN_OCL_GET_FLOAT_WORD(is,t_h);
+    GEN_OCL_SET_FLOAT_WORD(t_h,is&0xfffff000);
+    t_l = r-((t_h-3.0f)-s2);
+    /* u+v = s*(1+...) */
+    u = s_h*t_h;
+    v = s_l*t_h+t_l*s;
+    /* 2/(3log2)*(s+...) */
+    p_h = u+v;
+    GEN_OCL_GET_FLOAT_WORD(is,p_h);
+    GEN_OCL_SET_FLOAT_WORD(p_h,is&0xfffff000);
+    p_l = v-(p_h-u);
+    z_h = cp_h*p_h;		/* cp_h+cp_l = 2/(3*log2) */
+    z_l = cp_l*p_h+p_l*cp+dp_l[k];
+    /* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */
+    t = (float)n;
+    t1 = (((z_h+z_l)+dp_h[k])+t);
+    GEN_OCL_GET_FLOAT_WORD(is,t1);
+    GEN_OCL_SET_FLOAT_WORD(t1,is&0xfffff000);
+    t2 = z_l-(((t1-t)-dp_h[k])-z_h);
+  }
+
+  /* split up y into y1+y2 and compute (y1+y2)*(t1+t2) */
+  GEN_OCL_GET_FLOAT_WORD(is,y);
+  GEN_OCL_SET_FLOAT_WORD(y1,is&0xfffff000);
+  p_l = (y-y1)*t1+y*t2;
+  p_h = y1*t1;
+  z = p_l+p_h;
+  GEN_OCL_GET_FLOAT_WORD(j,z);
+  if (j>0x43000000)				/* if z > 128 */
+    return sn*huge*huge;			/* overflow */
+  else if (j==0x43000000) {			/* if z == 128 */
+    if(p_l+ovt>z-p_h) return sn*huge*huge;	/* overflow */
+  }
+  else if ((j&0x7fffffff)>0x43160000)		/* z <= -150 */
+    return sn*tiny*tiny;			/* underflow */
+  else if (j==0xc3160000){			/* z == -150 */
+    if(p_l<=z-p_h) return sn*tiny*tiny;		/* underflow */
+  }
+
+  /*
+    * compute 2**(p_h+p_l)
+    */
+  i = j&0x7fffffff;
+  k = (i>>23)-0x7f;
+  n = 0;
+  if(i>0x3f000000) {		/* if |z| > 0.5, set n = [z+0.5] */
+    n = j+(0x00800000>>(k+1));
+    k = ((n&0x7fffffff)>>23)-0x7f;	/* new k for n */
+    GEN_OCL_SET_FLOAT_WORD(t,n&~(0x007fffff>>k));
+    n = ((n&0x007fffff)|0x00800000)>>(23-k);
+    if(j<0) n = -n;
+    p_h -= t;
+  }
+  t = p_l+p_h;
+  GEN_OCL_GET_FLOAT_WORD(is,t);
+  GEN_OCL_SET_FLOAT_WORD(t,is&0xffff8000);
+  u = t*lg2_h;
+  v = (p_l-(t-p_h))*lg2+t*lg2_l;
+  z = u+v;
+  w = v-(z-u);
+  t  = z*z;
+  t1  = z - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))));
+  r  = (z*t1)/(t1-two)-(w+z*w);
+  z  = one-(r-z);
+  GEN_OCL_GET_FLOAT_WORD(j,z);
+  j += (n<<23);
+  if((j>>23)<=0) z = __gen_ocl_scalbnf(z,n);	/* subnormal output */
+  else GEN_OCL_SET_FLOAT_WORD(z,j);
+  return sn*z;
+}
+
+
+OVERLOADABLE float hypot(float x, float y) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_hypot(x, y);
+
+  //return __gen_ocl_sqrt(x*x + y*y);
+  float a,b,an,bn,cn;
+  int e;
+  if (isfinite (x) && isfinite (y)){      /* Determine absolute values.  */
+  x = __gen_ocl_fabs (x);
+  y = __gen_ocl_fabs (y);
+  /* Find the bigger and the smaller one.  */
+  a = max(x,y);
+  b = min(x,y);
+  /* Now 0 <= b <= a.  */
+  /* Write a = an * 2^e, b = bn * 2^e with 0 <= bn <= an < 1.  */
+  an = frexp (a, &e);
+  bn = ldexp (b, - e);
+  /* Through the normalization, no unneeded overflow or underflow will occur here.  */
+  cn = __gen_ocl_sqrt (an * an + bn * bn);
+  return ldexp (cn, e);
+  }else{
+    if (isinf (x) || isinf (y))  /* x or y is infinite.  Return +Infinity.  */
+      return INFINITY;
+    else        /* x or y is NaN.  Return NaN.  */
+      return x + y;
+  }
+}
+
+#define BODY \
+  if (isnan(x)) { \
+    *p = x; \
+    return x; \
+  } \
+  *p = __gen_ocl_internal_floor(x); \
+  if (isinf(x)) { \
+    return x > 0 ? +0. : -0.; \
+  } \
+  return __gen_ocl_internal_fmin(x - *p, 0x1.FFFFFep-1F);
+OVERLOADABLE float fract(float x, global float *p) { BODY; }
+OVERLOADABLE float fract(float x, local float *p) { BODY; }
+OVERLOADABLE float fract(float x, private float *p) { BODY; }
+#undef BODY
+
+#define BODY \
+  float Zero[2]; \
+  int n,hx,hy,hz,ix,iy,sx,i,sy; \
+  uint q,sxy; \
+  Zero[0] = 0.0;Zero[1] = -0.0; \
+  GEN_OCL_GET_FLOAT_WORD(hx,x);GEN_OCL_GET_FLOAT_WORD(hy,y); \
+  sxy = (hx ^ hy) & 0x80000000;sx = hx&0x80000000;sy = hy&0x80000000; \
+  hx ^=sx; hy &= 0x7fffffff; \
+  if (hx < 0x00800000)hx = 0;if (hy < 0x00800000)hy = 0; \
+  if(hy==0||hx>=0x7f800000||hy>0x7f800000){ \
+    *quo = 0;return NAN; \
+  } \
+  if( hy == 0x7F800000 || hx == 0 ) { \
+    *quo = 0;return x; \
+  } \
+  if( hx == hy ) { \
+    *quo = (x == y) ? 1 : -1; \
+    return sx ? -0.0 : 0.0; \
+  } \
+  if(hx<hy) { \
+    q = 0; \
+    goto fixup; \
+  } else if(hx==hy) { \
+    *quo = (sxy ? -1 : 1); \
+    return Zero[(uint)sx>>31]; \
+  } \
+  ix = (hx>>23)-127; \
+  iy = (hy>>23)-127; \
+  hx = 0x00800000|(0x007fffff&hx); \
+  hy = 0x00800000|(0x007fffff&hy); \
+  n = ix - iy; \
+  q = 0; \
+  while(n--) { \
+    hz=hx-hy; \
+    if(hz<0) hx = hx << 1; \
+    else {hx = hz << 1; q++;} \
+    q <<= 1; \
+  } \
+  hz=hx-hy; \
+  if(hz>=0) {hx=hz;q++;} \
+  if(hx==0) { \
+    q &= 0x0000007f; \
+    *quo = (sxy ? -q : q); \
+    return Zero[(uint)sx>>31]; \
+  } \
+  while(hx<0x00800000) { \
+    hx <<= 1;iy -= 1; \
+  } \
+  if(iy>= -126) { \
+    hx = ((hx-0x00800000)|((iy+127)<<23)); \
+  } else {\
+    n = -126 - iy; \
+    hx >>= n; \
+  } \
+fixup: \
+  GEN_OCL_SET_FLOAT_WORD(x,hx); \
+  if(hx<0x00800000){ \
+    GEN_OCL_GET_FLOAT_WORD(hy,y); \
+    hy &= 0x7fffffff; \
+    if(hx+hx > hy ||(hx+hx==hy && (q & 1)))q++; \
+    x = 0; \
+  }else{ \
+    y = __gen_ocl_fabs(y); \
+    if (y < 0x1p-125f) { \
+      if (x+x>y || (x+x==y && (q & 1))) { \
+        q++;x-=y; \
+      } \
+    }else if (x>0.5f*y || (x==0.5f*y && (q & 1))) { \
+      q++;x-=y; \
+    } \
+    GEN_OCL_GET_FLOAT_WORD(hx,x);GEN_OCL_SET_FLOAT_WORD(x,hx^sx); \
+  } \
+  int sign = sx==sy?0:1; \
+  q &= 0x0000007f; \
+  *quo = (sign ? -q : q); \
+  return x;
+
+OVERLOADABLE float remquo(float x, float y, global int *quo) {
+	BODY;
+}
+OVERLOADABLE float remquo(float x, float y, local int *quo) { BODY; }
+OVERLOADABLE float remquo(float x, float y, private int *quo) { BODY; }
+#undef BODY
+
+OVERLOADABLE float pown(float x, int n) {
+  if (x == 0.f && n == 0)
+    return 1.f;
+  if (x < 0.f && (n&1) )
+    return -powr(-x, n);
+  return powr(x, n);
+}
+
+OVERLOADABLE float pow(float x, float y) {
+  int n;
+  if (x == 0.f && y == 0.f)
+    return 1.f;
+  if (x >= 0.f)
+    return powr(x, y);
+  n = y;
+  if ((float)n == y)//is exact integer
+    return pown(x, n);
+  return NAN;
+}
+
+OVERLOADABLE float rootn(float x, int n) {
+  float ax,re;
+  int sign = 0;
+  if( n == 0 )return NAN;
+  //rootn ( x, n )  returns a NaN for x < 0 and n is even.
+  if( x < 0 && 0 == (n&1) )
+    return NAN;
+  if( x == 0.0 ){
+    switch( n & 0x80000001 ){
+      //rootn ( +-0,  n ) is +0 for even n > 0.
+      case 0:
+        return 0.0f;
+      //rootn ( +-0,  n ) is +-0 for odd n > 0.
+      case 1:
+        return x;
+      //rootn ( +-0,  n ) is +inf for even n < 0.
+      case 0x80000000:
+        return INFINITY;
+
+      //rootn ( +-0,  n ) is +-inf for odd n < 0.
+      case 0x80000001:
+        return __gen_ocl_internal_copysign(INFINITY, x);
+    }
+  }
+  ax = __gen_ocl_fabs(x);
+  if(x <0.0f && (n&1))
+    sign = 1;
+  if (__ocl_math_fastpath_flag)
+    re = __gen_ocl_pow(ax, 1.f/n);
+  else
+    re = __gen_ocl_internal_pow(ax,1.f/n);
+  if(sign)
+    re = -re;
+  return re;
+}
+
+OVERLOADABLE float fabs(float x) {
+  return __gen_ocl_internal_fabs(x);
+}
+
+OVERLOADABLE float trunc(float x) {
+  return  __gen_ocl_internal_trunc(x);
+}
+
+OVERLOADABLE float round(float x) {
+  return __gen_ocl_internal_round(x);
+}
+
+OVERLOADABLE float floor(float x) {
+  return __gen_ocl_internal_floor(x);
+}
+
+OVERLOADABLE float ceil(float x) {
+  return __gen_ocl_internal_ceil(x);
+}
+
+OVERLOADABLE float log(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_log(x);
+
+  return  __gen_ocl_internal_log(x);
+}
+
+OVERLOADABLE float log2(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_log2(x);
+
+  return  __gen_ocl_internal_log2(x);
+}
+
+OVERLOADABLE float log10(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_log10(x);
+
+  return  __gen_ocl_internal_log10(x);
+}
+
+OVERLOADABLE float exp(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_exp(x);
+
+  return  __gen_ocl_internal_exp(x);
+}
+
+OVERLOADABLE float exp2(float x) {
+  return native_exp2(x);
+}
+
+OVERLOADABLE float exp10(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_exp10(x);
+
+  return  __gen_ocl_internal_exp10(x);
+}
+
+OVERLOADABLE float expm1(float x) {
+  if (__ocl_math_fastpath_flag)
+    return __gen_ocl_internal_fastpath_expm1(x);
+
+  return  __gen_ocl_internal_expm1(x);
+}
+
+OVERLOADABLE float fmin(float a, float b) {
+  return __gen_ocl_internal_fmin(a, b);
+}
+
+OVERLOADABLE float fmax(float a, float b) {
+  return __gen_ocl_internal_fmax(a, b);
+}
+
+OVERLOADABLE float fma(float a, float b, float c) {
+  return mad(a, b, c);
+}
+
+OVERLOADABLE float fdim(float x, float y) {
+  return __gen_ocl_internal_fdim(x, y);
+}
+
+OVERLOADABLE float maxmag(float x, float y) {
+  return __gen_ocl_internal_maxmag(x, y);
+}
+
+OVERLOADABLE float minmag(float x, float y) {
+  return __gen_ocl_internal_minmag(x, y);
+}
diff --git a/backend/src/libocl/tmpl/ocl_math.tmpl.h b/backend/src/libocl/tmpl/ocl_math.tmpl.h
new file mode 100644
index 0000000..69ee3f3
--- /dev/null
+++ b/backend/src/libocl/tmpl/ocl_math.tmpl.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef __OCL_MATH_H__
+#define __OCL_MATH_H__
+
+#include "ocl_types.h"
+
+OVERLOADABLE float cospi(float x);
+OVERLOADABLE float cosh(float x);
+OVERLOADABLE float acos(float x);
+OVERLOADABLE float acospi(float x);
+OVERLOADABLE float acosh(float x);
+OVERLOADABLE float sinpi(float x);
+OVERLOADABLE float sinh(float x);
+OVERLOADABLE float asin(float x);
+OVERLOADABLE float asinpi(float x);
+OVERLOADABLE float asinh(float x);
+OVERLOADABLE float tanpi(float x);
+OVERLOADABLE float tanh(float x);
+OVERLOADABLE float atan(float x);
+OVERLOADABLE float atan2(float y, float x);
+OVERLOADABLE float atan2pi(float y, float x);
+OVERLOADABLE float atanpi(float x);
+OVERLOADABLE float atanh(float x);
+OVERLOADABLE float cbrt(float x);
+OVERLOADABLE float rint(float x);
+OVERLOADABLE float copysign(float x, float y);
+OVERLOADABLE float erf(float x);
+OVERLOADABLE float erfc(float x);
+OVERLOADABLE float fmod (float x, float y);
+OVERLOADABLE float remainder(float x, float p);
+OVERLOADABLE float ldexp(float x, int n);
+OVERLOADABLE float powr(float x, float y);
+OVERLOADABLE float pow(float x, float y);
+//no pow, we use powr instead
+OVERLOADABLE float fabs(float x);
+OVERLOADABLE float trunc(float x);
+OVERLOADABLE float round(float x);
+OVERLOADABLE float floor(float x);
+OVERLOADABLE float ceil(float x);
+OVERLOADABLE float log(float x);
+OVERLOADABLE float log2(float x);
+OVERLOADABLE float log10(float x);
+OVERLOADABLE float exp(float x);
+OVERLOADABLE float exp10(float x);
+OVERLOADABLE float expm1(float x);
+OVERLOADABLE float fmin(float a, float b);
+OVERLOADABLE float fmax(float a, float b);
+OVERLOADABLE float fma(float a, float b, float c);
+OVERLOADABLE float fdim(float x, float y);
+OVERLOADABLE float maxmag(float x, float y);
+OVERLOADABLE float minmag(float x, float y);
+OVERLOADABLE float exp2(float x);
+OVERLOADABLE float mad(float a, float b, float c);
+OVERLOADABLE float sin(float x);
+OVERLOADABLE float cos(float x);
+OVERLOADABLE float tan(float x);
+OVERLOADABLE float tgamma(float x);
+OVERLOADABLE float lgamma(float x);
+OVERLOADABLE float lgamma_r(float x, global int *signgamp);
+OVERLOADABLE float lgamma_r(float x, local int *signgamp);
+OVERLOADABLE float lgamma_r(float x, private int *signgamp);
+OVERLOADABLE float log1p(float x);
+OVERLOADABLE float logb(float x);
+OVERLOADABLE int ilogb(float x);
+OVERLOADABLE float nan(uint code);
+OVERLOADABLE float sincos(float x, global float *cosval);
+OVERLOADABLE float sincos(float x, local float *cosval);
+OVERLOADABLE float sincos(float x, private float *cosval);
+OVERLOADABLE float sqrt(float x);
+OVERLOADABLE float rsqrt(float x);
+OVERLOADABLE float frexp(float x, global int *exp);
+OVERLOADABLE float frexp(float x, local int *exp);
+OVERLOADABLE float frexp(float x, private int *exp);
+OVERLOADABLE float nextafter(float x, float y);
+OVERLOADABLE float modf(float x, global float *i);
+OVERLOADABLE float modf(float x, local float *i);
+OVERLOADABLE float modf(float x, private float *i);
+OVERLOADABLE float hypot(float x, float y);
+OVERLOADABLE float fract(float x, global float *p);
+OVERLOADABLE float fract(float x, local float *p);
+OVERLOADABLE float fract(float x, private float *p);
+OVERLOADABLE float remquo(float x, float y, global int *quo);
+OVERLOADABLE float remquo(float x, float y, local int *quo);
+OVERLOADABLE float remquo(float x, float y, private int *quo);
+OVERLOADABLE float pown(float x, int n);
+OVERLOADABLE float rootn(float x, int n);
+
+// native
+OVERLOADABLE float native_cos(float x);
+OVERLOADABLE float native_divide(float x, float y);
+OVERLOADABLE float native_exp(float x);
+OVERLOADABLE float native_exp2(float x);
+OVERLOADABLE float native_exp10(float x);
+OVERLOADABLE float native_log(float x);
+OVERLOADABLE float native_log2(float x);
+OVERLOADABLE float native_log10(float x);
+OVERLOADABLE float native_powr(float x, float y);
+OVERLOADABLE float native_recip(float x);
+OVERLOADABLE float native_rsqrt(float x);
+OVERLOADABLE float native_sin(float x);
+OVERLOADABLE float native_sqrt(float x);
+OVERLOADABLE float native_tan(float x);
+
+// half  not supported now.
diff --git a/backend/src/libocl/tmpl/ocl_relational.tmpl.cl b/backend/src/libocl/tmpl/ocl_relational.tmpl.cl
new file mode 100644
index 0000000..1100815
--- /dev/null
+++ b/backend/src/libocl/tmpl/ocl_relational.tmpl.cl
@@ -0,0 +1,167 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#include "ocl_relational.h"
+
+OVERLOADABLE int isequal(float x, float y) {
+  return x == y;
+}
+
+OVERLOADABLE int isnotequal(float x, float y) {
+  return x != y;
+}
+
+OVERLOADABLE int isgreater(float x, float y) {
+  return x > y;
+}
+
+OVERLOADABLE int isgreaterequal(float x, float y) {
+  return x >= y;
+}
+
+OVERLOADABLE int isless(float x, float y) {
+  return x < y;
+}
+
+OVERLOADABLE int islessequal(float x, float y) {
+  return x <= y;
+}
+
+OVERLOADABLE int islessgreater(float x, float y) {
+  return (x < y) || (x > y);
+}
+
+OVERLOADABLE int isfinite(float x) {
+  union { uint u; float f; } u;
+  u.f = x;
+  return (u.u & 0x7FFFFFFF) < 0x7F800000;
+}
+
+OVERLOADABLE int isinf(float x) {
+  union { uint u; float f; } u;
+  u.f = x;
+  return (u.u & 0x7FFFFFFF) == 0x7F800000;
+}
+
+OVERLOADABLE int isnan(float x) {
+  return x != x;
+}
+
+OVERLOADABLE int isnormal(float x) {
+  union { uint u; float f; } u;
+  u.f = x;
+  u.u &= 0x7FFFFFFF;
+  return (u.u < 0x7F800000) && (u.u >= 0x800000);
+}
+
+
+OVERLOADABLE int isordered(float x, float y) {
+  return isequal(x, x) && isequal(y, y);
+}
+OVERLOADABLE int isunordered(float x, float y) {
+  return isnan(x) || isnan(y);
+}
+OVERLOADABLE int signbit(float x) {
+  union { uint u; float f; } u;
+  u.f = x;
+  return u.u >> 31;
+}
+
+
+// any
+#define DEC1(type) OVERLOADABLE int any(type a) { return a<0; }
+#define DEC2(type) OVERLOADABLE int any(type a) { return a.s0<0 || a.s1<0; }
+#define DEC3(type) OVERLOADABLE int any(type a) { return a.s0<0 || a.s1<0 || a.s2<0; }
+#define DEC4(type) OVERLOADABLE int any(type a) { return a.s0<0 || a.s1<0 || a.s2<0 || a.s3<0; }
+#define DEC8(type) OVERLOADABLE int any(type a) { return a.s0<0 || a.s1<0 || a.s2<0 || a.s3<0 || a.s4<0 || a.s5<0 || a.s6<0 || a.s7<0; }
+#define DEC16(type) OVERLOADABLE int any(type a) { return a.s0<0 || a.s1<0 || a.s2<0 || a.s3<0 || a.s4<0 || a.s5<0 || a.s6<0 || a.s7<0 || a.s8<0 || a.s9<0 || a.sA<0 || a.sB<0 || a.sC<0 || a.sD<0 || a.sE<0 || a.sF<0; }
+DEC1(char);
+DEC1(short);
+DEC1(int);
+DEC1(long);
+#define DEC(n) DEC##n(char##n); DEC##n(short##n); DEC##n(int##n); DEC##n(long##n);
+DEC(2);
+DEC(3);
+DEC(4);
+DEC(8);
+DEC(16);
+#undef DEC
+#undef DEC1
+#undef DEC2
+#undef DEC3
+#undef DEC4
+#undef DEC8
+#undef DEC16
+
+// all
+#define DEC1(type) OVERLOADABLE int all(type a) { return a<0; }
+#define DEC2(type) OVERLOADABLE int all(type a) { return a.s0<0 && a.s1<0; }
+#define DEC3(type) OVERLOADABLE int all(type a) { return a.s0<0 && a.s1<0 && a.s2<0; }
+#define DEC4(type) OVERLOADABLE int all(type a) { return a.s0<0 && a.s1<0 && a.s2<0 && a.s3<0; }
+#define DEC8(type) OVERLOADABLE int all(type a) { return a.s0<0 && a.s1<0 && a.s2<0 && a.s3<0 && a.s4<0 && a.s5<0 && a.s6<0 && a.s7<0; }
+#define DEC16(type) OVERLOADABLE int all(type a) { return a.s0<0 && a.s1<0 && a.s2<0 && a.s3<0 && a.s4<0 && a.s5<0 && a.s6<0 && a.s7<0 && a.s8<0 && a.s9<0 && a.sA<0 && a.sB<0 && a.sC<0 && a.sD<0 && a.sE<0 && a.sF<0; }
+DEC1(char);
+DEC1(short);
+DEC1(int);
+DEC1(long);
+#define DEC(n) DEC##n(char##n); DEC##n(short##n); DEC##n(int##n); DEC##n(long##n);
+DEC(2);
+DEC(3);
+DEC(4);
+DEC(8);
+DEC(16);
+#undef DEC
+#undef DEC1
+#undef DEC2
+#undef DEC3
+#undef DEC4
+#undef DEC8
+#undef DEC16
+
+#define DEF(type) OVERLOADABLE type bitselect(type a, type b, type c) { return (a & ~c) | (b & c); }
+DEF(char); DEF(uchar); DEF(short); DEF(ushort); DEF(int); DEF(uint)
+DEF(long); DEF(ulong)
+#undef DEF
+OVERLOADABLE float bitselect(float a, float b, float c) {
+  return as_float(bitselect(as_int(a), as_int(b), as_int(c)));
+}
+
+
+// select
+#define DEF(TYPE1, TYPE2) \
+OVERLOADABLE TYPE1 select(TYPE1 src0, TYPE1 src1, TYPE2 cond) { \
+  return cond ? src1 : src0; \
+}
+DEF(char, char)
+DEF(char, uchar)
+DEF(uchar, char)
+DEF(uchar, uchar)
+DEF(short, short)
+DEF(short, ushort)
+DEF(ushort, short)
+DEF(ushort, ushort)
+DEF(int, int)
+DEF(int, uint)
+DEF(uint, int)
+DEF(uint, uint)
+DEF(long, long)
+DEF(long, ulong)
+DEF(ulong, long)
+DEF(ulong, ulong)
+DEF(float, int)
+DEF(float, uint)
+#undef DEF
diff --git a/backend/src/libocl/tmpl/ocl_relational.tmpl.h b/backend/src/libocl/tmpl/ocl_relational.tmpl.h
new file mode 100644
index 0000000..9921317
--- /dev/null
+++ b/backend/src/libocl/tmpl/ocl_relational.tmpl.h
@@ -0,0 +1,119 @@
+/*
+ * Copyright © 2012 - 2014 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+#ifndef __OCL_RELATIONAL_H__
+#define __OCL_RELATIONAL_H__
+
+#include "ocl_types.h"
+#include "ocl_as.h"
+
+OVERLOADABLE int isequal(float x, float y);
+OVERLOADABLE int isnotequal(float x, float y);
+OVERLOADABLE int isgreater(float x, float y);
+OVERLOADABLE int isgreaterequal(float x, float y);
+OVERLOADABLE int isless(float x, float y);
+OVERLOADABLE int islessequal(float x, float y);
+OVERLOADABLE int islessgreater(float x, float y);
+
+OVERLOADABLE int isfinite(float x);
+OVERLOADABLE int isinf(float x);
+OVERLOADABLE int isnan(float x);
+OVERLOADABLE int isnormal(float x);
+
+
+OVERLOADABLE int isordered(float x, float y);
+OVERLOADABLE int isunordered(float x, float y);
+OVERLOADABLE int signbit(float x);
+
+// any
+#define DEC1(type) OVERLOADABLE int any(type a);
+#define DEC2(type) OVERLOADABLE int any(type a);
+#define DEC3(type) OVERLOADABLE int any(type a);
+#define DEC4(type) OVERLOADABLE int any(type a);
+#define DEC8(type) OVERLOADABLE int any(type a);
+#define DEC16(type) OVERLOADABLE int any(type a);
+DEC1(char);
+DEC1(short);
+DEC1(int);
+DEC1(long);
+#define DEC(n) DEC##n(char##n); DEC##n(short##n); DEC##n(int##n); DEC##n(long##n);
+DEC(2);
+DEC(3);
+DEC(4);
+DEC(8);
+DEC(16);
+#undef DEC
+#undef DEC1
+#undef DEC2
+#undef DEC3
+#undef DEC4
+#undef DEC8
+#undef DEC16
+
+// all
+#define DEC1(type) OVERLOADABLE int all(type a);
+#define DEC2(type) OVERLOADABLE int all(type a);
+#define DEC3(type) OVERLOADABLE int all(type a);
+#define DEC4(type) OVERLOADABLE int all(type a);
+#define DEC8(type) OVERLOADABLE int all(type a);
+#define DEC16(type) OVERLOADABLE int all(type a);
+DEC1(char)
+DEC1(short)
+DEC1(int)
+DEC1(long)
+#define DEC(n) DEC##n(char##n) DEC##n(short##n) DEC##n(int##n) DEC##n(long##n)
+DEC(2)
+DEC(3)
+DEC(4)
+DEC(8)
+DEC(16)
+#undef DEC
+#undef DEC1
+#undef DEC2
+#undef DEC3
+#undef DEC4
+#undef DEC8
+#undef DEC16
+
+#define DEF(type) OVERLOADABLE type bitselect(type a, type b, type c);
+DEF(char) DEF(uchar) DEF(short) DEF(ushort) DEF(int) DEF(uint)
+DEF(long) DEF(ulong)
+#undef DEF
+OVERLOADABLE float bitselect(float a, float b, float c);
+
+
+#define DEF(TYPE1, TYPE2) \
+OVERLOADABLE TYPE1 select(TYPE1 src0, TYPE1 src1, TYPE2 cond);
+DEF(char, char)
+DEF(char, uchar)
+DEF(uchar, char)
+DEF(uchar, uchar)
+DEF(short, short)
+DEF(short, ushort)
+DEF(ushort, short)
+DEF(ushort, ushort)
+DEF(int, int)
+DEF(int, uint)
+DEF(uint, int)
+DEF(uint, uint)
+DEF(long, long)
+DEF(long, ulong)
+DEF(ulong, long)
+DEF(ulong, ulong)
+DEF(float, int)
+DEF(float, uint)
+#undef DEF
diff --git a/backend/src/llvm/llvm_barrier_nodup.cpp b/backend/src/llvm/llvm_barrier_nodup.cpp
index 791df00..19deafc 100644
--- a/backend/src/llvm/llvm_barrier_nodup.cpp
+++ b/backend/src/llvm/llvm_barrier_nodup.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/llvm/llvm_bitcode_link.cpp b/backend/src/llvm/llvm_bitcode_link.cpp
new file mode 100644
index 0000000..f5e9f81
--- /dev/null
+++ b/backend/src/llvm/llvm_bitcode_link.cpp
@@ -0,0 +1,239 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <iostream>
+#include <sstream>
+#include <set>
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/PassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Support/SourceMgr.h"
+
+#include "sys/cvar.hpp"
+#include "src/GBEConfig.h"
+#include "llvm/llvm_gen_backend.hpp"
+#if LLVM_VERSION_MINOR >= 5
+#include "llvm/Linker/Linker.h"
+#else
+#include "llvm/Linker.h"
+#endif
+
+using namespace llvm;
+
+SVAR(OCL_BITCODE_LIB_PATH, OCL_BITCODE_BIN);
+
+namespace gbe
+{
+  static Module* createOclBitCodeModule(LLVMContext& ctx, bool strictMath)
+  {
+    std::string bitCodeFiles = OCL_BITCODE_LIB_PATH;
+    std::istringstream bitCodeFilePath(bitCodeFiles);
+    std::string FilePath;
+    bool findBC = false;
+    Module* oclLib = NULL;
+    SMDiagnostic Err;
+
+    while (std::getline(bitCodeFilePath, FilePath, ':')) {
+      if(access(FilePath.c_str(), R_OK) == 0) {
+        findBC = true;
+        break;
+      }
+    }
+    assert(findBC);
+
+    oclLib = getLazyIRFileModule(FilePath, Err, ctx);
+    if (!oclLib) {
+      printf("Fatal Error: ocl lib can not be opened\n");
+      return NULL;
+    }
+
+    if (strictMath) {
+      llvm::GlobalVariable* mathFastFlag = oclLib->getGlobalVariable("__ocl_math_fastpath_flag");
+      assert(mathFastFlag);
+      Type* intTy = IntegerType::get(ctx, 32);
+      mathFastFlag->setInitializer(ConstantInt::get(intTy, 0));
+    }
+
+    return oclLib;
+  }
+
+  static bool materializedFuncCall(Module& src, Module& lib, llvm::Function &KF, std::set<std::string>& MFS)
+  {
+    bool fromSrc = false;
+    for (llvm::Function::iterator B = KF.begin(), BE = KF.end(); B != BE; B++) {
+      for (BasicBlock::iterator instI = B->begin(),
+           instE = B->end(); instI != instE; ++instI) {
+        llvm::CallInst* call = dyn_cast<llvm::CallInst>(instI);
+        if (!call) {
+          continue;
+        }
+
+        if (call->getCalledFunction() &&
+            call->getCalledFunction()->getIntrinsicID() != 0)
+          continue;
+
+        Value *Callee = call->getCalledValue();
+        const std::string fnName = Callee->getName();
+
+        if (!MFS.insert(fnName).second) {
+          continue;
+        }
+
+        fromSrc = false;
+        llvm::Function *newMF = lib.getFunction(fnName);
+        if (!newMF) {
+          newMF = src.getFunction(fnName);
+          if (!newMF) {
+	    printf("Can not find the lib: %s\n", fnName.c_str());
+	    return false;
+          }
+	  fromSrc = true;
+        }
+
+        std::string ErrInfo;// = "Not Materializable";
+        if (!fromSrc && newMF->isMaterializable()) {
+          if (newMF->Materialize(&ErrInfo)) {
+            printf("Can not materialize the function: %s, because %s\n", fnName.c_str(), ErrInfo.c_str());
+            return false;
+          }
+        }
+
+        if (!materializedFuncCall(src, lib, *newMF, MFS))
+          return false;
+
+      }
+    }
+
+    return true;
+  }
+
+
+  Module* runBitCodeLinker(Module *mod, bool strictMath)
+  {
+    LLVMContext& ctx = mod->getContext();
+    std::set<std::string> materializedFuncs;
+    Module* clonedLib = createOclBitCodeModule(ctx, strictMath);
+    assert(clonedLib && "Can not create the beignet bitcode\n");
+
+    std::vector<const char *> kernels;
+    std::vector<const char *> builtinFuncs;
+    /* Add the memset and memcpy functions here. */
+    builtinFuncs.push_back("__gen_memcpy_gg");
+    builtinFuncs.push_back("__gen_memcpy_gp");
+    builtinFuncs.push_back("__gen_memcpy_gl");
+    builtinFuncs.push_back("__gen_memcpy_pg");
+    builtinFuncs.push_back("__gen_memcpy_pp");
+    builtinFuncs.push_back("__gen_memcpy_pl");
+    builtinFuncs.push_back("__gen_memcpy_lg");
+    builtinFuncs.push_back("__gen_memcpy_lp");
+    builtinFuncs.push_back("__gen_memcpy_ll");
+    builtinFuncs.push_back("__gen_memset_p");
+    builtinFuncs.push_back("__gen_memset_g");
+    builtinFuncs.push_back("__gen_memset_l");
+
+    builtinFuncs.push_back("__gen_memcpy_gg_align");
+    builtinFuncs.push_back("__gen_memcpy_gp_align");
+    builtinFuncs.push_back("__gen_memcpy_gl_align");
+    builtinFuncs.push_back("__gen_memcpy_pg_align");
+    builtinFuncs.push_back("__gen_memcpy_pp_align");
+    builtinFuncs.push_back("__gen_memcpy_pl_align");
+    builtinFuncs.push_back("__gen_memcpy_lg_align");
+    builtinFuncs.push_back("__gen_memcpy_lp_align");
+    builtinFuncs.push_back("__gen_memcpy_ll_align");
+    builtinFuncs.push_back("__gen_memset_p_align");
+    builtinFuncs.push_back("__gen_memset_g_align");
+    builtinFuncs.push_back("__gen_memset_l_align");
+
+
+    for (Module::iterator SF = mod->begin(), E = mod->end(); SF != E; ++SF) {
+      if (SF->isDeclaration()) continue;
+      if (!isKernelFunction(*SF)) continue;
+      kernels.push_back(SF->getName().data());
+
+      if (!materializedFuncCall(*mod, *clonedLib, *SF, materializedFuncs)) {
+        delete clonedLib;
+        return NULL;
+      }
+    }
+
+    if (kernels.empty()) {
+      printf("One module without kernel function!\n");
+      delete clonedLib;
+      return NULL;
+    }
+
+    for (auto &f : builtinFuncs) {
+      const std::string fnName(f);
+      if (!materializedFuncs.insert(fnName).second) {
+        continue;
+      }
+
+      llvm::Function *newMF = clonedLib->getFunction(fnName);
+      if (!newMF) {
+        printf("Can not find the function: %s\n", fnName.c_str());
+        delete clonedLib;
+        return NULL;
+      }
+      std::string ErrInfo;// = "Not Materializable";
+      if (newMF->isMaterializable()) {
+        if (newMF->Materialize(&ErrInfo)) {
+          printf("Can not materialize the function: %s, because %s\n", fnName.c_str(), ErrInfo.c_str());
+          delete clonedLib;
+          return NULL;
+        }
+      }
+
+      if (!materializedFuncCall(*mod, *clonedLib, *newMF, materializedFuncs)) {
+        delete clonedLib;
+        return NULL;
+      }
+
+      kernels.push_back(f);
+    }
+
+    /* We use beignet's bitcode as dst because it will have a lot of
+       lazy functions which will not be loaded. */
+    std::string errorMsg;
+    if(Linker::LinkModules(clonedLib, mod, Linker::DestroySource, &errorMsg)) {
+      delete clonedLib;
+      printf("Fatal Error: link the bitcode error:\n%s\n", errorMsg.c_str());
+      return NULL;
+    }
+
+    llvm::PassManager passes;
+
+    passes.add(createInternalizePass(kernels));
+    passes.add(createGlobalDCEPass());
+
+    passes.run(*clonedLib);
+
+    return clonedLib;
+  }
+
+} // end namespace
diff --git a/backend/src/llvm/llvm_gen_backend.cpp b/backend/src/llvm/llvm_gen_backend.cpp
index 6cb3834..558491f 100644
--- a/backend/src/llvm/llvm_gen_backend.cpp
+++ b/backend/src/llvm/llvm_gen_backend.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -112,6 +112,7 @@
 #include "llvm/Target/Mangler.h"
 #endif
 
+#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -365,6 +366,18 @@ namespace gbe
             return this->_newScalar(value, key, elementType, index, uniform);
           break;
         }
+        case Type::StructTyID:
+        {
+          auto structType = cast<StructType>(type);
+          auto elementType = structType->getElementType(index);
+          auto elementTypeID = elementType->getTypeID();
+          if (elementTypeID != Type::IntegerTyID &&
+              elementTypeID != Type::FloatTyID &&
+              elementTypeID != Type::DoubleTyID)
+            GBE_ASSERTM(false, "Strcuts of elements are not supported");
+            return this->_newScalar(value, key, elementType, index, uniform);
+          break;
+        }
         default: NOT_SUPPORTED;
       };
       return ir::Register();
@@ -530,6 +543,8 @@ namespace gbe
     void allocateGlobalVariableRegister(Function &F);
     /*! gather all the loops in the function and add them to ir::Function */
     void gatherLoopInfo(ir::Function &fn);
+    /*! do topological sorting of basicblocks */
+    void sortBasicBlock(Function &F);
     /*! Emit the complete function code and declaration */
     void emitFunction(Function &F);
     /*! Handle input and output function parameters */
@@ -586,6 +601,7 @@ namespace gbe
     DECL_VISIT_FN(FCmpInst, FCmpInst);
     DECL_VISIT_FN(InsertElement, InsertElementInst);
     DECL_VISIT_FN(ExtractElement, ExtractElementInst);
+    DECL_VISIT_FN(ExtractValue, ExtractValueInst);
     DECL_VISIT_FN(ShuffleVectorInst, ShuffleVectorInst);
     DECL_VISIT_FN(SelectInst, SelectInst);
     DECL_VISIT_FN(BranchInst, BranchInst);
@@ -613,14 +629,14 @@ namespace gbe
     void visitUnreachableInst(UnreachableInst &I) {NOT_SUPPORTED;}
     void visitGetElementPtrInst(GetElementPtrInst &I) {NOT_SUPPORTED;}
     void visitInsertValueInst(InsertValueInst &I) {NOT_SUPPORTED;}
-    void visitExtractValueInst(ExtractValueInst &I) {NOT_SUPPORTED;}
     template <bool isLoad, typename T> void visitLoadOrStore(T &I);
 
     INLINE void gatherBTI(Value *pointer, ir::BTI &bti);
     // batch vec4/8/16 load/store
     INLINE void emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum,
                   Value *llvmValue, const ir::Register ptr,
-                  const ir::AddressSpace addrSpace, Type * elemType, bool isLoad, ir::BTI bti);
+                  const ir::AddressSpace addrSpace, Type * elemType, bool isLoad, ir::BTI bti,
+                  bool dwAligned);
     void visitInstruction(Instruction &I) {NOT_SUPPORTED;}
     private:
       ir::ImmediateIndex processConstantImmIndexImpl(Constant *CPV, int32_t index = 0u);
@@ -781,7 +797,7 @@ namespace gbe
       return ctx.newImmediate(data);
     } else {
       vector<P> array;
-      for(int i = 0; i < seq->getNumElements(); i++)
+      for(uint32_t i = 0; i < seq->getNumElements(); i++)
         array.push_back(GET_EFFECT_DATA(seq, i, tid));
       return ctx.newImmediate((T*)&array[0], array.size());
     }
@@ -941,7 +957,7 @@ namespace gbe
       ir::Type type = getType(ctx, ce->getType());
       switch (ce->getOpcode()) {
         default:
-          //ce->dump();
+          ce->dump();
           GBE_ASSERT(0 && "unsupported ce opcode.\n");
         case Instruction::Trunc:
         {
@@ -955,6 +971,23 @@ namespace gbe
             return immIndex;
           return ctx.processImm(ir::IMM_BITCAST, immIndex, type);
         }
+        case Instruction::FPToUI:
+        case Instruction::FPToSI:
+        case Instruction::SIToFP:
+        case Instruction::UIToFP:
+        {
+          const ir::ImmediateIndex immIndex = processConstantImmIndex(ce->getOperand(0), -1);
+          switch (ce->getOpcode()) {
+            default:
+              GBE_ASSERT(0);
+            case Instruction::FPToUI: return ctx.processImm(ir::IMM_FPTOUI, immIndex, type);
+            case Instruction::FPToSI: return ctx.processImm(ir::IMM_FPTOSI, immIndex, type);
+            case Instruction::SIToFP: return ctx.processImm(ir::IMM_SITOFP, immIndex, type);
+            case Instruction::UIToFP: return ctx.processImm(ir::IMM_UITOFP, immIndex, type);
+          }
+        }
+        case Instruction::FCmp:
+        case Instruction::ICmp:
         case Instruction::Add:
         case Instruction::Sub:
         case Instruction::Mul:
@@ -994,7 +1027,35 @@ namespace gbe
             return ctx.processImm(ir::IMM_OR, lhs, rhs, type);
           case Instruction::Xor:
             return ctx.processImm(ir::IMM_XOR, lhs, rhs, type);
+          case Instruction::FCmp:
+          case Instruction::ICmp:
+            switch (ce->getPredicate()) {
+              default:
+                NOT_SUPPORTED;
+              case ICmpInst::ICMP_EQ:
+              case ICmpInst::FCMP_OEQ: return ctx.processImm(ir::IMM_OEQ, lhs, rhs, type);
+              case ICmpInst::ICMP_NE:
+              case ICmpInst::FCMP_ONE: return ctx.processImm(ir::IMM_ONE, lhs, rhs, type);
+              case ICmpInst::ICMP_ULE:
+              case ICmpInst::ICMP_SLE:
+              case ICmpInst::FCMP_OLE: return ctx.processImm(ir::IMM_OLE, lhs, rhs, type);
+              case ICmpInst::ICMP_UGE:
+              case ICmpInst::ICMP_SGE:
+              case ICmpInst::FCMP_OGE: return ctx.processImm(ir::IMM_OGE, lhs, rhs, type);
+              case ICmpInst::ICMP_ULT:
+              case ICmpInst::ICMP_SLT:
+              case ICmpInst::FCMP_OLT: return ctx.processImm(ir::IMM_OLT, lhs, rhs, type);
+              case ICmpInst::ICMP_UGT:
+              case ICmpInst::ICMP_SGT:
+              case ICmpInst::FCMP_OGT: return ctx.processImm(ir::IMM_OGT, lhs, rhs, type);
+              case ICmpInst::FCMP_ORD: return ctx.processImm(ir::IMM_ORD, lhs, rhs, type);
+              case ICmpInst::FCMP_TRUE:
+                Value *cv = ConstantInt::get(ce->getType(), 1);
+                return ctx.newImmediate(cv);
+            }
+            break;
           }
+          break;
         }
       }
     }
@@ -1029,6 +1090,14 @@ namespace gbe
           regTranslator.newScalar(value, key, elemID, uniform);
         break;
       }
+      case Type::StructTyID:
+      {
+        auto structType = cast<StructType>(type);
+        const uint32_t elemNum = structType->getNumElements();
+        for (uint32_t elemID = 0; elemID < elemNum; ++elemID)
+          regTranslator.newScalar(value, key, elemID, uniform);
+        break;
+      }
       default: NOT_SUPPORTED;
     };
   }
@@ -1103,8 +1172,10 @@ namespace gbe
       ctx.ADD(ir::Type::TYPE_S32, reg, pointer_reg, offset_reg);
       return reg;
     }
-    else
-      assert(0);
+    else {
+      GBE_ASSERT(0 && "Unsupported constant expression");
+      return regTranslator.getScalar(val, elemID);
+    }
   }
 
   ir::Register GenWriter::getConstantRegister(Constant *c, uint32_t elemID) {
@@ -1376,7 +1447,7 @@ namespace gbe
           const uint32_t elemSize = getTypeByteSize(unit, elemType);
           const uint32_t elemNum = vectorType->getNumElements();
           //vector's elemType always scalar type
-          ctx.input(argName, ir::FunctionArgument::VALUE, reg, llvmInfo, elemNum*elemSize, getAlignmentByte(unit, type), 0);
+          ctx.input(argName, ir::FunctionArgument::VALUE, reg, llvmInfo, getTypeByteSize(unit, type), getAlignmentByte(unit, type), 0);
 
           ir::Function& fn = ctx.getFunction();
           for(uint32_t i=1; i < elemNum; i++) {
@@ -1870,7 +1941,53 @@ namespace gbe
       fn.addLoop(loopBBs, loopExits);
     }
   }
-
+/*!
+
+  Sorting Basic blocks is mainly used to solve register liveness issue, take a
+  look at below CFG:
+
+       -<--1--
+      |       |
+      |        ->2
+   -- 3 <---     |
+  |   ^     |     -->4--
+  |   |     |        |  |
+  |   |     -----5<--   |
+  |   |                 |
+  |    ----------6<-----
+  |
+   -->7
+
+  A register %10 defined in bb4, and used in bb5 & bb6. In normal liveness
+  analysis, %10 is not alive in bb3. But under simd execution model, after
+  executing bb4, some channel jump through bb5 to bb3, other channel may jump
+  to bb6, we must execute bb3 first, then bb6, to avoid missing instructions.
+  The physical register of %10 was assigned some value in bb4, but when
+  executing bb3, its content may be over-written as it is dead in bb3. When
+  jumping back to execute bb6, it will get polluted data. What a disaster!
+  What we do here is do a topological sorting of basic blocks, For this case
+  we can see the bb3 will be placed after bb5 & bb6. The liveness calculation
+  is just as normal and will be correct.
+
+  Another advantage of sorting basic blocks is reducing register pressure.
+  In the above CFG, a register defined in bb3 and used in bb7 will be
+  alive through 3,4,5,6,7. But in fact it should be only alive in bb3 and bb7.
+  After topological sorting, this kind of register would be only alive in bb3
+  and bb7. Register pressure in 4,5,6 is reduced.
+*/
+
+  void GenWriter::sortBasicBlock(Function &F) {
+    typedef ReversePostOrderTraversal<Function*> RPOTType;
+    RPOTType rpot(&F);
+    Function::BasicBlockListType &bbList = F.getBasicBlockList();
+
+    for (RPOTType::rpo_iterator bbI = rpot.begin(), bbE = rpot.end(); bbI != bbE; ++bbI) {
+      (*bbI)->removeFromParent();
+    }
+    for (RPOTType::rpo_iterator bbI = rpot.begin(), bbE = rpot.end(); bbI != bbE; ++bbI) {
+      bbList.push_back(*bbI);
+    }
+  }
   void GenWriter::emitFunction(Function &F)
   {
     switch (F.getCallingConv()) {
@@ -1892,6 +2009,8 @@ namespace gbe
     this->emitFunctionPrototype(F);
 
     this->allocateGlobalVariableRegister(F);
+
+    sortBasicBlock(F);
     // Visit all the instructions and emit the IR registers or the value to
     // value mapping when a new register is not needed
     pass = PASS_EMIT_REGISTERS;
@@ -2324,6 +2443,15 @@ namespace gbe
   void GenWriter::emitExtractElement(ExtractElementInst &I) {
   }
 
+  void GenWriter::regAllocateExtractValue(ExtractValueInst &I) {
+    Value *agg = I.getAggregateOperand();
+    for (const unsigned *i = I.idx_begin(), *e = I.idx_end(); i != e; i++)
+      regTranslator.newValueProxy(agg, &I, *i, 0);
+  }
+
+  void GenWriter::emitExtractValue(ExtractValueInst &I) {
+  }
+
   void GenWriter::regAllocateShuffleVectorInst(ShuffleVectorInst &I) {}
   void GenWriter::emitShuffleVectorInst(ShuffleVectorInst &I) {}
 
@@ -2439,6 +2567,17 @@ namespace gbe
           case Intrinsic::dbg_value:
           case Intrinsic::dbg_declare:
           break;
+          case Intrinsic::sadd_with_overflow:
+          case Intrinsic::uadd_with_overflow:
+          case Intrinsic::ssub_with_overflow:
+          case Intrinsic::usub_with_overflow:
+          case Intrinsic::smul_with_overflow:
+          case Intrinsic::umul_with_overflow:
+            this->newRegister(&I);
+          break;
+          case Intrinsic::bswap:
+            this->newRegister(&I);
+          break;
           default:
           GBE_ASSERTM(false, "Unsupported intrinsics");
         }
@@ -2449,6 +2588,13 @@ namespace gbe
     // Get the name of the called function and handle it
     const std::string fnName = Callee->getName();
     auto it = instrinsicMap.map.find(fnName);
+    // FIXME, should create a complete error reporting mechanism
+    // when found error in beignet managed passes including Gen pass.
+    if (it == instrinsicMap.map.end()) {
+      std::cerr << "Unresolved symbol: " << fnName << std::endl;
+      std::cerr << "Aborting..." << std::endl;
+      exit(-1);
+    }
     GBE_ASSERT(it != instrinsicMap.map.end());
     switch (it->second) {
       case GEN_OCL_GET_GROUP_ID0:
@@ -2489,12 +2635,9 @@ namespace gbe
         regTranslator.newScalarProxy(ir::ocl::goffset2, dst); break;
       case GEN_OCL_GET_WORK_DIM:
         regTranslator.newScalarProxy(ir::ocl::workdim, dst); break;
-      case GEN_OCL_PRINTF_BUF_ADDR:
-        regTranslator.newScalarProxy(ir::ocl::printfbptr, dst); break;
-      case GEN_OCL_PRINTF_INDEX_BUF_ADDR:
-        regTranslator.newScalarProxy(ir::ocl::printfiptr, dst); break;
       case GEN_OCL_FBH:
       case GEN_OCL_FBL:
+      case GEN_OCL_CBIT:
       case GEN_OCL_COS:
       case GEN_OCL_SIN:
       case GEN_OCL_SQR:
@@ -2647,6 +2790,8 @@ namespace gbe
       case GEN_OCL_CONV_F32_TO_F16:
       case GEN_OCL_SIMD_ANY:
       case GEN_OCL_SIMD_ALL:
+      case GEN_OCL_READ_TM:
+      case GEN_OCL_REGION:
         this->newRegister(&I);
         break;
       case GEN_OCL_PRINTF:
@@ -2739,7 +2884,6 @@ namespace gbe
             const ir::Register src2 = this->getRegister(I.getOperand(2));
             ctx.MUL(ir::TYPE_FLOAT, tmp, src0, src1);
             ctx.ADD(ir::TYPE_FLOAT, dst, tmp, src2);
-            break;
           }
           break;
           case Intrinsic::lifetime_start:
@@ -2750,6 +2894,132 @@ namespace gbe
           case Intrinsic::dbg_value:
           case Intrinsic::dbg_declare:
           break;
+          case Intrinsic::uadd_with_overflow:
+          {
+            Type *llvmDstType = I.getType();
+            GBE_ASSERT(llvmDstType->isStructTy());
+            ir::Type dst0Type = getType(ctx, llvmDstType->getStructElementType(0));
+            const ir::Register dst0  = this->getRegister(&I, 0);
+            const ir::Register src0 = this->getRegister(I.getOperand(0));
+            const ir::Register src1 = this->getRegister(I.getOperand(1));
+            ctx.ADD(dst0Type, dst0, src0, src1);
+
+            ir::Register overflow = this->getRegister(&I, 1);
+            ctx.LT(dst0Type, overflow, dst0, src1);
+          }
+          break;
+          case Intrinsic::usub_with_overflow:
+          {
+            Type *llvmDstType = I.getType();
+            GBE_ASSERT(llvmDstType->isStructTy());
+            ir::Type dst0Type = getType(ctx, llvmDstType->getStructElementType(0));
+            const ir::Register dst0  = this->getRegister(&I, 0);
+            const ir::Register src0 = this->getRegister(I.getOperand(0));
+            const ir::Register src1 = this->getRegister(I.getOperand(1));
+            ctx.SUB(dst0Type, dst0, src0, src1);
+
+            ir::Register overflow = this->getRegister(&I, 1);
+            const ir::Type unsignedType = makeTypeUnsigned(dst0Type);
+            ctx.GT(unsignedType, overflow, dst0, src0);
+          }
+          break;
+          case Intrinsic::sadd_with_overflow:
+          case Intrinsic::ssub_with_overflow:
+          case Intrinsic::smul_with_overflow:
+          case Intrinsic::umul_with_overflow:
+          NOT_IMPLEMENTED;
+          break;
+          case Intrinsic::bswap:
+          {
+            // FIXME, this is an unoptimized version, could be optimized by
+            // leveraging GEN's register region/indirect address feature.
+            Type *llvmDstType = I.getType();
+            uint32_t elementSize = getTypeByteSize(unit, llvmDstType);
+
+            const ir::Register dst0  = this->getRegister(&I);
+            const ir::Register src0 = this->getRegister(I.getOperand(0));
+            switch(elementSize)
+            {
+              case 2:
+                {
+                  ir::Type srcType = getUnsignedType(ctx, llvmDstType);
+                  ir::Register tmp1 = ctx.reg(getFamily(srcType));
+                  ir::Register tmp2 = ctx.reg(getFamily(srcType));
+
+                  ir::Register regWMask = ctx.reg( ir::FAMILY_WORD );
+                  const ir::ImmediateIndex wMask = ctx.newIntegerImmediate(0x00FF, ir::TYPE_S16);
+                  ir::Register regShift = ctx.reg( ir::FAMILY_WORD );
+                  const ir::ImmediateIndex shift = ctx.newIntegerImmediate(8, ir::TYPE_S16);
+
+                  ctx.LOADI(ir::TYPE_S16, regWMask, wMask);
+                  ctx.AND(srcType, tmp1, src0, regWMask);
+
+                  ctx.LOADI(ir::TYPE_S16, regShift, shift);
+                  ctx.SHL(srcType, tmp2, tmp1, regShift);
+
+                  ir::Register tmp3 = ctx.reg( getFamily(srcType) );
+                  ctx.SHR(srcType, tmp3, src0, regShift);
+
+                  ctx.OR(srcType, dst0, tmp2, tmp3);
+                }
+                break;
+              case 4:
+                {
+                  ir::Type srcType = getUnsignedType(ctx, llvmDstType);
+                  ir::Register tmp1 = ctx.reg(getFamily(srcType));
+                  ir::Register tmp2 = ctx.reg(getFamily(srcType));
+                  ir::Register tmp3 = ctx.reg(getFamily(srcType));
+                  ir::Register tmp4 = ctx.reg(getFamily(srcType));
+                  ir::Register tmp5 = ctx.reg(getFamily(srcType));
+                  ir::Register tmp6 = ctx.reg(getFamily(srcType));
+                  ir::Register tmp7 = ctx.reg(getFamily(srcType));
+                  ir::Register tmp8 = ctx.reg(getFamily(srcType));
+
+                  ir::Register regDWMask = ctx.reg( ir::FAMILY_DWORD );
+                  ir::Register regShift = ctx.reg( ir::FAMILY_DWORD );
+                  ir::ImmediateIndex wMask = ctx.newIntegerImmediate(0x000000FF, ir::TYPE_S32);
+                  ir::ImmediateIndex shift = ctx.newIntegerImmediate(24, ir::TYPE_S32);
+                  ctx.LOADI(ir::TYPE_S32, regDWMask, wMask);
+                  ctx.AND(srcType, tmp1, src0, regDWMask);
+                  ctx.LOADI(ir::TYPE_S32, regShift, shift);
+                  ctx.SHL(srcType, tmp2, tmp1, regShift);
+
+                  wMask = ctx.newIntegerImmediate(0x0000FF00, ir::TYPE_S32);
+                  shift = ctx.newIntegerImmediate(8, ir::TYPE_S32);
+                  ctx.LOADI(ir::TYPE_S32, regDWMask, wMask);
+                  ctx.AND(srcType, tmp3, src0, regDWMask);
+                  ctx.LOADI(ir::TYPE_S32, regShift, shift);
+                  ctx.SHL(srcType, tmp4, tmp3, regShift);
+
+                  wMask = ctx.newIntegerImmediate(0x00FF0000, ir::TYPE_S32);
+                  shift = ctx.newIntegerImmediate(8, ir::TYPE_S32);
+                  ctx.LOADI(ir::TYPE_S32, regDWMask, wMask);
+                  ctx.AND(srcType, tmp5, src0, regDWMask);
+                  ctx.LOADI(ir::TYPE_S32, regShift, shift);
+                  ctx.SHR(srcType, tmp6, tmp5, regShift);
+
+                  wMask = ctx.newIntegerImmediate(0xFF000000, ir::TYPE_S32);
+                  shift = ctx.newIntegerImmediate(24, ir::TYPE_S32);
+                  ctx.LOADI(ir::TYPE_S32, regDWMask, wMask);
+                  ctx.AND(srcType, tmp7, src0, regDWMask);
+                  ctx.LOADI(ir::TYPE_S32, regShift, shift);
+                  ctx.SHR(srcType, tmp8, tmp7, regShift);
+
+                  ir::Register tmp9 = ctx.reg(getFamily(srcType));
+                  ir::Register tmp10 = ctx.reg(getFamily(srcType));
+                  ctx.OR(srcType, tmp9, tmp2, tmp4);
+                  ctx.OR(srcType, tmp10, tmp6, tmp8);
+                  ctx.OR(srcType, dst0, tmp9, tmp10);
+                }
+                break;
+              case 8:
+                NOT_IMPLEMENTED;
+                break;
+              default:
+                GBE_ASSERT(0);
+            }
+          }
+          break;
           default: NOT_IMPLEMENTED;
         }
       } else {
@@ -2778,6 +3048,7 @@ namespace gbe
           }
           case GEN_OCL_FBH: this->emitUnaryCallInst(I,CS,ir::OP_FBH); break;
           case GEN_OCL_FBL: this->emitUnaryCallInst(I,CS,ir::OP_FBL); break;
+          case GEN_OCL_CBIT: this->emitUnaryCallInst(I,CS,ir::OP_CBIT); break;
           case GEN_OCL_ABS:
           {
             const ir::Register src = this->getRegister(*AI);
@@ -2799,6 +3070,26 @@ namespace gbe
             ctx.ALU1(ir::OP_SIMD_ANY, ir::TYPE_S16, dst, src);
             break;
           }
+          case GEN_OCL_READ_TM:
+          {
+            const ir::Register dst = this->getRegister(&I);
+            ctx.READ_ARF(ir::TYPE_U32, dst, ir::ARF_TM);
+            break;
+          }
+          case GEN_OCL_REGION:
+          {
+            const ir::Register dst = this->getRegister(&I);
+            // offset must be immediate
+            GBE_ASSERT(AI != AE); Constant *CPV = dyn_cast<Constant>(*AI);
+            assert(CPV);
+            const ir::Immediate &x = processConstantImm(CPV);
+
+            AI++;
+            const ir::Register src = this->getRegister(*AI);
+
+            ctx.REGION(dst, src, x.getIntegerValue());
+            break;
+          }
           case GEN_OCL_COS: this->emitUnaryCallInst(I,CS,ir::OP_COS); break;
           case GEN_OCL_SIN: this->emitUnaryCallInst(I,CS,ir::OP_SIN); break;
           case GEN_OCL_LOG: this->emitUnaryCallInst(I,CS,ir::OP_LOG); break;
@@ -3281,8 +3572,6 @@ handle_write_image:
             assert(fmt);
             break;
           }
-          case GEN_OCL_PRINTF_BUF_ADDR:
-          case GEN_OCL_PRINTF_INDEX_BUF_ADDR:
           default: break;
         }
       }
@@ -3361,7 +3650,8 @@ handle_write_image:
   void GenWriter::emitBatchLoadOrStore(const ir::Type type, const uint32_t elemNum,
                                       Value *llvmValues, const ir::Register ptr,
                                       const ir::AddressSpace addrSpace,
-                                      Type * elemType, bool isLoad, ir::BTI bti) {
+                                      Type * elemType, bool isLoad, ir::BTI bti,
+                                      bool dwAligned) {
     const ir::RegisterFamily pointerFamily = ctx.getPointerFamily();
     uint32_t totalSize = elemNum * getFamilySize(getFamily(type));
     uint32_t msgNum = totalSize > 16 ? totalSize / 16 : 1;
@@ -3407,9 +3697,9 @@ handle_write_image:
 
       // Emit the instruction
       if (isLoad)
-        ctx.LOAD(type, tuple, addr, addrSpace, perMsgNum, true, bti);
+        ctx.LOAD(type, tuple, addr, addrSpace, perMsgNum, dwAligned, bti);
       else
-        ctx.STORE(type, tuple, addr, addrSpace, perMsgNum, true, bti);
+        ctx.STORE(type, tuple, addr, addrSpace, perMsgNum, dwAligned, bti);
     }
   }
 
@@ -3581,11 +3871,12 @@ handle_write_image:
         // Not supported by the hardware. So, we split the message and we use
         // strided loads and stores
         else {
-          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding);
+          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding, dwAligned);
         }
       }
-      else if((dataFamily==ir::FAMILY_WORD && elemNum%2==0) || (dataFamily == ir::FAMILY_BYTE && elemNum%4 == 0)) {
-          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding);
+      else if((dataFamily == ir::FAMILY_WORD && (isLoad || elemNum % 2 == 0)) ||
+              (dataFamily == ir::FAMILY_BYTE && (isLoad || elemNum % 4 == 0))) {
+          emitBatchLoadOrStore(type, elemNum, llvmValues, ptr, addrSpace, elemType, isLoad, binding, dwAligned);
       } else {
         for (uint32_t elemID = 0; elemID < elemNum; elemID++) {
           if(regTranslator.isUndefConst(llvmValues, elemID))
diff --git a/backend/src/llvm/llvm_gen_backend.hpp b/backend/src/llvm/llvm_gen_backend.hpp
index cc5cdad..5dac3ea 100644
--- a/backend/src/llvm/llvm_gen_backend.hpp
+++ b/backend/src/llvm/llvm_gen_backend.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -26,7 +26,9 @@
 #ifndef __GBE_LLVM_GEN_BACKEND_HPP__
 #define __GBE_LLVM_GEN_BACKEND_HPP__
 
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Pass.h"
+#include "llvm/Analysis/LoopPass.h"
 #include "sys/platform.hpp"
 #include "sys/map.hpp"
 #include "sys/hash_map.hpp"
@@ -92,12 +94,23 @@ namespace gbe
   /*! Remove/add NoDuplicate function attribute for barrier functions. */
   llvm::ModulePass* createBarrierNodupPass(bool);
 
+  /*! Legalize all wide integer instructions */
+  llvm::FunctionPass* createLegalizePass();
+
   /*! Convert the Intrinsic call to gen function */
   llvm::BasicBlockPass *createIntrinsicLoweringPass();
 
   /*! Passer the printf function call. */
   llvm::FunctionPass* createPrintfParserPass();
 
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+  /* customized loop unrolling pass. */
+  llvm::LoopPass *createCustomLoopUnrollPass();
+#endif
+
+  /*! Add all the function call of ocl to our bitcode. */
+  llvm::Module* runBitCodeLinker(llvm::Module *mod, bool strictMath);
+
   void* getPrintfInfo(llvm::CallInst* inst);
 } /* namespace gbe */
 
diff --git a/backend/src/llvm/llvm_gen_ocl_function.hxx b/backend/src/llvm/llvm_gen_ocl_function.hxx
index f3ce096..7434c78 100644
--- a/backend/src/llvm/llvm_gen_ocl_function.hxx
+++ b/backend/src/llvm/llvm_gen_ocl_function.hxx
@@ -151,6 +151,7 @@ DECL_LLVM_GEN_FUNCTION(I64RHADD, _Z15__gen_ocl_rhaddmm)
 DECL_LLVM_GEN_FUNCTION(UPSAMPLE_SHORT, _Z18__gen_ocl_upsampless)
 DECL_LLVM_GEN_FUNCTION(UPSAMPLE_INT, _Z18__gen_ocl_upsampleii)
 DECL_LLVM_GEN_FUNCTION(UPSAMPLE_LONG, _Z18__gen_ocl_upsamplell)
+DECL_LLVM_GEN_FUNCTION(CBIT, __gen_ocl_cbit)
 
 // saturate convert
 DECL_LLVM_GEN_FUNCTION(SAT_CONV_U8_TO_I8,  _Z16convert_char_sath)
@@ -190,7 +191,8 @@ DECL_LLVM_GEN_FUNCTION(CONV_F32_TO_F16, __gen_ocl_f32to16)
 DECL_LLVM_GEN_FUNCTION(SIMD_ANY, __gen_ocl_simd_any)
 DECL_LLVM_GEN_FUNCTION(SIMD_ALL, __gen_ocl_simd_all)
 
+DECL_LLVM_GEN_FUNCTION(READ_TM, __gen_ocl_read_tm)
+DECL_LLVM_GEN_FUNCTION(REGION, __gen_ocl_region)
+
 // printf function
 DECL_LLVM_GEN_FUNCTION(PRINTF, __gen_ocl_printf)
-DECL_LLVM_GEN_FUNCTION(PRINTF_BUF_ADDR, __gen_ocl_printf_get_buf_addr)
-DECL_LLVM_GEN_FUNCTION(PRINTF_INDEX_BUF_ADDR, __gen_ocl_printf_get_index_buf_addr)
diff --git a/backend/src/llvm/llvm_intrinsic_lowering.cpp b/backend/src/llvm/llvm_intrinsic_lowering.cpp
index 7d04318..52f99c1 100644
--- a/backend/src/llvm/llvm_intrinsic_lowering.cpp
+++ b/backend/src/llvm/llvm_intrinsic_lowering.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -126,13 +126,18 @@ namespace gbe {
                 Type *IntPtr = TD.getIntPtrType(Context);
                 Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
                                                     /* isSigned */ false);
+                Value *align = Builder.CreateIntCast(CI->getArgOperand(3), IntPtr,
+                                                    /* isSigned */ false);
+                ConstantInt *ci = dyn_cast<ConstantInt>(align);
                 Value *Ops[3];
                 Ops[0] = CI->getArgOperand(0);
                 Ops[1] = CI->getArgOperand(1);
                 Ops[2] = Size;
-                char name[16] = "__gen_memcpy_xx";
+                char name[24] = "__gen_memcpy_xx";
                 name[13] = convertSpaceToName(Ops[0]);
                 name[14] = convertSpaceToName(Ops[1]);
+                if(ci && (ci->getZExtValue() % 4 == 0)) //alignment is constant and 4 byte align
+                  strcat(name, "_align");
                 replaceCallWith(name, CI, Ops, Ops+3, Type::getVoidTy(Context));
                 break;
               }
@@ -143,13 +148,18 @@ namespace gbe {
                 Type *IntPtr = TD.getIntPtrType(Op0->getType());
                 Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
                                                     /* isSigned */ false);
+                Value *align = Builder.CreateIntCast(CI->getArgOperand(3), IntPtr,
+                                                    /* isSigned */ false);
+                ConstantInt *ci = dyn_cast<ConstantInt>(align);
                 Value *Ops[3];
                 Ops[0] = Op0;
                 // Extend the amount to i32.
                 Ops[1] = val;
                 Ops[2] = Size;
-                char name[16] = "__gen_memset_x";
+                char name[24] = "__gen_memset_x";
                 name[13] = convertSpaceToName(Ops[0]);
+                if(ci && (ci->getZExtValue() % 4 == 0)) //alignment is constant and 4 byte align
+                  strcat(name, "_align");
                 replaceCallWith(name, CI, Ops, Ops+3, Type::getVoidTy(Context));
                 break;
               }
diff --git a/backend/src/llvm/llvm_legalize.cpp b/backend/src/llvm/llvm_legalize.cpp
new file mode 100644
index 0000000..250fd11
--- /dev/null
+++ b/backend/src/llvm/llvm_legalize.cpp
@@ -0,0 +1,704 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * Author: Ruiling, Song <ruiling.song at intel.com>
+ *
+ * Legalize unsupported integer data type i128/i256/...
+ * right now, the implementation only consider little-endian system.
+ *
+ */
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/PassManager.h"
+
+#include "llvm/Config/llvm-config.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/IRBuilder.h"
+#if LLVM_VERSION_MINOR >= 5
+#include "llvm/IR/CFG.h"
+#else
+#include "llvm/Support/CFG.h"
+#endif
+
+
+#include "llvm_gen_backend.hpp"
+
+using namespace llvm;
+
+namespace gbe {
+
+  class Legalize : public FunctionPass {
+  public:
+    Legalize() : FunctionPass(ID) {
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+      initializeDominatorTreeWrapperPassPass(*PassRegistry::getPassRegistry());
+#else
+      initializeDominatorTreePass(*PassRegistry::getPassRegistry());
+#endif
+    }
+    bool runOnFunction(Function& F) {
+      if (!isKernelFunction(F)) return false;
+      return legalizeFunction(F);
+    }
+    Value *getComponent(Value *v, uint32_t i, Type *ty);
+    bool isIncomplete(Value *v);
+    void legalizePHI(IRBuilder <> Builder, Instruction *p);
+    void legalizeSelect(IRBuilder<> &Builder, Instruction *p);
+    void legalizeICmp(IRBuilder<> &Builder, Instruction *p);
+    void legalizeShl(IRBuilder<> &Builder, Instruction *p);
+    void legalizeLShr(IRBuilder<> &Builder, Instruction *p);
+    void legalizeAnd(IRBuilder<> &Builder, Instruction *p);
+    void legalizeOr(IRBuilder<> &Builder, Instruction *p);
+    void legalizeXor(IRBuilder<> &Builder, Instruction *p);
+    void legalizeBitCast(IRBuilder<> &Builder, Instruction *p);
+    void legalizeTrunc(IRBuilder<> &Builder, Instruction *p);
+    void legalizeZExt(IRBuilder<> &Builder, Instruction *p);
+    bool legalizeFunction(Function& F);
+    void splitLargeInteger(APInt op, Type *splitTy, SmallVector<APInt, 16> &split);
+    void splitConstantInt(ConstantInt *c, Type *splitTy, SmallVector<Value*, 16> &split);
+    static char ID;
+  private:
+    std::set<Value *> processed;
+    std::set<PHINode *> incompletePHIs;
+    std::map<Value *, SmallVector<Value*, 16>> valueMap;
+    typedef std::map<Value*, SmallVector<Value*, 16>>::iterator ValueMapIter;
+  };
+
+  void splitAPInt(APInt &data, SmallVectorImpl<APInt> &result, int totalBits, int subBits) {
+    APInt lo = data.getLoBits(totalBits/2).trunc(totalBits/2);
+    APInt hi = data.getHiBits(totalBits/2).trunc(totalBits/2);
+
+    if (totalBits/2 <= subBits) {
+      result.push_back(lo);
+      result.push_back(hi);
+      return;
+    }
+    splitAPInt(lo, result, totalBits/2, subBits);
+    splitAPInt(hi, result, totalBits/2, subBits);
+  }
+
+  void Legalize::splitLargeInteger(APInt data, Type *splitTy, SmallVector<APInt, 16> &split) {
+    unsigned opSz = data.getBitWidth();
+    GBE_ASSERT(opSz > 7 && llvm::isPowerOf2_32(opSz));
+    unsigned subSz = splitTy->getPrimitiveSizeInBits();
+    splitAPInt(data, split, opSz, subSz);
+  }
+
+  void Legalize::splitConstantInt(ConstantInt *c, Type *splitTy, SmallVector<Value*, 16> &split) {
+    SmallVector<APInt, 16> imm;
+    splitLargeInteger(c->getValue(), splitTy, imm);
+    for (unsigned i = 0; i < imm.size(); i++) {
+      split.push_back(ConstantInt::get(splitTy, imm[i]));
+    }
+  }
+
+  bool Legalize::isIncomplete(Value *v) {
+    return valueMap.find(v) == valueMap.end() && !isa<ConstantInt>(v);
+  }
+
+  Value *Legalize::getComponent(Value *v, uint32_t i, Type *ty) {
+    GBE_ASSERT(!isIncomplete(v));
+    if (isa<ConstantInt>(v)) {
+      GBE_ASSERT(ty);
+      ConstantInt *CI = dyn_cast<ConstantInt>(v);
+      SmallVector<APInt, 16> imm;
+      splitLargeInteger(CI->getValue(), ty, imm);
+      return ConstantInt::get(ty, imm[i]);
+    }
+    return valueMap.find(v)->second[i];
+  }
+
+  void Legalize::legalizePHI(IRBuilder <> Builder, Instruction *p) {
+    PHINode *phi = dyn_cast<PHINode>(p);
+    bool incomplete = false, allConst = true;
+    uint32_t compNum = 0;
+    Type *splitTy = NULL;
+    for (unsigned int i = 0; i < phi->getNumIncomingValues(); ++i) {
+      Value *val = phi->getIncomingValue(i);
+      if (isIncomplete(val)) {
+        incomplete = true;
+        break;
+      }
+      if (allConst && valueMap.find(val) != valueMap.end()) {
+        allConst = false;
+        splitTy = valueMap.find(val)->second[0]->getType();
+        compNum = valueMap.find(val)->second.size();
+      }
+    }
+
+    if (incomplete) {
+      // FIME, if a PHINode is totally incomplete which means
+      // we don't even know the base type of this instruction.
+      // Then it will be a little bit difficult to handle here.
+      // Will do it in the future.
+      incompletePHIs.insert(phi);
+      GBE_ASSERT(0 && "unsupported PHI");
+    }
+    else {
+      GBE_ASSERT(!allConst);
+      SmallVector<Value*, 16> v;
+      for (unsigned int i = 0; i < compNum; ++i) {
+        PHINode* res = Builder.CreatePHI(splitTy, phi->getNumIncomingValues());
+
+        // Loop over pairs of operands: [Value*, BasicBlock*]
+        for (unsigned int j = 0; j < phi->getNumIncomingValues(); j++) {
+          BasicBlock* bb = phi->getIncomingBlock(j);
+          res->addIncoming(getComponent(phi->getIncomingValue(j), i, splitTy), bb);
+        }
+        v.push_back(res);
+      }
+      valueMap.insert(std::make_pair(phi, v));
+    }
+  }
+
+  void Legalize::legalizeSelect(IRBuilder<> &Builder, Instruction *p) {
+    SelectInst *sel = dyn_cast<SelectInst>(p);
+    Value *op0 = sel->getOperand(0);
+    Value *op1 = sel->getOperand(1);
+    Value *op2 = sel->getOperand(2);
+
+    ValueMapIter iter1 = valueMap.find(op1);
+    ValueMapIter iter2 = valueMap.find(op2);
+    SmallVector<Value*, 16> v;
+    if (iter1 != valueMap.end() && iter2 != valueMap.end()) {
+      SmallVectorImpl<Value*> &opVec1 = iter1->second;
+      SmallVectorImpl<Value*> &opVec2 = iter2->second;
+
+      GBE_ASSERT(opVec1.size() == opVec2.size());
+
+      for (unsigned i = 0; i < opVec1.size(); i++) {
+        Value *elemV = Builder.CreateSelect(op0, opVec1[i], opVec2[i]);
+        v.push_back(elemV);
+      }
+    } else if (iter1 != valueMap.end()) {
+      SmallVectorImpl<Value*> &opVec1 = iter1->second;
+      Type *splitTy = opVec1[0]->getType();
+      GBE_ASSERT(isa<ConstantInt>(op2));
+      ConstantInt *CI = dyn_cast<ConstantInt>(op2);
+      SmallVector<APInt, 16> imm;
+
+      splitLargeInteger(CI->getValue(), splitTy, imm);
+      for (unsigned i = 0; i < opVec1.size(); i++) {
+        Value *elemV = Builder.CreateSelect(op0, opVec1[i], ConstantInt::get(splitTy, imm[i]));
+        v.push_back(elemV);
+      }
+    } else if (iter2 != valueMap.end()) {
+      SmallVectorImpl<Value*> &opVec2 = iter2->second;
+      Type *splitTy = opVec2[0]->getType();
+      GBE_ASSERT(isa<ConstantInt>(op1));
+      ConstantInt *CI = dyn_cast<ConstantInt>(op1);
+      SmallVector<APInt, 16> imm;
+
+      splitLargeInteger(CI->getValue(), splitTy, imm);
+      for (unsigned i = 0; i < opVec2.size(); i++) {
+        Value *elemV = Builder.CreateSelect(op0, ConstantInt::get(splitTy, imm[i]), opVec2[i]) ;
+        v.push_back(elemV);
+      }
+    } else {
+      p->dump(); GBE_ASSERT(0 && "unsupported select.");
+    }
+    valueMap.insert(std::make_pair(p, v));
+  }
+
+  void Legalize::legalizeICmp(IRBuilder<> &Builder, Instruction *p) {
+    ICmpInst *IC = dyn_cast<ICmpInst>(p);
+    ICmpInst::Predicate pred = IC->getPredicate();
+    // I could not figure out why llvm could generate some
+    // compare instruction on large integers. so here only support equality check
+    GBE_ASSERT(IC->isEquality());
+    Value *op0 = p->getOperand(0);
+    Value *op1 = p->getOperand(1);
+
+    if (isa<ConstantInt>(op0)) {
+      op0 = p->getOperand(1);
+      op1 = p->getOperand(0);
+    }
+
+    if (isa<ConstantInt>(op1)) {
+      ValueMapIter iter = valueMap.find(op0);
+      SmallVectorImpl<Value*> &opVec = iter->second;
+      SmallVector<APInt, 16> imm;
+
+      Value *res = NULL;
+      Type *splitTy = opVec[0]->getType();
+      ConstantInt *CI = dyn_cast<ConstantInt>(op1);
+
+      splitLargeInteger(CI->getValue(), splitTy, imm);
+      for (unsigned i = 0; i < opVec.size(); i++) {
+        Value *tmp = Builder.CreateICmp(pred, opVec[i], ConstantInt::get(splitTy, imm[i]));
+        if (res != NULL) {
+          if (pred == CmpInst::ICMP_EQ)
+            tmp = Builder.CreateAnd(tmp, res);
+          else
+            tmp = Builder.CreateOr(tmp, res);
+        }
+        res = tmp;
+      }
+      p->replaceAllUsesWith(res);
+    } else {
+      ValueMapIter iter0 = valueMap.find(op0);
+      ValueMapIter iter1 = valueMap.find(op1);
+      SmallVectorImpl<Value*> &opVec0 = iter0->second;
+      SmallVectorImpl<Value*> &opVec1 = iter1->second;
+
+      Value *res = NULL;
+      for (unsigned i = 0; i < opVec0.size(); i++) {
+        Value *tmp = Builder.CreateICmp(pred, opVec0[i], opVec1[i]);
+        if (res != NULL) {
+          if (pred == CmpInst::ICMP_EQ)
+            tmp = Builder.CreateAnd(tmp, res);
+          else
+            tmp = Builder.CreateOr(tmp, res);
+        }
+        res = tmp;
+      }
+      p->replaceAllUsesWith(res);
+    }
+  }
+
+  void Legalize::legalizeShl(IRBuilder<> &Builder, Instruction *p) {
+    // only support known bits shift
+    GBE_ASSERT(isa<ConstantInt>(p->getOperand(1)));
+
+    ValueMapIter iter = valueMap.find(p->getOperand(0));
+    GBE_ASSERT(iter != valueMap.end());
+    SmallVectorImpl<Value*> &v0 = iter->second;
+
+    uint64_t shiftBits = dyn_cast<ConstantInt>(p->getOperand(1))->getZExtValue();
+    Type *splitTy = v0[0]->getType();
+
+    unsigned elemNum = v0.size();
+    unsigned szSplit = splitTy->getPrimitiveSizeInBits();
+    unsigned shift = shiftBits / szSplit;
+    unsigned unaligned = shiftBits % szSplit;
+
+    if (unaligned == 0) {
+      SmallVector<Value*, 16> v1;
+      // fill lower bits with zero
+      for (unsigned i = 0; i < shift; i++) {
+        v1.push_back(ConstantInt::get(splitTy, 0));
+      }
+      // do the shift
+      for (unsigned j =0; j < elemNum - shift; j++)
+        v1.push_back(v0[j]);
+
+      valueMap.insert(std::make_pair(p, v1));
+    } else {
+      SmallVector<Value*, 16> v1;
+      // fill lower bits with zero
+      for (unsigned i = 0; i < shift; i++) {
+        v1.push_back(ConstantInt::get(splitTy, 0));
+      }
+      // first one is special, shl is enough.
+      v1.push_back(Builder.CreateShl(v0[0], unaligned));
+
+      for (unsigned i = 0; i < elemNum - shift - 1; i++) {
+        Value *t0 = Builder.CreateLShr(v0[i], ConstantInt::get(v0[0]->getType(), szSplit-unaligned));
+        Value *t1 = Builder.CreateShl(v0[i + 1], ConstantInt::get(v0[i + 1]->getType(), unaligned));
+        Value *t2 = Builder.CreateOr(t0, t1);
+        v1.push_back(t2);
+      }
+      valueMap.insert(std::make_pair(p, v1));
+    }
+  }
+
+  void Legalize::legalizeLShr(IRBuilder<> &Builder, Instruction *p) {
+    Value *op0 = p->getOperand(0);
+    Value *op1 = p->getOperand(1);
+    SmallVector<Value*, 16> result;
+
+    GBE_ASSERT(isa<ConstantInt>(p->getOperand(1)));
+
+    ValueMapIter iter = valueMap.find(op0);
+    GBE_ASSERT(iter != valueMap.end());
+    SmallVectorImpl<Value*> &opVec = iter->second;
+
+    unsigned szTotal = op1->getType()->getPrimitiveSizeInBits();
+    unsigned elemNum = opVec.size();
+    unsigned szSplit = szTotal / elemNum;
+    int64_t shift = dyn_cast<ConstantInt>(op1)->getSExtValue();
+    GBE_ASSERT(shift > 0);
+    unsigned elemShift = shift / szSplit;
+    unsigned unalign = shift % szSplit;
+
+    if (unalign == 0) {
+      // the shift bits is aligned with the split size
+      Constant *zero = ConstantInt::getSigned(opVec[0]->getType(), 0);
+      for (unsigned s = 0; s < elemNum - elemShift; s++)
+        result.push_back(opVec[s + elemShift]);
+
+      for (unsigned s = 0; s < elemShift; s++)
+        result.push_back(zero);
+
+      valueMap.insert(std::make_pair(p, result));
+    } else {
+      // not aligned case
+      for (unsigned s = elemShift; s < elemNum-1; s++) {
+        Value *t0 = Builder.CreateLShr(opVec[s], ConstantInt::get(opVec[s]->getType(), unalign));
+        Value *t1 = Builder.CreateShl(opVec[s + 1], ConstantInt::get(opVec[s + 1]->getType(), szSplit - unalign));
+        Value *t2 = Builder.CreateOr(t0, t1);
+        result.push_back(t2);
+      }
+      // last element only need lshr
+      result.push_back(Builder.CreateLShr(opVec[elemNum-1], ConstantInt::get(opVec[elemNum - 1]->getType(), unalign)));
+
+      for (unsigned s = 0; s < elemShift; s++) {
+        result.push_back(ConstantInt::getSigned(opVec[0]->getType(), 0));
+      }
+      valueMap.insert(std::make_pair(p, result));
+    }
+  }
+
+  void Legalize::legalizeAnd(IRBuilder<> &Builder, Instruction *p) {
+    Value *op0 = p->getOperand(0);
+    Value *op1 = p->getOperand(1);
+
+    if ((isa<UndefValue>(op0) || isa<UndefValue>(op1))) {
+      // I meet some special case as below:
+      //   %82 = zext i32 %81 to i512
+      //   %mask148 = and i512 undef, -4294967296
+      //   %ins149 = or i512 %mask148, %82
+      // I don't know how to split this kind of i512 instruction in a good way,
+      // to simplify the situation, I directly optimize it to zero.
+      // And in later instructions like and/or/shr... that operates on
+      // the value can be optimized.
+      p->replaceAllUsesWith(ConstantInt::get(p->getType(), 0));
+      return;
+    }
+
+    if ((isa<ConstantInt>(op0) && dyn_cast<ConstantInt>(op0)->isZero())
+       || (isa<ConstantInt>(op1) && dyn_cast<ConstantInt>(op1)->isZero())) {
+      // zero & anyValue  ==> zero
+      p->replaceAllUsesWith(ConstantInt::get(p->getType(), 0));
+      return;
+    }
+
+    if (isa<ConstantInt>(op0)) {
+      op0 = p->getOperand(1);
+      op1 = p->getOperand(0);
+    }
+
+    ValueMapIter iter = valueMap.find(op0);
+    SmallVector<Value*, 16> v0 = iter->second;
+    SmallVector<Value*, 16> v1;
+    SmallVector<Value*, 16> v2;
+
+    if (isa<ConstantInt>(op1)) {
+      splitConstantInt(dyn_cast<ConstantInt>(op1), v0[0]->getType(), v1);
+    } else {
+      v1 = valueMap.find(op1)->second;
+    }
+
+    for (unsigned i = 0; i < v0.size(); i++) {
+      ConstantInt *c0 = NULL, *c1 = NULL;
+      if (isa<ConstantInt>(v0[i])) c0 = dyn_cast<ConstantInt>(v0[i]);
+      if (isa<ConstantInt>(v1[i])) c1 = dyn_cast<ConstantInt>(v1[i]);
+
+      if ((c0 &&c0->isZero()) || (c1 && c1->isZero())) {
+        // zero & anyvalue ==> zero
+        v2.push_back(ConstantInt::get(v0[i]->getType(), 0));
+      } else if (c0 && c0->isMinusOne()) {
+        // 1111s & anyvalue ==> anyvalue
+        v2.push_back(v1[i]);
+      } else if (c1 && c1->isMinusOne()) {
+        // 1111s & anyvalue ==> anyvalue
+        v2.push_back(v0[i]);
+      } else {
+        v2.push_back(Builder.CreateAnd(v0[i], v1[i]));
+      }
+    }
+    valueMap.insert(std::make_pair(p, v2));
+  }
+
+  void Legalize::legalizeOr(IRBuilder<> &Builder, Instruction *p) {
+    Value *op0 = p->getOperand(0);
+    Value *op1 = p->getOperand(1);
+
+    if (isa<ConstantInt>(op0)) {
+      op0 = p->getOperand(1);
+      op1 = p->getOperand(0);
+    }
+
+    if (isa<ConstantInt>(op1) && dyn_cast<ConstantInt>(op1)->isZero()) {
+      ValueMapIter iter = valueMap.find(op0);
+      valueMap.insert(std::make_pair(p, iter->second));
+      return;
+    }
+
+    ValueMapIter iter = valueMap.find(op0);
+    SmallVector<Value*, 16> v0 = iter->second;
+    SmallVector<Value*, 16> v1;
+    SmallVector<Value*, 16> v2;
+
+    if (isa<ConstantInt>(op1)) {
+      splitConstantInt(dyn_cast<ConstantInt>(op1), v0[0]->getType(), v1);
+    } else {
+      v1 = valueMap.find(op1)->second;
+    }
+
+    for (unsigned i = 0; i < v0.size(); i++) {
+      ConstantInt *c0 = NULL, *c1 = NULL;
+      if (isa<ConstantInt>(v0[i])) c0 = dyn_cast<ConstantInt>(v0[i]);
+      if (isa<ConstantInt>(v1[i])) c1 = dyn_cast<ConstantInt>(v1[i]);
+
+      if ((c0 &&c0->isZero())) {
+        // zero | anyvalue ==> anyvalue
+        v2.push_back(v1[i]);
+      } else if (c1 && c1->isZero()) {
+        // zero | anyvalue ==> anyvalue
+        v2.push_back(v0[i]);
+      } else if (c0 && c0->isMinusOne()) {
+        // 1111 | anyvalue ==> 1111
+        v2.push_back(c0);
+      } else if (c1 && c1->isMinusOne()) {
+        // 1111 | anyvalue ==> 1111
+        v2.push_back(c1);
+      } else {
+        v2.push_back(Builder.CreateOr(v0[i], v1[i]));
+      }
+    }
+    valueMap.insert(std::make_pair(p, v2));
+  }
+
+  void Legalize::legalizeXor(IRBuilder<> &Builder, Instruction *p) {
+    Value *op0 = p->getOperand(0);
+    Value *op1 = p->getOperand(1);
+
+    if (isa<ConstantInt>(op0)) {
+      op0 = p->getOperand(1);
+      op1 = p->getOperand(0);
+    }
+
+    ValueMapIter iter = valueMap.find(op0);
+    SmallVector<Value*, 16> v0 = iter->second;
+    SmallVector<Value*, 16> v1;
+    SmallVector<Value*, 16> v2;
+
+    if (isa<ConstantInt>(op1)) {
+      splitConstantInt(dyn_cast<ConstantInt>(op1), v0[0]->getType(), v1);
+    } else {
+      v1 = valueMap.find(op1)->second;
+    }
+
+    for (unsigned i = 0; i < v0.size(); i++) {
+      v2.push_back(Builder.CreateXor(v0[i], v1[i]));
+    }
+    valueMap.insert(std::make_pair(p, v2));
+  }
+
+  void Legalize::legalizeBitCast(IRBuilder<> &Builder, Instruction *p) {
+    SmallVector<Value*, 16> split;
+    Type *dstTy = p->getType();
+    Type *srcTy = dyn_cast<CastInst>(p)->getSrcTy();
+
+    if(srcTy->isVectorTy()) {
+      VectorType *vecTy = dyn_cast<VectorType>(srcTy);
+      Type *splitTy = vecTy->getElementType();
+      unsigned elements = srcTy->getPrimitiveSizeInBits()/splitTy->getPrimitiveSizeInBits();
+      // bitcast large integer from vector, so we do extractElement to get split integer
+      unsigned splitSz = splitTy->getPrimitiveSizeInBits();
+      Value *src = p->getOperand(0);
+      // if it is cast from <4 x float> to i128
+      // we cast <4 x float> to <4 x i32> first
+      if (!splitTy->isIntegerTy())
+        src = Builder.CreateBitCast(src, VectorType::get(IntegerType::get(p->getContext(), splitSz), elements));
+
+      for (unsigned i = 0; i < elements; i++) {
+        Value *NV = Builder.CreateExtractElement(src,
+                      ConstantInt::get(IntegerType::get(p->getContext(), 32), i));
+        split.push_back(NV);
+      }
+      valueMap.insert(std::make_pair(p, split));
+    } else if (dstTy->isVectorTy()) {
+      //bitcast from large integer to vector, so we do insertElement to build the vector
+      ValueMapIter iter = valueMap.find(p->getOperand(0));
+      SmallVectorImpl<Value*> &opVec = iter->second;
+      Type *splitTy = opVec[0]->getType();
+      GBE_ASSERT(dstTy->getPrimitiveSizeInBits() % splitTy->getPrimitiveSizeInBits() == 0);
+      GBE_ASSERT(dstTy->getPrimitiveSizeInBits() / splitTy->getPrimitiveSizeInBits() == opVec.size());
+      Value *vec = NULL;
+      Type *idxTy = IntegerType::get(p->getContext(), 32);
+      for (unsigned i = 0; i < opVec.size(); ++i) {
+        Value *tmp = vec ? vec : UndefValue::get(VectorType::get(splitTy, opVec.size()));
+        Value *idx = ConstantInt::get(idxTy, i);
+        vec = Builder.CreateInsertElement(tmp, opVec[i], idx);
+      }
+      Type *elemTy = cast<VectorType>(dstTy)->getElementType();
+      if (elemTy == opVec[0]->getType())
+        p->replaceAllUsesWith(vec);
+      else {
+        Value *newVec = Builder.CreateBitCast(vec, dstTy);
+        p->replaceAllUsesWith(newVec);
+      }
+    } else {
+      p->dump(); GBE_ASSERT(0 && "Unsupported bitcast");
+    }
+  }
+
+  void Legalize::legalizeTrunc(IRBuilder<> &Builder, Instruction *p) {
+    Type *dstTy = p->getType();
+
+    ValueMapIter iter = valueMap.find(p->getOperand(0));
+    SmallVector<Value*, 16> &opVec = iter->second;
+    unsigned szSplit = opVec[0]->getType()->getPrimitiveSizeInBits();
+    unsigned szResult = dstTy->getPrimitiveSizeInBits();
+
+    if(szResult > szSplit) {
+      // the needed bits is larger than what is already split,
+      // we have to merge the split Value, use Shl/Or to do it.
+      int endIdx = (szResult + szSplit-1 )/szSplit;
+      Value * prev = ConstantInt::get(dstTy, 0);
+      for (int i = endIdx - 1; i >=0; i--) {
+        Value * res = Builder.CreateZExt(opVec[i], dstTy);
+        if (i > 0)
+          res = Builder.CreateShl(res, i*szSplit);
+        prev = Builder.CreateOr(res, prev);
+      }
+      Value *newValue = Builder.CreateTrunc(prev, dstTy);
+      p->replaceAllUsesWith(newValue);
+    } else if (szResult == szSplit) {
+      // same bit width, should use bitcast instead of trunc.
+      Value *newValue = Builder.CreateBitCast(opVec[0], dstTy);
+      p->replaceAllUsesWith(newValue);
+    } else {
+      // normal case, trunc to a shorter bit width
+      Value *newValue = Builder.CreateTrunc(opVec[0], dstTy);
+      p->replaceAllUsesWith(newValue);
+    }
+  }
+
+  void Legalize::legalizeZExt(IRBuilder<> &Builder, Instruction *p) {
+    SmallVector<Value*, 16> split;
+    Type *dstTy = dyn_cast<CastInst>(p)->getDestTy();
+    Type *srcTy = p->getOperand(0)->getType();
+    int elements = dstTy->getPrimitiveSizeInBits() / srcTy->getPrimitiveSizeInBits();
+
+    split.push_back(p->getOperand(0));
+    for (int i = 0; i < elements - 1; i++)
+      split.push_back(ConstantInt::getSigned(srcTy, 0));
+
+    valueMap.insert(std::make_pair(p, split));
+  }
+
+  bool Legalize::legalizeFunction(Function &F) {
+    bool changed = false;
+
+    typedef ReversePostOrderTraversal<Function*> RPOTType;
+    RPOTType rpot(&F);
+
+    for (RPOTType::rpo_iterator bb = rpot.begin(), bbE = rpot.end(); bb != bbE; ++bb) {
+      IRBuilder<> Builder(*bb);
+      for (BasicBlock::iterator it = (*bb)->begin(), itE = (*bb)->end(); it != itE; ++it) {
+        Instruction *insn = it;
+        Type *ty = insn->getType();
+        if(ty->isIntegerTy() && ty->getIntegerBitWidth() > 64) {
+          // result is large integer, push back itself and its users
+          changed = true;
+
+          processed.insert(insn);
+
+          for(Value::use_iterator iter = insn->use_begin(); iter != insn->use_end(); ++iter) {
+            // After LLVM 3.5, use_iterator points to 'Use' instead of 'User', which is more straightforward.
+          #if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR < 5)
+            User *theUser = *iter;
+          #else
+            User *theUser = iter->getUser();
+          #endif
+            processed.insert(theUser);
+          }
+        }
+
+        if(processed.empty() || processed.find(insn) == processed.end())
+          continue;
+
+        Builder.SetInsertPoint(insn);
+        switch(insn->getOpcode()) {
+          default: { insn->dump(); GBE_ASSERT(false && "Illegal instruction\n"); break;}
+          case Instruction::PHI:
+            legalizePHI(Builder, insn);
+            break;
+          case Instruction::Select:
+            legalizeSelect(Builder, insn);
+            break;
+          case Instruction::ICmp:
+            legalizeICmp(Builder, insn);
+            break;
+
+          case Instruction::Shl:
+            legalizeShl(Builder, insn);
+            break;
+
+          case Instruction::LShr:
+            legalizeLShr(Builder, insn);
+            break;
+
+          case Instruction::And:
+            legalizeAnd(Builder, insn);
+            break;
+
+          case Instruction::Or:
+            legalizeOr(Builder, insn);
+            break;
+
+          case Instruction::Xor:
+            legalizeXor(Builder, insn);
+            break;
+
+          case Instruction::BitCast:
+            legalizeBitCast(Builder, insn);
+            break;
+
+          case Instruction::Trunc:
+            legalizeTrunc(Builder, insn);
+            break;
+
+          case Instruction::ZExt:
+            legalizeZExt(Builder, insn);
+            break;
+        }
+      }
+    }
+
+    for (Value *v : processed) {
+      if (isa<Instruction>(v)) {
+        dyn_cast<Instruction>(v)->dropAllReferences();
+      }
+    }
+
+    for (Value *v : processed) {
+      if (isa<Instruction>(v)) {
+        dyn_cast<Instruction>(v)->eraseFromParent();
+      }
+    }
+
+    processed.clear();
+    valueMap.clear();
+    incompletePHIs.clear();
+    return changed;
+  }
+
+  FunctionPass* createLegalizePass() {
+    return new Legalize();
+  }
+  char Legalize::ID = 0;
+};
diff --git a/backend/src/llvm/llvm_loadstore_optimization.cpp b/backend/src/llvm/llvm_loadstore_optimization.cpp
index 4bfc7f6..c6349fa 100644
--- a/backend/src/llvm/llvm_loadstore_optimization.cpp
+++ b/backend/src/llvm/llvm_loadstore_optimization.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -87,12 +87,12 @@ namespace gbe {
     bool     optimizeLoadStore(BasicBlock &BB);
 
     bool     isLoadStoreCompatible(Value *A, Value *B);
-    void     mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 4> &merged);
-    void     mergeStore(BasicBlock &BB, SmallVector<Instruction*, 4> &merged);
+    void     mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 16> &merged);
+    void     mergeStore(BasicBlock &BB, SmallVector<Instruction*, 16> &merged);
     BasicBlock::iterator findConsecutiveAccess(BasicBlock &BB,
-                                               SmallVector<Instruction*, 4> &merged,
+                                               SmallVector<Instruction*, 16> &merged,
                                                BasicBlock::iterator &start,
-                                               unsigned maxLimit,
+                                               unsigned maxVecSize,
                                                bool isLoad);
 
     virtual const char *getPassName() const {
@@ -154,11 +154,11 @@ namespace gbe {
     return ((-offset) == sz);
   }
 
-  void GenLoadStoreOptimization::mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 4> &merged) {
+  void GenLoadStoreOptimization::mergeLoad(BasicBlock &BB, SmallVector<Instruction*, 16> &merged) {
     IRBuilder<> Builder(&BB);
 
     unsigned size = merged.size();
-    SmallVector<Value *, 4> values;
+    SmallVector<Value *, 16> values;
     for(unsigned i = 0; i < size; i++) {
       values.push_back(merged[i]);
     }
@@ -169,7 +169,7 @@ namespace gbe {
     Builder.SetInsertPoint(ld);
     VectorType *vecTy = VectorType::get(ld->getType(), size);
     Value *vecPtr = Builder.CreateBitCast(ld->getPointerOperand(),
-                                          PointerType::get(vecTy, addrSpace));
+                                        PointerType::get(vecTy, addrSpace));
     LoadInst *vecValue = Builder.CreateLoad(vecPtr);
     vecValue->setAlignment(align);
 
@@ -181,9 +181,9 @@ namespace gbe {
 
   BasicBlock::iterator
   GenLoadStoreOptimization::findConsecutiveAccess(BasicBlock &BB,
-                            SmallVector<Instruction*, 4> &merged,
+                            SmallVector<Instruction*, 16> &merged,
                             BasicBlock::iterator &start,
-                            unsigned maxLimit,
+                            unsigned maxVecSize,
                             bool isLoad) {
 
     BasicBlock::iterator stepForward = start;
@@ -194,6 +194,8 @@ namespace gbe {
     BasicBlock::iterator E = BB.end();
     BasicBlock::iterator J = ++start;
 
+    unsigned maxLimit = maxVecSize * 3;
+
     for(unsigned ss = 0; J != E && ss <= maxLimit; ++ss, ++J) {
       if((isLoad && isa<LoadInst>(*J)) || (!isLoad && isa<StoreInst>(*J))) {
         if(isLoadStoreCompatible(merged[merged.size()-1], J)) {
@@ -205,12 +207,12 @@ namespace gbe {
         break;
       }
 
-      if(merged.size() >= 4) break;
+      if(merged.size() >= maxVecSize) break;
     }
     return stepForward;
   }
 
-  void GenLoadStoreOptimization::mergeStore(BasicBlock &BB, SmallVector<Instruction*, 4> &merged) {
+  void GenLoadStoreOptimization::mergeStore(BasicBlock &BB, SmallVector<Instruction*, 16> &merged) {
     IRBuilder<> Builder(&BB);
 
     unsigned size = merged.size();
@@ -239,25 +241,36 @@ namespace gbe {
 
   bool GenLoadStoreOptimization::optimizeLoadStore(BasicBlock &BB) {
     bool changed = false;
-    SmallVector<Instruction*, 4> merged;
+    SmallVector<Instruction*, 16> merged;
     for (BasicBlock::iterator BBI = BB.begin(), E = BB.end(); BBI != E;++BBI) {
       if(isa<LoadInst>(*BBI) || isa<StoreInst>(*BBI)) {
         bool isLoad = isa<LoadInst>(*BBI) ? true: false;
         Type *ty = getValueType(BBI);
         if(ty->isVectorTy()) continue;
-        // we only support DWORD data type merge
-        if(!ty->isFloatTy() && !ty->isIntegerTy(32)) continue;
-        BBI = findConsecutiveAccess(BB, merged, BBI, 10, isLoad);
-        if(merged.size() > 1) {
+        // TODO Support DWORD/WORD/BYTE LOAD for store support DWORD only now.
+        if (!(ty->isFloatTy() || ty->isIntegerTy(32) ||
+             ((ty->isIntegerTy(8) || ty->isIntegerTy(16)) && isLoad)))
+          continue;
+        unsigned maxVecSize = (ty->isFloatTy() || ty->isIntegerTy(32)) ? 4 :
+                              (ty->isIntegerTy(16) ? 8 : 16);
+        BBI = findConsecutiveAccess(BB, merged, BBI, maxVecSize, isLoad);
+        uint32_t size = merged.size();
+        uint32_t pos = 0;
+        while(size > 1) {
+          unsigned vecSize = (size >= 16) ? 16 :
+                             (size >= 8 ? 8 :
+                             (size >= 4 ? 4 : size));
+          SmallVector<Instruction*, 16> mergedVec(merged.begin() + pos, merged.begin() + pos + vecSize);
           if(isLoad)
-            mergeLoad(BB, merged);
+            mergeLoad(BB, mergedVec);
           else
-            mergeStore(BB, merged);
+            mergeStore(BB, mergedVec);
           // remove merged insn
-          int size = merged.size();
-          for(int i = 0; i < size; i++)
-            merged[i]->eraseFromParent();
+          for(uint32_t i = 0; i < mergedVec.size(); i++)
+            mergedVec[i]->eraseFromParent();
           changed = true;
+          pos += vecSize;
+          size -= vecSize;
         }
         merged.clear();
       }
diff --git a/backend/src/llvm/llvm_passes.cpp b/backend/src/llvm/llvm_passes.cpp
index 1a38a0c..f9fda4d 100644
--- a/backend/src/llvm/llvm_passes.cpp
+++ b/backend/src/llvm/llvm_passes.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/llvm/llvm_printf_parser.cpp b/backend/src/llvm/llvm_printf_parser.cpp
index 00e1ef8..11e9633 100644
--- a/backend/src/llvm/llvm_printf_parser.cpp
+++ b/backend/src/llvm/llvm_printf_parser.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -330,9 +330,17 @@ error:
     Type* intTy;
     Value* pbuf_ptr;
     Value* index_buf_ptr;
+    Value* g1Xg2Xg3;
+    Value* wg_offset;
     int out_buf_sizeof_offset;
     static map<CallInst*, PrintfSet::PrintfFmt*> printfs;
     int printf_num;
+    int totalSizeofSize;
+
+    struct PrintfParserInfo {
+      llvm::CallInst* call;
+      PrintfSet::PrintfFmt* printf_fmt;
+    };
 
     PrintfParser(void) : FunctionPass(ID)
     {
@@ -343,7 +351,10 @@ error:
       printfs.clear();
       pbuf_ptr = NULL;
       index_buf_ptr = NULL;
+      g1Xg2Xg3 = NULL;
+      wg_offset = NULL;
       printf_num = 0;
+      totalSizeofSize = 0;
     }
 
     ~PrintfParser(void)
@@ -355,9 +366,9 @@ error:
       printfs.clear();
     }
 
-
-    bool parseOnePrintfInstruction(CallInst *& call);
+    bool parseOnePrintfInstruction(CallInst * call, PrintfParserInfo& info, int& sizeof_size);
     bool generateOneParameterInst(PrintfSlot& slot, Value*& arg, Type*& dst_type, int& sizeof_size);
+    bool generateOnePrintfInstruction(PrintfParserInfo& pInfo);
 
     virtual const char *getPassName() const
     {
@@ -367,119 +378,38 @@ error:
     virtual bool runOnFunction(llvm::Function &F);
   };
 
-  bool PrintfParser::parseOnePrintfInstruction(CallInst *& call)
+  bool PrintfParser::generateOnePrintfInstruction(PrintfParserInfo& pInfo)
   {
-    CallSite CS(call);
-    CallSite::arg_iterator CI_FMT = CS.arg_begin();
-    int param_num = 0;
-
-    llvm::Constant* arg0 = dyn_cast<llvm::ConstantExpr>(*CI_FMT);
-    llvm::Constant* arg0_ptr = dyn_cast<llvm::Constant>(arg0->getOperand(0));
-    if (!arg0_ptr) {
-      return false;
-    }
-
-    ConstantDataSequential* fmt_arg = dyn_cast<ConstantDataSequential>(arg0_ptr->getOperand(0));
-    if (!fmt_arg || !fmt_arg->isCString()) {
-      return false;
-    }
-
-    std::string fmt = fmt_arg->getAsCString();
-
-    PrintfSet::PrintfFmt* printf_fmt = NULL;
-
-    if (!(printf_fmt = parser_printf_fmt((char *)fmt.c_str(), param_num))) {//at lease print something
-      return false;
-    }
-
-    /* iff parameter more than %, error. */
-    /* str_fmt arg0 arg1 ... NULL */
-    if (param_num + 2 < static_cast<int>(call->getNumOperands())) {
-      delete printf_fmt;
-      return false;
-    }
-
-    /* FIXME: Because the OpenCL language do not support va macro, and we do not want
-       to introduce the va_list, va_start and va_end into our code, we just simulate
-       the function calls to caculate the offset caculation here. */
-#define BUILD_CALL_INST(name) \
-    CallInst* name = builder->CreateCall(cast<llvm::Function>(module->getOrInsertFunction( \
-                             "__gen_ocl_get_"#name,                                         \
-                             IntegerType::getInt32Ty(module->getContext()),                 \
-                             NULL)))
-
-    BUILD_CALL_INST(group_id2);
-    BUILD_CALL_INST(group_id1);
-    BUILD_CALL_INST(group_id0);
-    BUILD_CALL_INST(global_size2);
-    BUILD_CALL_INST(global_size1);
-    BUILD_CALL_INST(global_size0);
-    BUILD_CALL_INST(local_id2);
-    BUILD_CALL_INST(local_id1);
-    BUILD_CALL_INST(local_id0);
-    BUILD_CALL_INST(local_size2);
-    BUILD_CALL_INST(local_size1);
-    BUILD_CALL_INST(local_size0);
-
-#undef BUILD_CALL_INST
-
     Value* op0 = NULL;
     Value* val = NULL;
-    /* calculate offset for later usage.
-       offset = ((local_id2 + local_size2 * group_id2) * (global_size1 * global_size0)
-       + (local_id1 + local_size1 * group_id1) * global_size0
-       + (local_id0 + local_size0 * group_id0)) * sizeof(type)  */
-
-    // local_size2 * group_id2
-    val = builder->CreateMul(local_size2, group_id2);
-    // local_id2 + local_size2 * group_id2
-    val = builder->CreateAdd(local_id2, val);
-    // global_size1 * global_size0
-    op0 = builder->CreateMul(global_size1, global_size0);
-    // (local_id2 + local_size2 * group_id2) * (global_size1 * global_size0)
-    Value* offset1 = builder->CreateMul(val, op0);
-    // local_size1 * group_id1
-    val = builder->CreateMul(local_size1, group_id1);
-    // local_id1 + local_size1 * group_id1
-    val = builder->CreateAdd(local_id1, val);
-    // (local_id1 + local_size1 * group_id1) * global_size_0
-    Value* offset2 = builder->CreateMul(val, global_size0);
-    // local_size0 * group_id0
-    val = builder->CreateMul(local_size0, group_id0);
-    // local_id0 + local_size0 * group_id0
-    val = builder->CreateAdd(local_id0, val);
-    // The total sum
-    val = builder->CreateAdd(val, offset1);
-    Value* offset = builder->CreateAdd(val, offset2);
 
     /////////////////////////////////////////////////////
     /* calculate index address.
-       index_addr = (index_offset + offset )* sizeof(int) + index_buf_ptr
+       index_addr = (index_offset + wg_offset )* sizeof(int) + index_buf_ptr
        index_offset = global_size2 * global_size1 * global_size0 * printf_num */
 
-    // global_size2 * global_size1
-    op0 = builder->CreateMul(global_size2, global_size1);
-    // global_size2 * global_size1 * global_size0
-    Value* glXg2Xg3 = builder->CreateMul(op0, global_size0);
-    Value* index_offset = builder->CreateMul(glXg2Xg3, ConstantInt::get(intTy, printf_num));
+    Value* index_offset = builder->CreateMul(g1Xg2Xg3, ConstantInt::get(intTy, printf_num));
     // index_offset + offset
-    op0 = builder->CreateAdd(index_offset, offset);
+    op0 = builder->CreateAdd(index_offset, wg_offset);
     // (index_offset + offset)* sizeof(int)
     op0 = builder->CreateMul(op0, ConstantInt::get(intTy, sizeof(int)));
     // Final index address = index_buf_ptr + (index_offset + offset)* sizeof(int)
     op0 = builder->CreateAdd(index_buf_ptr, op0);
     Value* index_addr = builder->CreateIntToPtr(op0, Type::getInt32PtrTy(module->getContext(), 1));
-    builder->CreateStore(ConstantInt::get(intTy, 1), index_addr);// The flag
+    // Load the printf num first, printf may be in loop.
+    Value* loop_num = builder->CreateLoad(index_addr);
+    val = builder->CreateAdd(loop_num, ConstantInt::get(intTy, 1));
+    builder->CreateStore(val, index_addr);// The loop number.
 
     int i = 1;
     Value* data_addr = NULL;
-    for (auto &s : *printf_fmt) {
+    for (auto &s : *pInfo.printf_fmt) {
       if (s.type == PRINTF_SLOT_TYPE_STRING)
         continue;
 
-      assert(i < static_cast<int>(call->getNumOperands()) - 1);
+      assert(i < static_cast<int>(pInfo.call->getNumOperands()) - 1);
 
-      Value *out_arg = call->getOperand(i);
+      Value *out_arg = pInfo.call->getOperand(i);
       Type *dst_type = NULL;
       int sizeof_size = 0;
       if (!generateOneParameterInst(s, out_arg, dst_type, sizeof_size)) {
@@ -499,16 +429,23 @@ error:
 
       /////////////////////////////////////////////////////
       /* Calculate the data address.
-      data_addr = data_offset + pbuf_ptr + offset * sizeof(specify)
+      data_addr = (data_offset + pbuf_ptr + offset * sizeof(specify)) +
+               totalSizeofSize * global_size2 * global_size1 * global_size0 * loop_num
       data_offset = global_size2 * global_size1 * global_size0 * out_buf_sizeof_offset
 
       //global_size2 * global_size1 * global_size0 * out_buf_sizeof_offset */
-      op0 = builder->CreateMul(glXg2Xg3, ConstantInt::get(intTy, out_buf_sizeof_offset));
+      op0 = builder->CreateMul(g1Xg2Xg3, ConstantInt::get(intTy, out_buf_sizeof_offset));
       //offset * sizeof(specify)
-      val = builder->CreateMul(offset, ConstantInt::get(intTy, sizeof_size));
+      val = builder->CreateMul(wg_offset, ConstantInt::get(intTy, sizeof_size));
       //data_offset + pbuf_ptr
       op0 = builder->CreateAdd(pbuf_ptr, op0);
       op0 = builder->CreateAdd(op0, val);
+      //totalSizeofSize * global_size2 * global_size1 * global_size0
+      val = builder->CreateMul(g1Xg2Xg3, ConstantInt::get(intTy, totalSizeofSize));
+      //totalSizeofSize * global_size2 * global_size1 * global_size0 * loop_num
+      val = builder->CreateMul(val, loop_num);
+      //final
+      op0 = builder->CreateAdd(op0, val);
       data_addr = builder->CreateIntToPtr(op0, dst_type);
       builder->CreateStore(out_arg, data_addr);
 
@@ -520,14 +457,101 @@ error:
                               "__gen_ocl_printf", Type::getVoidTy(module->getContext()),
                               NULL)));
     assert(printfs[printf_inst] == NULL);
-    printfs[printf_inst] = printf_fmt;
+    printfs[printf_inst] = pInfo.printf_fmt;
     printf_num++;
     return true;
   }
 
+  bool PrintfParser::parseOnePrintfInstruction(CallInst * call, PrintfParserInfo& info, int& sizeof_size)
+  {
+    CallSite CS(call);
+    CallSite::arg_iterator CI_FMT = CS.arg_begin();
+    int param_num = 0;
+
+    llvm::Constant* arg0 = dyn_cast<llvm::ConstantExpr>(*CI_FMT);
+    llvm::Constant* arg0_ptr = dyn_cast<llvm::Constant>(arg0->getOperand(0));
+    if (!arg0_ptr) {
+      return false;
+    }
+
+    ConstantDataSequential* fmt_arg = dyn_cast<ConstantDataSequential>(arg0_ptr->getOperand(0));
+    if (!fmt_arg || !fmt_arg->isCString()) {
+      return false;
+    }
+
+    std::string fmt = fmt_arg->getAsCString();
+
+    PrintfSet::PrintfFmt* printf_fmt = NULL;
+
+    if (!(printf_fmt = parser_printf_fmt((char *)fmt.c_str(), param_num))) {//at lease print something
+      return false;
+    }
+
+    /* iff parameter more than %, error. */
+    /* str_fmt arg0 arg1 ... NULL */
+    if (param_num + 2 < static_cast<int>(call->getNumOperands())) {
+      delete printf_fmt;
+      return false;
+    }
+
+    info.call = call;
+    info.printf_fmt = printf_fmt;
+
+    sizeof_size = 0;
+    int i = 1;
+    for (auto &s : *printf_fmt) {
+      int sz = 0;
+      if (s.type == PRINTF_SLOT_TYPE_STRING)
+        continue;
+
+      assert(i < static_cast<int>(call->getNumOperands()) - 1);
+
+      switch (s.state->conversion_specifier) {
+        case PRINTF_CONVERSION_I:
+        case PRINTF_CONVERSION_D:
+        case PRINTF_CONVERSION_O:
+        case PRINTF_CONVERSION_U:
+        case PRINTF_CONVERSION_x:
+        case PRINTF_CONVERSION_X:
+        case PRINTF_CONVERSION_P:
+          if (s.state->length_modifier == PRINTF_LM_L)
+            sz = sizeof(int64_t);
+          else
+            sz = sizeof(int);
+          break;
+        case PRINTF_CONVERSION_C:
+          sz = sizeof(char);
+          break;
+        case PRINTF_CONVERSION_F:
+        case PRINTF_CONVERSION_f:
+        case PRINTF_CONVERSION_E:
+        case PRINTF_CONVERSION_e:
+        case PRINTF_CONVERSION_G:
+        case PRINTF_CONVERSION_g:
+        case PRINTF_CONVERSION_A:
+        case PRINTF_CONVERSION_a:
+          sz = sizeof(float);
+          break;
+        default:
+          sz = 0;
+          break;
+      }
+
+      if (s.state->vector_n) {
+        sz = sz * s.state->vector_n;
+      }
+
+      sizeof_size += ((sz + 3) / 4) * 4;
+    }
+
+    return true;
+  }
+
+
   bool PrintfParser::runOnFunction(llvm::Function &F)
   {
     bool changed = false;
+    bool hasPrintf = false;
     switch (F.getCallingConv()) {
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR <= 2
       case CallingConv::PTX_Device:
@@ -541,6 +565,8 @@ error:
         GBE_ASSERTM(false, "Unsupported calling convention");
     }
 
+    std::vector<PrintfParserInfo> infoVect;
+    totalSizeofSize = 0;
     module = F.getParent();
     intTy = IntegerType::get(module->getContext(), 32);
 
@@ -550,11 +576,14 @@ error:
 
     builder = new IRBuilder<>(module->getContext());
 
-    /* Iter the function and find printf. */
+    /* First find printfs and caculate all slots size of one loop. */
     for (llvm::Function::iterator B = F.begin(), BE = F.end(); B != BE; B++) {
       for (BasicBlock::iterator instI = B->begin(),
            instE = B->end(); instI != instE; ++instI) {
 
+        PrintfParserInfo pInfo;
+        int sizeof_size = 0;
+
         llvm::CallInst* call = dyn_cast<llvm::CallInst>(instI);
         if (!call) {
           continue;
@@ -569,26 +598,107 @@ error:
         if (fnName != "__gen_ocl_printf_stub")
           continue;
 
-        changed = true;
-
-        builder->SetInsertPoint(call);
-
-        if (!pbuf_ptr) {
-          /* alloc a new buffer ptr to collect the print output. */
-          Type *ptrTy = Type::getInt32PtrTy(module->getContext());
-          llvm::Constant * pBuf = module->getOrInsertGlobal(StringRef("__gen_ocl_printf_buf"), ptrTy);
-          pbuf_ptr = builder->CreatePtrToInt(pBuf, Type::getInt32Ty(module->getContext()));
-        }
-        if (!index_buf_ptr) {
-          Type *ptrTy = Type::getInt32PtrTy(module->getContext());
-          llvm::Constant * pBuf = module->getOrInsertGlobal(StringRef("__gen_ocl_printf_index_buf"), ptrTy);
-          index_buf_ptr = builder->CreatePtrToInt(pBuf, Type::getInt32Ty(module->getContext()));
+        if (!parseOnePrintfInstruction(call, pInfo, sizeof_size)) {
+          printf("Parse One printf inst failed, may have some error\n");
+          // Just kill this printf instruction.
+          deadprintfs.push_back(PrintfInst(cast<Instruction>(call),0));
+          continue;
         }
 
-        deadprintfs.push_back(PrintfInst(cast<Instruction>(call),parseOnePrintfInstruction(call)));
+        hasPrintf = true;
+
+        infoVect.push_back(pInfo);
+        totalSizeofSize += sizeof_size;
       }
     }
 
+    if (!hasPrintf)
+      return changed;
+
+    if (!pbuf_ptr) {
+      /* alloc a new buffer ptr to collect the print output. */
+      Type *ptrTy = Type::getInt32PtrTy(module->getContext());
+      llvm::Constant * pBuf = module->getOrInsertGlobal(StringRef("__gen_ocl_printf_buf"), ptrTy);
+      pbuf_ptr = builder->CreatePtrToInt(pBuf, Type::getInt32Ty(module->getContext()));
+    }
+    if (!index_buf_ptr) {
+      Type *ptrTy = Type::getInt32PtrTy(module->getContext());
+      llvm::Constant * pBuf = module->getOrInsertGlobal(StringRef("__gen_ocl_printf_index_buf"), ptrTy);
+      index_buf_ptr = builder->CreatePtrToInt(pBuf, Type::getInt32Ty(module->getContext()));
+    }
+
+    if (!wg_offset || !g1Xg2Xg3) {
+      Value* op0 = NULL;
+      Value* val = NULL;
+
+      builder->SetInsertPoint(F.begin()->begin());// Insert the common var in the begin.
+
+      /* FIXME: Because the OpenCL language do not support va macro, and we do not want
+         to introduce the va_list, va_start and va_end into our code, we just simulate
+         the function calls to caculate the offset caculation here. */
+#define BUILD_CALL_INST(name) \
+	CallInst* name = builder->CreateCall(cast<llvm::Function>(module->getOrInsertFunction( \
+				 "__gen_ocl_get_"#name, 					\
+				 IntegerType::getInt32Ty(module->getContext()), 		\
+				 NULL)))
+
+      BUILD_CALL_INST(group_id2);
+      BUILD_CALL_INST(group_id1);
+      BUILD_CALL_INST(group_id0);
+      BUILD_CALL_INST(global_size2);
+      BUILD_CALL_INST(global_size1);
+      BUILD_CALL_INST(global_size0);
+      BUILD_CALL_INST(local_id2);
+      BUILD_CALL_INST(local_id1);
+      BUILD_CALL_INST(local_id0);
+      BUILD_CALL_INST(local_size2);
+      BUILD_CALL_INST(local_size1);
+      BUILD_CALL_INST(local_size0);
+
+#undef BUILD_CALL_INST
+
+      /* calculate offset for later usage.
+         wg_offset = ((local_id2 + local_size2 * group_id2) * (global_size1 * global_size0)
+         + (local_id1 + local_size1 * group_id1) * global_size0
+         + (local_id0 + local_size0 * group_id0))  */
+
+      // local_size2 * group_id2
+      val = builder->CreateMul(local_size2, group_id2);
+      // local_id2 + local_size2 * group_id2
+      val = builder->CreateAdd(local_id2, val);
+      // global_size1 * global_size0
+      op0 = builder->CreateMul(global_size1, global_size0);
+      // (local_id2 + local_size2 * group_id2) * (global_size1 * global_size0)
+      Value* offset1 = builder->CreateMul(val, op0);
+      // local_size1 * group_id1
+      val = builder->CreateMul(local_size1, group_id1);
+      // local_id1 + local_size1 * group_id1
+      val = builder->CreateAdd(local_id1, val);
+      // (local_id1 + local_size1 * group_id1) * global_size_0
+      Value* offset2 = builder->CreateMul(val, global_size0);
+      // local_size0 * group_id0
+      val = builder->CreateMul(local_size0, group_id0);
+      // local_id0 + local_size0 * group_id0
+      val = builder->CreateAdd(local_id0, val);
+      // The total sum
+      val = builder->CreateAdd(val, offset1);
+      wg_offset = builder->CreateAdd(val, offset2);
+
+      // global_size2 * global_size1
+      op0 = builder->CreateMul(global_size2, global_size1);
+      // global_size2 * global_size1 * global_size0
+      g1Xg2Xg3 = builder->CreateMul(op0, global_size0);
+    }
+
+
+    /* Now generate the instructions. */
+    for (auto pInfo : infoVect) {
+      builder->SetInsertPoint(pInfo.call);
+      deadprintfs.push_back(PrintfInst(cast<Instruction>(pInfo.call), generateOnePrintfInstruction(pInfo)));
+    }
+
+    assert(out_buf_sizeof_offset == totalSizeofSize);
+
     /* Replace the instruction's operand if using printf's return value. */
     for (llvm::Function::iterator B = F.begin(), BE = F.end(); B != BE; B++) {
       for (BasicBlock::iterator instI = B->begin(),
@@ -640,14 +750,22 @@ error:
           case PRINTF_CONVERSION_U:
           case PRINTF_CONVERSION_x:
           case PRINTF_CONVERSION_X:
-            /* If the bits change, we need to consider the signed. */
-            if (arg->getType() != Type::getInt32Ty(module->getContext())) {
-              arg = builder->CreateIntCast(arg, Type::getInt32Ty(module->getContext()), sign);
-            }
+            if (slot.state->length_modifier == PRINTF_LM_L) { /* we would rather print long. */
+              if (arg->getType() != Type::getInt64Ty(module->getContext())) {
+                arg = builder->CreateIntCast(arg, Type::getInt64Ty(module->getContext()), sign);
+              }
+              dst_type = Type::getInt64PtrTy(module->getContext(), 1);
+              sizeof_size = sizeof(int64_t);
+            } else {
+              /* If the bits change, we need to consider the signed. */
+              if (arg->getType() != Type::getInt32Ty(module->getContext())) {
+                arg = builder->CreateIntCast(arg, Type::getInt32Ty(module->getContext()), sign);
+              }
 
-            /* Int to Int, just store. */
-            dst_type = Type::getInt32PtrTy(module->getContext(), 1);
-            sizeof_size = sizeof(int);
+              /* Int to Int, just store. */
+              dst_type = Type::getInt32PtrTy(module->getContext(), 1);
+              sizeof_size = sizeof(int);
+            }
             return true;
 
           case PRINTF_CONVERSION_C:
@@ -665,7 +783,7 @@ error:
           case PRINTF_CONVERSION_g:
           case PRINTF_CONVERSION_A:
           case PRINTF_CONVERSION_a:
-            printf("Warning: Have a float paramter for %%d like specifier, take care of it\n");
+            printf("Warning: Have a float parameter for %%d like specifier, take care of it\n");
             arg = builder->CreateSIToFP(arg, Type::getFloatTy(module->getContext()));
             dst_type = Type::getFloatPtrTy(module->getContext(), 1);
             sizeof_size = sizeof(float);
@@ -693,7 +811,7 @@ error:
           case PRINTF_CONVERSION_I:
           case PRINTF_CONVERSION_D:
             /* Float to Int, add a conversion. */
-            printf("Warning: Have a int paramter for %%f like specifier, take care of it\n");
+            printf("Warning: Have a int parameter for %%f like specifier, take care of it\n");
             arg = builder->CreateFPToSI(arg, Type::getInt32Ty(module->getContext()));
             dst_type = Type::getInt32PtrTy(module->getContext(), 1);
             sizeof_size = sizeof(int);
@@ -704,7 +822,7 @@ error:
           case PRINTF_CONVERSION_x:
           case PRINTF_CONVERSION_X:
             /* Float to uint, add a conversion. */
-            printf("Warning: Have a uint paramter for %%f like specifier, take care of it\n");
+            printf("Warning: Have a uint parameter for %%f like specifier, take care of it\n");
             arg = builder->CreateFPToUI(arg, Type::getInt32Ty(module->getContext()));
             dst_type = Type::getInt32PtrTy(module->getContext(), 1);
             sizeof_size = sizeof(int);
@@ -767,6 +885,7 @@ error:
         bool sign = false;
 
         if (vec_num != slot.state->vector_n) {
+          printf("Error The printf vector number is not match!\n");
           return false;
         }
 
@@ -777,26 +896,37 @@ error:
           case PRINTF_CONVERSION_O:
           case PRINTF_CONVERSION_U:
           case PRINTF_CONVERSION_x:
-          case PRINTF_CONVERSION_X:
-            if (elt_type->getTypeID() != Type::IntegerTyID)
+          case PRINTF_CONVERSION_X: {
+            if (elt_type->getTypeID() != Type::IntegerTyID) {
+              printf("Do not support type conversion between float and int in vector printf!\n");
               return false;
+            }
+
+            Type* elt_dst_type = NULL;
+            if (slot.state->length_modifier == PRINTF_LM_L) {
+              elt_dst_type = Type::getInt64Ty(elt_type->getContext());
+            } else {
+              elt_dst_type = Type::getInt32Ty(elt_type->getContext());
+            }
 
             /* If the bits change, we need to consider the signed. */
-            if (elt_type != Type::getInt32Ty(elt_type->getContext())) {
+            if (elt_type != elt_dst_type) {
               Value *II = NULL;
               for (int i = 0; i < vec_num; i++) {
-                Value *vec = II ? II : UndefValue::get(VectorType::get(Type::getInt32Ty(elt_type->getContext()), vec_num));
+                Value *vec = II ? II : UndefValue::get(VectorType::get(elt_dst_type, vec_num));
                 Value *cv = ConstantInt::get(Type::getInt32Ty(elt_type->getContext()), i);
                 Value *org = builder->CreateExtractElement(arg, cv);
-                Value *cvt = builder->CreateIntCast(org, Type::getInt32Ty(module->getContext()), sign);
+                Value *cvt = builder->CreateIntCast(org, elt_dst_type, sign);
                 II = builder->CreateInsertElement(vec, cvt, cv);
               }
               arg = II;
             }
 
             dst_type = arg->getType()->getPointerTo(1);
-            sizeof_size = sizeof(int) * vec_num;
+            sizeof_size = (elt_dst_type == Type::getInt32Ty(elt_type->getContext()) ?
+                           sizeof(int) * vec_num  : sizeof(int64_t) * vec_num);
             return true;
+          }
 
           case PRINTF_CONVERSION_F:
           case PRINTF_CONVERSION_f:
@@ -806,8 +936,10 @@ error:
           case PRINTF_CONVERSION_g:
           case PRINTF_CONVERSION_A:
           case PRINTF_CONVERSION_a:
-            if (elt_type->getTypeID() != Type::DoubleTyID && elt_type->getTypeID() != Type::FloatTyID)
+            if (elt_type->getTypeID() != Type::DoubleTyID && elt_type->getTypeID() != Type::FloatTyID) {
+              printf("Do not support type conversion between float and int in vector printf!\n");
               return false;
+            }
 
             if (elt_type->getTypeID() != Type::FloatTyID) {
               Value *II = NULL;
@@ -820,10 +952,14 @@ error:
               }
               arg = II;
             }
+
+            dst_type = arg->getType()->getPointerTo(1);
+            sizeof_size = sizeof(int) * vec_num;
+            return true;
+
+          default:
+            return false;
         }
-        dst_type = arg->getType()->getPointerTo(1);
-        sizeof_size = sizeof(int) * vec_num;
-        return true;
       }
 
       default:
diff --git a/backend/src/llvm/llvm_scalarize.cpp b/backend/src/llvm/llvm_scalarize.cpp
index 3e48fbf..5450a2b 100644
--- a/backend/src/llvm/llvm_scalarize.cpp
+++ b/backend/src/llvm/llvm_scalarize.cpp
@@ -106,18 +106,18 @@ namespace gbe {
 
     void setComponent(int c, llvm::Value* val)
     {
-      assert(c >= 0 && c < 16 && "Out of bounds component");
+      assert(c >= 0 && c < 32 && "Out of bounds component");
       vals[c] = val;
     }
     llvm::Value* getComponent(int c)
     {
-      assert(c >= 0 && c < 16 && "Out of bounds component");
+      assert(c >= 0 && c < 32 && "Out of bounds component");
       assert(vals[c] && "Requesting non-existing component");
       return vals[c];
     }
 
     // {Value* x, Value* y, Value* z, Value* w}
-    llvm::Value* vals[16];
+    llvm::Value* vals[32];
   };
 
   class Scalarize : public FunctionPass {
@@ -441,7 +441,7 @@ namespace gbe {
 
   void Scalarize::makeScalarizedCalls(Function* f, ArrayRef<Value*> args, int count, VectorValues& vVals)
   {
-    assert(count > 0 && count <= 16 && "invalid number of vector components");
+    assert(count > 0 && count <= 32 && "invalid number of vector components");
     for (int i = 0; i < count; ++i) {
       Value* res;
       SmallVector<Value*, 8> callArgs(args.begin(), args.end());
@@ -455,7 +455,7 @@ namespace gbe {
   void Scalarize::makePerComponentScalarizedCalls(Instruction* inst, ArrayRef<Value*> args)
   {
     int count = GetComponentCount(inst);
-    assert(count > 0 && count <= 16 && "invalid number of vector components");
+    assert(count > 0 && count <= 32 && "invalid number of vector components");
     assert((inst->getNumOperands() == args.size() || isa<PHINode>(inst))
            && "not enough arguments passed for instruction");
 
@@ -637,6 +637,13 @@ namespace gbe {
         Value *Callee = call->getCalledValue();
         const std::string fnName = Callee->getName();
         auto it = instrinsicMap.map.find(fnName);
+        // FIXME, should create a complete error reporting mechanism
+        // when found error in beignet managed passes including Gen pass.
+        if (it == instrinsicMap.map.end()) {
+          std::cerr << "Unresolved symbol: " << fnName << std::endl;
+          std::cerr << "Aborting..." << std::endl;
+          exit(-1);
+        }
         GBE_ASSERT(it != instrinsicMap.map.end());
 
         // Get the function arguments
diff --git a/backend/src/llvm/llvm_to_gen.cpp b/backend/src/llvm/llvm_to_gen.cpp
index 84ba383..eb75ba1 100644
--- a/backend/src/llvm/llvm_to_gen.cpp
+++ b/backend/src/llvm/llvm_to_gen.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -60,6 +60,8 @@
 #include "llvm/llvm_to_gen.hpp"
 #include "sys/cvar.hpp"
 #include "sys/platform.hpp"
+#include "ir/unit.hpp"
+#include "ir/structural_analysis.hpp"
 
 #include <clang/CodeGen/CodeGenAction.h>
 
@@ -70,10 +72,8 @@
 
 namespace gbe
 {
-  BVAR(OCL_OUTPUT_LLVM, false);
   BVAR(OCL_OUTPUT_CFG, false);
   BVAR(OCL_OUTPUT_CFG_ONLY, false);
-  BVAR(OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS, false);
   using namespace llvm;
 
   void runFuntionPass(Module &mod, TargetLibraryInfo *libraryInfo, const DataLayout &DL)
@@ -86,13 +86,10 @@ namespace gbe
     FPM.add(new DataLayout(DL));
 #endif
 
-    // XXX remove the verifier pass to workaround a non-fatal error.
-    // add this pass cause the Clang abort with the following error message:
-    // "Global is external, but doesn't have external or weak linkage"
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >=5
-    //FPM.add(createVerifierPass(true));
+    FPM.add(createVerifierPass(true));
 #else
-    //FPM.add(createVerifierPass());
+    FPM.add(createVerifierPass());
 #endif
     FPM.add(new TargetLibraryInfo(*libraryInfo));
     FPM.add(createTypeBasedAliasAnalysisPass());
@@ -110,7 +107,7 @@ namespace gbe
     FPM.doFinalization();
   }
 
-  void runModulePass(Module &mod, TargetLibraryInfo *libraryInfo, const DataLayout &DL, int optLevel)
+  void runModulePass(Module &mod, TargetLibraryInfo *libraryInfo, const DataLayout &DL, int optLevel, bool strictMath)
   {
     llvm::PassManager MPM;
 
@@ -122,6 +119,7 @@ namespace gbe
     MPM.add(new TargetLibraryInfo(*libraryInfo));
     MPM.add(createTypeBasedAliasAnalysisPass());
     MPM.add(createBasicAliasAnalysisPass());
+    MPM.add(createIntrinsicLoweringPass());
     MPM.add(createGlobalOptimizerPass());     // Optimize out global vars
 
     MPM.add(createIPSCCPPass());              // IP SCCP
@@ -154,9 +152,24 @@ namespace gbe
     MPM.add(createIndVarSimplifyPass());        // Canonicalize indvars
     MPM.add(createLoopIdiomPass());             // Recognize idioms like memset.
     MPM.add(createLoopDeletionPass());          // Delete dead loops
-    MPM.add(createLoopUnrollPass());          // Unroll small loops
-    if(optLevel > 0)
+    MPM.add(createLoopUnrollPass()); //1024, 32, 1024, 512)); //Unroll loops
+    if(optLevel > 0) {
+      MPM.add(createSROAPass(/*RequiresDomTree*/ false));
       MPM.add(createGVNPass());                 // Remove redundancies
+    }
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+    // FIXME Workaround: we find that CustomLoopUnroll may increase register pressure greatly,
+    // and it may even make som cl kernel cannot compile because of limited scratch memory for spill.
+    // As we observe this under strict math. So we disable CustomLoopUnroll if strict math is enabled.
+    if (!strictMath) {
+      MPM.add(createCustomLoopUnrollPass()); //1024, 32, 1024, 512)); //Unroll loops
+      MPM.add(createLoopUnrollPass()); //1024, 32, 1024, 512)); //Unroll loops
+      if(optLevel > 0) {
+        MPM.add(createSROAPass(/*RequiresDomTree*/ false));
+        MPM.add(createGVNPass());                 // Remove redundancies
+      }
+    }
+#endif
     MPM.add(createMemCpyOptPass());             // Remove memcpy / form memset
     MPM.add(createSCCPPass());                  // Constant prop with SCCP
 
@@ -178,32 +191,71 @@ namespace gbe
     MPM.run(mod);
   }
 
-  bool llvmToGen(ir::Unit &unit, const char *fileName,const void* module, int optLevel)
+
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+#define OUTPUT_BITCODE(STAGE, MOD)  do {         \
+   llvm::PassManager passes__;                   \
+   if (OCL_OUTPUT_LLVM_##STAGE) {                \
+     passes__.add(createPrintModulePass(*o));    \
+     passes__.run(MOD);                          \
+   }                                             \
+ }while(0)
+#else
+#define OUTPUT_BITCODE(STAGE, MOD)  do {         \
+   llvm::PassManager passes__;                   \
+   if (OCL_OUTPUT_LLVM_##STAGE) {                \
+     passes__.add(createPrintModulePass(&*o));   \
+     passes__.run(MOD);                          \
+   }                                             \
+ }while(0)
+#endif
+
+  BVAR(OCL_OUTPUT_LLVM_BEFORE_LINK, false);
+  BVAR(OCL_OUTPUT_LLVM_AFTER_LINK, false);
+  BVAR(OCL_OUTPUT_LLVM_AFTER_GEN, false);
+
+  bool llvmToGen(ir::Unit &unit, const char *fileName,const void* module, int optLevel, bool strictMath)
   {
     std::string errInfo;
     std::unique_ptr<llvm::raw_fd_ostream> o = NULL;
-    if (OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS || OCL_OUTPUT_LLVM)
+    if (OCL_OUTPUT_LLVM_BEFORE_LINK || OCL_OUTPUT_LLVM_AFTER_LINK || OCL_OUTPUT_LLVM_AFTER_GEN)
       o = std::unique_ptr<llvm::raw_fd_ostream>(new llvm::raw_fd_ostream(fileno(stdout), false));
 
     // Get the module from its file
     llvm::SMDiagnostic Err;
-    std::auto_ptr<Module> M;
-    if(fileName){
-      // only when module is null, Get the global LLVM context
+
+    Module* cl_mod = NULL;
+    if (module) {
+      cl_mod = reinterpret_cast<Module*>(const_cast<void*>(module));
+    } else if (fileName){
       llvm::LLVMContext& c = llvm::getGlobalContext();
-      M.reset(ParseIRFile(fileName, Err, c));
-      if (M.get() == 0) return false;
+      cl_mod = ParseIRFile(fileName, Err, c);
     }
-    Module &mod = (module!=NULL)?*(llvm::Module*)module:*M.get();
+
+    if (!cl_mod) return false;
+
+    OUTPUT_BITCODE(BEFORE_LINK, (*cl_mod));
+
+    std::unique_ptr<Module> M;
+
+    /* Before do any thing, we first filter in all CL functions in bitcode. */ 
+    M.reset(runBitCodeLinker(cl_mod, strictMath));
+    if (!module)
+      delete cl_mod;
+    if (M.get() == 0)
+      return true;
+
+    Module &mod = *M.get();
     DataLayout DL(&mod);
 
     Triple TargetTriple(mod.getTargetTriple());
     TargetLibraryInfo *libraryInfo = new TargetLibraryInfo(TargetTriple);
     libraryInfo->disableAllFunctions();
 
-    runFuntionPass(mod, libraryInfo, DL);
-    runModulePass(mod, libraryInfo, DL, optLevel);
+    OUTPUT_BITCODE(AFTER_LINK, mod);
 
+    runFuntionPass(mod, libraryInfo, DL);
+    runModulePass(mod, libraryInfo, DL, optLevel, strictMath);
     llvm::PassManager passes;
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
     passes.add(new DataLayoutPass(DL));
@@ -211,12 +263,6 @@ namespace gbe
     passes.add(new DataLayout(DL));
 #endif
     // Print the code before further optimizations
-    if (OCL_OUTPUT_LLVM_BEFORE_EXTRA_PASS)
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
-      passes.add(createPrintModulePass(*o));
-#else
-      passes.add(createPrintModulePass(&*o));
-#endif
     passes.add(createIntrinsicLoweringPass());
     passes.add(createFunctionInliningPass(200000));
     passes.add(createScalarReplAggregatesPass(64, true, -1, -1, 64));
@@ -229,6 +275,7 @@ namespace gbe
       passes.add(createGVNPass());                  // Remove redundancies
     passes.add(createPrintfParserPass());
     passes.add(createScalarizePass());        // Expand all vector ops
+    passes.add(createLegalizePass());
     passes.add(createDeadInstEliminationPass());  // Remove simplified instructions
     passes.add(createCFGSimplificationPass());     // Merge & remove BBs
     passes.add(createScalarizePass());        // Expand all vector ops
@@ -238,15 +285,22 @@ namespace gbe
     if(OCL_OUTPUT_CFG_ONLY)
       passes.add(createCFGOnlyPrinterPass());
     passes.add(createGenPass(unit));
+    passes.run(mod);
 
     // Print the code extra optimization passes
-    if (OCL_OUTPUT_LLVM)
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
-      passes.add(createPrintModulePass(*o));
-#else
-      passes.add(createPrintModulePass(&*o));
-#endif
-    passes.run(mod);
+    OUTPUT_BITCODE(AFTER_GEN, mod);
+
+    const ir::Unit::FunctionSet& fs = unit.getFunctionSet();
+    ir::Unit::FunctionSet::const_iterator iter = fs.begin();
+    while(iter != fs.end())
+    {
+      analysis::ControlTree *ct = new analysis::ControlTree(iter->second);
+      ct->analyze();
+      delete ct;
+      iter++;
+    }
+
+    delete libraryInfo;
     return true;
   }
 } /* namespace gbe */
diff --git a/backend/src/llvm/llvm_to_gen.hpp b/backend/src/llvm/llvm_to_gen.hpp
index 41e3477..22ffcb4 100644
--- a/backend/src/llvm/llvm_to_gen.hpp
+++ b/backend/src/llvm/llvm_to_gen.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -32,7 +32,7 @@ namespace gbe {
 
   /*! Convert the LLVM IR code to a GEN IR code,
 		  optLevel 0 equal to clang -O1 and 1 equal to clang -O2*/
-  bool llvmToGen(ir::Unit &unit, const char *fileName, const void* module, int optLevel);
+  bool llvmToGen(ir::Unit &unit, const char *fileName, const void* module, int optLevel, bool strictMath);
 
 } /* namespace gbe */
 
diff --git a/backend/src/llvm/llvm_unroll.cpp b/backend/src/llvm/llvm_unroll.cpp
new file mode 100644
index 0000000..7cd7c35
--- /dev/null
+++ b/backend/src/llvm/llvm_unroll.cpp
@@ -0,0 +1,228 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "llvm/Config/llvm-config.h"
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 5
+#include <set>
+#if LLVM_VERSION_MINOR <= 2
+#include "llvm/Function.h"
+#include "llvm/InstrTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
+#else
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#endif  /* LLVM_VERSION_MINOR <= 2 */
+#include "llvm/Pass.h"
+#if LLVM_VERSION_MINOR <= 1
+#include "llvm/Support/IRBuilder.h"
+#elif LLVM_VERSION_MINOR == 2
+#include "llvm/IRBuilder.h"
+#else
+#include "llvm/IR/IRBuilder.h"
+#endif /* LLVM_VERSION_MINOR <= 1 */
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/PassManager.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/llvm_gen_backend.hpp"
+#include "sys/map.hpp"
+
+
+using namespace llvm;
+
+namespace gbe {
+    class CustomLoopUnroll : public LoopPass
+    {
+    public:
+      static char ID;
+      CustomLoopUnroll() :
+       LoopPass(ID) {}
+
+      void getAnalysisUsage(AnalysisUsage &AU) const {
+        AU.addRequired<LoopInfo>();
+        AU.addPreserved<LoopInfo>();
+        AU.addRequiredID(LoopSimplifyID);
+        AU.addPreservedID(LoopSimplifyID);
+        AU.addRequiredID(LCSSAID);
+        AU.addPreservedID(LCSSAID);
+        AU.addRequired<ScalarEvolution>();
+        AU.addPreserved<ScalarEvolution>();
+      // FIXME: Loop unroll requires LCSSA. And LCSSA requires dom info.
+      // If loop unroll does not preserve dom info then LCSSA pass on next
+      // loop will receive invalid dom info.
+      // For now, recreate dom info, if loop is unrolled.
+      AU.addPreserved<DominatorTreeWrapperPass>();
+
+      }
+
+      // Returns the value associated with the given metadata node name (for
+      // example, "llvm.loop.unroll.count").  If no such named metadata node
+      // exists, then nullptr is returned.
+      static const ConstantInt *GetUnrollMetadataValue(const Loop *L,
+                                                     StringRef Name) {
+        MDNode *LoopID = L->getLoopID();
+        if (!LoopID) return nullptr;
+        // First operand should refer to the loop id itself.
+        assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
+        assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
+        for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
+          const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+          if (!MD) continue;
+          const MDString *S = dyn_cast<MDString>(MD->getOperand(0));
+          if (!S) continue;
+          if (Name.equals(S->getString())) {
+            assert(MD->getNumOperands() == 2 &&
+                   "Unroll hint metadata should have two operands.");
+            return cast<ConstantInt>(MD->getOperand(1));
+          }
+        }
+        return nullptr;
+      }
+
+      void setUnrollID(Loop *L, bool enable) {
+        if (!enable && disabledLoops.find(L) != disabledLoops.end())
+           return;
+        LLVMContext &Context = L->getHeader()->getContext();
+        SmallVector<Value *, 2> forceUnroll;
+        forceUnroll.push_back(MDString::get(Context, "llvm.loop.unroll.enable"));
+        forceUnroll.push_back(ConstantInt::get(Type::getInt1Ty(Context), enable));
+        MDNode *forceUnrollNode = MDNode::get(Context, forceUnroll);
+        SmallVector<Value *, 4> Vals;
+        Vals.push_back(NULL);
+        Vals.push_back(forceUnrollNode);
+        MDNode *NewLoopID = MDNode::get(Context, Vals);
+        // Set operand 0 to refer to the loop id itself.
+        NewLoopID->replaceOperandWith(0, NewLoopID);
+        L->setLoopID(NewLoopID);
+        if (!enable)
+          disabledLoops.insert(L);
+      }
+
+      static bool hasPrivateLoadStore(Loop *L) {
+        const std::vector<Loop*> subLoops = L->getSubLoops();
+        std::set<BasicBlock*> subBlocks, blocks;
+
+        for(auto l : subLoops)
+          for(auto bb : l->getBlocks())
+            subBlocks.insert(bb);
+        for(auto bb : L->getBlocks())
+          if (subBlocks.find(bb) == subBlocks.end())
+            blocks.insert(bb);
+        for(auto bb : blocks) {
+          for (BasicBlock::iterator inst = bb->begin(), instE = bb->end(); inst != instE; ++inst) {
+            unsigned addrSpace = -1;
+            if (isa<LoadInst>(*inst)) {
+              LoadInst *ld = cast<LoadInst>(&*inst);
+              addrSpace = ld->getPointerAddressSpace();
+            }
+            else if (isa<StoreInst>(*inst)) {
+              StoreInst *st = cast<StoreInst>(&*inst);
+              addrSpace = st->getPointerAddressSpace();
+            }
+            if (addrSpace == 0)
+              return true;
+          }
+        }
+        return false;
+      }
+      // If one loop has very large self trip count
+      // we don't want to unroll it.
+      // self trip count means trip count divide by the parent's trip count. for example
+      // for (int i = 0; i < 16; i++) {
+      //   for (int j = 0; j < 4; j++) {
+      //     for (int k = 0; k < 2; k++) {
+      //       ...
+      //     }
+      //     ...
+      //   }
+      // The inner loops j and k could be unrolled, but the loop i will not be unrolled.
+      // The return value true means the L could be unrolled, otherwise, it could not
+      // be unrolled.
+      bool handleParentLoops(Loop *L, LPPassManager &LPM) {
+        Loop *currL = L;
+        ScalarEvolution *SE = &getAnalysis<ScalarEvolution>();
+        BasicBlock *latchBlock = currL->getLoopLatch();
+        unsigned currTripCount = 0;
+        bool shouldUnroll = true;
+        if (latchBlock)
+          currTripCount = SE->getSmallConstantTripCount(L, latchBlock);
+
+        while(currL) {
+          Loop *parentL = currL->getParentLoop();
+          unsigned parentTripCount = 0;
+          if (parentL) {
+            BasicBlock *parentLatchBlock = parentL->getLoopLatch();
+            if (parentLatchBlock)
+              parentTripCount = SE->getSmallConstantTripCount(parentL, parentLatchBlock);
+          }
+          if ((parentTripCount != 0 && currTripCount / parentTripCount > 16) ||
+              (currTripCount > 32)) {
+            if (currL == L)
+              shouldUnroll = false;
+            setUnrollID(currL, false);
+            if (currL != L)
+              LPM.deleteLoopFromQueue(currL);
+          }
+          currL = parentL;
+          currTripCount = parentTripCount;
+        }
+        return shouldUnroll;
+      }
+
+      // Analyze the outermost BBs of this loop, if there are
+      // some private load or store, we change it's loop meta data
+      // to indicate more aggresive unrolling on it.
+      virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
+        const ConstantInt *Enable = GetUnrollMetadataValue(L, "llvm.loop.unroll.enable");
+        if (Enable)
+          return false;
+        const ConstantInt *Count = GetUnrollMetadataValue(L, "llvm.loop.unroll.count");
+        if (Count)
+          return false;
+
+        if (!handleParentLoops(L, LPM))
+          return false;
+
+        if (!hasPrivateLoadStore(L))
+          return false;
+        setUnrollID(L, true);
+        return true;
+      }
+
+      virtual const char *getPassName() const {
+        return "SPIR backend: custom loop unrolling pass";
+      }
+    private:
+      std::set<Loop *> disabledLoops;
+
+    };
+
+    char CustomLoopUnroll::ID = 0;
+
+    LoopPass *createCustomLoopUnrollPass() {
+      return new CustomLoopUnroll();
+    }
+} // end namespace
+#endif
diff --git a/backend/src/ocl_as.h b/backend/src/ocl_as.h
deleted file mode 100644
index 692e892..0000000
--- a/backend/src/ocl_as.h
+++ /dev/null
@@ -1,3086 +0,0 @@
-// This file is autogenerated by gen_as.sh.
-// Don't modify it manually.
-union _type_cast_1_b {
-  char _char;
-  uchar _uchar;
-};
-
-INLINE OVERLOADABLE uchar as_uchar(char v) {
-  union _type_cast_1_b u;
-  u._char = v;
-  return u._uchar;
-}
-
-INLINE OVERLOADABLE char as_char(uchar v) {
-  union _type_cast_1_b u;
-  u._uchar = v;
-  return u._char;
-}
-
-union _type_cast_2_b {
-  short _short;
-  ushort _ushort;
-  char2 _char2;
-  uchar2 _uchar2;
-};
-
-INLINE OVERLOADABLE ushort as_ushort(short v) {
-  union _type_cast_2_b u;
-  u._short = v;
-  return u._ushort;
-}
-
-INLINE OVERLOADABLE char2 as_char2(short v) {
-  union _type_cast_2_b u;
-  u._short = v;
-  return u._char2;
-}
-
-INLINE OVERLOADABLE uchar2 as_uchar2(short v) {
-  union _type_cast_2_b u;
-  u._short = v;
-  return u._uchar2;
-}
-
-INLINE OVERLOADABLE short as_short(ushort v) {
-  union _type_cast_2_b u;
-  u._ushort = v;
-  return u._short;
-}
-
-INLINE OVERLOADABLE char2 as_char2(ushort v) {
-  union _type_cast_2_b u;
-  u._ushort = v;
-  return u._char2;
-}
-
-INLINE OVERLOADABLE uchar2 as_uchar2(ushort v) {
-  union _type_cast_2_b u;
-  u._ushort = v;
-  return u._uchar2;
-}
-
-INLINE OVERLOADABLE short as_short(char2 v) {
-  union _type_cast_2_b u;
-  u._char2 = v;
-  return u._short;
-}
-
-INLINE OVERLOADABLE ushort as_ushort(char2 v) {
-  union _type_cast_2_b u;
-  u._char2 = v;
-  return u._ushort;
-}
-
-INLINE OVERLOADABLE uchar2 as_uchar2(char2 v) {
-  union _type_cast_2_b u;
-  u._char2 = v;
-  return u._uchar2;
-}
-
-INLINE OVERLOADABLE short as_short(uchar2 v) {
-  union _type_cast_2_b u;
-  u._uchar2 = v;
-  return u._short;
-}
-
-INLINE OVERLOADABLE ushort as_ushort(uchar2 v) {
-  union _type_cast_2_b u;
-  u._uchar2 = v;
-  return u._ushort;
-}
-
-INLINE OVERLOADABLE char2 as_char2(uchar2 v) {
-  union _type_cast_2_b u;
-  u._uchar2 = v;
-  return u._char2;
-}
-
-union _type_cast_4_b {
-  int _int;
-  uint _uint;
-  short2 _short2;
-  ushort2 _ushort2;
-  char3 _char3;
-  char4 _char4;
-  uchar3 _uchar3;
-  uchar4 _uchar4;
-  float _float;
-};
-
-INLINE OVERLOADABLE uint as_uint(int v) {
-  union _type_cast_4_b u;
-  u._int = v;
-  return u._uint;
-}
-
-INLINE OVERLOADABLE short2 as_short2(int v) {
-  union _type_cast_4_b u;
-  u._int = v;
-  return u._short2;
-}
-
-INLINE OVERLOADABLE ushort2 as_ushort2(int v) {
-  union _type_cast_4_b u;
-  u._int = v;
-  return u._ushort2;
-}
-
-INLINE OVERLOADABLE char3 as_char3(int v) {
-  union _type_cast_4_b u;
-  u._int = v;
-  return u._char3;
-}
-
-INLINE OVERLOADABLE char4 as_char4(int v) {
-  union _type_cast_4_b u;
-  u._int = v;
-  return u._char4;
-}
-
-INLINE OVERLOADABLE uchar3 as_uchar3(int v) {
-  union _type_cast_4_b u;
-  u._int = v;
-  return u._uchar3;
-}
-
-INLINE OVERLOADABLE uchar4 as_uchar4(int v) {
-  union _type_cast_4_b u;
-  u._int = v;
-  return u._uchar4;
-}
-
-INLINE OVERLOADABLE float as_float(int v) {
-  union _type_cast_4_b u;
-  u._int = v;
-  return u._float;
-}
-
-INLINE OVERLOADABLE int as_int(uint v) {
-  union _type_cast_4_b u;
-  u._uint = v;
-  return u._int;
-}
-
-INLINE OVERLOADABLE short2 as_short2(uint v) {
-  union _type_cast_4_b u;
-  u._uint = v;
-  return u._short2;
-}
-
-INLINE OVERLOADABLE ushort2 as_ushort2(uint v) {
-  union _type_cast_4_b u;
-  u._uint = v;
-  return u._ushort2;
-}
-
-INLINE OVERLOADABLE char3 as_char3(uint v) {
-  union _type_cast_4_b u;
-  u._uint = v;
-  return u._char3;
-}
-
-INLINE OVERLOADABLE char4 as_char4(uint v) {
-  union _type_cast_4_b u;
-  u._uint = v;
-  return u._char4;
-}
-
-INLINE OVERLOADABLE uchar3 as_uchar3(uint v) {
-  union _type_cast_4_b u;
-  u._uint = v;
-  return u._uchar3;
-}
-
-INLINE OVERLOADABLE uchar4 as_uchar4(uint v) {
-  union _type_cast_4_b u;
-  u._uint = v;
-  return u._uchar4;
-}
-
-INLINE OVERLOADABLE float as_float(uint v) {
-  union _type_cast_4_b u;
-  u._uint = v;
-  return u._float;
-}
-
-INLINE OVERLOADABLE int as_int(short2 v) {
-  union _type_cast_4_b u;
-  u._short2 = v;
-  return u._int;
-}
-
-INLINE OVERLOADABLE uint as_uint(short2 v) {
-  union _type_cast_4_b u;
-  u._short2 = v;
-  return u._uint;
-}
-
-INLINE OVERLOADABLE ushort2 as_ushort2(short2 v) {
-  union _type_cast_4_b u;
-  u._short2 = v;
-  return u._ushort2;
-}
-
-INLINE OVERLOADABLE char3 as_char3(short2 v) {
-  union _type_cast_4_b u;
-  u._short2 = v;
-  return u._char3;
-}
-
-INLINE OVERLOADABLE char4 as_char4(short2 v) {
-  union _type_cast_4_b u;
-  u._short2 = v;
-  return u._char4;
-}
-
-INLINE OVERLOADABLE uchar3 as_uchar3(short2 v) {
-  union _type_cast_4_b u;
-  u._short2 = v;
-  return u._uchar3;
-}
-
-INLINE OVERLOADABLE uchar4 as_uchar4(short2 v) {
-  union _type_cast_4_b u;
-  u._short2 = v;
-  return u._uchar4;
-}
-
-INLINE OVERLOADABLE float as_float(short2 v) {
-  union _type_cast_4_b u;
-  u._short2 = v;
-  return u._float;
-}
-
-INLINE OVERLOADABLE int as_int(ushort2 v) {
-  union _type_cast_4_b u;
-  u._ushort2 = v;
-  return u._int;
-}
-
-INLINE OVERLOADABLE uint as_uint(ushort2 v) {
-  union _type_cast_4_b u;
-  u._ushort2 = v;
-  return u._uint;
-}
-
-INLINE OVERLOADABLE short2 as_short2(ushort2 v) {
-  union _type_cast_4_b u;
-  u._ushort2 = v;
-  return u._short2;
-}
-
-INLINE OVERLOADABLE char3 as_char3(ushort2 v) {
-  union _type_cast_4_b u;
-  u._ushort2 = v;
-  return u._char3;
-}
-
-INLINE OVERLOADABLE char4 as_char4(ushort2 v) {
-  union _type_cast_4_b u;
-  u._ushort2 = v;
-  return u._char4;
-}
-
-INLINE OVERLOADABLE uchar3 as_uchar3(ushort2 v) {
-  union _type_cast_4_b u;
-  u._ushort2 = v;
-  return u._uchar3;
-}
-
-INLINE OVERLOADABLE uchar4 as_uchar4(ushort2 v) {
-  union _type_cast_4_b u;
-  u._ushort2 = v;
-  return u._uchar4;
-}
-
-INLINE OVERLOADABLE float as_float(ushort2 v) {
-  union _type_cast_4_b u;
-  u._ushort2 = v;
-  return u._float;
-}
-
-INLINE OVERLOADABLE int as_int(char3 v) {
-  union _type_cast_4_b u;
-  u._char3 = v;
-  return u._int;
-}
-
-INLINE OVERLOADABLE uint as_uint(char3 v) {
-  union _type_cast_4_b u;
-  u._char3 = v;
-  return u._uint;
-}
-
-INLINE OVERLOADABLE short2 as_short2(char3 v) {
-  union _type_cast_4_b u;
-  u._char3 = v;
-  return u._short2;
-}
-
-INLINE OVERLOADABLE ushort2 as_ushort2(char3 v) {
-  union _type_cast_4_b u;
-  u._char3 = v;
-  return u._ushort2;
-}
-
-INLINE OVERLOADABLE uchar3 as_uchar3(char3 v) {
-  union _type_cast_4_b u;
-  u._char3 = v;
-  return u._uchar3;
-}
-
-INLINE OVERLOADABLE uchar4 as_uchar4(char3 v) {
-  union _type_cast_4_b u;
-  u._char3 = v;
-  return u._uchar4;
-}
-
-INLINE OVERLOADABLE float as_float(char3 v) {
-  union _type_cast_4_b u;
-  u._char3 = v;
-  return u._float;
-}
-
-INLINE OVERLOADABLE int as_int(char4 v) {
-  union _type_cast_4_b u;
-  u._char4 = v;
-  return u._int;
-}
-
-INLINE OVERLOADABLE uint as_uint(char4 v) {
-  union _type_cast_4_b u;
-  u._char4 = v;
-  return u._uint;
-}
-
-INLINE OVERLOADABLE short2 as_short2(char4 v) {
-  union _type_cast_4_b u;
-  u._char4 = v;
-  return u._short2;
-}
-
-INLINE OVERLOADABLE ushort2 as_ushort2(char4 v) {
-  union _type_cast_4_b u;
-  u._char4 = v;
-  return u._ushort2;
-}
-
-INLINE OVERLOADABLE uchar3 as_uchar3(char4 v) {
-  union _type_cast_4_b u;
-  u._char4 = v;
-  return u._uchar3;
-}
-
-INLINE OVERLOADABLE uchar4 as_uchar4(char4 v) {
-  union _type_cast_4_b u;
-  u._char4 = v;
-  return u._uchar4;
-}
-
-INLINE OVERLOADABLE float as_float(char4 v) {
-  union _type_cast_4_b u;
-  u._char4 = v;
-  return u._float;
-}
-
-INLINE OVERLOADABLE int as_int(uchar3 v) {
-  union _type_cast_4_b u;
-  u._uchar3 = v;
-  return u._int;
-}
-
-INLINE OVERLOADABLE uint as_uint(uchar3 v) {
-  union _type_cast_4_b u;
-  u._uchar3 = v;
-  return u._uint;
-}
-
-INLINE OVERLOADABLE short2 as_short2(uchar3 v) {
-  union _type_cast_4_b u;
-  u._uchar3 = v;
-  return u._short2;
-}
-
-INLINE OVERLOADABLE ushort2 as_ushort2(uchar3 v) {
-  union _type_cast_4_b u;
-  u._uchar3 = v;
-  return u._ushort2;
-}
-
-INLINE OVERLOADABLE char3 as_char3(uchar3 v) {
-  union _type_cast_4_b u;
-  u._uchar3 = v;
-  return u._char3;
-}
-
-INLINE OVERLOADABLE char4 as_char4(uchar3 v) {
-  union _type_cast_4_b u;
-  u._uchar3 = v;
-  return u._char4;
-}
-
-INLINE OVERLOADABLE float as_float(uchar3 v) {
-  union _type_cast_4_b u;
-  u._uchar3 = v;
-  return u._float;
-}
-
-INLINE OVERLOADABLE int as_int(uchar4 v) {
-  union _type_cast_4_b u;
-  u._uchar4 = v;
-  return u._int;
-}
-
-INLINE OVERLOADABLE uint as_uint(uchar4 v) {
-  union _type_cast_4_b u;
-  u._uchar4 = v;
-  return u._uint;
-}
-
-INLINE OVERLOADABLE short2 as_short2(uchar4 v) {
-  union _type_cast_4_b u;
-  u._uchar4 = v;
-  return u._short2;
-}
-
-INLINE OVERLOADABLE ushort2 as_ushort2(uchar4 v) {
-  union _type_cast_4_b u;
-  u._uchar4 = v;
-  return u._ushort2;
-}
-
-INLINE OVERLOADABLE char3 as_char3(uchar4 v) {
-  union _type_cast_4_b u;
-  u._uchar4 = v;
-  return u._char3;
-}
-
-INLINE OVERLOADABLE char4 as_char4(uchar4 v) {
-  union _type_cast_4_b u;
-  u._uchar4 = v;
-  return u._char4;
-}
-
-INLINE OVERLOADABLE float as_float(uchar4 v) {
-  union _type_cast_4_b u;
-  u._uchar4 = v;
-  return u._float;
-}
-
-INLINE OVERLOADABLE int as_int(float v) {
-  union _type_cast_4_b u;
-  u._float = v;
-  return u._int;
-}
-
-INLINE OVERLOADABLE uint as_uint(float v) {
-  union _type_cast_4_b u;
-  u._float = v;
-  return u._uint;
-}
-
-INLINE OVERLOADABLE short2 as_short2(float v) {
-  union _type_cast_4_b u;
-  u._float = v;
-  return u._short2;
-}
-
-INLINE OVERLOADABLE ushort2 as_ushort2(float v) {
-  union _type_cast_4_b u;
-  u._float = v;
-  return u._ushort2;
-}
-
-INLINE OVERLOADABLE char3 as_char3(float v) {
-  union _type_cast_4_b u;
-  u._float = v;
-  return u._char3;
-}
-
-INLINE OVERLOADABLE char4 as_char4(float v) {
-  union _type_cast_4_b u;
-  u._float = v;
-  return u._char4;
-}
-
-INLINE OVERLOADABLE uchar3 as_uchar3(float v) {
-  union _type_cast_4_b u;
-  u._float = v;
-  return u._uchar3;
-}
-
-INLINE OVERLOADABLE uchar4 as_uchar4(float v) {
-  union _type_cast_4_b u;
-  u._float = v;
-  return u._uchar4;
-}
-
-union _type_cast_8_b {
-  long _long;
-  ulong _ulong;
-  int2 _int2;
-  uint2 _uint2;
-  short3 _short3;
-  short4 _short4;
-  ushort3 _ushort3;
-  ushort4 _ushort4;
-  char8 _char8;
-  uchar8 _uchar8;
-  double _double;
-  float2 _float2;
-};
-
-INLINE OVERLOADABLE ulong as_ulong(long v) {
-  union _type_cast_8_b u;
-  u._long = v;
-  return u._ulong;
-}
-
-INLINE OVERLOADABLE int2 as_int2(long v) {
-  union _type_cast_8_b u;
-  u._long = v;
-  return u._int2;
-}
-
-INLINE OVERLOADABLE uint2 as_uint2(long v) {
-  union _type_cast_8_b u;
-  u._long = v;
-  return u._uint2;
-}
-
-INLINE OVERLOADABLE short3 as_short3(long v) {
-  union _type_cast_8_b u;
-  u._long = v;
-  return u._short3;
-}
-
-INLINE OVERLOADABLE short4 as_short4(long v) {
-  union _type_cast_8_b u;
-  u._long = v;
-  return u._short4;
-}
-
-INLINE OVERLOADABLE ushort3 as_ushort3(long v) {
-  union _type_cast_8_b u;
-  u._long = v;
-  return u._ushort3;
-}
-
-INLINE OVERLOADABLE ushort4 as_ushort4(long v) {
-  union _type_cast_8_b u;
-  u._long = v;
-  return u._ushort4;
-}
-
-INLINE OVERLOADABLE char8 as_char8(long v) {
-  union _type_cast_8_b u;
-  u._long = v;
-  return u._char8;
-}
-
-INLINE OVERLOADABLE uchar8 as_uchar8(long v) {
-  union _type_cast_8_b u;
-  u._long = v;
-  return u._uchar8;
-}
-
-INLINE OVERLOADABLE double as_double(long v) {
-  union _type_cast_8_b u;
-  u._long = v;
-  return u._double;
-}
-
-INLINE OVERLOADABLE float2 as_float2(long v) {
-  union _type_cast_8_b u;
-  u._long = v;
-  return u._float2;
-}
-
-INLINE OVERLOADABLE long as_long(ulong v) {
-  union _type_cast_8_b u;
-  u._ulong = v;
-  return u._long;
-}
-
-INLINE OVERLOADABLE int2 as_int2(ulong v) {
-  union _type_cast_8_b u;
-  u._ulong = v;
-  return u._int2;
-}
-
-INLINE OVERLOADABLE uint2 as_uint2(ulong v) {
-  union _type_cast_8_b u;
-  u._ulong = v;
-  return u._uint2;
-}
-
-INLINE OVERLOADABLE short3 as_short3(ulong v) {
-  union _type_cast_8_b u;
-  u._ulong = v;
-  return u._short3;
-}
-
-INLINE OVERLOADABLE short4 as_short4(ulong v) {
-  union _type_cast_8_b u;
-  u._ulong = v;
-  return u._short4;
-}
-
-INLINE OVERLOADABLE ushort3 as_ushort3(ulong v) {
-  union _type_cast_8_b u;
-  u._ulong = v;
-  return u._ushort3;
-}
-
-INLINE OVERLOADABLE ushort4 as_ushort4(ulong v) {
-  union _type_cast_8_b u;
-  u._ulong = v;
-  return u._ushort4;
-}
-
-INLINE OVERLOADABLE char8 as_char8(ulong v) {
-  union _type_cast_8_b u;
-  u._ulong = v;
-  return u._char8;
-}
-
-INLINE OVERLOADABLE uchar8 as_uchar8(ulong v) {
-  union _type_cast_8_b u;
-  u._ulong = v;
-  return u._uchar8;
-}
-
-INLINE OVERLOADABLE double as_double(ulong v) {
-  union _type_cast_8_b u;
-  u._ulong = v;
-  return u._double;
-}
-
-INLINE OVERLOADABLE float2 as_float2(ulong v) {
-  union _type_cast_8_b u;
-  u._ulong = v;
-  return u._float2;
-}
-
-INLINE OVERLOADABLE long as_long(int2 v) {
-  union _type_cast_8_b u;
-  u._int2 = v;
-  return u._long;
-}
-
-INLINE OVERLOADABLE ulong as_ulong(int2 v) {
-  union _type_cast_8_b u;
-  u._int2 = v;
-  return u._ulong;
-}
-
-INLINE OVERLOADABLE uint2 as_uint2(int2 v) {
-  union _type_cast_8_b u;
-  u._int2 = v;
-  return u._uint2;
-}
-
-INLINE OVERLOADABLE short3 as_short3(int2 v) {
-  union _type_cast_8_b u;
-  u._int2 = v;
-  return u._short3;
-}
-
-INLINE OVERLOADABLE short4 as_short4(int2 v) {
-  union _type_cast_8_b u;
-  u._int2 = v;
-  return u._short4;
-}
-
-INLINE OVERLOADABLE ushort3 as_ushort3(int2 v) {
-  union _type_cast_8_b u;
-  u._int2 = v;
-  return u._ushort3;
-}
-
-INLINE OVERLOADABLE ushort4 as_ushort4(int2 v) {
-  union _type_cast_8_b u;
-  u._int2 = v;
-  return u._ushort4;
-}
-
-INLINE OVERLOADABLE char8 as_char8(int2 v) {
-  union _type_cast_8_b u;
-  u._int2 = v;
-  return u._char8;
-}
-
-INLINE OVERLOADABLE uchar8 as_uchar8(int2 v) {
-  union _type_cast_8_b u;
-  u._int2 = v;
-  return u._uchar8;
-}
-
-INLINE OVERLOADABLE double as_double(int2 v) {
-  union _type_cast_8_b u;
-  u._int2 = v;
-  return u._double;
-}
-
-INLINE OVERLOADABLE float2 as_float2(int2 v) {
-  union _type_cast_8_b u;
-  u._int2 = v;
-  return u._float2;
-}
-
-INLINE OVERLOADABLE long as_long(uint2 v) {
-  union _type_cast_8_b u;
-  u._uint2 = v;
-  return u._long;
-}
-
-INLINE OVERLOADABLE ulong as_ulong(uint2 v) {
-  union _type_cast_8_b u;
-  u._uint2 = v;
-  return u._ulong;
-}
-
-INLINE OVERLOADABLE int2 as_int2(uint2 v) {
-  union _type_cast_8_b u;
-  u._uint2 = v;
-  return u._int2;
-}
-
-INLINE OVERLOADABLE short3 as_short3(uint2 v) {
-  union _type_cast_8_b u;
-  u._uint2 = v;
-  return u._short3;
-}
-
-INLINE OVERLOADABLE short4 as_short4(uint2 v) {
-  union _type_cast_8_b u;
-  u._uint2 = v;
-  return u._short4;
-}
-
-INLINE OVERLOADABLE ushort3 as_ushort3(uint2 v) {
-  union _type_cast_8_b u;
-  u._uint2 = v;
-  return u._ushort3;
-}
-
-INLINE OVERLOADABLE ushort4 as_ushort4(uint2 v) {
-  union _type_cast_8_b u;
-  u._uint2 = v;
-  return u._ushort4;
-}
-
-INLINE OVERLOADABLE char8 as_char8(uint2 v) {
-  union _type_cast_8_b u;
-  u._uint2 = v;
-  return u._char8;
-}
-
-INLINE OVERLOADABLE uchar8 as_uchar8(uint2 v) {
-  union _type_cast_8_b u;
-  u._uint2 = v;
-  return u._uchar8;
-}
-
-INLINE OVERLOADABLE double as_double(uint2 v) {
-  union _type_cast_8_b u;
-  u._uint2 = v;
-  return u._double;
-}
-
-INLINE OVERLOADABLE float2 as_float2(uint2 v) {
-  union _type_cast_8_b u;
-  u._uint2 = v;
-  return u._float2;
-}
-
-INLINE OVERLOADABLE long as_long(short3 v) {
-  union _type_cast_8_b u;
-  u._short3 = v;
-  return u._long;
-}
-
-INLINE OVERLOADABLE ulong as_ulong(short3 v) {
-  union _type_cast_8_b u;
-  u._short3 = v;
-  return u._ulong;
-}
-
-INLINE OVERLOADABLE int2 as_int2(short3 v) {
-  union _type_cast_8_b u;
-  u._short3 = v;
-  return u._int2;
-}
-
-INLINE OVERLOADABLE uint2 as_uint2(short3 v) {
-  union _type_cast_8_b u;
-  u._short3 = v;
-  return u._uint2;
-}
-
-INLINE OVERLOADABLE ushort3 as_ushort3(short3 v) {
-  union _type_cast_8_b u;
-  u._short3 = v;
-  return u._ushort3;
-}
-
-INLINE OVERLOADABLE ushort4 as_ushort4(short3 v) {
-  union _type_cast_8_b u;
-  u._short3 = v;
-  return u._ushort4;
-}
-
-INLINE OVERLOADABLE char8 as_char8(short3 v) {
-  union _type_cast_8_b u;
-  u._short3 = v;
-  return u._char8;
-}
-
-INLINE OVERLOADABLE uchar8 as_uchar8(short3 v) {
-  union _type_cast_8_b u;
-  u._short3 = v;
-  return u._uchar8;
-}
-
-INLINE OVERLOADABLE double as_double(short3 v) {
-  union _type_cast_8_b u;
-  u._short3 = v;
-  return u._double;
-}
-
-INLINE OVERLOADABLE float2 as_float2(short3 v) {
-  union _type_cast_8_b u;
-  u._short3 = v;
-  return u._float2;
-}
-
-INLINE OVERLOADABLE long as_long(short4 v) {
-  union _type_cast_8_b u;
-  u._short4 = v;
-  return u._long;
-}
-
-INLINE OVERLOADABLE ulong as_ulong(short4 v) {
-  union _type_cast_8_b u;
-  u._short4 = v;
-  return u._ulong;
-}
-
-INLINE OVERLOADABLE int2 as_int2(short4 v) {
-  union _type_cast_8_b u;
-  u._short4 = v;
-  return u._int2;
-}
-
-INLINE OVERLOADABLE uint2 as_uint2(short4 v) {
-  union _type_cast_8_b u;
-  u._short4 = v;
-  return u._uint2;
-}
-
-INLINE OVERLOADABLE ushort3 as_ushort3(short4 v) {
-  union _type_cast_8_b u;
-  u._short4 = v;
-  return u._ushort3;
-}
-
-INLINE OVERLOADABLE ushort4 as_ushort4(short4 v) {
-  union _type_cast_8_b u;
-  u._short4 = v;
-  return u._ushort4;
-}
-
-INLINE OVERLOADABLE char8 as_char8(short4 v) {
-  union _type_cast_8_b u;
-  u._short4 = v;
-  return u._char8;
-}
-
-INLINE OVERLOADABLE uchar8 as_uchar8(short4 v) {
-  union _type_cast_8_b u;
-  u._short4 = v;
-  return u._uchar8;
-}
-
-INLINE OVERLOADABLE double as_double(short4 v) {
-  union _type_cast_8_b u;
-  u._short4 = v;
-  return u._double;
-}
-
-INLINE OVERLOADABLE float2 as_float2(short4 v) {
-  union _type_cast_8_b u;
-  u._short4 = v;
-  return u._float2;
-}
-
-INLINE OVERLOADABLE long as_long(ushort3 v) {
-  union _type_cast_8_b u;
-  u._ushort3 = v;
-  return u._long;
-}
-
-INLINE OVERLOADABLE ulong as_ulong(ushort3 v) {
-  union _type_cast_8_b u;
-  u._ushort3 = v;
-  return u._ulong;
-}
-
-INLINE OVERLOADABLE int2 as_int2(ushort3 v) {
-  union _type_cast_8_b u;
-  u._ushort3 = v;
-  return u._int2;
-}
-
-INLINE OVERLOADABLE uint2 as_uint2(ushort3 v) {
-  union _type_cast_8_b u;
-  u._ushort3 = v;
-  return u._uint2;
-}
-
-INLINE OVERLOADABLE short3 as_short3(ushort3 v) {
-  union _type_cast_8_b u;
-  u._ushort3 = v;
-  return u._short3;
-}
-
-INLINE OVERLOADABLE short4 as_short4(ushort3 v) {
-  union _type_cast_8_b u;
-  u._ushort3 = v;
-  return u._short4;
-}
-
-INLINE OVERLOADABLE char8 as_char8(ushort3 v) {
-  union _type_cast_8_b u;
-  u._ushort3 = v;
-  return u._char8;
-}
-
-INLINE OVERLOADABLE uchar8 as_uchar8(ushort3 v) {
-  union _type_cast_8_b u;
-  u._ushort3 = v;
-  return u._uchar8;
-}
-
-INLINE OVERLOADABLE double as_double(ushort3 v) {
-  union _type_cast_8_b u;
-  u._ushort3 = v;
-  return u._double;
-}
-
-INLINE OVERLOADABLE float2 as_float2(ushort3 v) {
-  union _type_cast_8_b u;
-  u._ushort3 = v;
-  return u._float2;
-}
-
-INLINE OVERLOADABLE long as_long(ushort4 v) {
-  union _type_cast_8_b u;
-  u._ushort4 = v;
-  return u._long;
-}
-
-INLINE OVERLOADABLE ulong as_ulong(ushort4 v) {
-  union _type_cast_8_b u;
-  u._ushort4 = v;
-  return u._ulong;
-}
-
-INLINE OVERLOADABLE int2 as_int2(ushort4 v) {
-  union _type_cast_8_b u;
-  u._ushort4 = v;
-  return u._int2;
-}
-
-INLINE OVERLOADABLE uint2 as_uint2(ushort4 v) {
-  union _type_cast_8_b u;
-  u._ushort4 = v;
-  return u._uint2;
-}
-
-INLINE OVERLOADABLE short3 as_short3(ushort4 v) {
-  union _type_cast_8_b u;
-  u._ushort4 = v;
-  return u._short3;
-}
-
-INLINE OVERLOADABLE short4 as_short4(ushort4 v) {
-  union _type_cast_8_b u;
-  u._ushort4 = v;
-  return u._short4;
-}
-
-INLINE OVERLOADABLE char8 as_char8(ushort4 v) {
-  union _type_cast_8_b u;
-  u._ushort4 = v;
-  return u._char8;
-}
-
-INLINE OVERLOADABLE uchar8 as_uchar8(ushort4 v) {
-  union _type_cast_8_b u;
-  u._ushort4 = v;
-  return u._uchar8;
-}
-
-INLINE OVERLOADABLE double as_double(ushort4 v) {
-  union _type_cast_8_b u;
-  u._ushort4 = v;
-  return u._double;
-}
-
-INLINE OVERLOADABLE float2 as_float2(ushort4 v) {
-  union _type_cast_8_b u;
-  u._ushort4 = v;
-  return u._float2;
-}
-
-INLINE OVERLOADABLE long as_long(char8 v) {
-  union _type_cast_8_b u;
-  u._char8 = v;
-  return u._long;
-}
-
-INLINE OVERLOADABLE ulong as_ulong(char8 v) {
-  union _type_cast_8_b u;
-  u._char8 = v;
-  return u._ulong;
-}
-
-INLINE OVERLOADABLE int2 as_int2(char8 v) {
-  union _type_cast_8_b u;
-  u._char8 = v;
-  return u._int2;
-}
-
-INLINE OVERLOADABLE uint2 as_uint2(char8 v) {
-  union _type_cast_8_b u;
-  u._char8 = v;
-  return u._uint2;
-}
-
-INLINE OVERLOADABLE short3 as_short3(char8 v) {
-  union _type_cast_8_b u;
-  u._char8 = v;
-  return u._short3;
-}
-
-INLINE OVERLOADABLE short4 as_short4(char8 v) {
-  union _type_cast_8_b u;
-  u._char8 = v;
-  return u._short4;
-}
-
-INLINE OVERLOADABLE ushort3 as_ushort3(char8 v) {
-  union _type_cast_8_b u;
-  u._char8 = v;
-  return u._ushort3;
-}
-
-INLINE OVERLOADABLE ushort4 as_ushort4(char8 v) {
-  union _type_cast_8_b u;
-  u._char8 = v;
-  return u._ushort4;
-}
-
-INLINE OVERLOADABLE uchar8 as_uchar8(char8 v) {
-  union _type_cast_8_b u;
-  u._char8 = v;
-  return u._uchar8;
-}
-
-INLINE OVERLOADABLE double as_double(char8 v) {
-  union _type_cast_8_b u;
-  u._char8 = v;
-  return u._double;
-}
-
-INLINE OVERLOADABLE float2 as_float2(char8 v) {
-  union _type_cast_8_b u;
-  u._char8 = v;
-  return u._float2;
-}
-
-INLINE OVERLOADABLE long as_long(uchar8 v) {
-  union _type_cast_8_b u;
-  u._uchar8 = v;
-  return u._long;
-}
-
-INLINE OVERLOADABLE ulong as_ulong(uchar8 v) {
-  union _type_cast_8_b u;
-  u._uchar8 = v;
-  return u._ulong;
-}
-
-INLINE OVERLOADABLE int2 as_int2(uchar8 v) {
-  union _type_cast_8_b u;
-  u._uchar8 = v;
-  return u._int2;
-}
-
-INLINE OVERLOADABLE uint2 as_uint2(uchar8 v) {
-  union _type_cast_8_b u;
-  u._uchar8 = v;
-  return u._uint2;
-}
-
-INLINE OVERLOADABLE short3 as_short3(uchar8 v) {
-  union _type_cast_8_b u;
-  u._uchar8 = v;
-  return u._short3;
-}
-
-INLINE OVERLOADABLE short4 as_short4(uchar8 v) {
-  union _type_cast_8_b u;
-  u._uchar8 = v;
-  return u._short4;
-}
-
-INLINE OVERLOADABLE ushort3 as_ushort3(uchar8 v) {
-  union _type_cast_8_b u;
-  u._uchar8 = v;
-  return u._ushort3;
-}
-
-INLINE OVERLOADABLE ushort4 as_ushort4(uchar8 v) {
-  union _type_cast_8_b u;
-  u._uchar8 = v;
-  return u._ushort4;
-}
-
-INLINE OVERLOADABLE char8 as_char8(uchar8 v) {
-  union _type_cast_8_b u;
-  u._uchar8 = v;
-  return u._char8;
-}
-
-INLINE OVERLOADABLE double as_double(uchar8 v) {
-  union _type_cast_8_b u;
-  u._uchar8 = v;
-  return u._double;
-}
-
-INLINE OVERLOADABLE float2 as_float2(uchar8 v) {
-  union _type_cast_8_b u;
-  u._uchar8 = v;
-  return u._float2;
-}
-
-INLINE OVERLOADABLE long as_long(double v) {
-  union _type_cast_8_b u;
-  u._double = v;
-  return u._long;
-}
-
-INLINE OVERLOADABLE ulong as_ulong(double v) {
-  union _type_cast_8_b u;
-  u._double = v;
-  return u._ulong;
-}
-
-INLINE OVERLOADABLE int2 as_int2(double v) {
-  union _type_cast_8_b u;
-  u._double = v;
-  return u._int2;
-}
-
-INLINE OVERLOADABLE uint2 as_uint2(double v) {
-  union _type_cast_8_b u;
-  u._double = v;
-  return u._uint2;
-}
-
-INLINE OVERLOADABLE short3 as_short3(double v) {
-  union _type_cast_8_b u;
-  u._double = v;
-  return u._short3;
-}
-
-INLINE OVERLOADABLE short4 as_short4(double v) {
-  union _type_cast_8_b u;
-  u._double = v;
-  return u._short4;
-}
-
-INLINE OVERLOADABLE ushort3 as_ushort3(double v) {
-  union _type_cast_8_b u;
-  u._double = v;
-  return u._ushort3;
-}
-
-INLINE OVERLOADABLE ushort4 as_ushort4(double v) {
-  union _type_cast_8_b u;
-  u._double = v;
-  return u._ushort4;
-}
-
-INLINE OVERLOADABLE char8 as_char8(double v) {
-  union _type_cast_8_b u;
-  u._double = v;
-  return u._char8;
-}
-
-INLINE OVERLOADABLE uchar8 as_uchar8(double v) {
-  union _type_cast_8_b u;
-  u._double = v;
-  return u._uchar8;
-}
-
-INLINE OVERLOADABLE float2 as_float2(double v) {
-  union _type_cast_8_b u;
-  u._double = v;
-  return u._float2;
-}
-
-INLINE OVERLOADABLE long as_long(float2 v) {
-  union _type_cast_8_b u;
-  u._float2 = v;
-  return u._long;
-}
-
-INLINE OVERLOADABLE ulong as_ulong(float2 v) {
-  union _type_cast_8_b u;
-  u._float2 = v;
-  return u._ulong;
-}
-
-INLINE OVERLOADABLE int2 as_int2(float2 v) {
-  union _type_cast_8_b u;
-  u._float2 = v;
-  return u._int2;
-}
-
-INLINE OVERLOADABLE uint2 as_uint2(float2 v) {
-  union _type_cast_8_b u;
-  u._float2 = v;
-  return u._uint2;
-}
-
-INLINE OVERLOADABLE short3 as_short3(float2 v) {
-  union _type_cast_8_b u;
-  u._float2 = v;
-  return u._short3;
-}
-
-INLINE OVERLOADABLE short4 as_short4(float2 v) {
-  union _type_cast_8_b u;
-  u._float2 = v;
-  return u._short4;
-}
-
-INLINE OVERLOADABLE ushort3 as_ushort3(float2 v) {
-  union _type_cast_8_b u;
-  u._float2 = v;
-  return u._ushort3;
-}
-
-INLINE OVERLOADABLE ushort4 as_ushort4(float2 v) {
-  union _type_cast_8_b u;
-  u._float2 = v;
-  return u._ushort4;
-}
-
-INLINE OVERLOADABLE char8 as_char8(float2 v) {
-  union _type_cast_8_b u;
-  u._float2 = v;
-  return u._char8;
-}
-
-INLINE OVERLOADABLE uchar8 as_uchar8(float2 v) {
-  union _type_cast_8_b u;
-  u._float2 = v;
-  return u._uchar8;
-}
-
-INLINE OVERLOADABLE double as_double(float2 v) {
-  union _type_cast_8_b u;
-  u._float2 = v;
-  return u._double;
-}
-
-union _type_cast_16_b {
-  long2 _long2;
-  ulong2 _ulong2;
-  int3 _int3;
-  int4 _int4;
-  uint3 _uint3;
-  uint4 _uint4;
-  short8 _short8;
-  ushort8 _ushort8;
-  char16 _char16;
-  uchar16 _uchar16;
-  double2 _double2;
-  float3 _float3;
-  float4 _float4;
-};
-
-INLINE OVERLOADABLE ulong2 as_ulong2(long2 v) {
-  union _type_cast_16_b u;
-  u._long2 = v;
-  return u._ulong2;
-}
-
-INLINE OVERLOADABLE int3 as_int3(long2 v) {
-  union _type_cast_16_b u;
-  u._long2 = v;
-  return u._int3;
-}
-
-INLINE OVERLOADABLE int4 as_int4(long2 v) {
-  union _type_cast_16_b u;
-  u._long2 = v;
-  return u._int4;
-}
-
-INLINE OVERLOADABLE uint3 as_uint3(long2 v) {
-  union _type_cast_16_b u;
-  u._long2 = v;
-  return u._uint3;
-}
-
-INLINE OVERLOADABLE uint4 as_uint4(long2 v) {
-  union _type_cast_16_b u;
-  u._long2 = v;
-  return u._uint4;
-}
-
-INLINE OVERLOADABLE short8 as_short8(long2 v) {
-  union _type_cast_16_b u;
-  u._long2 = v;
-  return u._short8;
-}
-
-INLINE OVERLOADABLE ushort8 as_ushort8(long2 v) {
-  union _type_cast_16_b u;
-  u._long2 = v;
-  return u._ushort8;
-}
-
-INLINE OVERLOADABLE char16 as_char16(long2 v) {
-  union _type_cast_16_b u;
-  u._long2 = v;
-  return u._char16;
-}
-
-INLINE OVERLOADABLE uchar16 as_uchar16(long2 v) {
-  union _type_cast_16_b u;
-  u._long2 = v;
-  return u._uchar16;
-}
-
-INLINE OVERLOADABLE double2 as_double2(long2 v) {
-  union _type_cast_16_b u;
-  u._long2 = v;
-  return u._double2;
-}
-
-INLINE OVERLOADABLE float3 as_float3(long2 v) {
-  union _type_cast_16_b u;
-  u._long2 = v;
-  return u._float3;
-}
-
-INLINE OVERLOADABLE float4 as_float4(long2 v) {
-  union _type_cast_16_b u;
-  u._long2 = v;
-  return u._float4;
-}
-
-INLINE OVERLOADABLE long2 as_long2(ulong2 v) {
-  union _type_cast_16_b u;
-  u._ulong2 = v;
-  return u._long2;
-}
-
-INLINE OVERLOADABLE int3 as_int3(ulong2 v) {
-  union _type_cast_16_b u;
-  u._ulong2 = v;
-  return u._int3;
-}
-
-INLINE OVERLOADABLE int4 as_int4(ulong2 v) {
-  union _type_cast_16_b u;
-  u._ulong2 = v;
-  return u._int4;
-}
-
-INLINE OVERLOADABLE uint3 as_uint3(ulong2 v) {
-  union _type_cast_16_b u;
-  u._ulong2 = v;
-  return u._uint3;
-}
-
-INLINE OVERLOADABLE uint4 as_uint4(ulong2 v) {
-  union _type_cast_16_b u;
-  u._ulong2 = v;
-  return u._uint4;
-}
-
-INLINE OVERLOADABLE short8 as_short8(ulong2 v) {
-  union _type_cast_16_b u;
-  u._ulong2 = v;
-  return u._short8;
-}
-
-INLINE OVERLOADABLE ushort8 as_ushort8(ulong2 v) {
-  union _type_cast_16_b u;
-  u._ulong2 = v;
-  return u._ushort8;
-}
-
-INLINE OVERLOADABLE char16 as_char16(ulong2 v) {
-  union _type_cast_16_b u;
-  u._ulong2 = v;
-  return u._char16;
-}
-
-INLINE OVERLOADABLE uchar16 as_uchar16(ulong2 v) {
-  union _type_cast_16_b u;
-  u._ulong2 = v;
-  return u._uchar16;
-}
-
-INLINE OVERLOADABLE double2 as_double2(ulong2 v) {
-  union _type_cast_16_b u;
-  u._ulong2 = v;
-  return u._double2;
-}
-
-INLINE OVERLOADABLE float3 as_float3(ulong2 v) {
-  union _type_cast_16_b u;
-  u._ulong2 = v;
-  return u._float3;
-}
-
-INLINE OVERLOADABLE float4 as_float4(ulong2 v) {
-  union _type_cast_16_b u;
-  u._ulong2 = v;
-  return u._float4;
-}
-
-INLINE OVERLOADABLE long2 as_long2(int3 v) {
-  union _type_cast_16_b u;
-  u._int3 = v;
-  return u._long2;
-}
-
-INLINE OVERLOADABLE ulong2 as_ulong2(int3 v) {
-  union _type_cast_16_b u;
-  u._int3 = v;
-  return u._ulong2;
-}
-
-INLINE OVERLOADABLE uint3 as_uint3(int3 v) {
-  union _type_cast_16_b u;
-  u._int3 = v;
-  return u._uint3;
-}
-
-INLINE OVERLOADABLE uint4 as_uint4(int3 v) {
-  union _type_cast_16_b u;
-  u._int3 = v;
-  return u._uint4;
-}
-
-INLINE OVERLOADABLE short8 as_short8(int3 v) {
-  union _type_cast_16_b u;
-  u._int3 = v;
-  return u._short8;
-}
-
-INLINE OVERLOADABLE ushort8 as_ushort8(int3 v) {
-  union _type_cast_16_b u;
-  u._int3 = v;
-  return u._ushort8;
-}
-
-INLINE OVERLOADABLE char16 as_char16(int3 v) {
-  union _type_cast_16_b u;
-  u._int3 = v;
-  return u._char16;
-}
-
-INLINE OVERLOADABLE uchar16 as_uchar16(int3 v) {
-  union _type_cast_16_b u;
-  u._int3 = v;
-  return u._uchar16;
-}
-
-INLINE OVERLOADABLE double2 as_double2(int3 v) {
-  union _type_cast_16_b u;
-  u._int3 = v;
-  return u._double2;
-}
-
-INLINE OVERLOADABLE float3 as_float3(int3 v) {
-  union _type_cast_16_b u;
-  u._int3 = v;
-  return u._float3;
-}
-
-INLINE OVERLOADABLE float4 as_float4(int3 v) {
-  union _type_cast_16_b u;
-  u._int3 = v;
-  return u._float4;
-}
-
-INLINE OVERLOADABLE long2 as_long2(int4 v) {
-  union _type_cast_16_b u;
-  u._int4 = v;
-  return u._long2;
-}
-
-INLINE OVERLOADABLE ulong2 as_ulong2(int4 v) {
-  union _type_cast_16_b u;
-  u._int4 = v;
-  return u._ulong2;
-}
-
-INLINE OVERLOADABLE uint3 as_uint3(int4 v) {
-  union _type_cast_16_b u;
-  u._int4 = v;
-  return u._uint3;
-}
-
-INLINE OVERLOADABLE uint4 as_uint4(int4 v) {
-  union _type_cast_16_b u;
-  u._int4 = v;
-  return u._uint4;
-}
-
-INLINE OVERLOADABLE short8 as_short8(int4 v) {
-  union _type_cast_16_b u;
-  u._int4 = v;
-  return u._short8;
-}
-
-INLINE OVERLOADABLE ushort8 as_ushort8(int4 v) {
-  union _type_cast_16_b u;
-  u._int4 = v;
-  return u._ushort8;
-}
-
-INLINE OVERLOADABLE char16 as_char16(int4 v) {
-  union _type_cast_16_b u;
-  u._int4 = v;
-  return u._char16;
-}
-
-INLINE OVERLOADABLE uchar16 as_uchar16(int4 v) {
-  union _type_cast_16_b u;
-  u._int4 = v;
-  return u._uchar16;
-}
-
-INLINE OVERLOADABLE double2 as_double2(int4 v) {
-  union _type_cast_16_b u;
-  u._int4 = v;
-  return u._double2;
-}
-
-INLINE OVERLOADABLE float3 as_float3(int4 v) {
-  union _type_cast_16_b u;
-  u._int4 = v;
-  return u._float3;
-}
-
-INLINE OVERLOADABLE float4 as_float4(int4 v) {
-  union _type_cast_16_b u;
-  u._int4 = v;
-  return u._float4;
-}
-
-INLINE OVERLOADABLE long2 as_long2(uint3 v) {
-  union _type_cast_16_b u;
-  u._uint3 = v;
-  return u._long2;
-}
-
-INLINE OVERLOADABLE ulong2 as_ulong2(uint3 v) {
-  union _type_cast_16_b u;
-  u._uint3 = v;
-  return u._ulong2;
-}
-
-INLINE OVERLOADABLE int3 as_int3(uint3 v) {
-  union _type_cast_16_b u;
-  u._uint3 = v;
-  return u._int3;
-}
-
-INLINE OVERLOADABLE int4 as_int4(uint3 v) {
-  union _type_cast_16_b u;
-  u._uint3 = v;
-  return u._int4;
-}
-
-INLINE OVERLOADABLE short8 as_short8(uint3 v) {
-  union _type_cast_16_b u;
-  u._uint3 = v;
-  return u._short8;
-}
-
-INLINE OVERLOADABLE ushort8 as_ushort8(uint3 v) {
-  union _type_cast_16_b u;
-  u._uint3 = v;
-  return u._ushort8;
-}
-
-INLINE OVERLOADABLE char16 as_char16(uint3 v) {
-  union _type_cast_16_b u;
-  u._uint3 = v;
-  return u._char16;
-}
-
-INLINE OVERLOADABLE uchar16 as_uchar16(uint3 v) {
-  union _type_cast_16_b u;
-  u._uint3 = v;
-  return u._uchar16;
-}
-
-INLINE OVERLOADABLE double2 as_double2(uint3 v) {
-  union _type_cast_16_b u;
-  u._uint3 = v;
-  return u._double2;
-}
-
-INLINE OVERLOADABLE float3 as_float3(uint3 v) {
-  union _type_cast_16_b u;
-  u._uint3 = v;
-  return u._float3;
-}
-
-INLINE OVERLOADABLE float4 as_float4(uint3 v) {
-  union _type_cast_16_b u;
-  u._uint3 = v;
-  return u._float4;
-}
-
-INLINE OVERLOADABLE long2 as_long2(uint4 v) {
-  union _type_cast_16_b u;
-  u._uint4 = v;
-  return u._long2;
-}
-
-INLINE OVERLOADABLE ulong2 as_ulong2(uint4 v) {
-  union _type_cast_16_b u;
-  u._uint4 = v;
-  return u._ulong2;
-}
-
-INLINE OVERLOADABLE int3 as_int3(uint4 v) {
-  union _type_cast_16_b u;
-  u._uint4 = v;
-  return u._int3;
-}
-
-INLINE OVERLOADABLE int4 as_int4(uint4 v) {
-  union _type_cast_16_b u;
-  u._uint4 = v;
-  return u._int4;
-}
-
-INLINE OVERLOADABLE short8 as_short8(uint4 v) {
-  union _type_cast_16_b u;
-  u._uint4 = v;
-  return u._short8;
-}
-
-INLINE OVERLOADABLE ushort8 as_ushort8(uint4 v) {
-  union _type_cast_16_b u;
-  u._uint4 = v;
-  return u._ushort8;
-}
-
-INLINE OVERLOADABLE char16 as_char16(uint4 v) {
-  union _type_cast_16_b u;
-  u._uint4 = v;
-  return u._char16;
-}
-
-INLINE OVERLOADABLE uchar16 as_uchar16(uint4 v) {
-  union _type_cast_16_b u;
-  u._uint4 = v;
-  return u._uchar16;
-}
-
-INLINE OVERLOADABLE double2 as_double2(uint4 v) {
-  union _type_cast_16_b u;
-  u._uint4 = v;
-  return u._double2;
-}
-
-INLINE OVERLOADABLE float3 as_float3(uint4 v) {
-  union _type_cast_16_b u;
-  u._uint4 = v;
-  return u._float3;
-}
-
-INLINE OVERLOADABLE float4 as_float4(uint4 v) {
-  union _type_cast_16_b u;
-  u._uint4 = v;
-  return u._float4;
-}
-
-INLINE OVERLOADABLE long2 as_long2(short8 v) {
-  union _type_cast_16_b u;
-  u._short8 = v;
-  return u._long2;
-}
-
-INLINE OVERLOADABLE ulong2 as_ulong2(short8 v) {
-  union _type_cast_16_b u;
-  u._short8 = v;
-  return u._ulong2;
-}
-
-INLINE OVERLOADABLE int3 as_int3(short8 v) {
-  union _type_cast_16_b u;
-  u._short8 = v;
-  return u._int3;
-}
-
-INLINE OVERLOADABLE int4 as_int4(short8 v) {
-  union _type_cast_16_b u;
-  u._short8 = v;
-  return u._int4;
-}
-
-INLINE OVERLOADABLE uint3 as_uint3(short8 v) {
-  union _type_cast_16_b u;
-  u._short8 = v;
-  return u._uint3;
-}
-
-INLINE OVERLOADABLE uint4 as_uint4(short8 v) {
-  union _type_cast_16_b u;
-  u._short8 = v;
-  return u._uint4;
-}
-
-INLINE OVERLOADABLE ushort8 as_ushort8(short8 v) {
-  union _type_cast_16_b u;
-  u._short8 = v;
-  return u._ushort8;
-}
-
-INLINE OVERLOADABLE char16 as_char16(short8 v) {
-  union _type_cast_16_b u;
-  u._short8 = v;
-  return u._char16;
-}
-
-INLINE OVERLOADABLE uchar16 as_uchar16(short8 v) {
-  union _type_cast_16_b u;
-  u._short8 = v;
-  return u._uchar16;
-}
-
-INLINE OVERLOADABLE double2 as_double2(short8 v) {
-  union _type_cast_16_b u;
-  u._short8 = v;
-  return u._double2;
-}
-
-INLINE OVERLOADABLE float3 as_float3(short8 v) {
-  union _type_cast_16_b u;
-  u._short8 = v;
-  return u._float3;
-}
-
-INLINE OVERLOADABLE float4 as_float4(short8 v) {
-  union _type_cast_16_b u;
-  u._short8 = v;
-  return u._float4;
-}
-
-INLINE OVERLOADABLE long2 as_long2(ushort8 v) {
-  union _type_cast_16_b u;
-  u._ushort8 = v;
-  return u._long2;
-}
-
-INLINE OVERLOADABLE ulong2 as_ulong2(ushort8 v) {
-  union _type_cast_16_b u;
-  u._ushort8 = v;
-  return u._ulong2;
-}
-
-INLINE OVERLOADABLE int3 as_int3(ushort8 v) {
-  union _type_cast_16_b u;
-  u._ushort8 = v;
-  return u._int3;
-}
-
-INLINE OVERLOADABLE int4 as_int4(ushort8 v) {
-  union _type_cast_16_b u;
-  u._ushort8 = v;
-  return u._int4;
-}
-
-INLINE OVERLOADABLE uint3 as_uint3(ushort8 v) {
-  union _type_cast_16_b u;
-  u._ushort8 = v;
-  return u._uint3;
-}
-
-INLINE OVERLOADABLE uint4 as_uint4(ushort8 v) {
-  union _type_cast_16_b u;
-  u._ushort8 = v;
-  return u._uint4;
-}
-
-INLINE OVERLOADABLE short8 as_short8(ushort8 v) {
-  union _type_cast_16_b u;
-  u._ushort8 = v;
-  return u._short8;
-}
-
-INLINE OVERLOADABLE char16 as_char16(ushort8 v) {
-  union _type_cast_16_b u;
-  u._ushort8 = v;
-  return u._char16;
-}
-
-INLINE OVERLOADABLE uchar16 as_uchar16(ushort8 v) {
-  union _type_cast_16_b u;
-  u._ushort8 = v;
-  return u._uchar16;
-}
-
-INLINE OVERLOADABLE double2 as_double2(ushort8 v) {
-  union _type_cast_16_b u;
-  u._ushort8 = v;
-  return u._double2;
-}
-
-INLINE OVERLOADABLE float3 as_float3(ushort8 v) {
-  union _type_cast_16_b u;
-  u._ushort8 = v;
-  return u._float3;
-}
-
-INLINE OVERLOADABLE float4 as_float4(ushort8 v) {
-  union _type_cast_16_b u;
-  u._ushort8 = v;
-  return u._float4;
-}
-
-INLINE OVERLOADABLE long2 as_long2(char16 v) {
-  union _type_cast_16_b u;
-  u._char16 = v;
-  return u._long2;
-}
-
-INLINE OVERLOADABLE ulong2 as_ulong2(char16 v) {
-  union _type_cast_16_b u;
-  u._char16 = v;
-  return u._ulong2;
-}
-
-INLINE OVERLOADABLE int3 as_int3(char16 v) {
-  union _type_cast_16_b u;
-  u._char16 = v;
-  return u._int3;
-}
-
-INLINE OVERLOADABLE int4 as_int4(char16 v) {
-  union _type_cast_16_b u;
-  u._char16 = v;
-  return u._int4;
-}
-
-INLINE OVERLOADABLE uint3 as_uint3(char16 v) {
-  union _type_cast_16_b u;
-  u._char16 = v;
-  return u._uint3;
-}
-
-INLINE OVERLOADABLE uint4 as_uint4(char16 v) {
-  union _type_cast_16_b u;
-  u._char16 = v;
-  return u._uint4;
-}
-
-INLINE OVERLOADABLE short8 as_short8(char16 v) {
-  union _type_cast_16_b u;
-  u._char16 = v;
-  return u._short8;
-}
-
-INLINE OVERLOADABLE ushort8 as_ushort8(char16 v) {
-  union _type_cast_16_b u;
-  u._char16 = v;
-  return u._ushort8;
-}
-
-INLINE OVERLOADABLE uchar16 as_uchar16(char16 v) {
-  union _type_cast_16_b u;
-  u._char16 = v;
-  return u._uchar16;
-}
-
-INLINE OVERLOADABLE double2 as_double2(char16 v) {
-  union _type_cast_16_b u;
-  u._char16 = v;
-  return u._double2;
-}
-
-INLINE OVERLOADABLE float3 as_float3(char16 v) {
-  union _type_cast_16_b u;
-  u._char16 = v;
-  return u._float3;
-}
-
-INLINE OVERLOADABLE float4 as_float4(char16 v) {
-  union _type_cast_16_b u;
-  u._char16 = v;
-  return u._float4;
-}
-
-INLINE OVERLOADABLE long2 as_long2(uchar16 v) {
-  union _type_cast_16_b u;
-  u._uchar16 = v;
-  return u._long2;
-}
-
-INLINE OVERLOADABLE ulong2 as_ulong2(uchar16 v) {
-  union _type_cast_16_b u;
-  u._uchar16 = v;
-  return u._ulong2;
-}
-
-INLINE OVERLOADABLE int3 as_int3(uchar16 v) {
-  union _type_cast_16_b u;
-  u._uchar16 = v;
-  return u._int3;
-}
-
-INLINE OVERLOADABLE int4 as_int4(uchar16 v) {
-  union _type_cast_16_b u;
-  u._uchar16 = v;
-  return u._int4;
-}
-
-INLINE OVERLOADABLE uint3 as_uint3(uchar16 v) {
-  union _type_cast_16_b u;
-  u._uchar16 = v;
-  return u._uint3;
-}
-
-INLINE OVERLOADABLE uint4 as_uint4(uchar16 v) {
-  union _type_cast_16_b u;
-  u._uchar16 = v;
-  return u._uint4;
-}
-
-INLINE OVERLOADABLE short8 as_short8(uchar16 v) {
-  union _type_cast_16_b u;
-  u._uchar16 = v;
-  return u._short8;
-}
-
-INLINE OVERLOADABLE ushort8 as_ushort8(uchar16 v) {
-  union _type_cast_16_b u;
-  u._uchar16 = v;
-  return u._ushort8;
-}
-
-INLINE OVERLOADABLE char16 as_char16(uchar16 v) {
-  union _type_cast_16_b u;
-  u._uchar16 = v;
-  return u._char16;
-}
-
-INLINE OVERLOADABLE double2 as_double2(uchar16 v) {
-  union _type_cast_16_b u;
-  u._uchar16 = v;
-  return u._double2;
-}
-
-INLINE OVERLOADABLE float3 as_float3(uchar16 v) {
-  union _type_cast_16_b u;
-  u._uchar16 = v;
-  return u._float3;
-}
-
-INLINE OVERLOADABLE float4 as_float4(uchar16 v) {
-  union _type_cast_16_b u;
-  u._uchar16 = v;
-  return u._float4;
-}
-
-INLINE OVERLOADABLE long2 as_long2(double2 v) {
-  union _type_cast_16_b u;
-  u._double2 = v;
-  return u._long2;
-}
-
-INLINE OVERLOADABLE ulong2 as_ulong2(double2 v) {
-  union _type_cast_16_b u;
-  u._double2 = v;
-  return u._ulong2;
-}
-
-INLINE OVERLOADABLE int3 as_int3(double2 v) {
-  union _type_cast_16_b u;
-  u._double2 = v;
-  return u._int3;
-}
-
-INLINE OVERLOADABLE int4 as_int4(double2 v) {
-  union _type_cast_16_b u;
-  u._double2 = v;
-  return u._int4;
-}
-
-INLINE OVERLOADABLE uint3 as_uint3(double2 v) {
-  union _type_cast_16_b u;
-  u._double2 = v;
-  return u._uint3;
-}
-
-INLINE OVERLOADABLE uint4 as_uint4(double2 v) {
-  union _type_cast_16_b u;
-  u._double2 = v;
-  return u._uint4;
-}
-
-INLINE OVERLOADABLE short8 as_short8(double2 v) {
-  union _type_cast_16_b u;
-  u._double2 = v;
-  return u._short8;
-}
-
-INLINE OVERLOADABLE ushort8 as_ushort8(double2 v) {
-  union _type_cast_16_b u;
-  u._double2 = v;
-  return u._ushort8;
-}
-
-INLINE OVERLOADABLE char16 as_char16(double2 v) {
-  union _type_cast_16_b u;
-  u._double2 = v;
-  return u._char16;
-}
-
-INLINE OVERLOADABLE uchar16 as_uchar16(double2 v) {
-  union _type_cast_16_b u;
-  u._double2 = v;
-  return u._uchar16;
-}
-
-INLINE OVERLOADABLE float3 as_float3(double2 v) {
-  union _type_cast_16_b u;
-  u._double2 = v;
-  return u._float3;
-}
-
-INLINE OVERLOADABLE float4 as_float4(double2 v) {
-  union _type_cast_16_b u;
-  u._double2 = v;
-  return u._float4;
-}
-
-INLINE OVERLOADABLE long2 as_long2(float3 v) {
-  union _type_cast_16_b u;
-  u._float3 = v;
-  return u._long2;
-}
-
-INLINE OVERLOADABLE ulong2 as_ulong2(float3 v) {
-  union _type_cast_16_b u;
-  u._float3 = v;
-  return u._ulong2;
-}
-
-INLINE OVERLOADABLE int3 as_int3(float3 v) {
-  union _type_cast_16_b u;
-  u._float3 = v;
-  return u._int3;
-}
-
-INLINE OVERLOADABLE int4 as_int4(float3 v) {
-  union _type_cast_16_b u;
-  u._float3 = v;
-  return u._int4;
-}
-
-INLINE OVERLOADABLE uint3 as_uint3(float3 v) {
-  union _type_cast_16_b u;
-  u._float3 = v;
-  return u._uint3;
-}
-
-INLINE OVERLOADABLE uint4 as_uint4(float3 v) {
-  union _type_cast_16_b u;
-  u._float3 = v;
-  return u._uint4;
-}
-
-INLINE OVERLOADABLE short8 as_short8(float3 v) {
-  union _type_cast_16_b u;
-  u._float3 = v;
-  return u._short8;
-}
-
-INLINE OVERLOADABLE ushort8 as_ushort8(float3 v) {
-  union _type_cast_16_b u;
-  u._float3 = v;
-  return u._ushort8;
-}
-
-INLINE OVERLOADABLE char16 as_char16(float3 v) {
-  union _type_cast_16_b u;
-  u._float3 = v;
-  return u._char16;
-}
-
-INLINE OVERLOADABLE uchar16 as_uchar16(float3 v) {
-  union _type_cast_16_b u;
-  u._float3 = v;
-  return u._uchar16;
-}
-
-INLINE OVERLOADABLE double2 as_double2(float3 v) {
-  union _type_cast_16_b u;
-  u._float3 = v;
-  return u._double2;
-}
-
-INLINE OVERLOADABLE long2 as_long2(float4 v) {
-  union _type_cast_16_b u;
-  u._float4 = v;
-  return u._long2;
-}
-
-INLINE OVERLOADABLE ulong2 as_ulong2(float4 v) {
-  union _type_cast_16_b u;
-  u._float4 = v;
-  return u._ulong2;
-}
-
-INLINE OVERLOADABLE int3 as_int3(float4 v) {
-  union _type_cast_16_b u;
-  u._float4 = v;
-  return u._int3;
-}
-
-INLINE OVERLOADABLE int4 as_int4(float4 v) {
-  union _type_cast_16_b u;
-  u._float4 = v;
-  return u._int4;
-}
-
-INLINE OVERLOADABLE uint3 as_uint3(float4 v) {
-  union _type_cast_16_b u;
-  u._float4 = v;
-  return u._uint3;
-}
-
-INLINE OVERLOADABLE uint4 as_uint4(float4 v) {
-  union _type_cast_16_b u;
-  u._float4 = v;
-  return u._uint4;
-}
-
-INLINE OVERLOADABLE short8 as_short8(float4 v) {
-  union _type_cast_16_b u;
-  u._float4 = v;
-  return u._short8;
-}
-
-INLINE OVERLOADABLE ushort8 as_ushort8(float4 v) {
-  union _type_cast_16_b u;
-  u._float4 = v;
-  return u._ushort8;
-}
-
-INLINE OVERLOADABLE char16 as_char16(float4 v) {
-  union _type_cast_16_b u;
-  u._float4 = v;
-  return u._char16;
-}
-
-INLINE OVERLOADABLE uchar16 as_uchar16(float4 v) {
-  union _type_cast_16_b u;
-  u._float4 = v;
-  return u._uchar16;
-}
-
-INLINE OVERLOADABLE double2 as_double2(float4 v) {
-  union _type_cast_16_b u;
-  u._float4 = v;
-  return u._double2;
-}
-
-union _type_cast_32_b {
-  long3 _long3;
-  long4 _long4;
-  ulong3 _ulong3;
-  ulong4 _ulong4;
-  int8 _int8;
-  uint8 _uint8;
-  short16 _short16;
-  ushort16 _ushort16;
-  double3 _double3;
-  double4 _double4;
-  float8 _float8;
-};
-
-INLINE OVERLOADABLE ulong3 as_ulong3(long3 v) {
-  union _type_cast_32_b u;
-  u._long3 = v;
-  return u._ulong3;
-}
-
-INLINE OVERLOADABLE ulong4 as_ulong4(long3 v) {
-  union _type_cast_32_b u;
-  u._long3 = v;
-  return u._ulong4;
-}
-
-INLINE OVERLOADABLE int8 as_int8(long3 v) {
-  union _type_cast_32_b u;
-  u._long3 = v;
-  return u._int8;
-}
-
-INLINE OVERLOADABLE uint8 as_uint8(long3 v) {
-  union _type_cast_32_b u;
-  u._long3 = v;
-  return u._uint8;
-}
-
-INLINE OVERLOADABLE short16 as_short16(long3 v) {
-  union _type_cast_32_b u;
-  u._long3 = v;
-  return u._short16;
-}
-
-INLINE OVERLOADABLE ushort16 as_ushort16(long3 v) {
-  union _type_cast_32_b u;
-  u._long3 = v;
-  return u._ushort16;
-}
-
-INLINE OVERLOADABLE double3 as_double3(long3 v) {
-  union _type_cast_32_b u;
-  u._long3 = v;
-  return u._double3;
-}
-
-INLINE OVERLOADABLE double4 as_double4(long3 v) {
-  union _type_cast_32_b u;
-  u._long3 = v;
-  return u._double4;
-}
-
-INLINE OVERLOADABLE float8 as_float8(long3 v) {
-  union _type_cast_32_b u;
-  u._long3 = v;
-  return u._float8;
-}
-
-INLINE OVERLOADABLE ulong3 as_ulong3(long4 v) {
-  union _type_cast_32_b u;
-  u._long4 = v;
-  return u._ulong3;
-}
-
-INLINE OVERLOADABLE ulong4 as_ulong4(long4 v) {
-  union _type_cast_32_b u;
-  u._long4 = v;
-  return u._ulong4;
-}
-
-INLINE OVERLOADABLE int8 as_int8(long4 v) {
-  union _type_cast_32_b u;
-  u._long4 = v;
-  return u._int8;
-}
-
-INLINE OVERLOADABLE uint8 as_uint8(long4 v) {
-  union _type_cast_32_b u;
-  u._long4 = v;
-  return u._uint8;
-}
-
-INLINE OVERLOADABLE short16 as_short16(long4 v) {
-  union _type_cast_32_b u;
-  u._long4 = v;
-  return u._short16;
-}
-
-INLINE OVERLOADABLE ushort16 as_ushort16(long4 v) {
-  union _type_cast_32_b u;
-  u._long4 = v;
-  return u._ushort16;
-}
-
-INLINE OVERLOADABLE double3 as_double3(long4 v) {
-  union _type_cast_32_b u;
-  u._long4 = v;
-  return u._double3;
-}
-
-INLINE OVERLOADABLE double4 as_double4(long4 v) {
-  union _type_cast_32_b u;
-  u._long4 = v;
-  return u._double4;
-}
-
-INLINE OVERLOADABLE float8 as_float8(long4 v) {
-  union _type_cast_32_b u;
-  u._long4 = v;
-  return u._float8;
-}
-
-INLINE OVERLOADABLE long3 as_long3(ulong3 v) {
-  union _type_cast_32_b u;
-  u._ulong3 = v;
-  return u._long3;
-}
-
-INLINE OVERLOADABLE long4 as_long4(ulong3 v) {
-  union _type_cast_32_b u;
-  u._ulong3 = v;
-  return u._long4;
-}
-
-INLINE OVERLOADABLE int8 as_int8(ulong3 v) {
-  union _type_cast_32_b u;
-  u._ulong3 = v;
-  return u._int8;
-}
-
-INLINE OVERLOADABLE uint8 as_uint8(ulong3 v) {
-  union _type_cast_32_b u;
-  u._ulong3 = v;
-  return u._uint8;
-}
-
-INLINE OVERLOADABLE short16 as_short16(ulong3 v) {
-  union _type_cast_32_b u;
-  u._ulong3 = v;
-  return u._short16;
-}
-
-INLINE OVERLOADABLE ushort16 as_ushort16(ulong3 v) {
-  union _type_cast_32_b u;
-  u._ulong3 = v;
-  return u._ushort16;
-}
-
-INLINE OVERLOADABLE double3 as_double3(ulong3 v) {
-  union _type_cast_32_b u;
-  u._ulong3 = v;
-  return u._double3;
-}
-
-INLINE OVERLOADABLE double4 as_double4(ulong3 v) {
-  union _type_cast_32_b u;
-  u._ulong3 = v;
-  return u._double4;
-}
-
-INLINE OVERLOADABLE float8 as_float8(ulong3 v) {
-  union _type_cast_32_b u;
-  u._ulong3 = v;
-  return u._float8;
-}
-
-INLINE OVERLOADABLE long3 as_long3(ulong4 v) {
-  union _type_cast_32_b u;
-  u._ulong4 = v;
-  return u._long3;
-}
-
-INLINE OVERLOADABLE long4 as_long4(ulong4 v) {
-  union _type_cast_32_b u;
-  u._ulong4 = v;
-  return u._long4;
-}
-
-INLINE OVERLOADABLE int8 as_int8(ulong4 v) {
-  union _type_cast_32_b u;
-  u._ulong4 = v;
-  return u._int8;
-}
-
-INLINE OVERLOADABLE uint8 as_uint8(ulong4 v) {
-  union _type_cast_32_b u;
-  u._ulong4 = v;
-  return u._uint8;
-}
-
-INLINE OVERLOADABLE short16 as_short16(ulong4 v) {
-  union _type_cast_32_b u;
-  u._ulong4 = v;
-  return u._short16;
-}
-
-INLINE OVERLOADABLE ushort16 as_ushort16(ulong4 v) {
-  union _type_cast_32_b u;
-  u._ulong4 = v;
-  return u._ushort16;
-}
-
-INLINE OVERLOADABLE double3 as_double3(ulong4 v) {
-  union _type_cast_32_b u;
-  u._ulong4 = v;
-  return u._double3;
-}
-
-INLINE OVERLOADABLE double4 as_double4(ulong4 v) {
-  union _type_cast_32_b u;
-  u._ulong4 = v;
-  return u._double4;
-}
-
-INLINE OVERLOADABLE float8 as_float8(ulong4 v) {
-  union _type_cast_32_b u;
-  u._ulong4 = v;
-  return u._float8;
-}
-
-INLINE OVERLOADABLE long3 as_long3(int8 v) {
-  union _type_cast_32_b u;
-  u._int8 = v;
-  return u._long3;
-}
-
-INLINE OVERLOADABLE long4 as_long4(int8 v) {
-  union _type_cast_32_b u;
-  u._int8 = v;
-  return u._long4;
-}
-
-INLINE OVERLOADABLE ulong3 as_ulong3(int8 v) {
-  union _type_cast_32_b u;
-  u._int8 = v;
-  return u._ulong3;
-}
-
-INLINE OVERLOADABLE ulong4 as_ulong4(int8 v) {
-  union _type_cast_32_b u;
-  u._int8 = v;
-  return u._ulong4;
-}
-
-INLINE OVERLOADABLE uint8 as_uint8(int8 v) {
-  union _type_cast_32_b u;
-  u._int8 = v;
-  return u._uint8;
-}
-
-INLINE OVERLOADABLE short16 as_short16(int8 v) {
-  union _type_cast_32_b u;
-  u._int8 = v;
-  return u._short16;
-}
-
-INLINE OVERLOADABLE ushort16 as_ushort16(int8 v) {
-  union _type_cast_32_b u;
-  u._int8 = v;
-  return u._ushort16;
-}
-
-INLINE OVERLOADABLE double3 as_double3(int8 v) {
-  union _type_cast_32_b u;
-  u._int8 = v;
-  return u._double3;
-}
-
-INLINE OVERLOADABLE double4 as_double4(int8 v) {
-  union _type_cast_32_b u;
-  u._int8 = v;
-  return u._double4;
-}
-
-INLINE OVERLOADABLE float8 as_float8(int8 v) {
-  union _type_cast_32_b u;
-  u._int8 = v;
-  return u._float8;
-}
-
-INLINE OVERLOADABLE long3 as_long3(uint8 v) {
-  union _type_cast_32_b u;
-  u._uint8 = v;
-  return u._long3;
-}
-
-INLINE OVERLOADABLE long4 as_long4(uint8 v) {
-  union _type_cast_32_b u;
-  u._uint8 = v;
-  return u._long4;
-}
-
-INLINE OVERLOADABLE ulong3 as_ulong3(uint8 v) {
-  union _type_cast_32_b u;
-  u._uint8 = v;
-  return u._ulong3;
-}
-
-INLINE OVERLOADABLE ulong4 as_ulong4(uint8 v) {
-  union _type_cast_32_b u;
-  u._uint8 = v;
-  return u._ulong4;
-}
-
-INLINE OVERLOADABLE int8 as_int8(uint8 v) {
-  union _type_cast_32_b u;
-  u._uint8 = v;
-  return u._int8;
-}
-
-INLINE OVERLOADABLE short16 as_short16(uint8 v) {
-  union _type_cast_32_b u;
-  u._uint8 = v;
-  return u._short16;
-}
-
-INLINE OVERLOADABLE ushort16 as_ushort16(uint8 v) {
-  union _type_cast_32_b u;
-  u._uint8 = v;
-  return u._ushort16;
-}
-
-INLINE OVERLOADABLE double3 as_double3(uint8 v) {
-  union _type_cast_32_b u;
-  u._uint8 = v;
-  return u._double3;
-}
-
-INLINE OVERLOADABLE double4 as_double4(uint8 v) {
-  union _type_cast_32_b u;
-  u._uint8 = v;
-  return u._double4;
-}
-
-INLINE OVERLOADABLE float8 as_float8(uint8 v) {
-  union _type_cast_32_b u;
-  u._uint8 = v;
-  return u._float8;
-}
-
-INLINE OVERLOADABLE long3 as_long3(short16 v) {
-  union _type_cast_32_b u;
-  u._short16 = v;
-  return u._long3;
-}
-
-INLINE OVERLOADABLE long4 as_long4(short16 v) {
-  union _type_cast_32_b u;
-  u._short16 = v;
-  return u._long4;
-}
-
-INLINE OVERLOADABLE ulong3 as_ulong3(short16 v) {
-  union _type_cast_32_b u;
-  u._short16 = v;
-  return u._ulong3;
-}
-
-INLINE OVERLOADABLE ulong4 as_ulong4(short16 v) {
-  union _type_cast_32_b u;
-  u._short16 = v;
-  return u._ulong4;
-}
-
-INLINE OVERLOADABLE int8 as_int8(short16 v) {
-  union _type_cast_32_b u;
-  u._short16 = v;
-  return u._int8;
-}
-
-INLINE OVERLOADABLE uint8 as_uint8(short16 v) {
-  union _type_cast_32_b u;
-  u._short16 = v;
-  return u._uint8;
-}
-
-INLINE OVERLOADABLE ushort16 as_ushort16(short16 v) {
-  union _type_cast_32_b u;
-  u._short16 = v;
-  return u._ushort16;
-}
-
-INLINE OVERLOADABLE double3 as_double3(short16 v) {
-  union _type_cast_32_b u;
-  u._short16 = v;
-  return u._double3;
-}
-
-INLINE OVERLOADABLE double4 as_double4(short16 v) {
-  union _type_cast_32_b u;
-  u._short16 = v;
-  return u._double4;
-}
-
-INLINE OVERLOADABLE float8 as_float8(short16 v) {
-  union _type_cast_32_b u;
-  u._short16 = v;
-  return u._float8;
-}
-
-INLINE OVERLOADABLE long3 as_long3(ushort16 v) {
-  union _type_cast_32_b u;
-  u._ushort16 = v;
-  return u._long3;
-}
-
-INLINE OVERLOADABLE long4 as_long4(ushort16 v) {
-  union _type_cast_32_b u;
-  u._ushort16 = v;
-  return u._long4;
-}
-
-INLINE OVERLOADABLE ulong3 as_ulong3(ushort16 v) {
-  union _type_cast_32_b u;
-  u._ushort16 = v;
-  return u._ulong3;
-}
-
-INLINE OVERLOADABLE ulong4 as_ulong4(ushort16 v) {
-  union _type_cast_32_b u;
-  u._ushort16 = v;
-  return u._ulong4;
-}
-
-INLINE OVERLOADABLE int8 as_int8(ushort16 v) {
-  union _type_cast_32_b u;
-  u._ushort16 = v;
-  return u._int8;
-}
-
-INLINE OVERLOADABLE uint8 as_uint8(ushort16 v) {
-  union _type_cast_32_b u;
-  u._ushort16 = v;
-  return u._uint8;
-}
-
-INLINE OVERLOADABLE short16 as_short16(ushort16 v) {
-  union _type_cast_32_b u;
-  u._ushort16 = v;
-  return u._short16;
-}
-
-INLINE OVERLOADABLE double3 as_double3(ushort16 v) {
-  union _type_cast_32_b u;
-  u._ushort16 = v;
-  return u._double3;
-}
-
-INLINE OVERLOADABLE double4 as_double4(ushort16 v) {
-  union _type_cast_32_b u;
-  u._ushort16 = v;
-  return u._double4;
-}
-
-INLINE OVERLOADABLE float8 as_float8(ushort16 v) {
-  union _type_cast_32_b u;
-  u._ushort16 = v;
-  return u._float8;
-}
-
-INLINE OVERLOADABLE long3 as_long3(double3 v) {
-  union _type_cast_32_b u;
-  u._double3 = v;
-  return u._long3;
-}
-
-INLINE OVERLOADABLE long4 as_long4(double3 v) {
-  union _type_cast_32_b u;
-  u._double3 = v;
-  return u._long4;
-}
-
-INLINE OVERLOADABLE ulong3 as_ulong3(double3 v) {
-  union _type_cast_32_b u;
-  u._double3 = v;
-  return u._ulong3;
-}
-
-INLINE OVERLOADABLE ulong4 as_ulong4(double3 v) {
-  union _type_cast_32_b u;
-  u._double3 = v;
-  return u._ulong4;
-}
-
-INLINE OVERLOADABLE int8 as_int8(double3 v) {
-  union _type_cast_32_b u;
-  u._double3 = v;
-  return u._int8;
-}
-
-INLINE OVERLOADABLE uint8 as_uint8(double3 v) {
-  union _type_cast_32_b u;
-  u._double3 = v;
-  return u._uint8;
-}
-
-INLINE OVERLOADABLE short16 as_short16(double3 v) {
-  union _type_cast_32_b u;
-  u._double3 = v;
-  return u._short16;
-}
-
-INLINE OVERLOADABLE ushort16 as_ushort16(double3 v) {
-  union _type_cast_32_b u;
-  u._double3 = v;
-  return u._ushort16;
-}
-
-INLINE OVERLOADABLE float8 as_float8(double3 v) {
-  union _type_cast_32_b u;
-  u._double3 = v;
-  return u._float8;
-}
-
-INLINE OVERLOADABLE long3 as_long3(double4 v) {
-  union _type_cast_32_b u;
-  u._double4 = v;
-  return u._long3;
-}
-
-INLINE OVERLOADABLE long4 as_long4(double4 v) {
-  union _type_cast_32_b u;
-  u._double4 = v;
-  return u._long4;
-}
-
-INLINE OVERLOADABLE ulong3 as_ulong3(double4 v) {
-  union _type_cast_32_b u;
-  u._double4 = v;
-  return u._ulong3;
-}
-
-INLINE OVERLOADABLE ulong4 as_ulong4(double4 v) {
-  union _type_cast_32_b u;
-  u._double4 = v;
-  return u._ulong4;
-}
-
-INLINE OVERLOADABLE int8 as_int8(double4 v) {
-  union _type_cast_32_b u;
-  u._double4 = v;
-  return u._int8;
-}
-
-INLINE OVERLOADABLE uint8 as_uint8(double4 v) {
-  union _type_cast_32_b u;
-  u._double4 = v;
-  return u._uint8;
-}
-
-INLINE OVERLOADABLE short16 as_short16(double4 v) {
-  union _type_cast_32_b u;
-  u._double4 = v;
-  return u._short16;
-}
-
-INLINE OVERLOADABLE ushort16 as_ushort16(double4 v) {
-  union _type_cast_32_b u;
-  u._double4 = v;
-  return u._ushort16;
-}
-
-INLINE OVERLOADABLE float8 as_float8(double4 v) {
-  union _type_cast_32_b u;
-  u._double4 = v;
-  return u._float8;
-}
-
-INLINE OVERLOADABLE long3 as_long3(float8 v) {
-  union _type_cast_32_b u;
-  u._float8 = v;
-  return u._long3;
-}
-
-INLINE OVERLOADABLE long4 as_long4(float8 v) {
-  union _type_cast_32_b u;
-  u._float8 = v;
-  return u._long4;
-}
-
-INLINE OVERLOADABLE ulong3 as_ulong3(float8 v) {
-  union _type_cast_32_b u;
-  u._float8 = v;
-  return u._ulong3;
-}
-
-INLINE OVERLOADABLE ulong4 as_ulong4(float8 v) {
-  union _type_cast_32_b u;
-  u._float8 = v;
-  return u._ulong4;
-}
-
-INLINE OVERLOADABLE int8 as_int8(float8 v) {
-  union _type_cast_32_b u;
-  u._float8 = v;
-  return u._int8;
-}
-
-INLINE OVERLOADABLE uint8 as_uint8(float8 v) {
-  union _type_cast_32_b u;
-  u._float8 = v;
-  return u._uint8;
-}
-
-INLINE OVERLOADABLE short16 as_short16(float8 v) {
-  union _type_cast_32_b u;
-  u._float8 = v;
-  return u._short16;
-}
-
-INLINE OVERLOADABLE ushort16 as_ushort16(float8 v) {
-  union _type_cast_32_b u;
-  u._float8 = v;
-  return u._ushort16;
-}
-
-INLINE OVERLOADABLE double3 as_double3(float8 v) {
-  union _type_cast_32_b u;
-  u._float8 = v;
-  return u._double3;
-}
-
-INLINE OVERLOADABLE double4 as_double4(float8 v) {
-  union _type_cast_32_b u;
-  u._float8 = v;
-  return u._double4;
-}
-
-union _type_cast_64_b {
-  long8 _long8;
-  ulong8 _ulong8;
-  int16 _int16;
-  uint16 _uint16;
-  double8 _double8;
-  float16 _float16;
-};
-
-INLINE OVERLOADABLE ulong8 as_ulong8(long8 v) {
-  union _type_cast_64_b u;
-  u._long8 = v;
-  return u._ulong8;
-}
-
-INLINE OVERLOADABLE int16 as_int16(long8 v) {
-  union _type_cast_64_b u;
-  u._long8 = v;
-  return u._int16;
-}
-
-INLINE OVERLOADABLE uint16 as_uint16(long8 v) {
-  union _type_cast_64_b u;
-  u._long8 = v;
-  return u._uint16;
-}
-
-INLINE OVERLOADABLE double8 as_double8(long8 v) {
-  union _type_cast_64_b u;
-  u._long8 = v;
-  return u._double8;
-}
-
-INLINE OVERLOADABLE float16 as_float16(long8 v) {
-  union _type_cast_64_b u;
-  u._long8 = v;
-  return u._float16;
-}
-
-INLINE OVERLOADABLE long8 as_long8(ulong8 v) {
-  union _type_cast_64_b u;
-  u._ulong8 = v;
-  return u._long8;
-}
-
-INLINE OVERLOADABLE int16 as_int16(ulong8 v) {
-  union _type_cast_64_b u;
-  u._ulong8 = v;
-  return u._int16;
-}
-
-INLINE OVERLOADABLE uint16 as_uint16(ulong8 v) {
-  union _type_cast_64_b u;
-  u._ulong8 = v;
-  return u._uint16;
-}
-
-INLINE OVERLOADABLE double8 as_double8(ulong8 v) {
-  union _type_cast_64_b u;
-  u._ulong8 = v;
-  return u._double8;
-}
-
-INLINE OVERLOADABLE float16 as_float16(ulong8 v) {
-  union _type_cast_64_b u;
-  u._ulong8 = v;
-  return u._float16;
-}
-
-INLINE OVERLOADABLE long8 as_long8(int16 v) {
-  union _type_cast_64_b u;
-  u._int16 = v;
-  return u._long8;
-}
-
-INLINE OVERLOADABLE ulong8 as_ulong8(int16 v) {
-  union _type_cast_64_b u;
-  u._int16 = v;
-  return u._ulong8;
-}
-
-INLINE OVERLOADABLE uint16 as_uint16(int16 v) {
-  union _type_cast_64_b u;
-  u._int16 = v;
-  return u._uint16;
-}
-
-INLINE OVERLOADABLE double8 as_double8(int16 v) {
-  union _type_cast_64_b u;
-  u._int16 = v;
-  return u._double8;
-}
-
-INLINE OVERLOADABLE float16 as_float16(int16 v) {
-  union _type_cast_64_b u;
-  u._int16 = v;
-  return u._float16;
-}
-
-INLINE OVERLOADABLE long8 as_long8(uint16 v) {
-  union _type_cast_64_b u;
-  u._uint16 = v;
-  return u._long8;
-}
-
-INLINE OVERLOADABLE ulong8 as_ulong8(uint16 v) {
-  union _type_cast_64_b u;
-  u._uint16 = v;
-  return u._ulong8;
-}
-
-INLINE OVERLOADABLE int16 as_int16(uint16 v) {
-  union _type_cast_64_b u;
-  u._uint16 = v;
-  return u._int16;
-}
-
-INLINE OVERLOADABLE double8 as_double8(uint16 v) {
-  union _type_cast_64_b u;
-  u._uint16 = v;
-  return u._double8;
-}
-
-INLINE OVERLOADABLE float16 as_float16(uint16 v) {
-  union _type_cast_64_b u;
-  u._uint16 = v;
-  return u._float16;
-}
-
-INLINE OVERLOADABLE long8 as_long8(double8 v) {
-  union _type_cast_64_b u;
-  u._double8 = v;
-  return u._long8;
-}
-
-INLINE OVERLOADABLE ulong8 as_ulong8(double8 v) {
-  union _type_cast_64_b u;
-  u._double8 = v;
-  return u._ulong8;
-}
-
-INLINE OVERLOADABLE int16 as_int16(double8 v) {
-  union _type_cast_64_b u;
-  u._double8 = v;
-  return u._int16;
-}
-
-INLINE OVERLOADABLE uint16 as_uint16(double8 v) {
-  union _type_cast_64_b u;
-  u._double8 = v;
-  return u._uint16;
-}
-
-INLINE OVERLOADABLE float16 as_float16(double8 v) {
-  union _type_cast_64_b u;
-  u._double8 = v;
-  return u._float16;
-}
-
-INLINE OVERLOADABLE long8 as_long8(float16 v) {
-  union _type_cast_64_b u;
-  u._float16 = v;
-  return u._long8;
-}
-
-INLINE OVERLOADABLE ulong8 as_ulong8(float16 v) {
-  union _type_cast_64_b u;
-  u._float16 = v;
-  return u._ulong8;
-}
-
-INLINE OVERLOADABLE int16 as_int16(float16 v) {
-  union _type_cast_64_b u;
-  u._float16 = v;
-  return u._int16;
-}
-
-INLINE OVERLOADABLE uint16 as_uint16(float16 v) {
-  union _type_cast_64_b u;
-  u._float16 = v;
-  return u._uint16;
-}
-
-INLINE OVERLOADABLE double8 as_double8(float16 v) {
-  union _type_cast_64_b u;
-  u._float16 = v;
-  return u._double8;
-}
-
-union _type_cast_128_b {
-  long16 _long16;
-  ulong16 _ulong16;
-  double16 _double16;
-};
-
-INLINE OVERLOADABLE ulong16 as_ulong16(long16 v) {
-  union _type_cast_128_b u;
-  u._long16 = v;
-  return u._ulong16;
-}
-
-INLINE OVERLOADABLE double16 as_double16(long16 v) {
-  union _type_cast_128_b u;
-  u._long16 = v;
-  return u._double16;
-}
-
-INLINE OVERLOADABLE long16 as_long16(ulong16 v) {
-  union _type_cast_128_b u;
-  u._ulong16 = v;
-  return u._long16;
-}
-
-INLINE OVERLOADABLE double16 as_double16(ulong16 v) {
-  union _type_cast_128_b u;
-  u._ulong16 = v;
-  return u._double16;
-}
-
-INLINE OVERLOADABLE long16 as_long16(double16 v) {
-  union _type_cast_128_b u;
-  u._double16 = v;
-  return u._long16;
-}
-
-INLINE OVERLOADABLE ulong16 as_ulong16(double16 v) {
-  union _type_cast_128_b u;
-  u._double16 = v;
-  return u._ulong16;
-}
-
diff --git a/backend/src/ocl_convert.h b/backend/src/ocl_convert.h
deleted file mode 100644
index 8326768..0000000
--- a/backend/src/ocl_convert.h
+++ /dev/null
@@ -1,17415 +0,0 @@
-// This file is autogenerated by gen_convert.sh.
-// Don't modify it manually.
-INLINE OVERLOADABLE long convert_long(long v) {
-  return (long)v;
-}
-
-INLINE OVERLOADABLE ulong convert_ulong(long v) {
-  return (ulong)v;
-}
-
-INLINE OVERLOADABLE int convert_int(long v) {
-  return (int)v;
-}
-
-INLINE OVERLOADABLE uint convert_uint(long v) {
-  return (uint)v;
-}
-
-INLINE OVERLOADABLE short convert_short(long v) {
-  return (short)v;
-}
-
-INLINE OVERLOADABLE ushort convert_ushort(long v) {
-  return (ushort)v;
-}
-
-INLINE OVERLOADABLE char convert_char(long v) {
-  return (char)v;
-}
-
-INLINE OVERLOADABLE uchar convert_uchar(long v) {
-  return (uchar)v;
-}
-
-INLINE OVERLOADABLE double convert_double(long v) {
-  return (double)v;
-}
-
-INLINE OVERLOADABLE float convert_float(long v) {
-  return (float)v;
-}
-
-INLINE OVERLOADABLE long convert_long(ulong v) {
-  return (long)v;
-}
-
-INLINE OVERLOADABLE ulong convert_ulong(ulong v) {
-  return (ulong)v;
-}
-
-INLINE OVERLOADABLE int convert_int(ulong v) {
-  return (int)v;
-}
-
-INLINE OVERLOADABLE uint convert_uint(ulong v) {
-  return (uint)v;
-}
-
-INLINE OVERLOADABLE short convert_short(ulong v) {
-  return (short)v;
-}
-
-INLINE OVERLOADABLE ushort convert_ushort(ulong v) {
-  return (ushort)v;
-}
-
-INLINE OVERLOADABLE char convert_char(ulong v) {
-  return (char)v;
-}
-
-INLINE OVERLOADABLE uchar convert_uchar(ulong v) {
-  return (uchar)v;
-}
-
-INLINE OVERLOADABLE double convert_double(ulong v) {
-  return (double)v;
-}
-
-INLINE OVERLOADABLE float convert_float(ulong v) {
-  return (float)v;
-}
-
-INLINE OVERLOADABLE long convert_long(int v) {
-  return (long)v;
-}
-
-INLINE OVERLOADABLE ulong convert_ulong(int v) {
-  return (ulong)v;
-}
-
-INLINE OVERLOADABLE int convert_int(int v) {
-  return (int)v;
-}
-
-INLINE OVERLOADABLE uint convert_uint(int v) {
-  return (uint)v;
-}
-
-INLINE OVERLOADABLE short convert_short(int v) {
-  return (short)v;
-}
-
-INLINE OVERLOADABLE ushort convert_ushort(int v) {
-  return (ushort)v;
-}
-
-INLINE OVERLOADABLE char convert_char(int v) {
-  return (char)v;
-}
-
-INLINE OVERLOADABLE uchar convert_uchar(int v) {
-  return (uchar)v;
-}
-
-INLINE OVERLOADABLE double convert_double(int v) {
-  return (double)v;
-}
-
-INLINE OVERLOADABLE float convert_float(int v) {
-  return (float)v;
-}
-
-INLINE OVERLOADABLE long convert_long(uint v) {
-  return (long)v;
-}
-
-INLINE OVERLOADABLE ulong convert_ulong(uint v) {
-  return (ulong)v;
-}
-
-INLINE OVERLOADABLE int convert_int(uint v) {
-  return (int)v;
-}
-
-INLINE OVERLOADABLE uint convert_uint(uint v) {
-  return (uint)v;
-}
-
-INLINE OVERLOADABLE short convert_short(uint v) {
-  return (short)v;
-}
-
-INLINE OVERLOADABLE ushort convert_ushort(uint v) {
-  return (ushort)v;
-}
-
-INLINE OVERLOADABLE char convert_char(uint v) {
-  return (char)v;
-}
-
-INLINE OVERLOADABLE uchar convert_uchar(uint v) {
-  return (uchar)v;
-}
-
-INLINE OVERLOADABLE double convert_double(uint v) {
-  return (double)v;
-}
-
-INLINE OVERLOADABLE float convert_float(uint v) {
-  return (float)v;
-}
-
-INLINE OVERLOADABLE long convert_long(short v) {
-  return (long)v;
-}
-
-INLINE OVERLOADABLE ulong convert_ulong(short v) {
-  return (ulong)v;
-}
-
-INLINE OVERLOADABLE int convert_int(short v) {
-  return (int)v;
-}
-
-INLINE OVERLOADABLE uint convert_uint(short v) {
-  return (uint)v;
-}
-
-INLINE OVERLOADABLE short convert_short(short v) {
-  return (short)v;
-}
-
-INLINE OVERLOADABLE ushort convert_ushort(short v) {
-  return (ushort)v;
-}
-
-INLINE OVERLOADABLE char convert_char(short v) {
-  return (char)v;
-}
-
-INLINE OVERLOADABLE uchar convert_uchar(short v) {
-  return (uchar)v;
-}
-
-INLINE OVERLOADABLE double convert_double(short v) {
-  return (double)v;
-}
-
-INLINE OVERLOADABLE float convert_float(short v) {
-  return (float)v;
-}
-
-INLINE OVERLOADABLE long convert_long(ushort v) {
-  return (long)v;
-}
-
-INLINE OVERLOADABLE ulong convert_ulong(ushort v) {
-  return (ulong)v;
-}
-
-INLINE OVERLOADABLE int convert_int(ushort v) {
-  return (int)v;
-}
-
-INLINE OVERLOADABLE uint convert_uint(ushort v) {
-  return (uint)v;
-}
-
-INLINE OVERLOADABLE short convert_short(ushort v) {
-  return (short)v;
-}
-
-INLINE OVERLOADABLE ushort convert_ushort(ushort v) {
-  return (ushort)v;
-}
-
-INLINE OVERLOADABLE char convert_char(ushort v) {
-  return (char)v;
-}
-
-INLINE OVERLOADABLE uchar convert_uchar(ushort v) {
-  return (uchar)v;
-}
-
-INLINE OVERLOADABLE double convert_double(ushort v) {
-  return (double)v;
-}
-
-INLINE OVERLOADABLE float convert_float(ushort v) {
-  return (float)v;
-}
-
-INLINE OVERLOADABLE long convert_long(char v) {
-  return (long)v;
-}
-
-INLINE OVERLOADABLE ulong convert_ulong(char v) {
-  return (ulong)v;
-}
-
-INLINE OVERLOADABLE int convert_int(char v) {
-  return (int)v;
-}
-
-INLINE OVERLOADABLE uint convert_uint(char v) {
-  return (uint)v;
-}
-
-INLINE OVERLOADABLE short convert_short(char v) {
-  return (short)v;
-}
-
-INLINE OVERLOADABLE ushort convert_ushort(char v) {
-  return (ushort)v;
-}
-
-INLINE OVERLOADABLE char convert_char(char v) {
-  return (char)v;
-}
-
-INLINE OVERLOADABLE uchar convert_uchar(char v) {
-  return (uchar)v;
-}
-
-INLINE OVERLOADABLE double convert_double(char v) {
-  return (double)v;
-}
-
-INLINE OVERLOADABLE float convert_float(char v) {
-  return (float)v;
-}
-
-INLINE OVERLOADABLE long convert_long(uchar v) {
-  return (long)v;
-}
-
-INLINE OVERLOADABLE ulong convert_ulong(uchar v) {
-  return (ulong)v;
-}
-
-INLINE OVERLOADABLE int convert_int(uchar v) {
-  return (int)v;
-}
-
-INLINE OVERLOADABLE uint convert_uint(uchar v) {
-  return (uint)v;
-}
-
-INLINE OVERLOADABLE short convert_short(uchar v) {
-  return (short)v;
-}
-
-INLINE OVERLOADABLE ushort convert_ushort(uchar v) {
-  return (ushort)v;
-}
-
-INLINE OVERLOADABLE char convert_char(uchar v) {
-  return (char)v;
-}
-
-INLINE OVERLOADABLE uchar convert_uchar(uchar v) {
-  return (uchar)v;
-}
-
-INLINE OVERLOADABLE double convert_double(uchar v) {
-  return (double)v;
-}
-
-INLINE OVERLOADABLE float convert_float(uchar v) {
-  return (float)v;
-}
-
-INLINE OVERLOADABLE long convert_long(double v) {
-  return (long)v;
-}
-
-INLINE OVERLOADABLE ulong convert_ulong(double v) {
-  return (ulong)v;
-}
-
-INLINE OVERLOADABLE int convert_int(double v) {
-  return (int)v;
-}
-
-INLINE OVERLOADABLE uint convert_uint(double v) {
-  return (uint)v;
-}
-
-INLINE OVERLOADABLE short convert_short(double v) {
-  return (short)v;
-}
-
-INLINE OVERLOADABLE ushort convert_ushort(double v) {
-  return (ushort)v;
-}
-
-INLINE OVERLOADABLE char convert_char(double v) {
-  return (char)v;
-}
-
-INLINE OVERLOADABLE uchar convert_uchar(double v) {
-  return (uchar)v;
-}
-
-INLINE OVERLOADABLE double convert_double(double v) {
-  return (double)v;
-}
-
-INLINE OVERLOADABLE float convert_float(double v) {
-  return (float)v;
-}
-
-INLINE OVERLOADABLE long convert_long(float v) {
-  return (long)v;
-}
-
-INLINE OVERLOADABLE ulong convert_ulong(float v) {
-  return (ulong)v;
-}
-
-INLINE OVERLOADABLE int convert_int(float v) {
-  return (int)v;
-}
-
-INLINE OVERLOADABLE uint convert_uint(float v) {
-  return (uint)v;
-}
-
-INLINE OVERLOADABLE short convert_short(float v) {
-  return (short)v;
-}
-
-INLINE OVERLOADABLE ushort convert_ushort(float v) {
-  return (ushort)v;
-}
-
-INLINE OVERLOADABLE char convert_char(float v) {
-  return (char)v;
-}
-
-INLINE OVERLOADABLE uchar convert_uchar(float v) {
-  return (uchar)v;
-}
-
-INLINE OVERLOADABLE double convert_double(float v) {
-  return (double)v;
-}
-
-INLINE OVERLOADABLE float convert_float(float v) {
-  return (float)v;
-}
-
-INLINE OVERLOADABLE long2 convert_long2(long2 v) { return v; }
-INLINE OVERLOADABLE ulong2 convert_ulong2(long2 v) {
-  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2(long2 v) {
-  return (int2)((int)(v.s0), (int)(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2(long2 v) {
-  return (uint2)((uint)(v.s0), (uint)(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2(long2 v) {
-  return (short2)((short)(v.s0), (short)(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2(long2 v) {
-  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2(long2 v) {
-  return (char2)((char)(v.s0), (char)(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2(long2 v) {
-  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
-}
-
-INLINE OVERLOADABLE double2 convert_double2(long2 v) {
-  return (double2)((double)(v.s0), (double)(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2(long2 v) {
-  return (float2)((float)(v.s0), (float)(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2(ulong2 v) {
-  return (long2)((long)(v.s0), (long)(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2(ulong2 v) { return v; }
-INLINE OVERLOADABLE int2 convert_int2(ulong2 v) {
-  return (int2)((int)(v.s0), (int)(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2(ulong2 v) {
-  return (uint2)((uint)(v.s0), (uint)(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2(ulong2 v) {
-  return (short2)((short)(v.s0), (short)(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2(ulong2 v) {
-  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2(ulong2 v) {
-  return (char2)((char)(v.s0), (char)(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2(ulong2 v) {
-  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
-}
-
-INLINE OVERLOADABLE double2 convert_double2(ulong2 v) {
-  return (double2)((double)(v.s0), (double)(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2(ulong2 v) {
-  return (float2)((float)(v.s0), (float)(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2(int2 v) {
-  return (long2)((long)(v.s0), (long)(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2(int2 v) {
-  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2(int2 v) { return v; }
-INLINE OVERLOADABLE uint2 convert_uint2(int2 v) {
-  return (uint2)((uint)(v.s0), (uint)(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2(int2 v) {
-  return (short2)((short)(v.s0), (short)(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2(int2 v) {
-  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2(int2 v) {
-  return (char2)((char)(v.s0), (char)(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2(int2 v) {
-  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
-}
-
-INLINE OVERLOADABLE double2 convert_double2(int2 v) {
-  return (double2)((double)(v.s0), (double)(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2(int2 v) {
-  return (float2)((float)(v.s0), (float)(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2(uint2 v) {
-  return (long2)((long)(v.s0), (long)(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2(uint2 v) {
-  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2(uint2 v) {
-  return (int2)((int)(v.s0), (int)(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2(uint2 v) { return v; }
-INLINE OVERLOADABLE short2 convert_short2(uint2 v) {
-  return (short2)((short)(v.s0), (short)(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2(uint2 v) {
-  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2(uint2 v) {
-  return (char2)((char)(v.s0), (char)(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2(uint2 v) {
-  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
-}
-
-INLINE OVERLOADABLE double2 convert_double2(uint2 v) {
-  return (double2)((double)(v.s0), (double)(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2(uint2 v) {
-  return (float2)((float)(v.s0), (float)(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2(short2 v) {
-  return (long2)((long)(v.s0), (long)(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2(short2 v) {
-  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2(short2 v) {
-  return (int2)((int)(v.s0), (int)(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2(short2 v) {
-  return (uint2)((uint)(v.s0), (uint)(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2(short2 v) { return v; }
-INLINE OVERLOADABLE ushort2 convert_ushort2(short2 v) {
-  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2(short2 v) {
-  return (char2)((char)(v.s0), (char)(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2(short2 v) {
-  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
-}
-
-INLINE OVERLOADABLE double2 convert_double2(short2 v) {
-  return (double2)((double)(v.s0), (double)(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2(short2 v) {
-  return (float2)((float)(v.s0), (float)(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2(ushort2 v) {
-  return (long2)((long)(v.s0), (long)(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2(ushort2 v) {
-  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2(ushort2 v) {
-  return (int2)((int)(v.s0), (int)(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2(ushort2 v) {
-  return (uint2)((uint)(v.s0), (uint)(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2(ushort2 v) {
-  return (short2)((short)(v.s0), (short)(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2(ushort2 v) { return v; }
-INLINE OVERLOADABLE char2 convert_char2(ushort2 v) {
-  return (char2)((char)(v.s0), (char)(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2(ushort2 v) {
-  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
-}
-
-INLINE OVERLOADABLE double2 convert_double2(ushort2 v) {
-  return (double2)((double)(v.s0), (double)(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2(ushort2 v) {
-  return (float2)((float)(v.s0), (float)(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2(char2 v) {
-  return (long2)((long)(v.s0), (long)(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2(char2 v) {
-  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2(char2 v) {
-  return (int2)((int)(v.s0), (int)(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2(char2 v) {
-  return (uint2)((uint)(v.s0), (uint)(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2(char2 v) {
-  return (short2)((short)(v.s0), (short)(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2(char2 v) {
-  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2(char2 v) { return v; }
-INLINE OVERLOADABLE uchar2 convert_uchar2(char2 v) {
-  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
-}
-
-INLINE OVERLOADABLE double2 convert_double2(char2 v) {
-  return (double2)((double)(v.s0), (double)(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2(char2 v) {
-  return (float2)((float)(v.s0), (float)(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2(uchar2 v) {
-  return (long2)((long)(v.s0), (long)(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2(uchar2 v) {
-  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2(uchar2 v) {
-  return (int2)((int)(v.s0), (int)(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2(uchar2 v) {
-  return (uint2)((uint)(v.s0), (uint)(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2(uchar2 v) {
-  return (short2)((short)(v.s0), (short)(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2(uchar2 v) {
-  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2(uchar2 v) {
-  return (char2)((char)(v.s0), (char)(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2(uchar2 v) { return v; }
-INLINE OVERLOADABLE double2 convert_double2(uchar2 v) {
-  return (double2)((double)(v.s0), (double)(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2(uchar2 v) {
-  return (float2)((float)(v.s0), (float)(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2(double2 v) {
-  return (long2)((long)(v.s0), (long)(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2(double2 v) {
-  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2(double2 v) {
-  return (int2)((int)(v.s0), (int)(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2(double2 v) {
-  return (uint2)((uint)(v.s0), (uint)(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2(double2 v) {
-  return (short2)((short)(v.s0), (short)(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2(double2 v) {
-  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2(double2 v) {
-  return (char2)((char)(v.s0), (char)(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2(double2 v) {
-  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
-}
-
-INLINE OVERLOADABLE double2 convert_double2(double2 v) { return v; }
-INLINE OVERLOADABLE float2 convert_float2(double2 v) {
-  return (float2)((float)(v.s0), (float)(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2(float2 v) {
-  return (long2)((long)(v.s0), (long)(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2(float2 v) {
-  return (ulong2)((ulong)(v.s0), (ulong)(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2(float2 v) {
-  return (int2)((int)(v.s0), (int)(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2(float2 v) {
-  return (uint2)((uint)(v.s0), (uint)(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2(float2 v) {
-  return (short2)((short)(v.s0), (short)(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2(float2 v) {
-  return (ushort2)((ushort)(v.s0), (ushort)(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2(float2 v) {
-  return (char2)((char)(v.s0), (char)(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2(float2 v) {
-  return (uchar2)((uchar)(v.s0), (uchar)(v.s1));
-}
-
-INLINE OVERLOADABLE double2 convert_double2(float2 v) {
-  return (double2)((double)(v.s0), (double)(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2(float2 v) { return v; }
-INLINE OVERLOADABLE long3 convert_long3(long3 v) { return v; }
-INLINE OVERLOADABLE ulong3 convert_ulong3(long3 v) {
-  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3(long3 v) {
-  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3(long3 v) {
-  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3(long3 v) {
-  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3(long3 v) {
-  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3(long3 v) {
-  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3(long3 v) {
-  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
-}
-
-INLINE OVERLOADABLE double3 convert_double3(long3 v) {
-  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3(long3 v) {
-  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3(ulong3 v) {
-  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3(ulong3 v) { return v; }
-INLINE OVERLOADABLE int3 convert_int3(ulong3 v) {
-  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3(ulong3 v) {
-  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3(ulong3 v) {
-  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3(ulong3 v) {
-  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3(ulong3 v) {
-  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3(ulong3 v) {
-  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
-}
-
-INLINE OVERLOADABLE double3 convert_double3(ulong3 v) {
-  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3(ulong3 v) {
-  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3(int3 v) {
-  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3(int3 v) {
-  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3(int3 v) { return v; }
-INLINE OVERLOADABLE uint3 convert_uint3(int3 v) {
-  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3(int3 v) {
-  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3(int3 v) {
-  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3(int3 v) {
-  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3(int3 v) {
-  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
-}
-
-INLINE OVERLOADABLE double3 convert_double3(int3 v) {
-  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3(int3 v) {
-  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3(uint3 v) {
-  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3(uint3 v) {
-  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3(uint3 v) {
-  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3(uint3 v) { return v; }
-INLINE OVERLOADABLE short3 convert_short3(uint3 v) {
-  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3(uint3 v) {
-  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3(uint3 v) {
-  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3(uint3 v) {
-  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
-}
-
-INLINE OVERLOADABLE double3 convert_double3(uint3 v) {
-  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3(uint3 v) {
-  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3(short3 v) {
-  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3(short3 v) {
-  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3(short3 v) {
-  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3(short3 v) {
-  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3(short3 v) { return v; }
-INLINE OVERLOADABLE ushort3 convert_ushort3(short3 v) {
-  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3(short3 v) {
-  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3(short3 v) {
-  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
-}
-
-INLINE OVERLOADABLE double3 convert_double3(short3 v) {
-  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3(short3 v) {
-  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3(ushort3 v) {
-  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3(ushort3 v) {
-  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3(ushort3 v) {
-  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3(ushort3 v) {
-  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3(ushort3 v) {
-  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3(ushort3 v) { return v; }
-INLINE OVERLOADABLE char3 convert_char3(ushort3 v) {
-  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3(ushort3 v) {
-  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
-}
-
-INLINE OVERLOADABLE double3 convert_double3(ushort3 v) {
-  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3(ushort3 v) {
-  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3(char3 v) {
-  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3(char3 v) {
-  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3(char3 v) {
-  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3(char3 v) {
-  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3(char3 v) {
-  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3(char3 v) {
-  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3(char3 v) { return v; }
-INLINE OVERLOADABLE uchar3 convert_uchar3(char3 v) {
-  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
-}
-
-INLINE OVERLOADABLE double3 convert_double3(char3 v) {
-  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3(char3 v) {
-  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3(uchar3 v) {
-  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3(uchar3 v) {
-  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3(uchar3 v) {
-  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3(uchar3 v) {
-  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3(uchar3 v) {
-  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3(uchar3 v) {
-  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3(uchar3 v) {
-  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3(uchar3 v) { return v; }
-INLINE OVERLOADABLE double3 convert_double3(uchar3 v) {
-  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3(uchar3 v) {
-  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3(double3 v) {
-  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3(double3 v) {
-  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3(double3 v) {
-  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3(double3 v) {
-  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3(double3 v) {
-  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3(double3 v) {
-  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3(double3 v) {
-  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3(double3 v) {
-  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
-}
-
-INLINE OVERLOADABLE double3 convert_double3(double3 v) { return v; }
-INLINE OVERLOADABLE float3 convert_float3(double3 v) {
-  return (float3)((float)(v.s0), (float)(v.s1), (float)(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3(float3 v) {
-  return (long3)((long)(v.s0), (long)(v.s1), (long)(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3(float3 v) {
-  return (ulong3)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3(float3 v) {
-  return (int3)((int)(v.s0), (int)(v.s1), (int)(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3(float3 v) {
-  return (uint3)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3(float3 v) {
-  return (short3)((short)(v.s0), (short)(v.s1), (short)(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3(float3 v) {
-  return (ushort3)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3(float3 v) {
-  return (char3)((char)(v.s0), (char)(v.s1), (char)(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3(float3 v) {
-  return (uchar3)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2));
-}
-
-INLINE OVERLOADABLE double3 convert_double3(float3 v) {
-  return (double3)((double)(v.s0), (double)(v.s1), (double)(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3(float3 v) { return v; }
-INLINE OVERLOADABLE long4 convert_long4(long4 v) { return v; }
-INLINE OVERLOADABLE ulong4 convert_ulong4(long4 v) {
-  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4(long4 v) {
-  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4(long4 v) {
-  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4(long4 v) {
-  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4(long4 v) {
-  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4(long4 v) {
-  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4(long4 v) {
-  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
-}
-
-INLINE OVERLOADABLE double4 convert_double4(long4 v) {
-  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4(long4 v) {
-  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4(ulong4 v) {
-  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4(ulong4 v) { return v; }
-INLINE OVERLOADABLE int4 convert_int4(ulong4 v) {
-  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4(ulong4 v) {
-  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4(ulong4 v) {
-  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4(ulong4 v) {
-  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4(ulong4 v) {
-  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4(ulong4 v) {
-  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
-}
-
-INLINE OVERLOADABLE double4 convert_double4(ulong4 v) {
-  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4(ulong4 v) {
-  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4(int4 v) {
-  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4(int4 v) {
-  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4(int4 v) { return v; }
-INLINE OVERLOADABLE uint4 convert_uint4(int4 v) {
-  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4(int4 v) {
-  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4(int4 v) {
-  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4(int4 v) {
-  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4(int4 v) {
-  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
-}
-
-INLINE OVERLOADABLE double4 convert_double4(int4 v) {
-  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4(int4 v) {
-  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4(uint4 v) {
-  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4(uint4 v) {
-  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4(uint4 v) {
-  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4(uint4 v) { return v; }
-INLINE OVERLOADABLE short4 convert_short4(uint4 v) {
-  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4(uint4 v) {
-  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4(uint4 v) {
-  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4(uint4 v) {
-  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
-}
-
-INLINE OVERLOADABLE double4 convert_double4(uint4 v) {
-  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4(uint4 v) {
-  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4(short4 v) {
-  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4(short4 v) {
-  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4(short4 v) {
-  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4(short4 v) {
-  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4(short4 v) { return v; }
-INLINE OVERLOADABLE ushort4 convert_ushort4(short4 v) {
-  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4(short4 v) {
-  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4(short4 v) {
-  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
-}
-
-INLINE OVERLOADABLE double4 convert_double4(short4 v) {
-  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4(short4 v) {
-  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4(ushort4 v) {
-  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4(ushort4 v) {
-  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4(ushort4 v) {
-  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4(ushort4 v) {
-  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4(ushort4 v) {
-  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4(ushort4 v) { return v; }
-INLINE OVERLOADABLE char4 convert_char4(ushort4 v) {
-  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4(ushort4 v) {
-  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
-}
-
-INLINE OVERLOADABLE double4 convert_double4(ushort4 v) {
-  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4(ushort4 v) {
-  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4(char4 v) {
-  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4(char4 v) {
-  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4(char4 v) {
-  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4(char4 v) {
-  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4(char4 v) {
-  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4(char4 v) {
-  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4(char4 v) { return v; }
-INLINE OVERLOADABLE uchar4 convert_uchar4(char4 v) {
-  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
-}
-
-INLINE OVERLOADABLE double4 convert_double4(char4 v) {
-  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4(char4 v) {
-  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4(uchar4 v) {
-  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4(uchar4 v) {
-  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4(uchar4 v) {
-  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4(uchar4 v) {
-  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4(uchar4 v) {
-  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4(uchar4 v) {
-  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4(uchar4 v) {
-  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4(uchar4 v) { return v; }
-INLINE OVERLOADABLE double4 convert_double4(uchar4 v) {
-  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4(uchar4 v) {
-  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4(double4 v) {
-  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4(double4 v) {
-  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4(double4 v) {
-  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4(double4 v) {
-  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4(double4 v) {
-  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4(double4 v) {
-  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4(double4 v) {
-  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4(double4 v) {
-  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
-}
-
-INLINE OVERLOADABLE double4 convert_double4(double4 v) { return v; }
-INLINE OVERLOADABLE float4 convert_float4(double4 v) {
-  return (float4)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4(float4 v) {
-  return (long4)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4(float4 v) {
-  return (ulong4)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4(float4 v) {
-  return (int4)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4(float4 v) {
-  return (uint4)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4(float4 v) {
-  return (short4)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4(float4 v) {
-  return (ushort4)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4(float4 v) {
-  return (char4)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4(float4 v) {
-  return (uchar4)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3));
-}
-
-INLINE OVERLOADABLE double4 convert_double4(float4 v) {
-  return (double4)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4(float4 v) { return v; }
-INLINE OVERLOADABLE long8 convert_long8(long8 v) { return v; }
-INLINE OVERLOADABLE ulong8 convert_ulong8(long8 v) {
-  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8(long8 v) {
-  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8(long8 v) {
-  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8(long8 v) {
-  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8(long8 v) {
-  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8(long8 v) {
-  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8(long8 v) {
-  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
-}
-
-INLINE OVERLOADABLE double8 convert_double8(long8 v) {
-  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8(long8 v) {
-  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8(ulong8 v) {
-  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8(ulong8 v) { return v; }
-INLINE OVERLOADABLE int8 convert_int8(ulong8 v) {
-  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8(ulong8 v) {
-  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8(ulong8 v) {
-  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8(ulong8 v) {
-  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8(ulong8 v) {
-  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8(ulong8 v) {
-  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
-}
-
-INLINE OVERLOADABLE double8 convert_double8(ulong8 v) {
-  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8(ulong8 v) {
-  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8(int8 v) {
-  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8(int8 v) {
-  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8(int8 v) { return v; }
-INLINE OVERLOADABLE uint8 convert_uint8(int8 v) {
-  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8(int8 v) {
-  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8(int8 v) {
-  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8(int8 v) {
-  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8(int8 v) {
-  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
-}
-
-INLINE OVERLOADABLE double8 convert_double8(int8 v) {
-  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8(int8 v) {
-  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8(uint8 v) {
-  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8(uint8 v) {
-  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8(uint8 v) {
-  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8(uint8 v) { return v; }
-INLINE OVERLOADABLE short8 convert_short8(uint8 v) {
-  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8(uint8 v) {
-  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8(uint8 v) {
-  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8(uint8 v) {
-  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
-}
-
-INLINE OVERLOADABLE double8 convert_double8(uint8 v) {
-  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8(uint8 v) {
-  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8(short8 v) {
-  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8(short8 v) {
-  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8(short8 v) {
-  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8(short8 v) {
-  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8(short8 v) { return v; }
-INLINE OVERLOADABLE ushort8 convert_ushort8(short8 v) {
-  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8(short8 v) {
-  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8(short8 v) {
-  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
-}
-
-INLINE OVERLOADABLE double8 convert_double8(short8 v) {
-  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8(short8 v) {
-  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8(ushort8 v) {
-  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8(ushort8 v) {
-  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8(ushort8 v) {
-  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8(ushort8 v) {
-  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8(ushort8 v) {
-  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8(ushort8 v) { return v; }
-INLINE OVERLOADABLE char8 convert_char8(ushort8 v) {
-  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8(ushort8 v) {
-  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
-}
-
-INLINE OVERLOADABLE double8 convert_double8(ushort8 v) {
-  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8(ushort8 v) {
-  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8(char8 v) {
-  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8(char8 v) {
-  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8(char8 v) {
-  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8(char8 v) {
-  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8(char8 v) {
-  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8(char8 v) {
-  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8(char8 v) { return v; }
-INLINE OVERLOADABLE uchar8 convert_uchar8(char8 v) {
-  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
-}
-
-INLINE OVERLOADABLE double8 convert_double8(char8 v) {
-  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8(char8 v) {
-  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8(uchar8 v) {
-  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8(uchar8 v) {
-  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8(uchar8 v) {
-  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8(uchar8 v) {
-  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8(uchar8 v) {
-  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8(uchar8 v) {
-  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8(uchar8 v) {
-  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8(uchar8 v) { return v; }
-INLINE OVERLOADABLE double8 convert_double8(uchar8 v) {
-  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8(uchar8 v) {
-  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8(double8 v) {
-  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8(double8 v) {
-  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8(double8 v) {
-  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8(double8 v) {
-  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8(double8 v) {
-  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8(double8 v) {
-  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8(double8 v) {
-  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8(double8 v) {
-  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
-}
-
-INLINE OVERLOADABLE double8 convert_double8(double8 v) { return v; }
-INLINE OVERLOADABLE float8 convert_float8(double8 v) {
-  return (float8)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8(float8 v) {
-  return (long8)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8(float8 v) {
-  return (ulong8)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8(float8 v) {
-  return (int8)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8(float8 v) {
-  return (uint8)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8(float8 v) {
-  return (short8)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8(float8 v) {
-  return (ushort8)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8(float8 v) {
-  return (char8)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8(float8 v) {
-  return (uchar8)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7));
-}
-
-INLINE OVERLOADABLE double8 convert_double8(float8 v) {
-  return (double8)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8(float8 v) { return v; }
-INLINE OVERLOADABLE long16 convert_long16(long16 v) { return v; }
-INLINE OVERLOADABLE ulong16 convert_ulong16(long16 v) {
-  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16(long16 v) {
-  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16(long16 v) {
-  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16(long16 v) {
-  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16(long16 v) {
-  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16(long16 v) {
-  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16(long16 v) {
-  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
-}
-
-INLINE OVERLOADABLE double16 convert_double16(long16 v) {
-  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16(long16 v) {
-  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16(ulong16 v) {
-  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16(ulong16 v) { return v; }
-INLINE OVERLOADABLE int16 convert_int16(ulong16 v) {
-  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16(ulong16 v) {
-  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16(ulong16 v) {
-  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16(ulong16 v) {
-  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16(ulong16 v) {
-  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16(ulong16 v) {
-  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
-}
-
-INLINE OVERLOADABLE double16 convert_double16(ulong16 v) {
-  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16(ulong16 v) {
-  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16(int16 v) {
-  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16(int16 v) {
-  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16(int16 v) { return v; }
-INLINE OVERLOADABLE uint16 convert_uint16(int16 v) {
-  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16(int16 v) {
-  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16(int16 v) {
-  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16(int16 v) {
-  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16(int16 v) {
-  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
-}
-
-INLINE OVERLOADABLE double16 convert_double16(int16 v) {
-  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16(int16 v) {
-  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16(uint16 v) {
-  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16(uint16 v) {
-  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16(uint16 v) {
-  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16(uint16 v) { return v; }
-INLINE OVERLOADABLE short16 convert_short16(uint16 v) {
-  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16(uint16 v) {
-  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16(uint16 v) {
-  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16(uint16 v) {
-  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
-}
-
-INLINE OVERLOADABLE double16 convert_double16(uint16 v) {
-  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16(uint16 v) {
-  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16(short16 v) {
-  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16(short16 v) {
-  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16(short16 v) {
-  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16(short16 v) {
-  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16(short16 v) { return v; }
-INLINE OVERLOADABLE ushort16 convert_ushort16(short16 v) {
-  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16(short16 v) {
-  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16(short16 v) {
-  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
-}
-
-INLINE OVERLOADABLE double16 convert_double16(short16 v) {
-  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16(short16 v) {
-  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16(ushort16 v) {
-  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16(ushort16 v) {
-  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16(ushort16 v) {
-  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16(ushort16 v) {
-  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16(ushort16 v) {
-  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16(ushort16 v) { return v; }
-INLINE OVERLOADABLE char16 convert_char16(ushort16 v) {
-  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16(ushort16 v) {
-  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
-}
-
-INLINE OVERLOADABLE double16 convert_double16(ushort16 v) {
-  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16(ushort16 v) {
-  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16(char16 v) {
-  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16(char16 v) {
-  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16(char16 v) {
-  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16(char16 v) {
-  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16(char16 v) {
-  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16(char16 v) {
-  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16(char16 v) { return v; }
-INLINE OVERLOADABLE uchar16 convert_uchar16(char16 v) {
-  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
-}
-
-INLINE OVERLOADABLE double16 convert_double16(char16 v) {
-  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16(char16 v) {
-  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16(uchar16 v) {
-  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16(uchar16 v) {
-  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16(uchar16 v) {
-  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16(uchar16 v) {
-  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16(uchar16 v) {
-  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16(uchar16 v) {
-  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16(uchar16 v) {
-  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16(uchar16 v) { return v; }
-INLINE OVERLOADABLE double16 convert_double16(uchar16 v) {
-  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16(uchar16 v) {
-  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16(double16 v) {
-  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16(double16 v) {
-  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16(double16 v) {
-  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16(double16 v) {
-  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16(double16 v) {
-  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16(double16 v) {
-  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16(double16 v) {
-  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16(double16 v) {
-  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
-}
-
-INLINE OVERLOADABLE double16 convert_double16(double16 v) { return v; }
-INLINE OVERLOADABLE float16 convert_float16(double16 v) {
-  return (float16)((float)(v.s0), (float)(v.s1), (float)(v.s2), (float)(v.s3), (float)(v.s4), (float)(v.s5), (float)(v.s6), (float)(v.s7), (float)(v.s8), (float)(v.s9), (float)(v.sA), (float)(v.sB), (float)(v.sC), (float)(v.sD), (float)(v.sE), (float)(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16(float16 v) {
-  return (long16)((long)(v.s0), (long)(v.s1), (long)(v.s2), (long)(v.s3), (long)(v.s4), (long)(v.s5), (long)(v.s6), (long)(v.s7), (long)(v.s8), (long)(v.s9), (long)(v.sA), (long)(v.sB), (long)(v.sC), (long)(v.sD), (long)(v.sE), (long)(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16(float16 v) {
-  return (ulong16)((ulong)(v.s0), (ulong)(v.s1), (ulong)(v.s2), (ulong)(v.s3), (ulong)(v.s4), (ulong)(v.s5), (ulong)(v.s6), (ulong)(v.s7), (ulong)(v.s8), (ulong)(v.s9), (ulong)(v.sA), (ulong)(v.sB), (ulong)(v.sC), (ulong)(v.sD), (ulong)(v.sE), (ulong)(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16(float16 v) {
-  return (int16)((int)(v.s0), (int)(v.s1), (int)(v.s2), (int)(v.s3), (int)(v.s4), (int)(v.s5), (int)(v.s6), (int)(v.s7), (int)(v.s8), (int)(v.s9), (int)(v.sA), (int)(v.sB), (int)(v.sC), (int)(v.sD), (int)(v.sE), (int)(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16(float16 v) {
-  return (uint16)((uint)(v.s0), (uint)(v.s1), (uint)(v.s2), (uint)(v.s3), (uint)(v.s4), (uint)(v.s5), (uint)(v.s6), (uint)(v.s7), (uint)(v.s8), (uint)(v.s9), (uint)(v.sA), (uint)(v.sB), (uint)(v.sC), (uint)(v.sD), (uint)(v.sE), (uint)(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16(float16 v) {
-  return (short16)((short)(v.s0), (short)(v.s1), (short)(v.s2), (short)(v.s3), (short)(v.s4), (short)(v.s5), (short)(v.s6), (short)(v.s7), (short)(v.s8), (short)(v.s9), (short)(v.sA), (short)(v.sB), (short)(v.sC), (short)(v.sD), (short)(v.sE), (short)(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16(float16 v) {
-  return (ushort16)((ushort)(v.s0), (ushort)(v.s1), (ushort)(v.s2), (ushort)(v.s3), (ushort)(v.s4), (ushort)(v.s5), (ushort)(v.s6), (ushort)(v.s7), (ushort)(v.s8), (ushort)(v.s9), (ushort)(v.sA), (ushort)(v.sB), (ushort)(v.sC), (ushort)(v.sD), (ushort)(v.sE), (ushort)(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16(float16 v) {
-  return (char16)((char)(v.s0), (char)(v.s1), (char)(v.s2), (char)(v.s3), (char)(v.s4), (char)(v.s5), (char)(v.s6), (char)(v.s7), (char)(v.s8), (char)(v.s9), (char)(v.sA), (char)(v.sB), (char)(v.sC), (char)(v.sD), (char)(v.sE), (char)(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16(float16 v) {
-  return (uchar16)((uchar)(v.s0), (uchar)(v.s1), (uchar)(v.s2), (uchar)(v.s3), (uchar)(v.s4), (uchar)(v.s5), (uchar)(v.s6), (uchar)(v.s7), (uchar)(v.s8), (uchar)(v.s9), (uchar)(v.sA), (uchar)(v.sB), (uchar)(v.sC), (uchar)(v.sD), (uchar)(v.sE), (uchar)(v.sF));
-}
-
-INLINE OVERLOADABLE double16 convert_double16(float16 v) {
-  return (double16)((double)(v.s0), (double)(v.s1), (double)(v.s2), (double)(v.s3), (double)(v.s4), (double)(v.s5), (double)(v.s6), (double)(v.s7), (double)(v.s8), (double)(v.s9), (double)(v.sA), (double)(v.sB), (double)(v.sC), (double)(v.sD), (double)(v.sE), (double)(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16(float16 v) { return v; }
-
-#define DEF(DSTTYPE, SRCTYPE) \
-  OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x);
-DEF(char, uchar);
-DEF(char, short);
-DEF(char, ushort);
-DEF(char, int);
-DEF(char, uint);
-DEF(char, float);
-DEF(uchar, char);
-DEF(uchar, short);
-DEF(uchar, ushort);
-DEF(uchar, int);
-DEF(uchar, uint);
-DEF(uchar, float);
-DEF(short, ushort);
-DEF(short, int);
-DEF(short, uint);
-DEF(short, float);
-DEF(ushort, short);
-DEF(ushort, int);
-DEF(ushort, uint);
-DEF(ushort, float);
-DEF(int, uint);
-DEF(int, float);
-DEF(uint, int);
-DEF(uint, float);
-#undef DEF
-
-#define DEF(DSTTYPE, SRCTYPE, MIN, MAX) \
-  INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
-    return x >= MAX ? (DSTTYPE)MAX : x <= MIN ? (DSTTYPE)MIN : x; \
-  }
-DEF(char, long, -128, 127);
-DEF(uchar, long, 0, 255);
-DEF(short, long, -32768, 32767);
-DEF(ushort, long, 0, 65535);
-DEF(int, long, -0x7fffffff-1, 0x7fffffff);
-DEF(uint, long, 0, 0xffffffffu);
-DEF(long, float, -9.223372036854776e+18f, 9.223372036854776e+18f);
-DEF(ulong, float, 0, 1.8446744073709552e+19f);
-#undef DEF
-
-#define DEF(DSTTYPE, SRCTYPE, MAX) \
-  INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
-    return x >= MAX ? (DSTTYPE)MAX : x; \
-  }
-DEF(char, ulong, 127);
-DEF(uchar, ulong, 255);
-DEF(short, ulong, 32767);
-DEF(ushort, ulong, 65535);
-DEF(int, ulong, 0x7fffffff);
-DEF(uint, ulong, 0xffffffffu);
-#undef DEF
-
-INLINE_OVERLOADABLE long convert_long_sat(ulong x) {
-  ulong MAX = 0x7ffffffffffffffful;
-  return x >= MAX ? MAX : x;
-}
-
-#define DEF(DSTTYPE, SRCTYPE) \
-  INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
-    return x <= 0 ? 0 : x; \
-  }
-DEF(ushort, char);
-DEF(uint, char);
-DEF(uint, short);
-DEF(ulong, char);
-DEF(ulong, short);
-DEF(ulong, int);
-DEF(ulong, long);
-#undef DEF
-
-#define DEF(DSTTYPE, SRCTYPE) \
-  INLINE_OVERLOADABLE DSTTYPE convert_ ## DSTTYPE ## _sat(SRCTYPE x) { \
-    return x; \
-  }
-DEF(char, char);
-DEF(uchar, uchar);
-DEF(short, char);
-DEF(short, uchar);
-DEF(short, short);
-DEF(ushort, uchar);
-DEF(ushort, ushort);
-DEF(int, char);
-DEF(int, uchar);
-DEF(int, short);
-DEF(int, ushort);
-DEF(int, int);
-DEF(uint, uchar);
-DEF(uint, ushort);
-DEF(uint, uint);
-DEF(long, char);
-DEF(long, uchar);
-DEF(long, short);
-DEF(long, ushort);
-DEF(long, int);
-DEF(long, uint);
-DEF(long, long);
-DEF(ulong, uchar);
-DEF(ulong, ushort);
-DEF(ulong, uint);
-DEF(ulong, ulong);
-#undef DEF
-
-INLINE OVERLOADABLE long2 convert_long2_sat(long2 v) {
-  return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat(long2 v) {
-  return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat(long2 v) {
-  return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat(long2 v) {
-  return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat(long2 v) {
-  return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat(long2 v) {
-  return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat(long2 v) {
-  return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat(long2 v) {
-  return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat(ulong2 v) {
-  return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat(ulong2 v) {
-  return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat(ulong2 v) {
-  return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat(ulong2 v) {
-  return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat(ulong2 v) {
-  return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat(ulong2 v) {
-  return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat(ulong2 v) {
-  return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat(ulong2 v) {
-  return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat(int2 v) {
-  return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat(int2 v) {
-  return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat(int2 v) {
-  return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat(int2 v) {
-  return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat(int2 v) {
-  return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat(int2 v) {
-  return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat(int2 v) {
-  return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat(int2 v) {
-  return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat(uint2 v) {
-  return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat(uint2 v) {
-  return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat(uint2 v) {
-  return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat(uint2 v) {
-  return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat(uint2 v) {
-  return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat(uint2 v) {
-  return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat(uint2 v) {
-  return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat(uint2 v) {
-  return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat(short2 v) {
-  return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat(short2 v) {
-  return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat(short2 v) {
-  return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat(short2 v) {
-  return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat(short2 v) {
-  return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat(short2 v) {
-  return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat(short2 v) {
-  return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat(short2 v) {
-  return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat(ushort2 v) {
-  return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat(ushort2 v) {
-  return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat(ushort2 v) {
-  return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat(ushort2 v) {
-  return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat(ushort2 v) {
-  return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat(ushort2 v) {
-  return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat(ushort2 v) {
-  return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat(ushort2 v) {
-  return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat(char2 v) {
-  return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat(char2 v) {
-  return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat(char2 v) {
-  return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat(char2 v) {
-  return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat(char2 v) {
-  return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat(char2 v) {
-  return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat(char2 v) {
-  return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat(char2 v) {
-  return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat(uchar2 v) {
-  return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat(uchar2 v) {
-  return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat(uchar2 v) {
-  return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat(uchar2 v) {
-  return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat(uchar2 v) {
-  return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat(uchar2 v) {
-  return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat(uchar2 v) {
-  return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat(uchar2 v) {
-  return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat(float2 v) {
-  return (long2)(convert_long_sat(v.s0), convert_long_sat(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat(float2 v) {
-  return (ulong2)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat(float2 v) {
-  return (int2)(convert_int_sat(v.s0), convert_int_sat(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat(float2 v) {
-  return (uint2)(convert_uint_sat(v.s0), convert_uint_sat(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat(float2 v) {
-  return (short2)(convert_short_sat(v.s0), convert_short_sat(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat(float2 v) {
-  return (ushort2)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat(float2 v) {
-  return (char2)(convert_char_sat(v.s0), convert_char_sat(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat(float2 v) {
-  return (uchar2)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat(long3 v) {
-  return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat(long3 v) {
-  return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat(long3 v) {
-  return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat(long3 v) {
-  return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat(long3 v) {
-  return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat(long3 v) {
-  return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat(long3 v) {
-  return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat(long3 v) {
-  return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat(ulong3 v) {
-  return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat(ulong3 v) {
-  return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat(ulong3 v) {
-  return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat(ulong3 v) {
-  return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat(ulong3 v) {
-  return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat(ulong3 v) {
-  return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat(ulong3 v) {
-  return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat(ulong3 v) {
-  return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat(int3 v) {
-  return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat(int3 v) {
-  return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat(int3 v) {
-  return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat(int3 v) {
-  return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat(int3 v) {
-  return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat(int3 v) {
-  return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat(int3 v) {
-  return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat(int3 v) {
-  return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat(uint3 v) {
-  return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat(uint3 v) {
-  return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat(uint3 v) {
-  return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat(uint3 v) {
-  return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat(uint3 v) {
-  return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat(uint3 v) {
-  return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat(uint3 v) {
-  return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat(uint3 v) {
-  return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat(short3 v) {
-  return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat(short3 v) {
-  return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat(short3 v) {
-  return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat(short3 v) {
-  return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat(short3 v) {
-  return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat(short3 v) {
-  return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat(short3 v) {
-  return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat(short3 v) {
-  return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat(ushort3 v) {
-  return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat(ushort3 v) {
-  return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat(ushort3 v) {
-  return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat(ushort3 v) {
-  return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat(ushort3 v) {
-  return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat(ushort3 v) {
-  return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat(ushort3 v) {
-  return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat(ushort3 v) {
-  return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat(char3 v) {
-  return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat(char3 v) {
-  return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat(char3 v) {
-  return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat(char3 v) {
-  return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat(char3 v) {
-  return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat(char3 v) {
-  return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat(char3 v) {
-  return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat(char3 v) {
-  return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat(uchar3 v) {
-  return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat(uchar3 v) {
-  return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat(uchar3 v) {
-  return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat(uchar3 v) {
-  return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat(uchar3 v) {
-  return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat(uchar3 v) {
-  return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat(uchar3 v) {
-  return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat(uchar3 v) {
-  return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat(float3 v) {
-  return (long3)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat(float3 v) {
-  return (ulong3)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat(float3 v) {
-  return (int3)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat(float3 v) {
-  return (uint3)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat(float3 v) {
-  return (short3)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat(float3 v) {
-  return (ushort3)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat(float3 v) {
-  return (char3)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat(float3 v) {
-  return (uchar3)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat(long4 v) {
-  return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat(long4 v) {
-  return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat(long4 v) {
-  return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat(long4 v) {
-  return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat(long4 v) {
-  return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat(long4 v) {
-  return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat(long4 v) {
-  return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat(long4 v) {
-  return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat(ulong4 v) {
-  return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat(ulong4 v) {
-  return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat(ulong4 v) {
-  return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat(ulong4 v) {
-  return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat(ulong4 v) {
-  return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat(ulong4 v) {
-  return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat(ulong4 v) {
-  return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat(ulong4 v) {
-  return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat(int4 v) {
-  return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat(int4 v) {
-  return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat(int4 v) {
-  return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat(int4 v) {
-  return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat(int4 v) {
-  return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat(int4 v) {
-  return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat(int4 v) {
-  return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat(int4 v) {
-  return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat(uint4 v) {
-  return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat(uint4 v) {
-  return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat(uint4 v) {
-  return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat(uint4 v) {
-  return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat(uint4 v) {
-  return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat(uint4 v) {
-  return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat(uint4 v) {
-  return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat(uint4 v) {
-  return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat(short4 v) {
-  return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat(short4 v) {
-  return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat(short4 v) {
-  return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat(short4 v) {
-  return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat(short4 v) {
-  return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat(short4 v) {
-  return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat(short4 v) {
-  return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat(short4 v) {
-  return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat(ushort4 v) {
-  return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat(ushort4 v) {
-  return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat(ushort4 v) {
-  return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat(ushort4 v) {
-  return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat(ushort4 v) {
-  return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat(ushort4 v) {
-  return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat(ushort4 v) {
-  return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat(ushort4 v) {
-  return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat(char4 v) {
-  return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat(char4 v) {
-  return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat(char4 v) {
-  return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat(char4 v) {
-  return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat(char4 v) {
-  return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat(char4 v) {
-  return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat(char4 v) {
-  return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat(char4 v) {
-  return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat(uchar4 v) {
-  return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat(uchar4 v) {
-  return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat(uchar4 v) {
-  return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat(uchar4 v) {
-  return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat(uchar4 v) {
-  return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat(uchar4 v) {
-  return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat(uchar4 v) {
-  return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat(uchar4 v) {
-  return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat(float4 v) {
-  return (long4)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat(float4 v) {
-  return (ulong4)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat(float4 v) {
-  return (int4)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat(float4 v) {
-  return (uint4)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat(float4 v) {
-  return (short4)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat(float4 v) {
-  return (ushort4)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat(float4 v) {
-  return (char4)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat(float4 v) {
-  return (uchar4)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat(long8 v) {
-  return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat(long8 v) {
-  return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat(long8 v) {
-  return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat(long8 v) {
-  return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat(long8 v) {
-  return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat(long8 v) {
-  return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat(long8 v) {
-  return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat(long8 v) {
-  return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat(ulong8 v) {
-  return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat(ulong8 v) {
-  return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat(ulong8 v) {
-  return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat(ulong8 v) {
-  return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat(ulong8 v) {
-  return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat(ulong8 v) {
-  return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat(ulong8 v) {
-  return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat(ulong8 v) {
-  return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat(int8 v) {
-  return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat(int8 v) {
-  return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat(int8 v) {
-  return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat(int8 v) {
-  return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat(int8 v) {
-  return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat(int8 v) {
-  return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat(int8 v) {
-  return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat(int8 v) {
-  return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat(uint8 v) {
-  return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat(uint8 v) {
-  return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat(uint8 v) {
-  return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat(uint8 v) {
-  return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat(uint8 v) {
-  return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat(uint8 v) {
-  return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat(uint8 v) {
-  return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat(uint8 v) {
-  return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat(short8 v) {
-  return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat(short8 v) {
-  return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat(short8 v) {
-  return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat(short8 v) {
-  return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat(short8 v) {
-  return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat(short8 v) {
-  return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat(short8 v) {
-  return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat(short8 v) {
-  return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat(ushort8 v) {
-  return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat(ushort8 v) {
-  return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat(ushort8 v) {
-  return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat(ushort8 v) {
-  return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat(ushort8 v) {
-  return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat(ushort8 v) {
-  return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat(ushort8 v) {
-  return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat(ushort8 v) {
-  return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat(char8 v) {
-  return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat(char8 v) {
-  return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat(char8 v) {
-  return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat(char8 v) {
-  return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat(char8 v) {
-  return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat(char8 v) {
-  return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat(char8 v) {
-  return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat(char8 v) {
-  return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat(uchar8 v) {
-  return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat(uchar8 v) {
-  return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat(uchar8 v) {
-  return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat(uchar8 v) {
-  return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat(uchar8 v) {
-  return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat(uchar8 v) {
-  return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat(uchar8 v) {
-  return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat(uchar8 v) {
-  return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat(float8 v) {
-  return (long8)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat(float8 v) {
-  return (ulong8)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat(float8 v) {
-  return (int8)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat(float8 v) {
-  return (uint8)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat(float8 v) {
-  return (short8)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat(float8 v) {
-  return (ushort8)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat(float8 v) {
-  return (char8)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat(float8 v) {
-  return (uchar8)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat(long16 v) {
-  return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat(long16 v) {
-  return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat(long16 v) {
-  return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat(long16 v) {
-  return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat(long16 v) {
-  return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat(long16 v) {
-  return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat(long16 v) {
-  return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat(long16 v) {
-  return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat(ulong16 v) {
-  return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat(ulong16 v) {
-  return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat(ulong16 v) {
-  return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat(ulong16 v) {
-  return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat(ulong16 v) {
-  return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat(ulong16 v) {
-  return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat(ulong16 v) {
-  return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat(ulong16 v) {
-  return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat(int16 v) {
-  return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat(int16 v) {
-  return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat(int16 v) {
-  return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat(int16 v) {
-  return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat(int16 v) {
-  return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat(int16 v) {
-  return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat(int16 v) {
-  return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat(int16 v) {
-  return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat(uint16 v) {
-  return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat(uint16 v) {
-  return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat(uint16 v) {
-  return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat(uint16 v) {
-  return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat(uint16 v) {
-  return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat(uint16 v) {
-  return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat(uint16 v) {
-  return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat(uint16 v) {
-  return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat(short16 v) {
-  return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat(short16 v) {
-  return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat(short16 v) {
-  return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat(short16 v) {
-  return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat(short16 v) {
-  return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat(short16 v) {
-  return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat(short16 v) {
-  return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat(short16 v) {
-  return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat(ushort16 v) {
-  return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat(ushort16 v) {
-  return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat(ushort16 v) {
-  return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat(ushort16 v) {
-  return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat(ushort16 v) {
-  return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat(ushort16 v) {
-  return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat(ushort16 v) {
-  return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat(ushort16 v) {
-  return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat(char16 v) {
-  return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat(char16 v) {
-  return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat(char16 v) {
-  return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat(char16 v) {
-  return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat(char16 v) {
-  return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat(char16 v) {
-  return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat(char16 v) {
-  return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat(char16 v) {
-  return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat(uchar16 v) {
-  return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat(uchar16 v) {
-  return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat(uchar16 v) {
-  return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat(uchar16 v) {
-  return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat(uchar16 v) {
-  return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat(uchar16 v) {
-  return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat(uchar16 v) {
-  return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat(uchar16 v) {
-  return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat(float16 v) {
-  return (long16)(convert_long_sat(v.s0), convert_long_sat(v.s1), convert_long_sat(v.s2), convert_long_sat(v.s3), convert_long_sat(v.s4), convert_long_sat(v.s5), convert_long_sat(v.s6), convert_long_sat(v.s7), convert_long_sat(v.s8), convert_long_sat(v.s9), convert_long_sat(v.sA), convert_long_sat(v.sB), convert_long_sat(v.sC), convert_long_sat(v.sD), convert_long_sat(v.sE), convert_long_sat(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat(float16 v) {
-  return (ulong16)(convert_ulong_sat(v.s0), convert_ulong_sat(v.s1), convert_ulong_sat(v.s2), convert_ulong_sat(v.s3), convert_ulong_sat(v.s4), convert_ulong_sat(v.s5), convert_ulong_sat(v.s6), convert_ulong_sat(v.s7), convert_ulong_sat(v.s8), convert_ulong_sat(v.s9), convert_ulong_sat(v.sA), convert_ulong_sat(v.sB), convert_ulong_sat(v.sC), convert_ulong_sat(v.sD), convert_ulong_sat(v.sE), convert_ulong_sat(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat(float16 v) {
-  return (int16)(convert_int_sat(v.s0), convert_int_sat(v.s1), convert_int_sat(v.s2), convert_int_sat(v.s3), convert_int_sat(v.s4), convert_int_sat(v.s5), convert_int_sat(v.s6), convert_int_sat(v.s7), convert_int_sat(v.s8), convert_int_sat(v.s9), convert_int_sat(v.sA), convert_int_sat(v.sB), convert_int_sat(v.sC), convert_int_sat(v.sD), convert_int_sat(v.sE), convert_int_sat(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat(float16 v) {
-  return (uint16)(convert_uint_sat(v.s0), convert_uint_sat(v.s1), convert_uint_sat(v.s2), convert_uint_sat(v.s3), convert_uint_sat(v.s4), convert_uint_sat(v.s5), convert_uint_sat(v.s6), convert_uint_sat(v.s7), convert_uint_sat(v.s8), convert_uint_sat(v.s9), convert_uint_sat(v.sA), convert_uint_sat(v.sB), convert_uint_sat(v.sC), convert_uint_sat(v.sD), convert_uint_sat(v.sE), convert_uint_sat(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat(float16 v) {
-  return (short16)(convert_short_sat(v.s0), convert_short_sat(v.s1), convert_short_sat(v.s2), convert_short_sat(v.s3), convert_short_sat(v.s4), convert_short_sat(v.s5), convert_short_sat(v.s6), convert_short_sat(v.s7), convert_short_sat(v.s8), convert_short_sat(v.s9), convert_short_sat(v.sA), convert_short_sat(v.sB), convert_short_sat(v.sC), convert_short_sat(v.sD), convert_short_sat(v.sE), convert_short_sat(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat(float16 v) {
-  return (ushort16)(convert_ushort_sat(v.s0), convert_ushort_sat(v.s1), convert_ushort_sat(v.s2), convert_ushort_sat(v.s3), convert_ushort_sat(v.s4), convert_ushort_sat(v.s5), convert_ushort_sat(v.s6), convert_ushort_sat(v.s7), convert_ushort_sat(v.s8), convert_ushort_sat(v.s9), convert_ushort_sat(v.sA), convert_ushort_sat(v.sB), convert_ushort_sat(v.sC), convert_ushort_sat(v.sD), convert_ushort_sat(v.sE), convert_ushort_sat(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat(float16 v) {
-  return (char16)(convert_char_sat(v.s0), convert_char_sat(v.s1), convert_char_sat(v.s2), convert_char_sat(v.s3), convert_char_sat(v.s4), convert_char_sat(v.s5), convert_char_sat(v.s6), convert_char_sat(v.s7), convert_char_sat(v.s8), convert_char_sat(v.s9), convert_char_sat(v.sA), convert_char_sat(v.sB), convert_char_sat(v.sC), convert_char_sat(v.sD), convert_char_sat(v.sE), convert_char_sat(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat(float16 v) {
-  return (uchar16)(convert_uchar_sat(v.s0), convert_uchar_sat(v.s1), convert_uchar_sat(v.s2), convert_uchar_sat(v.s3), convert_uchar_sat(v.s4), convert_uchar_sat(v.s5), convert_uchar_sat(v.s6), convert_uchar_sat(v.s7), convert_uchar_sat(v.s8), convert_uchar_sat(v.s9), convert_uchar_sat(v.sA), convert_uchar_sat(v.sB), convert_uchar_sat(v.sC), convert_uchar_sat(v.sD), convert_uchar_sat(v.sE), convert_uchar_sat(v.sF));
-}
-
-
-float __gen_ocl_rndz(float x);
-float __gen_ocl_rnde(float x);
-float __gen_ocl_rndu(float x);
-float __gen_ocl_rndd(float x);
-INLINE_OVERLOADABLE float __convert_float_rtz(long x)
-{
-  union {
-    uint u;
-    float f;
-  } u;
-  u.f = x;
-  long l = u.f;
-  if((l > x && x > 0) || x >= 0x7fffffc000000000 ||
-     (l < x && x < 0)) {
-      u.u -= 1;
-  }
-  return u.f;
-}
-INLINE_OVERLOADABLE float __convert_float_rtp(long x)
-{
-  union {
-    uint u;
-    float f;
-  } u;
-  u.f = x;
-  long l = u.f;  //can not use u.f < x
-  if(l < x && x < 0x7fffffc000000000) {
-    if(x > 0)
-      u.u = u.u + 1;
-    else
-      u.u = u.u - 1;
-  }
-  return u.f;
-}
-INLINE_OVERLOADABLE float __convert_float_rtn(long x)
-{
-  union {
-    uint u;
-    float f;
-  } u;
-  u.f = x;
-  long l = u.f;  //avoid overflow
-  if(l > x || x >= 0x7fffffc000000000) {
-    if(x > 0)
-      u.u = u.u - 1;
-    else
-      u.u = u.u + 1;
-  }
-  return u.f;
-}
-INLINE_OVERLOADABLE float __convert_float_rtz(ulong x)
-{
-  union {
-    uint u;
-    float f;
-  } u;
-  u.f = x;
-  ulong l = u.f;
-  if(l > x  || x >= 0xffffff8000000000)
-      u.u -= 1;
-  return u.f;
-}
-INLINE_OVERLOADABLE float __convert_float_rtp(ulong x)
-{
-  union {
-    uint u;
-    float f;
-  } u;
-  u.f = x;
-  ulong l = u.f;  //can not use u.f < x
-  if(l < x && x < 0xffffff8000000000)
-    u.u = u.u + 1;
-  return u.f;
-}
-INLINE_OVERLOADABLE float __convert_float_rtn(ulong x)
-{
-  return __convert_float_rtz(x);
-}
-INLINE_OVERLOADABLE float __convert_float_rtz(int x)
-{
-  union {
-    uint u;
-    float f;
-  } u;
-  u.f = x;
-  long i = u.f;
-  if((i > x && x > 0) ||
-     (i < x && x < 0)) {
-      u.u -= 1;
-  }
-  return u.f;
-}
-INLINE_OVERLOADABLE float __convert_float_rtp(int x)
-{
-  union {
-    uint u;
-    float f;
-  } u;
-  u.f = x;
-  int i = u.f;
-  if(i < x) {
-    if(x > 0)
-      u.u += 1;
-    else
-      u.u -= 1;
-  }
-  return u.f;
-}
-INLINE_OVERLOADABLE float __convert_float_rtn(int x)
-{
-  union {
-    uint u;
-    float f;
-  } u;
-  u.f = x;
-  long i = u.f;  //avoid overflow
-  if(i > x) {
-    if(x > 0)
-      u.u = u.u - 1;
-    else
-      u.u = u.u + 1;
-  }
-  return u.f;
-}
-INLINE_OVERLOADABLE float __convert_float_rtz(uint x)
-{
-  union {
-    uint u;
-    float f;
-  } u;
-  u.f = x;
-  ulong i = u.f;
-  if(i > x)
-    u.u -= 1;
-  return u.f;
-}
-INLINE_OVERLOADABLE float __convert_float_rtp(uint x)
-{
-  union {
-    uint u;
-    float f;
-  } u;
-  u.f = x;
-  uint i = u.f;
-  if(i < x)
-    u.u += 1;
-  return u.f;
-}
-INLINE_OVERLOADABLE float __convert_float_rtn(uint x)
-{
-  return __convert_float_rtz(x);
-}
-
-INLINE_OVERLOADABLE long convert_long_rte(long x)
-{ return x; }
-INLINE_OVERLOADABLE long convert_long_rtz(long x)
-{ return x; }
-INLINE_OVERLOADABLE long convert_long_rtp(long x)
-{ return x; }
-INLINE_OVERLOADABLE long convert_long_rtn(long x)
-{ return x; }
-INLINE_OVERLOADABLE ulong convert_ulong_rte(long x)
-{ return x; }
-INLINE_OVERLOADABLE ulong convert_ulong_rtz(long x)
-{ return x; }
-INLINE_OVERLOADABLE ulong convert_ulong_rtp(long x)
-{ return x; }
-INLINE_OVERLOADABLE ulong convert_ulong_rtn(long x)
-{ return x; }
-INLINE_OVERLOADABLE int convert_int_rte(long x)
-{ return x; }
-INLINE_OVERLOADABLE int convert_int_rtz(long x)
-{ return x; }
-INLINE_OVERLOADABLE int convert_int_rtp(long x)
-{ return x; }
-INLINE_OVERLOADABLE int convert_int_rtn(long x)
-{ return x; }
-INLINE_OVERLOADABLE uint convert_uint_rte(long x)
-{ return x; }
-INLINE_OVERLOADABLE uint convert_uint_rtz(long x)
-{ return x; }
-INLINE_OVERLOADABLE uint convert_uint_rtp(long x)
-{ return x; }
-INLINE_OVERLOADABLE uint convert_uint_rtn(long x)
-{ return x; }
-INLINE_OVERLOADABLE short convert_short_rte(long x)
-{ return x; }
-INLINE_OVERLOADABLE short convert_short_rtz(long x)
-{ return x; }
-INLINE_OVERLOADABLE short convert_short_rtp(long x)
-{ return x; }
-INLINE_OVERLOADABLE short convert_short_rtn(long x)
-{ return x; }
-INLINE_OVERLOADABLE ushort convert_ushort_rte(long x)
-{ return x; }
-INLINE_OVERLOADABLE ushort convert_ushort_rtz(long x)
-{ return x; }
-INLINE_OVERLOADABLE ushort convert_ushort_rtp(long x)
-{ return x; }
-INLINE_OVERLOADABLE ushort convert_ushort_rtn(long x)
-{ return x; }
-INLINE_OVERLOADABLE char convert_char_rte(long x)
-{ return x; }
-INLINE_OVERLOADABLE char convert_char_rtz(long x)
-{ return x; }
-INLINE_OVERLOADABLE char convert_char_rtp(long x)
-{ return x; }
-INLINE_OVERLOADABLE char convert_char_rtn(long x)
-{ return x; }
-INLINE_OVERLOADABLE uchar convert_uchar_rte(long x)
-{ return x; }
-INLINE_OVERLOADABLE uchar convert_uchar_rtz(long x)
-{ return x; }
-INLINE_OVERLOADABLE uchar convert_uchar_rtp(long x)
-{ return x; }
-INLINE_OVERLOADABLE uchar convert_uchar_rtn(long x)
-{ return x; }
-INLINE_OVERLOADABLE float convert_float_rte(long x)
-{ return x; }
-INLINE_OVERLOADABLE float convert_float_rtz(long x)
-{ return __convert_float_rtz(x); }
-INLINE_OVERLOADABLE float convert_float_rtp(long x)
-{ return __convert_float_rtp(x); }
-INLINE_OVERLOADABLE float convert_float_rtn(long x)
-{ return __convert_float_rtn(x); }
-INLINE_OVERLOADABLE long convert_long_rte(ulong x)
-{ return x; }
-INLINE_OVERLOADABLE long convert_long_rtz(ulong x)
-{ return x; }
-INLINE_OVERLOADABLE long convert_long_rtp(ulong x)
-{ return x; }
-INLINE_OVERLOADABLE long convert_long_rtn(ulong x)
-{ return x; }
-INLINE_OVERLOADABLE ulong convert_ulong_rte(ulong x)
-{ return x; }
-INLINE_OVERLOADABLE ulong convert_ulong_rtz(ulong x)
-{ return x; }
-INLINE_OVERLOADABLE ulong convert_ulong_rtp(ulong x)
-{ return x; }
-INLINE_OVERLOADABLE ulong convert_ulong_rtn(ulong x)
-{ return x; }
-INLINE_OVERLOADABLE int convert_int_rte(ulong x)
-{ return x; }
-INLINE_OVERLOADABLE int convert_int_rtz(ulong x)
-{ return x; }
-INLINE_OVERLOADABLE int convert_int_rtp(ulong x)
-{ return x; }
-INLINE_OVERLOADABLE int convert_int_rtn(ulong x)
-{ return x; }
-INLINE_OVERLOADABLE uint convert_uint_rte(ulong x)
-{ return x; }
-INLINE_OVERLOADABLE uint convert_uint_rtz(ulong x)
-{ return x; }
-INLINE_OVERLOADABLE uint convert_uint_rtp(ulong x)
-{ return x; }
-INLINE_OVERLOADABLE uint convert_uint_rtn(ulong x)
-{ return x; }
-INLINE_OVERLOADABLE short convert_short_rte(ulong x)
-{ return x; }
-INLINE_OVERLOADABLE short convert_short_rtz(ulong x)
-{ return x; }
-INLINE_OVERLOADABLE short convert_short_rtp(ulong x)
-{ return x; }
-INLINE_OVERLOADABLE short convert_short_rtn(ulong x)
-{ return x; }
-INLINE_OVERLOADABLE ushort convert_ushort_rte(ulong x)
-{ return x; }
-INLINE_OVERLOADABLE ushort convert_ushort_rtz(ulong x)
-{ return x; }
-INLINE_OVERLOADABLE ushort convert_ushort_rtp(ulong x)
-{ return x; }
-INLINE_OVERLOADABLE ushort convert_ushort_rtn(ulong x)
-{ return x; }
-INLINE_OVERLOADABLE char convert_char_rte(ulong x)
-{ return x; }
-INLINE_OVERLOADABLE char convert_char_rtz(ulong x)
-{ return x; }
-INLINE_OVERLOADABLE char convert_char_rtp(ulong x)
-{ return x; }
-INLINE_OVERLOADABLE char convert_char_rtn(ulong x)
-{ return x; }
-INLINE_OVERLOADABLE uchar convert_uchar_rte(ulong x)
-{ return x; }
-INLINE_OVERLOADABLE uchar convert_uchar_rtz(ulong x)
-{ return x; }
-INLINE_OVERLOADABLE uchar convert_uchar_rtp(ulong x)
-{ return x; }
-INLINE_OVERLOADABLE uchar convert_uchar_rtn(ulong x)
-{ return x; }
-INLINE_OVERLOADABLE float convert_float_rte(ulong x)
-{ return x; }
-INLINE_OVERLOADABLE float convert_float_rtz(ulong x)
-{ return __convert_float_rtz(x); }
-INLINE_OVERLOADABLE float convert_float_rtp(ulong x)
-{ return __convert_float_rtp(x); }
-INLINE_OVERLOADABLE float convert_float_rtn(ulong x)
-{ return __convert_float_rtn(x); }
-INLINE_OVERLOADABLE long convert_long_rte(int x)
-{ return x; }
-INLINE_OVERLOADABLE long convert_long_rtz(int x)
-{ return x; }
-INLINE_OVERLOADABLE long convert_long_rtp(int x)
-{ return x; }
-INLINE_OVERLOADABLE long convert_long_rtn(int x)
-{ return x; }
-INLINE_OVERLOADABLE ulong convert_ulong_rte(int x)
-{ return x; }
-INLINE_OVERLOADABLE ulong convert_ulong_rtz(int x)
-{ return x; }
-INLINE_OVERLOADABLE ulong convert_ulong_rtp(int x)
-{ return x; }
-INLINE_OVERLOADABLE ulong convert_ulong_rtn(int x)
-{ return x; }
-INLINE_OVERLOADABLE int convert_int_rte(int x)
-{ return x; }
-INLINE_OVERLOADABLE int convert_int_rtz(int x)
-{ return x; }
-INLINE_OVERLOADABLE int convert_int_rtp(int x)
-{ return x; }
-INLINE_OVERLOADABLE int convert_int_rtn(int x)
-{ return x; }
-INLINE_OVERLOADABLE uint convert_uint_rte(int x)
-{ return x; }
-INLINE_OVERLOADABLE uint convert_uint_rtz(int x)
-{ return x; }
-INLINE_OVERLOADABLE uint convert_uint_rtp(int x)
-{ return x; }
-INLINE_OVERLOADABLE uint convert_uint_rtn(int x)
-{ return x; }
-INLINE_OVERLOADABLE short convert_short_rte(int x)
-{ return x; }
-INLINE_OVERLOADABLE short convert_short_rtz(int x)
-{ return x; }
-INLINE_OVERLOADABLE short convert_short_rtp(int x)
-{ return x; }
-INLINE_OVERLOADABLE short convert_short_rtn(int x)
-{ return x; }
-INLINE_OVERLOADABLE ushort convert_ushort_rte(int x)
-{ return x; }
-INLINE_OVERLOADABLE ushort convert_ushort_rtz(int x)
-{ return x; }
-INLINE_OVERLOADABLE ushort convert_ushort_rtp(int x)
-{ return x; }
-INLINE_OVERLOADABLE ushort convert_ushort_rtn(int x)
-{ return x; }
-INLINE_OVERLOADABLE char convert_char_rte(int x)
-{ return x; }
-INLINE_OVERLOADABLE char convert_char_rtz(int x)
-{ return x; }
-INLINE_OVERLOADABLE char convert_char_rtp(int x)
-{ return x; }
-INLINE_OVERLOADABLE char convert_char_rtn(int x)
-{ return x; }
-INLINE_OVERLOADABLE uchar convert_uchar_rte(int x)
-{ return x; }
-INLINE_OVERLOADABLE uchar convert_uchar_rtz(int x)
-{ return x; }
-INLINE_OVERLOADABLE uchar convert_uchar_rtp(int x)
-{ return x; }
-INLINE_OVERLOADABLE uchar convert_uchar_rtn(int x)
-{ return x; }
-INLINE_OVERLOADABLE float convert_float_rte(int x)
-{ return x; }
-INLINE_OVERLOADABLE float convert_float_rtz(int x)
-{ return __convert_float_rtz(x); }
-INLINE_OVERLOADABLE float convert_float_rtp(int x)
-{ return __convert_float_rtp(x); }
-INLINE_OVERLOADABLE float convert_float_rtn(int x)
-{ return __convert_float_rtn(x); }
-INLINE_OVERLOADABLE long convert_long_rte(uint x)
-{ return x; }
-INLINE_OVERLOADABLE long convert_long_rtz(uint x)
-{ return x; }
-INLINE_OVERLOADABLE long convert_long_rtp(uint x)
-{ return x; }
-INLINE_OVERLOADABLE long convert_long_rtn(uint x)
-{ return x; }
-INLINE_OVERLOADABLE ulong convert_ulong_rte(uint x)
-{ return x; }
-INLINE_OVERLOADABLE ulong convert_ulong_rtz(uint x)
-{ return x; }
-INLINE_OVERLOADABLE ulong convert_ulong_rtp(uint x)
-{ return x; }
-INLINE_OVERLOADABLE ulong convert_ulong_rtn(uint x)
-{ return x; }
-INLINE_OVERLOADABLE int convert_int_rte(uint x)
-{ return x; }
-INLINE_OVERLOADABLE int convert_int_rtz(uint x)
-{ return x; }
-INLINE_OVERLOADABLE int convert_int_rtp(uint x)
-{ return x; }
-INLINE_OVERLOADABLE int convert_int_rtn(uint x)
-{ return x; }
-INLINE_OVERLOADABLE uint convert_uint_rte(uint x)
-{ return x; }
-INLINE_OVERLOADABLE uint convert_uint_rtz(uint x)
-{ return x; }
-INLINE_OVERLOADABLE uint convert_uint_rtp(uint x)
-{ return x; }
-INLINE_OVERLOADABLE uint convert_uint_rtn(uint x)
-{ return x; }
-INLINE_OVERLOADABLE short convert_short_rte(uint x)
-{ return x; }
-INLINE_OVERLOADABLE short convert_short_rtz(uint x)
-{ return x; }
-INLINE_OVERLOADABLE short convert_short_rtp(uint x)
-{ return x; }
-INLINE_OVERLOADABLE short convert_short_rtn(uint x)
-{ return x; }
-INLINE_OVERLOADABLE ushort convert_ushort_rte(uint x)
-{ return x; }
-INLINE_OVERLOADABLE ushort convert_ushort_rtz(uint x)
-{ return x; }
-INLINE_OVERLOADABLE ushort convert_ushort_rtp(uint x)
-{ return x; }
-INLINE_OVERLOADABLE ushort convert_ushort_rtn(uint x)
-{ return x; }
-INLINE_OVERLOADABLE char convert_char_rte(uint x)
-{ return x; }
-INLINE_OVERLOADABLE char convert_char_rtz(uint x)
-{ return x; }
-INLINE_OVERLOADABLE char convert_char_rtp(uint x)
-{ return x; }
-INLINE_OVERLOADABLE char convert_char_rtn(uint x)
-{ return x; }
-INLINE_OVERLOADABLE uchar convert_uchar_rte(uint x)
-{ return x; }
-INLINE_OVERLOADABLE uchar convert_uchar_rtz(uint x)
-{ return x; }
-INLINE_OVERLOADABLE uchar convert_uchar_rtp(uint x)
-{ return x; }
-INLINE_OVERLOADABLE uchar convert_uchar_rtn(uint x)
-{ return x; }
-INLINE_OVERLOADABLE float convert_float_rte(uint x)
-{ return x; }
-INLINE_OVERLOADABLE float convert_float_rtz(uint x)
-{ return __convert_float_rtz(x); }
-INLINE_OVERLOADABLE float convert_float_rtp(uint x)
-{ return __convert_float_rtp(x); }
-INLINE_OVERLOADABLE float convert_float_rtn(uint x)
-{ return __convert_float_rtn(x); }
-INLINE_OVERLOADABLE long convert_long_rte(short x)
-{ return x; }
-INLINE_OVERLOADABLE long convert_long_rtz(short x)
-{ return x; }
-INLINE_OVERLOADABLE long convert_long_rtp(short x)
-{ return x; }
-INLINE_OVERLOADABLE long convert_long_rtn(short x)
-{ return x; }
-INLINE_OVERLOADABLE ulong convert_ulong_rte(short x)
-{ return x; }
-INLINE_OVERLOADABLE ulong convert_ulong_rtz(short x)
-{ return x; }
-INLINE_OVERLOADABLE ulong convert_ulong_rtp(short x)
-{ return x; }
-INLINE_OVERLOADABLE ulong convert_ulong_rtn(short x)
-{ return x; }
-INLINE_OVERLOADABLE int convert_int_rte(short x)
-{ return x; }
-INLINE_OVERLOADABLE int convert_int_rtz(short x)
-{ return x; }
-INLINE_OVERLOADABLE int convert_int_rtp(short x)
-{ return x; }
-INLINE_OVERLOADABLE int convert_int_rtn(short x)
-{ return x; }
-INLINE_OVERLOADABLE uint convert_uint_rte(short x)
-{ return x; }
-INLINE_OVERLOADABLE uint convert_uint_rtz(short x)
-{ return x; }
-INLINE_OVERLOADABLE uint convert_uint_rtp(short x)
-{ return x; }
-INLINE_OVERLOADABLE uint convert_uint_rtn(short x)
-{ return x; }
-INLINE_OVERLOADABLE short convert_short_rte(short x)
-{ return x; }
-INLINE_OVERLOADABLE short convert_short_rtz(short x)
-{ return x; }
-INLINE_OVERLOADABLE short convert_short_rtp(short x)
-{ return x; }
-INLINE_OVERLOADABLE short convert_short_rtn(short x)
-{ return x; }
-INLINE_OVERLOADABLE ushort convert_ushort_rte(short x)
-{ return x; }
-INLINE_OVERLOADABLE ushort convert_ushort_rtz(short x)
-{ return x; }
-INLINE_OVERLOADABLE ushort convert_ushort_rtp(short x)
-{ return x; }
-INLINE_OVERLOADABLE ushort convert_ushort_rtn(short x)
-{ return x; }
-INLINE_OVERLOADABLE char convert_char_rte(short x)
-{ return x; }
-INLINE_OVERLOADABLE char convert_char_rtz(short x)
-{ return x; }
-INLINE_OVERLOADABLE char convert_char_rtp(short x)
-{ return x; }
-INLINE_OVERLOADABLE char convert_char_rtn(short x)
-{ return x; }
-INLINE_OVERLOADABLE uchar convert_uchar_rte(short x)
-{ return x; }
-INLINE_OVERLOADABLE uchar convert_uchar_rtz(short x)
-{ return x; }
-INLINE_OVERLOADABLE uchar convert_uchar_rtp(short x)
-{ return x; }
-INLINE_OVERLOADABLE uchar convert_uchar_rtn(short x)
-{ return x; }
-INLINE_OVERLOADABLE float convert_float_rte(short x)
-{ return x; }
-INLINE_OVERLOADABLE float convert_float_rtz(short x)
-{ return x; }
-INLINE_OVERLOADABLE float convert_float_rtp(short x)
-{ return x; }
-INLINE_OVERLOADABLE float convert_float_rtn(short x)
-{ return x; }
-INLINE_OVERLOADABLE long convert_long_rte(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE long convert_long_rtz(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE long convert_long_rtp(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE long convert_long_rtn(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE ulong convert_ulong_rte(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE ulong convert_ulong_rtz(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE ulong convert_ulong_rtp(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE ulong convert_ulong_rtn(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE int convert_int_rte(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE int convert_int_rtz(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE int convert_int_rtp(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE int convert_int_rtn(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE uint convert_uint_rte(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE uint convert_uint_rtz(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE uint convert_uint_rtp(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE uint convert_uint_rtn(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE short convert_short_rte(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE short convert_short_rtz(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE short convert_short_rtp(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE short convert_short_rtn(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE ushort convert_ushort_rte(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE ushort convert_ushort_rtz(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE ushort convert_ushort_rtp(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE ushort convert_ushort_rtn(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE char convert_char_rte(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE char convert_char_rtz(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE char convert_char_rtp(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE char convert_char_rtn(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE uchar convert_uchar_rte(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE uchar convert_uchar_rtz(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE uchar convert_uchar_rtp(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE uchar convert_uchar_rtn(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE float convert_float_rte(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE float convert_float_rtz(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE float convert_float_rtp(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE float convert_float_rtn(ushort x)
-{ return x; }
-INLINE_OVERLOADABLE long convert_long_rte(char x)
-{ return x; }
-INLINE_OVERLOADABLE long convert_long_rtz(char x)
-{ return x; }
-INLINE_OVERLOADABLE long convert_long_rtp(char x)
-{ return x; }
-INLINE_OVERLOADABLE long convert_long_rtn(char x)
-{ return x; }
-INLINE_OVERLOADABLE ulong convert_ulong_rte(char x)
-{ return x; }
-INLINE_OVERLOADABLE ulong convert_ulong_rtz(char x)
-{ return x; }
-INLINE_OVERLOADABLE ulong convert_ulong_rtp(char x)
-{ return x; }
-INLINE_OVERLOADABLE ulong convert_ulong_rtn(char x)
-{ return x; }
-INLINE_OVERLOADABLE int convert_int_rte(char x)
-{ return x; }
-INLINE_OVERLOADABLE int convert_int_rtz(char x)
-{ return x; }
-INLINE_OVERLOADABLE int convert_int_rtp(char x)
-{ return x; }
-INLINE_OVERLOADABLE int convert_int_rtn(char x)
-{ return x; }
-INLINE_OVERLOADABLE uint convert_uint_rte(char x)
-{ return x; }
-INLINE_OVERLOADABLE uint convert_uint_rtz(char x)
-{ return x; }
-INLINE_OVERLOADABLE uint convert_uint_rtp(char x)
-{ return x; }
-INLINE_OVERLOADABLE uint convert_uint_rtn(char x)
-{ return x; }
-INLINE_OVERLOADABLE short convert_short_rte(char x)
-{ return x; }
-INLINE_OVERLOADABLE short convert_short_rtz(char x)
-{ return x; }
-INLINE_OVERLOADABLE short convert_short_rtp(char x)
-{ return x; }
-INLINE_OVERLOADABLE short convert_short_rtn(char x)
-{ return x; }
-INLINE_OVERLOADABLE ushort convert_ushort_rte(char x)
-{ return x; }
-INLINE_OVERLOADABLE ushort convert_ushort_rtz(char x)
-{ return x; }
-INLINE_OVERLOADABLE ushort convert_ushort_rtp(char x)
-{ return x; }
-INLINE_OVERLOADABLE ushort convert_ushort_rtn(char x)
-{ return x; }
-INLINE_OVERLOADABLE char convert_char_rte(char x)
-{ return x; }
-INLINE_OVERLOADABLE char convert_char_rtz(char x)
-{ return x; }
-INLINE_OVERLOADABLE char convert_char_rtp(char x)
-{ return x; }
-INLINE_OVERLOADABLE char convert_char_rtn(char x)
-{ return x; }
-INLINE_OVERLOADABLE uchar convert_uchar_rte(char x)
-{ return x; }
-INLINE_OVERLOADABLE uchar convert_uchar_rtz(char x)
-{ return x; }
-INLINE_OVERLOADABLE uchar convert_uchar_rtp(char x)
-{ return x; }
-INLINE_OVERLOADABLE uchar convert_uchar_rtn(char x)
-{ return x; }
-INLINE_OVERLOADABLE float convert_float_rte(char x)
-{ return x; }
-INLINE_OVERLOADABLE float convert_float_rtz(char x)
-{ return x; }
-INLINE_OVERLOADABLE float convert_float_rtp(char x)
-{ return x; }
-INLINE_OVERLOADABLE float convert_float_rtn(char x)
-{ return x; }
-INLINE_OVERLOADABLE long convert_long_rte(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE long convert_long_rtz(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE long convert_long_rtp(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE long convert_long_rtn(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE ulong convert_ulong_rte(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE ulong convert_ulong_rtz(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE ulong convert_ulong_rtp(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE ulong convert_ulong_rtn(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE int convert_int_rte(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE int convert_int_rtz(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE int convert_int_rtp(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE int convert_int_rtn(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE uint convert_uint_rte(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE uint convert_uint_rtz(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE uint convert_uint_rtp(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE uint convert_uint_rtn(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE short convert_short_rte(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE short convert_short_rtz(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE short convert_short_rtp(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE short convert_short_rtn(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE ushort convert_ushort_rte(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE ushort convert_ushort_rtz(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE ushort convert_ushort_rtp(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE ushort convert_ushort_rtn(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE char convert_char_rte(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE char convert_char_rtz(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE char convert_char_rtp(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE char convert_char_rtn(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE uchar convert_uchar_rte(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE uchar convert_uchar_rtz(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE uchar convert_uchar_rtp(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE uchar convert_uchar_rtn(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE float convert_float_rte(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE float convert_float_rtz(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE float convert_float_rtp(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE float convert_float_rtn(uchar x)
-{ return x; }
-INLINE_OVERLOADABLE long convert_long_rte(float x)
-{ return __gen_ocl_rnde(x); }
-INLINE_OVERLOADABLE long convert_long_rtz(float x)
-{ return __gen_ocl_rndz(x); }
-INLINE_OVERLOADABLE long convert_long_rtp(float x)
-{ return __gen_ocl_rndu(x); }
-INLINE_OVERLOADABLE long convert_long_rtn(float x)
-{ return __gen_ocl_rndd(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_rte(float x)
-{ return __gen_ocl_rnde(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_rtz(float x)
-{ return __gen_ocl_rndz(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_rtp(float x)
-{ return __gen_ocl_rndu(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_rtn(float x)
-{ return __gen_ocl_rndd(x); }
-INLINE_OVERLOADABLE int convert_int_rte(float x)
-{ return __gen_ocl_rnde(x); }
-INLINE_OVERLOADABLE int convert_int_rtz(float x)
-{ return __gen_ocl_rndz(x); }
-INLINE_OVERLOADABLE int convert_int_rtp(float x)
-{ return __gen_ocl_rndu(x); }
-INLINE_OVERLOADABLE int convert_int_rtn(float x)
-{ return __gen_ocl_rndd(x); }
-INLINE_OVERLOADABLE uint convert_uint_rte(float x)
-{ return __gen_ocl_rnde(x); }
-INLINE_OVERLOADABLE uint convert_uint_rtz(float x)
-{ return __gen_ocl_rndz(x); }
-INLINE_OVERLOADABLE uint convert_uint_rtp(float x)
-{ return __gen_ocl_rndu(x); }
-INLINE_OVERLOADABLE uint convert_uint_rtn(float x)
-{ return __gen_ocl_rndd(x); }
-INLINE_OVERLOADABLE short convert_short_rte(float x)
-{ return __gen_ocl_rnde(x); }
-INLINE_OVERLOADABLE short convert_short_rtz(float x)
-{ return __gen_ocl_rndz(x); }
-INLINE_OVERLOADABLE short convert_short_rtp(float x)
-{ return __gen_ocl_rndu(x); }
-INLINE_OVERLOADABLE short convert_short_rtn(float x)
-{ return __gen_ocl_rndd(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_rte(float x)
-{ return __gen_ocl_rnde(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_rtz(float x)
-{ return __gen_ocl_rndz(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_rtp(float x)
-{ return __gen_ocl_rndu(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_rtn(float x)
-{ return __gen_ocl_rndd(x); }
-INLINE_OVERLOADABLE char convert_char_rte(float x)
-{ return __gen_ocl_rnde(x); }
-INLINE_OVERLOADABLE char convert_char_rtz(float x)
-{ return __gen_ocl_rndz(x); }
-INLINE_OVERLOADABLE char convert_char_rtp(float x)
-{ return __gen_ocl_rndu(x); }
-INLINE_OVERLOADABLE char convert_char_rtn(float x)
-{ return __gen_ocl_rndd(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_rte(float x)
-{ return __gen_ocl_rnde(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_rtz(float x)
-{ return __gen_ocl_rndz(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_rtp(float x)
-{ return __gen_ocl_rndu(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_rtn(float x)
-{ return __gen_ocl_rndd(x); }
-INLINE_OVERLOADABLE float convert_float_rte(float x)
-{ return x; }
-INLINE_OVERLOADABLE float convert_float_rtz(float x)
-{ return x; }
-INLINE_OVERLOADABLE float convert_float_rtp(float x)
-{ return x; }
-INLINE_OVERLOADABLE float convert_float_rtn(float x)
-{ return x; }
-INLINE OVERLOADABLE long2 convert_long2_rte(long2 v) {
-  return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_rtz(long2 v) {
-  return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_rtp(long2 v) {
-  return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_rtn(long2 v) {
-  return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rte(long2 v) {
-  return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(long2 v) {
-  return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(long2 v) {
-  return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(long2 v) {
-  return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rte(long2 v) {
-  return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rtz(long2 v) {
-  return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rtp(long2 v) {
-  return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rtn(long2 v) {
-  return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rte(long2 v) {
-  return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rtz(long2 v) {
-  return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rtp(long2 v) {
-  return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rtn(long2 v) {
-  return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rte(long2 v) {
-  return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rtz(long2 v) {
-  return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rtp(long2 v) {
-  return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rtn(long2 v) {
-  return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rte(long2 v) {
-  return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(long2 v) {
-  return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(long2 v) {
-  return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(long2 v) {
-  return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rte(long2 v) {
-  return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rtz(long2 v) {
-  return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rtp(long2 v) {
-  return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rtn(long2 v) {
-  return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rte(long2 v) {
-  return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(long2 v) {
-  return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(long2 v) {
-  return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(long2 v) {
-  return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rte(long2 v) {
-  return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rtz(long2 v) {
-  return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rtp(long2 v) {
-  return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rtn(long2 v) {
-  return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_rte(ulong2 v) {
-  return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_rtz(ulong2 v) {
-  return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_rtp(ulong2 v) {
-  return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_rtn(ulong2 v) {
-  return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rte(ulong2 v) {
-  return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(ulong2 v) {
-  return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(ulong2 v) {
-  return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(ulong2 v) {
-  return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rte(ulong2 v) {
-  return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rtz(ulong2 v) {
-  return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rtp(ulong2 v) {
-  return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rtn(ulong2 v) {
-  return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rte(ulong2 v) {
-  return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rtz(ulong2 v) {
-  return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rtp(ulong2 v) {
-  return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rtn(ulong2 v) {
-  return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rte(ulong2 v) {
-  return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rtz(ulong2 v) {
-  return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rtp(ulong2 v) {
-  return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rtn(ulong2 v) {
-  return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rte(ulong2 v) {
-  return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(ulong2 v) {
-  return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(ulong2 v) {
-  return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(ulong2 v) {
-  return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rte(ulong2 v) {
-  return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rtz(ulong2 v) {
-  return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rtp(ulong2 v) {
-  return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rtn(ulong2 v) {
-  return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rte(ulong2 v) {
-  return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(ulong2 v) {
-  return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(ulong2 v) {
-  return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(ulong2 v) {
-  return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rte(ulong2 v) {
-  return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rtz(ulong2 v) {
-  return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rtp(ulong2 v) {
-  return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rtn(ulong2 v) {
-  return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_rte(int2 v) {
-  return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_rtz(int2 v) {
-  return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_rtp(int2 v) {
-  return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_rtn(int2 v) {
-  return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rte(int2 v) {
-  return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(int2 v) {
-  return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(int2 v) {
-  return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(int2 v) {
-  return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rte(int2 v) {
-  return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rtz(int2 v) {
-  return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rtp(int2 v) {
-  return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rtn(int2 v) {
-  return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rte(int2 v) {
-  return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rtz(int2 v) {
-  return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rtp(int2 v) {
-  return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rtn(int2 v) {
-  return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rte(int2 v) {
-  return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rtz(int2 v) {
-  return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rtp(int2 v) {
-  return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rtn(int2 v) {
-  return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rte(int2 v) {
-  return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(int2 v) {
-  return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(int2 v) {
-  return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(int2 v) {
-  return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rte(int2 v) {
-  return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rtz(int2 v) {
-  return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rtp(int2 v) {
-  return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rtn(int2 v) {
-  return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rte(int2 v) {
-  return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(int2 v) {
-  return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(int2 v) {
-  return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(int2 v) {
-  return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rte(int2 v) {
-  return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rtz(int2 v) {
-  return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rtp(int2 v) {
-  return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rtn(int2 v) {
-  return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_rte(uint2 v) {
-  return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_rtz(uint2 v) {
-  return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_rtp(uint2 v) {
-  return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_rtn(uint2 v) {
-  return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rte(uint2 v) {
-  return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(uint2 v) {
-  return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(uint2 v) {
-  return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(uint2 v) {
-  return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rte(uint2 v) {
-  return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rtz(uint2 v) {
-  return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rtp(uint2 v) {
-  return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rtn(uint2 v) {
-  return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rte(uint2 v) {
-  return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rtz(uint2 v) {
-  return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rtp(uint2 v) {
-  return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rtn(uint2 v) {
-  return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rte(uint2 v) {
-  return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rtz(uint2 v) {
-  return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rtp(uint2 v) {
-  return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rtn(uint2 v) {
-  return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rte(uint2 v) {
-  return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(uint2 v) {
-  return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(uint2 v) {
-  return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(uint2 v) {
-  return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rte(uint2 v) {
-  return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rtz(uint2 v) {
-  return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rtp(uint2 v) {
-  return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rtn(uint2 v) {
-  return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rte(uint2 v) {
-  return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(uint2 v) {
-  return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(uint2 v) {
-  return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(uint2 v) {
-  return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rte(uint2 v) {
-  return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rtz(uint2 v) {
-  return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rtp(uint2 v) {
-  return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rtn(uint2 v) {
-  return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_rte(short2 v) {
-  return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_rtz(short2 v) {
-  return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_rtp(short2 v) {
-  return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_rtn(short2 v) {
-  return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rte(short2 v) {
-  return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(short2 v) {
-  return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(short2 v) {
-  return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(short2 v) {
-  return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rte(short2 v) {
-  return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rtz(short2 v) {
-  return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rtp(short2 v) {
-  return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rtn(short2 v) {
-  return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rte(short2 v) {
-  return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rtz(short2 v) {
-  return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rtp(short2 v) {
-  return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rtn(short2 v) {
-  return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rte(short2 v) {
-  return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rtz(short2 v) {
-  return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rtp(short2 v) {
-  return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rtn(short2 v) {
-  return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rte(short2 v) {
-  return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(short2 v) {
-  return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(short2 v) {
-  return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(short2 v) {
-  return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rte(short2 v) {
-  return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rtz(short2 v) {
-  return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rtp(short2 v) {
-  return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rtn(short2 v) {
-  return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rte(short2 v) {
-  return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(short2 v) {
-  return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(short2 v) {
-  return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(short2 v) {
-  return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rte(short2 v) {
-  return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rtz(short2 v) {
-  return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rtp(short2 v) {
-  return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rtn(short2 v) {
-  return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_rte(ushort2 v) {
-  return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_rtz(ushort2 v) {
-  return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_rtp(ushort2 v) {
-  return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_rtn(ushort2 v) {
-  return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rte(ushort2 v) {
-  return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(ushort2 v) {
-  return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(ushort2 v) {
-  return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(ushort2 v) {
-  return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rte(ushort2 v) {
-  return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rtz(ushort2 v) {
-  return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rtp(ushort2 v) {
-  return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rtn(ushort2 v) {
-  return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rte(ushort2 v) {
-  return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rtz(ushort2 v) {
-  return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rtp(ushort2 v) {
-  return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rtn(ushort2 v) {
-  return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rte(ushort2 v) {
-  return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rtz(ushort2 v) {
-  return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rtp(ushort2 v) {
-  return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rtn(ushort2 v) {
-  return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rte(ushort2 v) {
-  return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(ushort2 v) {
-  return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(ushort2 v) {
-  return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(ushort2 v) {
-  return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rte(ushort2 v) {
-  return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rtz(ushort2 v) {
-  return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rtp(ushort2 v) {
-  return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rtn(ushort2 v) {
-  return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rte(ushort2 v) {
-  return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(ushort2 v) {
-  return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(ushort2 v) {
-  return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(ushort2 v) {
-  return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rte(ushort2 v) {
-  return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rtz(ushort2 v) {
-  return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rtp(ushort2 v) {
-  return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rtn(ushort2 v) {
-  return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_rte(char2 v) {
-  return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_rtz(char2 v) {
-  return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_rtp(char2 v) {
-  return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_rtn(char2 v) {
-  return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rte(char2 v) {
-  return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(char2 v) {
-  return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(char2 v) {
-  return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(char2 v) {
-  return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rte(char2 v) {
-  return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rtz(char2 v) {
-  return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rtp(char2 v) {
-  return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rtn(char2 v) {
-  return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rte(char2 v) {
-  return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rtz(char2 v) {
-  return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rtp(char2 v) {
-  return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rtn(char2 v) {
-  return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rte(char2 v) {
-  return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rtz(char2 v) {
-  return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rtp(char2 v) {
-  return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rtn(char2 v) {
-  return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rte(char2 v) {
-  return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(char2 v) {
-  return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(char2 v) {
-  return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(char2 v) {
-  return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rte(char2 v) {
-  return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rtz(char2 v) {
-  return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rtp(char2 v) {
-  return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rtn(char2 v) {
-  return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rte(char2 v) {
-  return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(char2 v) {
-  return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(char2 v) {
-  return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(char2 v) {
-  return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rte(char2 v) {
-  return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rtz(char2 v) {
-  return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rtp(char2 v) {
-  return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rtn(char2 v) {
-  return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_rte(uchar2 v) {
-  return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_rtz(uchar2 v) {
-  return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_rtp(uchar2 v) {
-  return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_rtn(uchar2 v) {
-  return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rte(uchar2 v) {
-  return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(uchar2 v) {
-  return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(uchar2 v) {
-  return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(uchar2 v) {
-  return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rte(uchar2 v) {
-  return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rtz(uchar2 v) {
-  return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rtp(uchar2 v) {
-  return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rtn(uchar2 v) {
-  return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rte(uchar2 v) {
-  return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rtz(uchar2 v) {
-  return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rtp(uchar2 v) {
-  return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rtn(uchar2 v) {
-  return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rte(uchar2 v) {
-  return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rtz(uchar2 v) {
-  return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rtp(uchar2 v) {
-  return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rtn(uchar2 v) {
-  return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rte(uchar2 v) {
-  return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(uchar2 v) {
-  return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(uchar2 v) {
-  return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(uchar2 v) {
-  return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rte(uchar2 v) {
-  return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rtz(uchar2 v) {
-  return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rtp(uchar2 v) {
-  return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rtn(uchar2 v) {
-  return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rte(uchar2 v) {
-  return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(uchar2 v) {
-  return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(uchar2 v) {
-  return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(uchar2 v) {
-  return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rte(uchar2 v) {
-  return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rtz(uchar2 v) {
-  return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rtp(uchar2 v) {
-  return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rtn(uchar2 v) {
-  return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_rte(float2 v) {
-  return (long2)(convert_long_rte(v.s0), convert_long_rte(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_rtz(float2 v) {
-  return (long2)(convert_long_rtz(v.s0), convert_long_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_rtp(float2 v) {
-  return (long2)(convert_long_rtp(v.s0), convert_long_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_rtn(float2 v) {
-  return (long2)(convert_long_rtn(v.s0), convert_long_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rte(float2 v) {
-  return (ulong2)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rtz(float2 v) {
-  return (ulong2)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rtp(float2 v) {
-  return (ulong2)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_rtn(float2 v) {
-  return (ulong2)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rte(float2 v) {
-  return (int2)(convert_int_rte(v.s0), convert_int_rte(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rtz(float2 v) {
-  return (int2)(convert_int_rtz(v.s0), convert_int_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rtp(float2 v) {
-  return (int2)(convert_int_rtp(v.s0), convert_int_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_rtn(float2 v) {
-  return (int2)(convert_int_rtn(v.s0), convert_int_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rte(float2 v) {
-  return (uint2)(convert_uint_rte(v.s0), convert_uint_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rtz(float2 v) {
-  return (uint2)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rtp(float2 v) {
-  return (uint2)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_rtn(float2 v) {
-  return (uint2)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rte(float2 v) {
-  return (short2)(convert_short_rte(v.s0), convert_short_rte(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rtz(float2 v) {
-  return (short2)(convert_short_rtz(v.s0), convert_short_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rtp(float2 v) {
-  return (short2)(convert_short_rtp(v.s0), convert_short_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_rtn(float2 v) {
-  return (short2)(convert_short_rtn(v.s0), convert_short_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rte(float2 v) {
-  return (ushort2)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rtz(float2 v) {
-  return (ushort2)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rtp(float2 v) {
-  return (ushort2)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_rtn(float2 v) {
-  return (ushort2)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rte(float2 v) {
-  return (char2)(convert_char_rte(v.s0), convert_char_rte(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rtz(float2 v) {
-  return (char2)(convert_char_rtz(v.s0), convert_char_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rtp(float2 v) {
-  return (char2)(convert_char_rtp(v.s0), convert_char_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_rtn(float2 v) {
-  return (char2)(convert_char_rtn(v.s0), convert_char_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rte(float2 v) {
-  return (uchar2)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rtz(float2 v) {
-  return (uchar2)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rtp(float2 v) {
-  return (uchar2)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_rtn(float2 v) {
-  return (uchar2)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rte(float2 v) {
-  return (float2)(convert_float_rte(v.s0), convert_float_rte(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rtz(float2 v) {
-  return (float2)(convert_float_rtz(v.s0), convert_float_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rtp(float2 v) {
-  return (float2)(convert_float_rtp(v.s0), convert_float_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE float2 convert_float2_rtn(float2 v) {
-  return (float2)(convert_float_rtn(v.s0), convert_float_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rte(long3 v) {
-  return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rtz(long3 v) {
-  return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rtp(long3 v) {
-  return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rtn(long3 v) {
-  return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rte(long3 v) {
-  return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(long3 v) {
-  return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(long3 v) {
-  return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(long3 v) {
-  return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rte(long3 v) {
-  return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rtz(long3 v) {
-  return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rtp(long3 v) {
-  return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rtn(long3 v) {
-  return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rte(long3 v) {
-  return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rtz(long3 v) {
-  return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rtp(long3 v) {
-  return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rtn(long3 v) {
-  return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rte(long3 v) {
-  return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rtz(long3 v) {
-  return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rtp(long3 v) {
-  return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rtn(long3 v) {
-  return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rte(long3 v) {
-  return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(long3 v) {
-  return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(long3 v) {
-  return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(long3 v) {
-  return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rte(long3 v) {
-  return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rtz(long3 v) {
-  return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rtp(long3 v) {
-  return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rtn(long3 v) {
-  return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rte(long3 v) {
-  return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(long3 v) {
-  return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(long3 v) {
-  return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(long3 v) {
-  return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rte(long3 v) {
-  return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rtz(long3 v) {
-  return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rtp(long3 v) {
-  return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rtn(long3 v) {
-  return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rte(ulong3 v) {
-  return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rtz(ulong3 v) {
-  return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rtp(ulong3 v) {
-  return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rtn(ulong3 v) {
-  return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rte(ulong3 v) {
-  return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(ulong3 v) {
-  return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(ulong3 v) {
-  return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(ulong3 v) {
-  return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rte(ulong3 v) {
-  return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rtz(ulong3 v) {
-  return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rtp(ulong3 v) {
-  return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rtn(ulong3 v) {
-  return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rte(ulong3 v) {
-  return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rtz(ulong3 v) {
-  return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rtp(ulong3 v) {
-  return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rtn(ulong3 v) {
-  return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rte(ulong3 v) {
-  return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rtz(ulong3 v) {
-  return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rtp(ulong3 v) {
-  return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rtn(ulong3 v) {
-  return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rte(ulong3 v) {
-  return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(ulong3 v) {
-  return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(ulong3 v) {
-  return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(ulong3 v) {
-  return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rte(ulong3 v) {
-  return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rtz(ulong3 v) {
-  return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rtp(ulong3 v) {
-  return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rtn(ulong3 v) {
-  return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rte(ulong3 v) {
-  return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(ulong3 v) {
-  return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(ulong3 v) {
-  return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(ulong3 v) {
-  return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rte(ulong3 v) {
-  return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rtz(ulong3 v) {
-  return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rtp(ulong3 v) {
-  return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rtn(ulong3 v) {
-  return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rte(int3 v) {
-  return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rtz(int3 v) {
-  return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rtp(int3 v) {
-  return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rtn(int3 v) {
-  return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rte(int3 v) {
-  return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(int3 v) {
-  return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(int3 v) {
-  return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(int3 v) {
-  return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rte(int3 v) {
-  return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rtz(int3 v) {
-  return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rtp(int3 v) {
-  return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rtn(int3 v) {
-  return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rte(int3 v) {
-  return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rtz(int3 v) {
-  return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rtp(int3 v) {
-  return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rtn(int3 v) {
-  return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rte(int3 v) {
-  return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rtz(int3 v) {
-  return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rtp(int3 v) {
-  return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rtn(int3 v) {
-  return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rte(int3 v) {
-  return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(int3 v) {
-  return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(int3 v) {
-  return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(int3 v) {
-  return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rte(int3 v) {
-  return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rtz(int3 v) {
-  return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rtp(int3 v) {
-  return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rtn(int3 v) {
-  return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rte(int3 v) {
-  return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(int3 v) {
-  return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(int3 v) {
-  return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(int3 v) {
-  return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rte(int3 v) {
-  return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rtz(int3 v) {
-  return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rtp(int3 v) {
-  return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rtn(int3 v) {
-  return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rte(uint3 v) {
-  return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rtz(uint3 v) {
-  return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rtp(uint3 v) {
-  return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rtn(uint3 v) {
-  return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rte(uint3 v) {
-  return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(uint3 v) {
-  return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(uint3 v) {
-  return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(uint3 v) {
-  return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rte(uint3 v) {
-  return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rtz(uint3 v) {
-  return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rtp(uint3 v) {
-  return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rtn(uint3 v) {
-  return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rte(uint3 v) {
-  return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rtz(uint3 v) {
-  return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rtp(uint3 v) {
-  return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rtn(uint3 v) {
-  return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rte(uint3 v) {
-  return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rtz(uint3 v) {
-  return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rtp(uint3 v) {
-  return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rtn(uint3 v) {
-  return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rte(uint3 v) {
-  return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(uint3 v) {
-  return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(uint3 v) {
-  return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(uint3 v) {
-  return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rte(uint3 v) {
-  return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rtz(uint3 v) {
-  return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rtp(uint3 v) {
-  return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rtn(uint3 v) {
-  return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rte(uint3 v) {
-  return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(uint3 v) {
-  return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(uint3 v) {
-  return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(uint3 v) {
-  return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rte(uint3 v) {
-  return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rtz(uint3 v) {
-  return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rtp(uint3 v) {
-  return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rtn(uint3 v) {
-  return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rte(short3 v) {
-  return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rtz(short3 v) {
-  return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rtp(short3 v) {
-  return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rtn(short3 v) {
-  return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rte(short3 v) {
-  return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(short3 v) {
-  return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(short3 v) {
-  return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(short3 v) {
-  return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rte(short3 v) {
-  return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rtz(short3 v) {
-  return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rtp(short3 v) {
-  return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rtn(short3 v) {
-  return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rte(short3 v) {
-  return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rtz(short3 v) {
-  return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rtp(short3 v) {
-  return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rtn(short3 v) {
-  return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rte(short3 v) {
-  return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rtz(short3 v) {
-  return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rtp(short3 v) {
-  return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rtn(short3 v) {
-  return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rte(short3 v) {
-  return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(short3 v) {
-  return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(short3 v) {
-  return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(short3 v) {
-  return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rte(short3 v) {
-  return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rtz(short3 v) {
-  return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rtp(short3 v) {
-  return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rtn(short3 v) {
-  return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rte(short3 v) {
-  return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(short3 v) {
-  return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(short3 v) {
-  return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(short3 v) {
-  return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rte(short3 v) {
-  return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rtz(short3 v) {
-  return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rtp(short3 v) {
-  return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rtn(short3 v) {
-  return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rte(ushort3 v) {
-  return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rtz(ushort3 v) {
-  return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rtp(ushort3 v) {
-  return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rtn(ushort3 v) {
-  return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rte(ushort3 v) {
-  return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(ushort3 v) {
-  return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(ushort3 v) {
-  return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(ushort3 v) {
-  return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rte(ushort3 v) {
-  return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rtz(ushort3 v) {
-  return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rtp(ushort3 v) {
-  return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rtn(ushort3 v) {
-  return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rte(ushort3 v) {
-  return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rtz(ushort3 v) {
-  return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rtp(ushort3 v) {
-  return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rtn(ushort3 v) {
-  return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rte(ushort3 v) {
-  return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rtz(ushort3 v) {
-  return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rtp(ushort3 v) {
-  return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rtn(ushort3 v) {
-  return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rte(ushort3 v) {
-  return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(ushort3 v) {
-  return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(ushort3 v) {
-  return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(ushort3 v) {
-  return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rte(ushort3 v) {
-  return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rtz(ushort3 v) {
-  return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rtp(ushort3 v) {
-  return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rtn(ushort3 v) {
-  return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rte(ushort3 v) {
-  return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(ushort3 v) {
-  return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(ushort3 v) {
-  return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(ushort3 v) {
-  return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rte(ushort3 v) {
-  return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rtz(ushort3 v) {
-  return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rtp(ushort3 v) {
-  return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rtn(ushort3 v) {
-  return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rte(char3 v) {
-  return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rtz(char3 v) {
-  return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rtp(char3 v) {
-  return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rtn(char3 v) {
-  return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rte(char3 v) {
-  return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(char3 v) {
-  return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(char3 v) {
-  return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(char3 v) {
-  return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rte(char3 v) {
-  return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rtz(char3 v) {
-  return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rtp(char3 v) {
-  return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rtn(char3 v) {
-  return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rte(char3 v) {
-  return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rtz(char3 v) {
-  return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rtp(char3 v) {
-  return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rtn(char3 v) {
-  return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rte(char3 v) {
-  return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rtz(char3 v) {
-  return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rtp(char3 v) {
-  return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rtn(char3 v) {
-  return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rte(char3 v) {
-  return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(char3 v) {
-  return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(char3 v) {
-  return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(char3 v) {
-  return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rte(char3 v) {
-  return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rtz(char3 v) {
-  return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rtp(char3 v) {
-  return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rtn(char3 v) {
-  return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rte(char3 v) {
-  return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(char3 v) {
-  return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(char3 v) {
-  return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(char3 v) {
-  return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rte(char3 v) {
-  return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rtz(char3 v) {
-  return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rtp(char3 v) {
-  return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rtn(char3 v) {
-  return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rte(uchar3 v) {
-  return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rtz(uchar3 v) {
-  return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rtp(uchar3 v) {
-  return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rtn(uchar3 v) {
-  return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rte(uchar3 v) {
-  return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(uchar3 v) {
-  return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(uchar3 v) {
-  return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(uchar3 v) {
-  return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rte(uchar3 v) {
-  return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rtz(uchar3 v) {
-  return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rtp(uchar3 v) {
-  return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rtn(uchar3 v) {
-  return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rte(uchar3 v) {
-  return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rtz(uchar3 v) {
-  return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rtp(uchar3 v) {
-  return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rtn(uchar3 v) {
-  return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rte(uchar3 v) {
-  return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rtz(uchar3 v) {
-  return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rtp(uchar3 v) {
-  return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rtn(uchar3 v) {
-  return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rte(uchar3 v) {
-  return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(uchar3 v) {
-  return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(uchar3 v) {
-  return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(uchar3 v) {
-  return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rte(uchar3 v) {
-  return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rtz(uchar3 v) {
-  return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rtp(uchar3 v) {
-  return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rtn(uchar3 v) {
-  return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rte(uchar3 v) {
-  return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(uchar3 v) {
-  return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(uchar3 v) {
-  return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(uchar3 v) {
-  return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rte(uchar3 v) {
-  return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rtz(uchar3 v) {
-  return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rtp(uchar3 v) {
-  return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rtn(uchar3 v) {
-  return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rte(float3 v) {
-  return (long3)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rtz(float3 v) {
-  return (long3)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rtp(float3 v) {
-  return (long3)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_rtn(float3 v) {
-  return (long3)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rte(float3 v) {
-  return (ulong3)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rtz(float3 v) {
-  return (ulong3)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rtp(float3 v) {
-  return (ulong3)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_rtn(float3 v) {
-  return (ulong3)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rte(float3 v) {
-  return (int3)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rtz(float3 v) {
-  return (int3)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rtp(float3 v) {
-  return (int3)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_rtn(float3 v) {
-  return (int3)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rte(float3 v) {
-  return (uint3)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rtz(float3 v) {
-  return (uint3)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rtp(float3 v) {
-  return (uint3)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_rtn(float3 v) {
-  return (uint3)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rte(float3 v) {
-  return (short3)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rtz(float3 v) {
-  return (short3)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rtp(float3 v) {
-  return (short3)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_rtn(float3 v) {
-  return (short3)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rte(float3 v) {
-  return (ushort3)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rtz(float3 v) {
-  return (ushort3)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rtp(float3 v) {
-  return (ushort3)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_rtn(float3 v) {
-  return (ushort3)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rte(float3 v) {
-  return (char3)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rtz(float3 v) {
-  return (char3)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rtp(float3 v) {
-  return (char3)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_rtn(float3 v) {
-  return (char3)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rte(float3 v) {
-  return (uchar3)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rtz(float3 v) {
-  return (uchar3)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rtp(float3 v) {
-  return (uchar3)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_rtn(float3 v) {
-  return (uchar3)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rte(float3 v) {
-  return (float3)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rtz(float3 v) {
-  return (float3)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rtp(float3 v) {
-  return (float3)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE float3 convert_float3_rtn(float3 v) {
-  return (float3)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rte(long4 v) {
-  return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rtz(long4 v) {
-  return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rtp(long4 v) {
-  return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rtn(long4 v) {
-  return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rte(long4 v) {
-  return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(long4 v) {
-  return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(long4 v) {
-  return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(long4 v) {
-  return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rte(long4 v) {
-  return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rtz(long4 v) {
-  return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rtp(long4 v) {
-  return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rtn(long4 v) {
-  return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rte(long4 v) {
-  return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rtz(long4 v) {
-  return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rtp(long4 v) {
-  return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rtn(long4 v) {
-  return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rte(long4 v) {
-  return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rtz(long4 v) {
-  return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rtp(long4 v) {
-  return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rtn(long4 v) {
-  return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rte(long4 v) {
-  return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(long4 v) {
-  return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(long4 v) {
-  return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(long4 v) {
-  return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rte(long4 v) {
-  return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rtz(long4 v) {
-  return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rtp(long4 v) {
-  return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rtn(long4 v) {
-  return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rte(long4 v) {
-  return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(long4 v) {
-  return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(long4 v) {
-  return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(long4 v) {
-  return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rte(long4 v) {
-  return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rtz(long4 v) {
-  return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rtp(long4 v) {
-  return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rtn(long4 v) {
-  return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rte(ulong4 v) {
-  return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rtz(ulong4 v) {
-  return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rtp(ulong4 v) {
-  return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rtn(ulong4 v) {
-  return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rte(ulong4 v) {
-  return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(ulong4 v) {
-  return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(ulong4 v) {
-  return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(ulong4 v) {
-  return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rte(ulong4 v) {
-  return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rtz(ulong4 v) {
-  return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rtp(ulong4 v) {
-  return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rtn(ulong4 v) {
-  return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rte(ulong4 v) {
-  return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rtz(ulong4 v) {
-  return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rtp(ulong4 v) {
-  return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rtn(ulong4 v) {
-  return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rte(ulong4 v) {
-  return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rtz(ulong4 v) {
-  return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rtp(ulong4 v) {
-  return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rtn(ulong4 v) {
-  return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rte(ulong4 v) {
-  return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(ulong4 v) {
-  return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(ulong4 v) {
-  return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(ulong4 v) {
-  return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rte(ulong4 v) {
-  return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rtz(ulong4 v) {
-  return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rtp(ulong4 v) {
-  return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rtn(ulong4 v) {
-  return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rte(ulong4 v) {
-  return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(ulong4 v) {
-  return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(ulong4 v) {
-  return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(ulong4 v) {
-  return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rte(ulong4 v) {
-  return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rtz(ulong4 v) {
-  return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rtp(ulong4 v) {
-  return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rtn(ulong4 v) {
-  return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rte(int4 v) {
-  return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rtz(int4 v) {
-  return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rtp(int4 v) {
-  return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rtn(int4 v) {
-  return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rte(int4 v) {
-  return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(int4 v) {
-  return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(int4 v) {
-  return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(int4 v) {
-  return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rte(int4 v) {
-  return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rtz(int4 v) {
-  return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rtp(int4 v) {
-  return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rtn(int4 v) {
-  return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rte(int4 v) {
-  return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rtz(int4 v) {
-  return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rtp(int4 v) {
-  return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rtn(int4 v) {
-  return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rte(int4 v) {
-  return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rtz(int4 v) {
-  return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rtp(int4 v) {
-  return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rtn(int4 v) {
-  return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rte(int4 v) {
-  return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(int4 v) {
-  return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(int4 v) {
-  return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(int4 v) {
-  return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rte(int4 v) {
-  return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rtz(int4 v) {
-  return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rtp(int4 v) {
-  return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rtn(int4 v) {
-  return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rte(int4 v) {
-  return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(int4 v) {
-  return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(int4 v) {
-  return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(int4 v) {
-  return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rte(int4 v) {
-  return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rtz(int4 v) {
-  return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rtp(int4 v) {
-  return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rtn(int4 v) {
-  return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rte(uint4 v) {
-  return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rtz(uint4 v) {
-  return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rtp(uint4 v) {
-  return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rtn(uint4 v) {
-  return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rte(uint4 v) {
-  return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(uint4 v) {
-  return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(uint4 v) {
-  return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(uint4 v) {
-  return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rte(uint4 v) {
-  return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rtz(uint4 v) {
-  return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rtp(uint4 v) {
-  return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rtn(uint4 v) {
-  return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rte(uint4 v) {
-  return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rtz(uint4 v) {
-  return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rtp(uint4 v) {
-  return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rtn(uint4 v) {
-  return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rte(uint4 v) {
-  return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rtz(uint4 v) {
-  return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rtp(uint4 v) {
-  return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rtn(uint4 v) {
-  return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rte(uint4 v) {
-  return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(uint4 v) {
-  return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(uint4 v) {
-  return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(uint4 v) {
-  return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rte(uint4 v) {
-  return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rtz(uint4 v) {
-  return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rtp(uint4 v) {
-  return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rtn(uint4 v) {
-  return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rte(uint4 v) {
-  return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(uint4 v) {
-  return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(uint4 v) {
-  return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(uint4 v) {
-  return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rte(uint4 v) {
-  return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rtz(uint4 v) {
-  return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rtp(uint4 v) {
-  return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rtn(uint4 v) {
-  return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rte(short4 v) {
-  return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rtz(short4 v) {
-  return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rtp(short4 v) {
-  return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rtn(short4 v) {
-  return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rte(short4 v) {
-  return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(short4 v) {
-  return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(short4 v) {
-  return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(short4 v) {
-  return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rte(short4 v) {
-  return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rtz(short4 v) {
-  return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rtp(short4 v) {
-  return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rtn(short4 v) {
-  return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rte(short4 v) {
-  return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rtz(short4 v) {
-  return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rtp(short4 v) {
-  return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rtn(short4 v) {
-  return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rte(short4 v) {
-  return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rtz(short4 v) {
-  return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rtp(short4 v) {
-  return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rtn(short4 v) {
-  return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rte(short4 v) {
-  return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(short4 v) {
-  return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(short4 v) {
-  return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(short4 v) {
-  return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rte(short4 v) {
-  return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rtz(short4 v) {
-  return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rtp(short4 v) {
-  return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rtn(short4 v) {
-  return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rte(short4 v) {
-  return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(short4 v) {
-  return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(short4 v) {
-  return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(short4 v) {
-  return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rte(short4 v) {
-  return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rtz(short4 v) {
-  return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rtp(short4 v) {
-  return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rtn(short4 v) {
-  return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rte(ushort4 v) {
-  return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rtz(ushort4 v) {
-  return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rtp(ushort4 v) {
-  return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rtn(ushort4 v) {
-  return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rte(ushort4 v) {
-  return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(ushort4 v) {
-  return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(ushort4 v) {
-  return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(ushort4 v) {
-  return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rte(ushort4 v) {
-  return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rtz(ushort4 v) {
-  return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rtp(ushort4 v) {
-  return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rtn(ushort4 v) {
-  return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rte(ushort4 v) {
-  return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rtz(ushort4 v) {
-  return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rtp(ushort4 v) {
-  return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rtn(ushort4 v) {
-  return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rte(ushort4 v) {
-  return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rtz(ushort4 v) {
-  return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rtp(ushort4 v) {
-  return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rtn(ushort4 v) {
-  return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rte(ushort4 v) {
-  return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(ushort4 v) {
-  return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(ushort4 v) {
-  return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(ushort4 v) {
-  return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rte(ushort4 v) {
-  return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rtz(ushort4 v) {
-  return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rtp(ushort4 v) {
-  return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rtn(ushort4 v) {
-  return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rte(ushort4 v) {
-  return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(ushort4 v) {
-  return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(ushort4 v) {
-  return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(ushort4 v) {
-  return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rte(ushort4 v) {
-  return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rtz(ushort4 v) {
-  return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rtp(ushort4 v) {
-  return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rtn(ushort4 v) {
-  return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rte(char4 v) {
-  return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rtz(char4 v) {
-  return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rtp(char4 v) {
-  return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rtn(char4 v) {
-  return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rte(char4 v) {
-  return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(char4 v) {
-  return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(char4 v) {
-  return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(char4 v) {
-  return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rte(char4 v) {
-  return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rtz(char4 v) {
-  return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rtp(char4 v) {
-  return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rtn(char4 v) {
-  return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rte(char4 v) {
-  return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rtz(char4 v) {
-  return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rtp(char4 v) {
-  return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rtn(char4 v) {
-  return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rte(char4 v) {
-  return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rtz(char4 v) {
-  return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rtp(char4 v) {
-  return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rtn(char4 v) {
-  return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rte(char4 v) {
-  return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(char4 v) {
-  return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(char4 v) {
-  return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(char4 v) {
-  return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rte(char4 v) {
-  return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rtz(char4 v) {
-  return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rtp(char4 v) {
-  return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rtn(char4 v) {
-  return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rte(char4 v) {
-  return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(char4 v) {
-  return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(char4 v) {
-  return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(char4 v) {
-  return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rte(char4 v) {
-  return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rtz(char4 v) {
-  return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rtp(char4 v) {
-  return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rtn(char4 v) {
-  return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rte(uchar4 v) {
-  return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rtz(uchar4 v) {
-  return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rtp(uchar4 v) {
-  return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rtn(uchar4 v) {
-  return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rte(uchar4 v) {
-  return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(uchar4 v) {
-  return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(uchar4 v) {
-  return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(uchar4 v) {
-  return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rte(uchar4 v) {
-  return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rtz(uchar4 v) {
-  return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rtp(uchar4 v) {
-  return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rtn(uchar4 v) {
-  return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rte(uchar4 v) {
-  return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rtz(uchar4 v) {
-  return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rtp(uchar4 v) {
-  return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rtn(uchar4 v) {
-  return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rte(uchar4 v) {
-  return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rtz(uchar4 v) {
-  return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rtp(uchar4 v) {
-  return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rtn(uchar4 v) {
-  return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rte(uchar4 v) {
-  return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(uchar4 v) {
-  return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(uchar4 v) {
-  return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(uchar4 v) {
-  return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rte(uchar4 v) {
-  return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rtz(uchar4 v) {
-  return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rtp(uchar4 v) {
-  return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rtn(uchar4 v) {
-  return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rte(uchar4 v) {
-  return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(uchar4 v) {
-  return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(uchar4 v) {
-  return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(uchar4 v) {
-  return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rte(uchar4 v) {
-  return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rtz(uchar4 v) {
-  return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rtp(uchar4 v) {
-  return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rtn(uchar4 v) {
-  return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rte(float4 v) {
-  return (long4)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rtz(float4 v) {
-  return (long4)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rtp(float4 v) {
-  return (long4)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_rtn(float4 v) {
-  return (long4)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rte(float4 v) {
-  return (ulong4)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rtz(float4 v) {
-  return (ulong4)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rtp(float4 v) {
-  return (ulong4)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_rtn(float4 v) {
-  return (ulong4)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rte(float4 v) {
-  return (int4)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rtz(float4 v) {
-  return (int4)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rtp(float4 v) {
-  return (int4)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_rtn(float4 v) {
-  return (int4)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rte(float4 v) {
-  return (uint4)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rtz(float4 v) {
-  return (uint4)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rtp(float4 v) {
-  return (uint4)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_rtn(float4 v) {
-  return (uint4)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rte(float4 v) {
-  return (short4)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rtz(float4 v) {
-  return (short4)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rtp(float4 v) {
-  return (short4)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_rtn(float4 v) {
-  return (short4)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rte(float4 v) {
-  return (ushort4)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rtz(float4 v) {
-  return (ushort4)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rtp(float4 v) {
-  return (ushort4)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_rtn(float4 v) {
-  return (ushort4)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rte(float4 v) {
-  return (char4)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rtz(float4 v) {
-  return (char4)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rtp(float4 v) {
-  return (char4)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_rtn(float4 v) {
-  return (char4)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rte(float4 v) {
-  return (uchar4)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rtz(float4 v) {
-  return (uchar4)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rtp(float4 v) {
-  return (uchar4)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_rtn(float4 v) {
-  return (uchar4)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rte(float4 v) {
-  return (float4)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rtz(float4 v) {
-  return (float4)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rtp(float4 v) {
-  return (float4)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE float4 convert_float4_rtn(float4 v) {
-  return (float4)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rte(long8 v) {
-  return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rtz(long8 v) {
-  return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rtp(long8 v) {
-  return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rtn(long8 v) {
-  return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rte(long8 v) {
-  return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(long8 v) {
-  return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(long8 v) {
-  return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(long8 v) {
-  return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rte(long8 v) {
-  return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rtz(long8 v) {
-  return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rtp(long8 v) {
-  return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rtn(long8 v) {
-  return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rte(long8 v) {
-  return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rtz(long8 v) {
-  return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rtp(long8 v) {
-  return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rtn(long8 v) {
-  return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rte(long8 v) {
-  return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rtz(long8 v) {
-  return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rtp(long8 v) {
-  return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rtn(long8 v) {
-  return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rte(long8 v) {
-  return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(long8 v) {
-  return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(long8 v) {
-  return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(long8 v) {
-  return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rte(long8 v) {
-  return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rtz(long8 v) {
-  return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rtp(long8 v) {
-  return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rtn(long8 v) {
-  return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rte(long8 v) {
-  return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(long8 v) {
-  return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(long8 v) {
-  return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(long8 v) {
-  return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rte(long8 v) {
-  return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rtz(long8 v) {
-  return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rtp(long8 v) {
-  return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rtn(long8 v) {
-  return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rte(ulong8 v) {
-  return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rtz(ulong8 v) {
-  return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rtp(ulong8 v) {
-  return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rtn(ulong8 v) {
-  return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rte(ulong8 v) {
-  return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(ulong8 v) {
-  return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(ulong8 v) {
-  return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(ulong8 v) {
-  return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rte(ulong8 v) {
-  return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rtz(ulong8 v) {
-  return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rtp(ulong8 v) {
-  return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rtn(ulong8 v) {
-  return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rte(ulong8 v) {
-  return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rtz(ulong8 v) {
-  return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rtp(ulong8 v) {
-  return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rtn(ulong8 v) {
-  return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rte(ulong8 v) {
-  return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rtz(ulong8 v) {
-  return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rtp(ulong8 v) {
-  return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rtn(ulong8 v) {
-  return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rte(ulong8 v) {
-  return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(ulong8 v) {
-  return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(ulong8 v) {
-  return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(ulong8 v) {
-  return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rte(ulong8 v) {
-  return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rtz(ulong8 v) {
-  return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rtp(ulong8 v) {
-  return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rtn(ulong8 v) {
-  return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rte(ulong8 v) {
-  return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(ulong8 v) {
-  return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(ulong8 v) {
-  return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(ulong8 v) {
-  return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rte(ulong8 v) {
-  return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rtz(ulong8 v) {
-  return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rtp(ulong8 v) {
-  return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rtn(ulong8 v) {
-  return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rte(int8 v) {
-  return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rtz(int8 v) {
-  return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rtp(int8 v) {
-  return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rtn(int8 v) {
-  return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rte(int8 v) {
-  return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(int8 v) {
-  return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(int8 v) {
-  return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(int8 v) {
-  return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rte(int8 v) {
-  return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rtz(int8 v) {
-  return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rtp(int8 v) {
-  return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rtn(int8 v) {
-  return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rte(int8 v) {
-  return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rtz(int8 v) {
-  return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rtp(int8 v) {
-  return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rtn(int8 v) {
-  return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rte(int8 v) {
-  return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rtz(int8 v) {
-  return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rtp(int8 v) {
-  return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rtn(int8 v) {
-  return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rte(int8 v) {
-  return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(int8 v) {
-  return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(int8 v) {
-  return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(int8 v) {
-  return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rte(int8 v) {
-  return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rtz(int8 v) {
-  return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rtp(int8 v) {
-  return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rtn(int8 v) {
-  return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rte(int8 v) {
-  return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(int8 v) {
-  return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(int8 v) {
-  return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(int8 v) {
-  return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rte(int8 v) {
-  return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rtz(int8 v) {
-  return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rtp(int8 v) {
-  return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rtn(int8 v) {
-  return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rte(uint8 v) {
-  return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rtz(uint8 v) {
-  return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rtp(uint8 v) {
-  return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rtn(uint8 v) {
-  return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rte(uint8 v) {
-  return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(uint8 v) {
-  return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(uint8 v) {
-  return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(uint8 v) {
-  return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rte(uint8 v) {
-  return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rtz(uint8 v) {
-  return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rtp(uint8 v) {
-  return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rtn(uint8 v) {
-  return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rte(uint8 v) {
-  return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rtz(uint8 v) {
-  return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rtp(uint8 v) {
-  return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rtn(uint8 v) {
-  return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rte(uint8 v) {
-  return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rtz(uint8 v) {
-  return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rtp(uint8 v) {
-  return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rtn(uint8 v) {
-  return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rte(uint8 v) {
-  return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(uint8 v) {
-  return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(uint8 v) {
-  return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(uint8 v) {
-  return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rte(uint8 v) {
-  return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rtz(uint8 v) {
-  return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rtp(uint8 v) {
-  return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rtn(uint8 v) {
-  return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rte(uint8 v) {
-  return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(uint8 v) {
-  return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(uint8 v) {
-  return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(uint8 v) {
-  return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rte(uint8 v) {
-  return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rtz(uint8 v) {
-  return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rtp(uint8 v) {
-  return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rtn(uint8 v) {
-  return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rte(short8 v) {
-  return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rtz(short8 v) {
-  return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rtp(short8 v) {
-  return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rtn(short8 v) {
-  return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rte(short8 v) {
-  return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(short8 v) {
-  return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(short8 v) {
-  return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(short8 v) {
-  return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rte(short8 v) {
-  return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rtz(short8 v) {
-  return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rtp(short8 v) {
-  return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rtn(short8 v) {
-  return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rte(short8 v) {
-  return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rtz(short8 v) {
-  return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rtp(short8 v) {
-  return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rtn(short8 v) {
-  return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rte(short8 v) {
-  return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rtz(short8 v) {
-  return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rtp(short8 v) {
-  return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rtn(short8 v) {
-  return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rte(short8 v) {
-  return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(short8 v) {
-  return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(short8 v) {
-  return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(short8 v) {
-  return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rte(short8 v) {
-  return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rtz(short8 v) {
-  return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rtp(short8 v) {
-  return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rtn(short8 v) {
-  return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rte(short8 v) {
-  return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(short8 v) {
-  return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(short8 v) {
-  return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(short8 v) {
-  return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rte(short8 v) {
-  return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rtz(short8 v) {
-  return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rtp(short8 v) {
-  return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rtn(short8 v) {
-  return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rte(ushort8 v) {
-  return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rtz(ushort8 v) {
-  return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rtp(ushort8 v) {
-  return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rtn(ushort8 v) {
-  return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rte(ushort8 v) {
-  return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(ushort8 v) {
-  return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(ushort8 v) {
-  return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(ushort8 v) {
-  return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rte(ushort8 v) {
-  return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rtz(ushort8 v) {
-  return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rtp(ushort8 v) {
-  return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rtn(ushort8 v) {
-  return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rte(ushort8 v) {
-  return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rtz(ushort8 v) {
-  return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rtp(ushort8 v) {
-  return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rtn(ushort8 v) {
-  return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rte(ushort8 v) {
-  return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rtz(ushort8 v) {
-  return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rtp(ushort8 v) {
-  return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rtn(ushort8 v) {
-  return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rte(ushort8 v) {
-  return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(ushort8 v) {
-  return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(ushort8 v) {
-  return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(ushort8 v) {
-  return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rte(ushort8 v) {
-  return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rtz(ushort8 v) {
-  return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rtp(ushort8 v) {
-  return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rtn(ushort8 v) {
-  return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rte(ushort8 v) {
-  return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(ushort8 v) {
-  return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(ushort8 v) {
-  return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(ushort8 v) {
-  return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rte(ushort8 v) {
-  return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rtz(ushort8 v) {
-  return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rtp(ushort8 v) {
-  return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rtn(ushort8 v) {
-  return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rte(char8 v) {
-  return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rtz(char8 v) {
-  return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rtp(char8 v) {
-  return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rtn(char8 v) {
-  return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rte(char8 v) {
-  return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(char8 v) {
-  return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(char8 v) {
-  return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(char8 v) {
-  return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rte(char8 v) {
-  return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rtz(char8 v) {
-  return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rtp(char8 v) {
-  return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rtn(char8 v) {
-  return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rte(char8 v) {
-  return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rtz(char8 v) {
-  return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rtp(char8 v) {
-  return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rtn(char8 v) {
-  return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rte(char8 v) {
-  return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rtz(char8 v) {
-  return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rtp(char8 v) {
-  return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rtn(char8 v) {
-  return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rte(char8 v) {
-  return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(char8 v) {
-  return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(char8 v) {
-  return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(char8 v) {
-  return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rte(char8 v) {
-  return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rtz(char8 v) {
-  return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rtp(char8 v) {
-  return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rtn(char8 v) {
-  return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rte(char8 v) {
-  return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(char8 v) {
-  return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(char8 v) {
-  return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(char8 v) {
-  return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rte(char8 v) {
-  return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rtz(char8 v) {
-  return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rtp(char8 v) {
-  return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rtn(char8 v) {
-  return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rte(uchar8 v) {
-  return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rtz(uchar8 v) {
-  return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rtp(uchar8 v) {
-  return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rtn(uchar8 v) {
-  return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rte(uchar8 v) {
-  return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(uchar8 v) {
-  return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(uchar8 v) {
-  return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(uchar8 v) {
-  return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rte(uchar8 v) {
-  return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rtz(uchar8 v) {
-  return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rtp(uchar8 v) {
-  return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rtn(uchar8 v) {
-  return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rte(uchar8 v) {
-  return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rtz(uchar8 v) {
-  return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rtp(uchar8 v) {
-  return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rtn(uchar8 v) {
-  return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rte(uchar8 v) {
-  return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rtz(uchar8 v) {
-  return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rtp(uchar8 v) {
-  return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rtn(uchar8 v) {
-  return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rte(uchar8 v) {
-  return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(uchar8 v) {
-  return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(uchar8 v) {
-  return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(uchar8 v) {
-  return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rte(uchar8 v) {
-  return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rtz(uchar8 v) {
-  return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rtp(uchar8 v) {
-  return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rtn(uchar8 v) {
-  return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rte(uchar8 v) {
-  return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(uchar8 v) {
-  return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(uchar8 v) {
-  return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(uchar8 v) {
-  return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rte(uchar8 v) {
-  return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rtz(uchar8 v) {
-  return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rtp(uchar8 v) {
-  return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rtn(uchar8 v) {
-  return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rte(float8 v) {
-  return (long8)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rtz(float8 v) {
-  return (long8)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rtp(float8 v) {
-  return (long8)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_rtn(float8 v) {
-  return (long8)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rte(float8 v) {
-  return (ulong8)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rtz(float8 v) {
-  return (ulong8)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rtp(float8 v) {
-  return (ulong8)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_rtn(float8 v) {
-  return (ulong8)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rte(float8 v) {
-  return (int8)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rtz(float8 v) {
-  return (int8)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rtp(float8 v) {
-  return (int8)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_rtn(float8 v) {
-  return (int8)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rte(float8 v) {
-  return (uint8)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rtz(float8 v) {
-  return (uint8)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rtp(float8 v) {
-  return (uint8)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_rtn(float8 v) {
-  return (uint8)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rte(float8 v) {
-  return (short8)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rtz(float8 v) {
-  return (short8)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rtp(float8 v) {
-  return (short8)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_rtn(float8 v) {
-  return (short8)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rte(float8 v) {
-  return (ushort8)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rtz(float8 v) {
-  return (ushort8)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rtp(float8 v) {
-  return (ushort8)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_rtn(float8 v) {
-  return (ushort8)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rte(float8 v) {
-  return (char8)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rtz(float8 v) {
-  return (char8)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rtp(float8 v) {
-  return (char8)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_rtn(float8 v) {
-  return (char8)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rte(float8 v) {
-  return (uchar8)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rtz(float8 v) {
-  return (uchar8)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rtp(float8 v) {
-  return (uchar8)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_rtn(float8 v) {
-  return (uchar8)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rte(float8 v) {
-  return (float8)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rtz(float8 v) {
-  return (float8)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rtp(float8 v) {
-  return (float8)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE float8 convert_float8_rtn(float8 v) {
-  return (float8)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rte(long16 v) {
-  return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rtz(long16 v) {
-  return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rtp(long16 v) {
-  return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rtn(long16 v) {
-  return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rte(long16 v) {
-  return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(long16 v) {
-  return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(long16 v) {
-  return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(long16 v) {
-  return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rte(long16 v) {
-  return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rtz(long16 v) {
-  return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rtp(long16 v) {
-  return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rtn(long16 v) {
-  return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rte(long16 v) {
-  return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rtz(long16 v) {
-  return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rtp(long16 v) {
-  return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rtn(long16 v) {
-  return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rte(long16 v) {
-  return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rtz(long16 v) {
-  return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rtp(long16 v) {
-  return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rtn(long16 v) {
-  return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rte(long16 v) {
-  return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(long16 v) {
-  return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(long16 v) {
-  return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(long16 v) {
-  return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rte(long16 v) {
-  return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rtz(long16 v) {
-  return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rtp(long16 v) {
-  return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rtn(long16 v) {
-  return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rte(long16 v) {
-  return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(long16 v) {
-  return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(long16 v) {
-  return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(long16 v) {
-  return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rte(long16 v) {
-  return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rtz(long16 v) {
-  return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rtp(long16 v) {
-  return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rtn(long16 v) {
-  return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rte(ulong16 v) {
-  return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rtz(ulong16 v) {
-  return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rtp(ulong16 v) {
-  return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rtn(ulong16 v) {
-  return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rte(ulong16 v) {
-  return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(ulong16 v) {
-  return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(ulong16 v) {
-  return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(ulong16 v) {
-  return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rte(ulong16 v) {
-  return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rtz(ulong16 v) {
-  return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rtp(ulong16 v) {
-  return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rtn(ulong16 v) {
-  return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rte(ulong16 v) {
-  return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rtz(ulong16 v) {
-  return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rtp(ulong16 v) {
-  return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rtn(ulong16 v) {
-  return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rte(ulong16 v) {
-  return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rtz(ulong16 v) {
-  return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rtp(ulong16 v) {
-  return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rtn(ulong16 v) {
-  return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rte(ulong16 v) {
-  return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(ulong16 v) {
-  return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(ulong16 v) {
-  return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(ulong16 v) {
-  return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rte(ulong16 v) {
-  return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rtz(ulong16 v) {
-  return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rtp(ulong16 v) {
-  return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rtn(ulong16 v) {
-  return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rte(ulong16 v) {
-  return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(ulong16 v) {
-  return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(ulong16 v) {
-  return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(ulong16 v) {
-  return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rte(ulong16 v) {
-  return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rtz(ulong16 v) {
-  return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rtp(ulong16 v) {
-  return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rtn(ulong16 v) {
-  return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rte(int16 v) {
-  return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rtz(int16 v) {
-  return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rtp(int16 v) {
-  return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rtn(int16 v) {
-  return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rte(int16 v) {
-  return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(int16 v) {
-  return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(int16 v) {
-  return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(int16 v) {
-  return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rte(int16 v) {
-  return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rtz(int16 v) {
-  return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rtp(int16 v) {
-  return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rtn(int16 v) {
-  return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rte(int16 v) {
-  return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rtz(int16 v) {
-  return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rtp(int16 v) {
-  return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rtn(int16 v) {
-  return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rte(int16 v) {
-  return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rtz(int16 v) {
-  return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rtp(int16 v) {
-  return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rtn(int16 v) {
-  return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rte(int16 v) {
-  return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(int16 v) {
-  return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(int16 v) {
-  return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(int16 v) {
-  return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rte(int16 v) {
-  return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rtz(int16 v) {
-  return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rtp(int16 v) {
-  return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rtn(int16 v) {
-  return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rte(int16 v) {
-  return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(int16 v) {
-  return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(int16 v) {
-  return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(int16 v) {
-  return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rte(int16 v) {
-  return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rtz(int16 v) {
-  return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rtp(int16 v) {
-  return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rtn(int16 v) {
-  return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rte(uint16 v) {
-  return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rtz(uint16 v) {
-  return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rtp(uint16 v) {
-  return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rtn(uint16 v) {
-  return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rte(uint16 v) {
-  return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(uint16 v) {
-  return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(uint16 v) {
-  return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(uint16 v) {
-  return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rte(uint16 v) {
-  return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rtz(uint16 v) {
-  return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rtp(uint16 v) {
-  return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rtn(uint16 v) {
-  return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rte(uint16 v) {
-  return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rtz(uint16 v) {
-  return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rtp(uint16 v) {
-  return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rtn(uint16 v) {
-  return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rte(uint16 v) {
-  return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rtz(uint16 v) {
-  return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rtp(uint16 v) {
-  return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rtn(uint16 v) {
-  return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rte(uint16 v) {
-  return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(uint16 v) {
-  return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(uint16 v) {
-  return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(uint16 v) {
-  return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rte(uint16 v) {
-  return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rtz(uint16 v) {
-  return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rtp(uint16 v) {
-  return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rtn(uint16 v) {
-  return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rte(uint16 v) {
-  return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(uint16 v) {
-  return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(uint16 v) {
-  return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(uint16 v) {
-  return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rte(uint16 v) {
-  return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rtz(uint16 v) {
-  return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rtp(uint16 v) {
-  return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rtn(uint16 v) {
-  return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rte(short16 v) {
-  return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rtz(short16 v) {
-  return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rtp(short16 v) {
-  return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rtn(short16 v) {
-  return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rte(short16 v) {
-  return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(short16 v) {
-  return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(short16 v) {
-  return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(short16 v) {
-  return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rte(short16 v) {
-  return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rtz(short16 v) {
-  return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rtp(short16 v) {
-  return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rtn(short16 v) {
-  return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rte(short16 v) {
-  return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rtz(short16 v) {
-  return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rtp(short16 v) {
-  return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rtn(short16 v) {
-  return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rte(short16 v) {
-  return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rtz(short16 v) {
-  return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rtp(short16 v) {
-  return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rtn(short16 v) {
-  return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rte(short16 v) {
-  return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(short16 v) {
-  return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(short16 v) {
-  return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(short16 v) {
-  return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rte(short16 v) {
-  return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rtz(short16 v) {
-  return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rtp(short16 v) {
-  return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rtn(short16 v) {
-  return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rte(short16 v) {
-  return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(short16 v) {
-  return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(short16 v) {
-  return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(short16 v) {
-  return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rte(short16 v) {
-  return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rtz(short16 v) {
-  return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rtp(short16 v) {
-  return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rtn(short16 v) {
-  return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rte(ushort16 v) {
-  return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rtz(ushort16 v) {
-  return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rtp(ushort16 v) {
-  return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rtn(ushort16 v) {
-  return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rte(ushort16 v) {
-  return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(ushort16 v) {
-  return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(ushort16 v) {
-  return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(ushort16 v) {
-  return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rte(ushort16 v) {
-  return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rtz(ushort16 v) {
-  return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rtp(ushort16 v) {
-  return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rtn(ushort16 v) {
-  return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rte(ushort16 v) {
-  return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rtz(ushort16 v) {
-  return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rtp(ushort16 v) {
-  return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rtn(ushort16 v) {
-  return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rte(ushort16 v) {
-  return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rtz(ushort16 v) {
-  return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rtp(ushort16 v) {
-  return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rtn(ushort16 v) {
-  return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rte(ushort16 v) {
-  return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(ushort16 v) {
-  return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(ushort16 v) {
-  return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(ushort16 v) {
-  return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rte(ushort16 v) {
-  return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rtz(ushort16 v) {
-  return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rtp(ushort16 v) {
-  return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rtn(ushort16 v) {
-  return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rte(ushort16 v) {
-  return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(ushort16 v) {
-  return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(ushort16 v) {
-  return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(ushort16 v) {
-  return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rte(ushort16 v) {
-  return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rtz(ushort16 v) {
-  return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rtp(ushort16 v) {
-  return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rtn(ushort16 v) {
-  return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rte(char16 v) {
-  return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rtz(char16 v) {
-  return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rtp(char16 v) {
-  return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rtn(char16 v) {
-  return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rte(char16 v) {
-  return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(char16 v) {
-  return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(char16 v) {
-  return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(char16 v) {
-  return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rte(char16 v) {
-  return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rtz(char16 v) {
-  return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rtp(char16 v) {
-  return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rtn(char16 v) {
-  return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rte(char16 v) {
-  return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rtz(char16 v) {
-  return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rtp(char16 v) {
-  return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rtn(char16 v) {
-  return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rte(char16 v) {
-  return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rtz(char16 v) {
-  return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rtp(char16 v) {
-  return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rtn(char16 v) {
-  return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rte(char16 v) {
-  return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(char16 v) {
-  return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(char16 v) {
-  return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(char16 v) {
-  return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rte(char16 v) {
-  return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rtz(char16 v) {
-  return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rtp(char16 v) {
-  return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rtn(char16 v) {
-  return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rte(char16 v) {
-  return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(char16 v) {
-  return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(char16 v) {
-  return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(char16 v) {
-  return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rte(char16 v) {
-  return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rtz(char16 v) {
-  return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rtp(char16 v) {
-  return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rtn(char16 v) {
-  return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rte(uchar16 v) {
-  return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rtz(uchar16 v) {
-  return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rtp(uchar16 v) {
-  return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rtn(uchar16 v) {
-  return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rte(uchar16 v) {
-  return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(uchar16 v) {
-  return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(uchar16 v) {
-  return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(uchar16 v) {
-  return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rte(uchar16 v) {
-  return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rtz(uchar16 v) {
-  return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rtp(uchar16 v) {
-  return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rtn(uchar16 v) {
-  return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rte(uchar16 v) {
-  return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rtz(uchar16 v) {
-  return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rtp(uchar16 v) {
-  return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rtn(uchar16 v) {
-  return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rte(uchar16 v) {
-  return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rtz(uchar16 v) {
-  return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rtp(uchar16 v) {
-  return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rtn(uchar16 v) {
-  return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rte(uchar16 v) {
-  return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(uchar16 v) {
-  return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(uchar16 v) {
-  return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(uchar16 v) {
-  return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rte(uchar16 v) {
-  return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rtz(uchar16 v) {
-  return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rtp(uchar16 v) {
-  return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rtn(uchar16 v) {
-  return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rte(uchar16 v) {
-  return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(uchar16 v) {
-  return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(uchar16 v) {
-  return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(uchar16 v) {
-  return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rte(uchar16 v) {
-  return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rtz(uchar16 v) {
-  return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rtp(uchar16 v) {
-  return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rtn(uchar16 v) {
-  return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rte(float16 v) {
-  return (long16)(convert_long_rte(v.s0), convert_long_rte(v.s1), convert_long_rte(v.s2), convert_long_rte(v.s3), convert_long_rte(v.s4), convert_long_rte(v.s5), convert_long_rte(v.s6), convert_long_rte(v.s7), convert_long_rte(v.s8), convert_long_rte(v.s9), convert_long_rte(v.sA), convert_long_rte(v.sB), convert_long_rte(v.sC), convert_long_rte(v.sD), convert_long_rte(v.sE), convert_long_rte(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rtz(float16 v) {
-  return (long16)(convert_long_rtz(v.s0), convert_long_rtz(v.s1), convert_long_rtz(v.s2), convert_long_rtz(v.s3), convert_long_rtz(v.s4), convert_long_rtz(v.s5), convert_long_rtz(v.s6), convert_long_rtz(v.s7), convert_long_rtz(v.s8), convert_long_rtz(v.s9), convert_long_rtz(v.sA), convert_long_rtz(v.sB), convert_long_rtz(v.sC), convert_long_rtz(v.sD), convert_long_rtz(v.sE), convert_long_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rtp(float16 v) {
-  return (long16)(convert_long_rtp(v.s0), convert_long_rtp(v.s1), convert_long_rtp(v.s2), convert_long_rtp(v.s3), convert_long_rtp(v.s4), convert_long_rtp(v.s5), convert_long_rtp(v.s6), convert_long_rtp(v.s7), convert_long_rtp(v.s8), convert_long_rtp(v.s9), convert_long_rtp(v.sA), convert_long_rtp(v.sB), convert_long_rtp(v.sC), convert_long_rtp(v.sD), convert_long_rtp(v.sE), convert_long_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_rtn(float16 v) {
-  return (long16)(convert_long_rtn(v.s0), convert_long_rtn(v.s1), convert_long_rtn(v.s2), convert_long_rtn(v.s3), convert_long_rtn(v.s4), convert_long_rtn(v.s5), convert_long_rtn(v.s6), convert_long_rtn(v.s7), convert_long_rtn(v.s8), convert_long_rtn(v.s9), convert_long_rtn(v.sA), convert_long_rtn(v.sB), convert_long_rtn(v.sC), convert_long_rtn(v.sD), convert_long_rtn(v.sE), convert_long_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rte(float16 v) {
-  return (ulong16)(convert_ulong_rte(v.s0), convert_ulong_rte(v.s1), convert_ulong_rte(v.s2), convert_ulong_rte(v.s3), convert_ulong_rte(v.s4), convert_ulong_rte(v.s5), convert_ulong_rte(v.s6), convert_ulong_rte(v.s7), convert_ulong_rte(v.s8), convert_ulong_rte(v.s9), convert_ulong_rte(v.sA), convert_ulong_rte(v.sB), convert_ulong_rte(v.sC), convert_ulong_rte(v.sD), convert_ulong_rte(v.sE), convert_ulong_rte(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rtz(float16 v) {
-  return (ulong16)(convert_ulong_rtz(v.s0), convert_ulong_rtz(v.s1), convert_ulong_rtz(v.s2), convert_ulong_rtz(v.s3), convert_ulong_rtz(v.s4), convert_ulong_rtz(v.s5), convert_ulong_rtz(v.s6), convert_ulong_rtz(v.s7), convert_ulong_rtz(v.s8), convert_ulong_rtz(v.s9), convert_ulong_rtz(v.sA), convert_ulong_rtz(v.sB), convert_ulong_rtz(v.sC), convert_ulong_rtz(v.sD), convert_ulong_rtz(v.sE), convert_ulong_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rtp(float16 v) {
-  return (ulong16)(convert_ulong_rtp(v.s0), convert_ulong_rtp(v.s1), convert_ulong_rtp(v.s2), convert_ulong_rtp(v.s3), convert_ulong_rtp(v.s4), convert_ulong_rtp(v.s5), convert_ulong_rtp(v.s6), convert_ulong_rtp(v.s7), convert_ulong_rtp(v.s8), convert_ulong_rtp(v.s9), convert_ulong_rtp(v.sA), convert_ulong_rtp(v.sB), convert_ulong_rtp(v.sC), convert_ulong_rtp(v.sD), convert_ulong_rtp(v.sE), convert_ulong_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_rtn(float16 v) {
-  return (ulong16)(convert_ulong_rtn(v.s0), convert_ulong_rtn(v.s1), convert_ulong_rtn(v.s2), convert_ulong_rtn(v.s3), convert_ulong_rtn(v.s4), convert_ulong_rtn(v.s5), convert_ulong_rtn(v.s6), convert_ulong_rtn(v.s7), convert_ulong_rtn(v.s8), convert_ulong_rtn(v.s9), convert_ulong_rtn(v.sA), convert_ulong_rtn(v.sB), convert_ulong_rtn(v.sC), convert_ulong_rtn(v.sD), convert_ulong_rtn(v.sE), convert_ulong_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rte(float16 v) {
-  return (int16)(convert_int_rte(v.s0), convert_int_rte(v.s1), convert_int_rte(v.s2), convert_int_rte(v.s3), convert_int_rte(v.s4), convert_int_rte(v.s5), convert_int_rte(v.s6), convert_int_rte(v.s7), convert_int_rte(v.s8), convert_int_rte(v.s9), convert_int_rte(v.sA), convert_int_rte(v.sB), convert_int_rte(v.sC), convert_int_rte(v.sD), convert_int_rte(v.sE), convert_int_rte(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rtz(float16 v) {
-  return (int16)(convert_int_rtz(v.s0), convert_int_rtz(v.s1), convert_int_rtz(v.s2), convert_int_rtz(v.s3), convert_int_rtz(v.s4), convert_int_rtz(v.s5), convert_int_rtz(v.s6), convert_int_rtz(v.s7), convert_int_rtz(v.s8), convert_int_rtz(v.s9), convert_int_rtz(v.sA), convert_int_rtz(v.sB), convert_int_rtz(v.sC), convert_int_rtz(v.sD), convert_int_rtz(v.sE), convert_int_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rtp(float16 v) {
-  return (int16)(convert_int_rtp(v.s0), convert_int_rtp(v.s1), convert_int_rtp(v.s2), convert_int_rtp(v.s3), convert_int_rtp(v.s4), convert_int_rtp(v.s5), convert_int_rtp(v.s6), convert_int_rtp(v.s7), convert_int_rtp(v.s8), convert_int_rtp(v.s9), convert_int_rtp(v.sA), convert_int_rtp(v.sB), convert_int_rtp(v.sC), convert_int_rtp(v.sD), convert_int_rtp(v.sE), convert_int_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_rtn(float16 v) {
-  return (int16)(convert_int_rtn(v.s0), convert_int_rtn(v.s1), convert_int_rtn(v.s2), convert_int_rtn(v.s3), convert_int_rtn(v.s4), convert_int_rtn(v.s5), convert_int_rtn(v.s6), convert_int_rtn(v.s7), convert_int_rtn(v.s8), convert_int_rtn(v.s9), convert_int_rtn(v.sA), convert_int_rtn(v.sB), convert_int_rtn(v.sC), convert_int_rtn(v.sD), convert_int_rtn(v.sE), convert_int_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rte(float16 v) {
-  return (uint16)(convert_uint_rte(v.s0), convert_uint_rte(v.s1), convert_uint_rte(v.s2), convert_uint_rte(v.s3), convert_uint_rte(v.s4), convert_uint_rte(v.s5), convert_uint_rte(v.s6), convert_uint_rte(v.s7), convert_uint_rte(v.s8), convert_uint_rte(v.s9), convert_uint_rte(v.sA), convert_uint_rte(v.sB), convert_uint_rte(v.sC), convert_uint_rte(v.sD), convert_uint_rte(v.sE), convert_uint_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rtz(float16 v) {
-  return (uint16)(convert_uint_rtz(v.s0), convert_uint_rtz(v.s1), convert_uint_rtz(v.s2), convert_uint_rtz(v.s3), convert_uint_rtz(v.s4), convert_uint_rtz(v.s5), convert_uint_rtz(v.s6), convert_uint_rtz(v.s7), convert_uint_rtz(v.s8), convert_uint_rtz(v.s9), convert_uint_rtz(v.sA), convert_uint_rtz(v.sB), convert_uint_rtz(v.sC), convert_uint_rtz(v.sD), convert_uint_rtz(v.sE), convert_uint_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rtp(float16 v) {
-  return (uint16)(convert_uint_rtp(v.s0), convert_uint_rtp(v.s1), convert_uint_rtp(v.s2), convert_uint_rtp(v.s3), convert_uint_rtp(v.s4), convert_uint_rtp(v.s5), convert_uint_rtp(v.s6), convert_uint_rtp(v.s7), convert_uint_rtp(v.s8), convert_uint_rtp(v.s9), convert_uint_rtp(v.sA), convert_uint_rtp(v.sB), convert_uint_rtp(v.sC), convert_uint_rtp(v.sD), convert_uint_rtp(v.sE), convert_uint_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_rtn(float16 v) {
-  return (uint16)(convert_uint_rtn(v.s0), convert_uint_rtn(v.s1), convert_uint_rtn(v.s2), convert_uint_rtn(v.s3), convert_uint_rtn(v.s4), convert_uint_rtn(v.s5), convert_uint_rtn(v.s6), convert_uint_rtn(v.s7), convert_uint_rtn(v.s8), convert_uint_rtn(v.s9), convert_uint_rtn(v.sA), convert_uint_rtn(v.sB), convert_uint_rtn(v.sC), convert_uint_rtn(v.sD), convert_uint_rtn(v.sE), convert_uint_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rte(float16 v) {
-  return (short16)(convert_short_rte(v.s0), convert_short_rte(v.s1), convert_short_rte(v.s2), convert_short_rte(v.s3), convert_short_rte(v.s4), convert_short_rte(v.s5), convert_short_rte(v.s6), convert_short_rte(v.s7), convert_short_rte(v.s8), convert_short_rte(v.s9), convert_short_rte(v.sA), convert_short_rte(v.sB), convert_short_rte(v.sC), convert_short_rte(v.sD), convert_short_rte(v.sE), convert_short_rte(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rtz(float16 v) {
-  return (short16)(convert_short_rtz(v.s0), convert_short_rtz(v.s1), convert_short_rtz(v.s2), convert_short_rtz(v.s3), convert_short_rtz(v.s4), convert_short_rtz(v.s5), convert_short_rtz(v.s6), convert_short_rtz(v.s7), convert_short_rtz(v.s8), convert_short_rtz(v.s9), convert_short_rtz(v.sA), convert_short_rtz(v.sB), convert_short_rtz(v.sC), convert_short_rtz(v.sD), convert_short_rtz(v.sE), convert_short_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rtp(float16 v) {
-  return (short16)(convert_short_rtp(v.s0), convert_short_rtp(v.s1), convert_short_rtp(v.s2), convert_short_rtp(v.s3), convert_short_rtp(v.s4), convert_short_rtp(v.s5), convert_short_rtp(v.s6), convert_short_rtp(v.s7), convert_short_rtp(v.s8), convert_short_rtp(v.s9), convert_short_rtp(v.sA), convert_short_rtp(v.sB), convert_short_rtp(v.sC), convert_short_rtp(v.sD), convert_short_rtp(v.sE), convert_short_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_rtn(float16 v) {
-  return (short16)(convert_short_rtn(v.s0), convert_short_rtn(v.s1), convert_short_rtn(v.s2), convert_short_rtn(v.s3), convert_short_rtn(v.s4), convert_short_rtn(v.s5), convert_short_rtn(v.s6), convert_short_rtn(v.s7), convert_short_rtn(v.s8), convert_short_rtn(v.s9), convert_short_rtn(v.sA), convert_short_rtn(v.sB), convert_short_rtn(v.sC), convert_short_rtn(v.sD), convert_short_rtn(v.sE), convert_short_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rte(float16 v) {
-  return (ushort16)(convert_ushort_rte(v.s0), convert_ushort_rte(v.s1), convert_ushort_rte(v.s2), convert_ushort_rte(v.s3), convert_ushort_rte(v.s4), convert_ushort_rte(v.s5), convert_ushort_rte(v.s6), convert_ushort_rte(v.s7), convert_ushort_rte(v.s8), convert_ushort_rte(v.s9), convert_ushort_rte(v.sA), convert_ushort_rte(v.sB), convert_ushort_rte(v.sC), convert_ushort_rte(v.sD), convert_ushort_rte(v.sE), convert_ushort_rte(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rtz(float16 v) {
-  return (ushort16)(convert_ushort_rtz(v.s0), convert_ushort_rtz(v.s1), convert_ushort_rtz(v.s2), convert_ushort_rtz(v.s3), convert_ushort_rtz(v.s4), convert_ushort_rtz(v.s5), convert_ushort_rtz(v.s6), convert_ushort_rtz(v.s7), convert_ushort_rtz(v.s8), convert_ushort_rtz(v.s9), convert_ushort_rtz(v.sA), convert_ushort_rtz(v.sB), convert_ushort_rtz(v.sC), convert_ushort_rtz(v.sD), convert_ushort_rtz(v.sE), convert_ushort_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rtp(float16 v) {
-  return (ushort16)(convert_ushort_rtp(v.s0), convert_ushort_rtp(v.s1), convert_ushort_rtp(v.s2), convert_ushort_rtp(v.s3), convert_ushort_rtp(v.s4), convert_ushort_rtp(v.s5), convert_ushort_rtp(v.s6), convert_ushort_rtp(v.s7), convert_ushort_rtp(v.s8), convert_ushort_rtp(v.s9), convert_ushort_rtp(v.sA), convert_ushort_rtp(v.sB), convert_ushort_rtp(v.sC), convert_ushort_rtp(v.sD), convert_ushort_rtp(v.sE), convert_ushort_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_rtn(float16 v) {
-  return (ushort16)(convert_ushort_rtn(v.s0), convert_ushort_rtn(v.s1), convert_ushort_rtn(v.s2), convert_ushort_rtn(v.s3), convert_ushort_rtn(v.s4), convert_ushort_rtn(v.s5), convert_ushort_rtn(v.s6), convert_ushort_rtn(v.s7), convert_ushort_rtn(v.s8), convert_ushort_rtn(v.s9), convert_ushort_rtn(v.sA), convert_ushort_rtn(v.sB), convert_ushort_rtn(v.sC), convert_ushort_rtn(v.sD), convert_ushort_rtn(v.sE), convert_ushort_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rte(float16 v) {
-  return (char16)(convert_char_rte(v.s0), convert_char_rte(v.s1), convert_char_rte(v.s2), convert_char_rte(v.s3), convert_char_rte(v.s4), convert_char_rte(v.s5), convert_char_rte(v.s6), convert_char_rte(v.s7), convert_char_rte(v.s8), convert_char_rte(v.s9), convert_char_rte(v.sA), convert_char_rte(v.sB), convert_char_rte(v.sC), convert_char_rte(v.sD), convert_char_rte(v.sE), convert_char_rte(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rtz(float16 v) {
-  return (char16)(convert_char_rtz(v.s0), convert_char_rtz(v.s1), convert_char_rtz(v.s2), convert_char_rtz(v.s3), convert_char_rtz(v.s4), convert_char_rtz(v.s5), convert_char_rtz(v.s6), convert_char_rtz(v.s7), convert_char_rtz(v.s8), convert_char_rtz(v.s9), convert_char_rtz(v.sA), convert_char_rtz(v.sB), convert_char_rtz(v.sC), convert_char_rtz(v.sD), convert_char_rtz(v.sE), convert_char_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rtp(float16 v) {
-  return (char16)(convert_char_rtp(v.s0), convert_char_rtp(v.s1), convert_char_rtp(v.s2), convert_char_rtp(v.s3), convert_char_rtp(v.s4), convert_char_rtp(v.s5), convert_char_rtp(v.s6), convert_char_rtp(v.s7), convert_char_rtp(v.s8), convert_char_rtp(v.s9), convert_char_rtp(v.sA), convert_char_rtp(v.sB), convert_char_rtp(v.sC), convert_char_rtp(v.sD), convert_char_rtp(v.sE), convert_char_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_rtn(float16 v) {
-  return (char16)(convert_char_rtn(v.s0), convert_char_rtn(v.s1), convert_char_rtn(v.s2), convert_char_rtn(v.s3), convert_char_rtn(v.s4), convert_char_rtn(v.s5), convert_char_rtn(v.s6), convert_char_rtn(v.s7), convert_char_rtn(v.s8), convert_char_rtn(v.s9), convert_char_rtn(v.sA), convert_char_rtn(v.sB), convert_char_rtn(v.sC), convert_char_rtn(v.sD), convert_char_rtn(v.sE), convert_char_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rte(float16 v) {
-  return (uchar16)(convert_uchar_rte(v.s0), convert_uchar_rte(v.s1), convert_uchar_rte(v.s2), convert_uchar_rte(v.s3), convert_uchar_rte(v.s4), convert_uchar_rte(v.s5), convert_uchar_rte(v.s6), convert_uchar_rte(v.s7), convert_uchar_rte(v.s8), convert_uchar_rte(v.s9), convert_uchar_rte(v.sA), convert_uchar_rte(v.sB), convert_uchar_rte(v.sC), convert_uchar_rte(v.sD), convert_uchar_rte(v.sE), convert_uchar_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rtz(float16 v) {
-  return (uchar16)(convert_uchar_rtz(v.s0), convert_uchar_rtz(v.s1), convert_uchar_rtz(v.s2), convert_uchar_rtz(v.s3), convert_uchar_rtz(v.s4), convert_uchar_rtz(v.s5), convert_uchar_rtz(v.s6), convert_uchar_rtz(v.s7), convert_uchar_rtz(v.s8), convert_uchar_rtz(v.s9), convert_uchar_rtz(v.sA), convert_uchar_rtz(v.sB), convert_uchar_rtz(v.sC), convert_uchar_rtz(v.sD), convert_uchar_rtz(v.sE), convert_uchar_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rtp(float16 v) {
-  return (uchar16)(convert_uchar_rtp(v.s0), convert_uchar_rtp(v.s1), convert_uchar_rtp(v.s2), convert_uchar_rtp(v.s3), convert_uchar_rtp(v.s4), convert_uchar_rtp(v.s5), convert_uchar_rtp(v.s6), convert_uchar_rtp(v.s7), convert_uchar_rtp(v.s8), convert_uchar_rtp(v.s9), convert_uchar_rtp(v.sA), convert_uchar_rtp(v.sB), convert_uchar_rtp(v.sC), convert_uchar_rtp(v.sD), convert_uchar_rtp(v.sE), convert_uchar_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_rtn(float16 v) {
-  return (uchar16)(convert_uchar_rtn(v.s0), convert_uchar_rtn(v.s1), convert_uchar_rtn(v.s2), convert_uchar_rtn(v.s3), convert_uchar_rtn(v.s4), convert_uchar_rtn(v.s5), convert_uchar_rtn(v.s6), convert_uchar_rtn(v.s7), convert_uchar_rtn(v.s8), convert_uchar_rtn(v.s9), convert_uchar_rtn(v.sA), convert_uchar_rtn(v.sB), convert_uchar_rtn(v.sC), convert_uchar_rtn(v.sD), convert_uchar_rtn(v.sE), convert_uchar_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rte(float16 v) {
-  return (float16)(convert_float_rte(v.s0), convert_float_rte(v.s1), convert_float_rte(v.s2), convert_float_rte(v.s3), convert_float_rte(v.s4), convert_float_rte(v.s5), convert_float_rte(v.s6), convert_float_rte(v.s7), convert_float_rte(v.s8), convert_float_rte(v.s9), convert_float_rte(v.sA), convert_float_rte(v.sB), convert_float_rte(v.sC), convert_float_rte(v.sD), convert_float_rte(v.sE), convert_float_rte(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rtz(float16 v) {
-  return (float16)(convert_float_rtz(v.s0), convert_float_rtz(v.s1), convert_float_rtz(v.s2), convert_float_rtz(v.s3), convert_float_rtz(v.s4), convert_float_rtz(v.s5), convert_float_rtz(v.s6), convert_float_rtz(v.s7), convert_float_rtz(v.s8), convert_float_rtz(v.s9), convert_float_rtz(v.sA), convert_float_rtz(v.sB), convert_float_rtz(v.sC), convert_float_rtz(v.sD), convert_float_rtz(v.sE), convert_float_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rtp(float16 v) {
-  return (float16)(convert_float_rtp(v.s0), convert_float_rtp(v.s1), convert_float_rtp(v.s2), convert_float_rtp(v.s3), convert_float_rtp(v.s4), convert_float_rtp(v.s5), convert_float_rtp(v.s6), convert_float_rtp(v.s7), convert_float_rtp(v.s8), convert_float_rtp(v.s9), convert_float_rtp(v.sA), convert_float_rtp(v.sB), convert_float_rtp(v.sC), convert_float_rtp(v.sD), convert_float_rtp(v.sE), convert_float_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE float16 convert_float16_rtn(float16 v) {
-  return (float16)(convert_float_rtn(v.s0), convert_float_rtn(v.s1), convert_float_rtn(v.s2), convert_float_rtn(v.s3), convert_float_rtn(v.s4), convert_float_rtn(v.s5), convert_float_rtn(v.s6), convert_float_rtn(v.s7), convert_float_rtn(v.s8), convert_float_rtn(v.s9), convert_float_rtn(v.sA), convert_float_rtn(v.sB), convert_float_rtn(v.sC), convert_float_rtn(v.sD), convert_float_rtn(v.sE), convert_float_rtn(v.sF));
-}
-
-INLINE_OVERLOADABLE long convert_long_sat_rte(long x)
-{ return convert_long_sat(x); }
-INLINE_OVERLOADABLE long convert_long_sat_rtz(long x)
-{ return convert_long_sat(x); }
-INLINE_OVERLOADABLE long convert_long_sat_rtp(long x)
-{ return convert_long_sat(x); }
-INLINE_OVERLOADABLE long convert_long_sat_rtn(long x)
-{ return convert_long_sat(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(long x)
-{ return convert_ulong_sat(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(long x)
-{ return convert_ulong_sat(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(long x)
-{ return convert_ulong_sat(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(long x)
-{ return convert_ulong_sat(x); }
-INLINE_OVERLOADABLE int convert_int_sat_rte(long x)
-{ return convert_int_sat(x); }
-INLINE_OVERLOADABLE int convert_int_sat_rtz(long x)
-{ return convert_int_sat(x); }
-INLINE_OVERLOADABLE int convert_int_sat_rtp(long x)
-{ return convert_int_sat(x); }
-INLINE_OVERLOADABLE int convert_int_sat_rtn(long x)
-{ return convert_int_sat(x); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rte(long x)
-{ return convert_uint_sat(x); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rtz(long x)
-{ return convert_uint_sat(x); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rtp(long x)
-{ return convert_uint_sat(x); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rtn(long x)
-{ return convert_uint_sat(x); }
-INLINE_OVERLOADABLE short convert_short_sat_rte(long x)
-{ return convert_short_sat(x); }
-INLINE_OVERLOADABLE short convert_short_sat_rtz(long x)
-{ return convert_short_sat(x); }
-INLINE_OVERLOADABLE short convert_short_sat_rtp(long x)
-{ return convert_short_sat(x); }
-INLINE_OVERLOADABLE short convert_short_sat_rtn(long x)
-{ return convert_short_sat(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(long x)
-{ return convert_ushort_sat(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(long x)
-{ return convert_ushort_sat(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(long x)
-{ return convert_ushort_sat(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(long x)
-{ return convert_ushort_sat(x); }
-INLINE_OVERLOADABLE char convert_char_sat_rte(long x)
-{ return convert_char_sat(x); }
-INLINE_OVERLOADABLE char convert_char_sat_rtz(long x)
-{ return convert_char_sat(x); }
-INLINE_OVERLOADABLE char convert_char_sat_rtp(long x)
-{ return convert_char_sat(x); }
-INLINE_OVERLOADABLE char convert_char_sat_rtn(long x)
-{ return convert_char_sat(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(long x)
-{ return convert_uchar_sat(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(long x)
-{ return convert_uchar_sat(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(long x)
-{ return convert_uchar_sat(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(long x)
-{ return convert_uchar_sat(x); }
-INLINE_OVERLOADABLE long convert_long_sat_rte(ulong x)
-{ return convert_long_sat(x); }
-INLINE_OVERLOADABLE long convert_long_sat_rtz(ulong x)
-{ return convert_long_sat(x); }
-INLINE_OVERLOADABLE long convert_long_sat_rtp(ulong x)
-{ return convert_long_sat(x); }
-INLINE_OVERLOADABLE long convert_long_sat_rtn(ulong x)
-{ return convert_long_sat(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(ulong x)
-{ return convert_ulong_sat(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(ulong x)
-{ return convert_ulong_sat(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(ulong x)
-{ return convert_ulong_sat(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(ulong x)
-{ return convert_ulong_sat(x); }
-INLINE_OVERLOADABLE int convert_int_sat_rte(ulong x)
-{ return convert_int_sat(x); }
-INLINE_OVERLOADABLE int convert_int_sat_rtz(ulong x)
-{ return convert_int_sat(x); }
-INLINE_OVERLOADABLE int convert_int_sat_rtp(ulong x)
-{ return convert_int_sat(x); }
-INLINE_OVERLOADABLE int convert_int_sat_rtn(ulong x)
-{ return convert_int_sat(x); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rte(ulong x)
-{ return convert_uint_sat(x); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rtz(ulong x)
-{ return convert_uint_sat(x); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rtp(ulong x)
-{ return convert_uint_sat(x); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rtn(ulong x)
-{ return convert_uint_sat(x); }
-INLINE_OVERLOADABLE short convert_short_sat_rte(ulong x)
-{ return convert_short_sat(x); }
-INLINE_OVERLOADABLE short convert_short_sat_rtz(ulong x)
-{ return convert_short_sat(x); }
-INLINE_OVERLOADABLE short convert_short_sat_rtp(ulong x)
-{ return convert_short_sat(x); }
-INLINE_OVERLOADABLE short convert_short_sat_rtn(ulong x)
-{ return convert_short_sat(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(ulong x)
-{ return convert_ushort_sat(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(ulong x)
-{ return convert_ushort_sat(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(ulong x)
-{ return convert_ushort_sat(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(ulong x)
-{ return convert_ushort_sat(x); }
-INLINE_OVERLOADABLE char convert_char_sat_rte(ulong x)
-{ return convert_char_sat(x); }
-INLINE_OVERLOADABLE char convert_char_sat_rtz(ulong x)
-{ return convert_char_sat(x); }
-INLINE_OVERLOADABLE char convert_char_sat_rtp(ulong x)
-{ return convert_char_sat(x); }
-INLINE_OVERLOADABLE char convert_char_sat_rtn(ulong x)
-{ return convert_char_sat(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(ulong x)
-{ return convert_uchar_sat(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(ulong x)
-{ return convert_uchar_sat(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(ulong x)
-{ return convert_uchar_sat(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(ulong x)
-{ return convert_uchar_sat(x); }
-INLINE_OVERLOADABLE long convert_long_sat_rte(int x)
-{ return convert_long_sat(x); }
-INLINE_OVERLOADABLE long convert_long_sat_rtz(int x)
-{ return convert_long_sat(x); }
-INLINE_OVERLOADABLE long convert_long_sat_rtp(int x)
-{ return convert_long_sat(x); }
-INLINE_OVERLOADABLE long convert_long_sat_rtn(int x)
-{ return convert_long_sat(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(int x)
-{ return convert_ulong_sat(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(int x)
-{ return convert_ulong_sat(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(int x)
-{ return convert_ulong_sat(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(int x)
-{ return convert_ulong_sat(x); }
-INLINE_OVERLOADABLE int convert_int_sat_rte(int x)
-{ return convert_int_sat(x); }
-INLINE_OVERLOADABLE int convert_int_sat_rtz(int x)
-{ return convert_int_sat(x); }
-INLINE_OVERLOADABLE int convert_int_sat_rtp(int x)
-{ return convert_int_sat(x); }
-INLINE_OVERLOADABLE int convert_int_sat_rtn(int x)
-{ return convert_int_sat(x); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rte(int x)
-{ return convert_uint_sat(x); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rtz(int x)
-{ return convert_uint_sat(x); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rtp(int x)
-{ return convert_uint_sat(x); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rtn(int x)
-{ return convert_uint_sat(x); }
-INLINE_OVERLOADABLE short convert_short_sat_rte(int x)
-{ return convert_short_sat(x); }
-INLINE_OVERLOADABLE short convert_short_sat_rtz(int x)
-{ return convert_short_sat(x); }
-INLINE_OVERLOADABLE short convert_short_sat_rtp(int x)
-{ return convert_short_sat(x); }
-INLINE_OVERLOADABLE short convert_short_sat_rtn(int x)
-{ return convert_short_sat(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(int x)
-{ return convert_ushort_sat(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(int x)
-{ return convert_ushort_sat(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(int x)
-{ return convert_ushort_sat(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(int x)
-{ return convert_ushort_sat(x); }
-INLINE_OVERLOADABLE char convert_char_sat_rte(int x)
-{ return convert_char_sat(x); }
-INLINE_OVERLOADABLE char convert_char_sat_rtz(int x)
-{ return convert_char_sat(x); }
-INLINE_OVERLOADABLE char convert_char_sat_rtp(int x)
-{ return convert_char_sat(x); }
-INLINE_OVERLOADABLE char convert_char_sat_rtn(int x)
-{ return convert_char_sat(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(int x)
-{ return convert_uchar_sat(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(int x)
-{ return convert_uchar_sat(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(int x)
-{ return convert_uchar_sat(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(int x)
-{ return convert_uchar_sat(x); }
-INLINE_OVERLOADABLE long convert_long_sat_rte(uint x)
-{ return convert_long_sat(x); }
-INLINE_OVERLOADABLE long convert_long_sat_rtz(uint x)
-{ return convert_long_sat(x); }
-INLINE_OVERLOADABLE long convert_long_sat_rtp(uint x)
-{ return convert_long_sat(x); }
-INLINE_OVERLOADABLE long convert_long_sat_rtn(uint x)
-{ return convert_long_sat(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(uint x)
-{ return convert_ulong_sat(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(uint x)
-{ return convert_ulong_sat(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(uint x)
-{ return convert_ulong_sat(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(uint x)
-{ return convert_ulong_sat(x); }
-INLINE_OVERLOADABLE int convert_int_sat_rte(uint x)
-{ return convert_int_sat(x); }
-INLINE_OVERLOADABLE int convert_int_sat_rtz(uint x)
-{ return convert_int_sat(x); }
-INLINE_OVERLOADABLE int convert_int_sat_rtp(uint x)
-{ return convert_int_sat(x); }
-INLINE_OVERLOADABLE int convert_int_sat_rtn(uint x)
-{ return convert_int_sat(x); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rte(uint x)
-{ return convert_uint_sat(x); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rtz(uint x)
-{ return convert_uint_sat(x); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rtp(uint x)
-{ return convert_uint_sat(x); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rtn(uint x)
-{ return convert_uint_sat(x); }
-INLINE_OVERLOADABLE short convert_short_sat_rte(uint x)
-{ return convert_short_sat(x); }
-INLINE_OVERLOADABLE short convert_short_sat_rtz(uint x)
-{ return convert_short_sat(x); }
-INLINE_OVERLOADABLE short convert_short_sat_rtp(uint x)
-{ return convert_short_sat(x); }
-INLINE_OVERLOADABLE short convert_short_sat_rtn(uint x)
-{ return convert_short_sat(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(uint x)
-{ return convert_ushort_sat(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(uint x)
-{ return convert_ushort_sat(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(uint x)
-{ return convert_ushort_sat(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(uint x)
-{ return convert_ushort_sat(x); }
-INLINE_OVERLOADABLE char convert_char_sat_rte(uint x)
-{ return convert_char_sat(x); }
-INLINE_OVERLOADABLE char convert_char_sat_rtz(uint x)
-{ return convert_char_sat(x); }
-INLINE_OVERLOADABLE char convert_char_sat_rtp(uint x)
-{ return convert_char_sat(x); }
-INLINE_OVERLOADABLE char convert_char_sat_rtn(uint x)
-{ return convert_char_sat(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(uint x)
-{ return convert_uchar_sat(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(uint x)
-{ return convert_uchar_sat(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(uint x)
-{ return convert_uchar_sat(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(uint x)
-{ return convert_uchar_sat(x); }
-INLINE_OVERLOADABLE long convert_long_sat_rte(short x)
-{ return convert_long_sat(x); }
-INLINE_OVERLOADABLE long convert_long_sat_rtz(short x)
-{ return convert_long_sat(x); }
-INLINE_OVERLOADABLE long convert_long_sat_rtp(short x)
-{ return convert_long_sat(x); }
-INLINE_OVERLOADABLE long convert_long_sat_rtn(short x)
-{ return convert_long_sat(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(short x)
-{ return convert_ulong_sat(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(short x)
-{ return convert_ulong_sat(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(short x)
-{ return convert_ulong_sat(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(short x)
-{ return convert_ulong_sat(x); }
-INLINE_OVERLOADABLE int convert_int_sat_rte(short x)
-{ return convert_int_sat(x); }
-INLINE_OVERLOADABLE int convert_int_sat_rtz(short x)
-{ return convert_int_sat(x); }
-INLINE_OVERLOADABLE int convert_int_sat_rtp(short x)
-{ return convert_int_sat(x); }
-INLINE_OVERLOADABLE int convert_int_sat_rtn(short x)
-{ return convert_int_sat(x); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rte(short x)
-{ return convert_uint_sat(x); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rtz(short x)
-{ return convert_uint_sat(x); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rtp(short x)
-{ return convert_uint_sat(x); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rtn(short x)
-{ return convert_uint_sat(x); }
-INLINE_OVERLOADABLE short convert_short_sat_rte(short x)
-{ return convert_short_sat(x); }
-INLINE_OVERLOADABLE short convert_short_sat_rtz(short x)
-{ return convert_short_sat(x); }
-INLINE_OVERLOADABLE short convert_short_sat_rtp(short x)
-{ return convert_short_sat(x); }
-INLINE_OVERLOADABLE short convert_short_sat_rtn(short x)
-{ return convert_short_sat(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(short x)
-{ return convert_ushort_sat(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(short x)
-{ return convert_ushort_sat(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(short x)
-{ return convert_ushort_sat(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(short x)
-{ return convert_ushort_sat(x); }
-INLINE_OVERLOADABLE char convert_char_sat_rte(short x)
-{ return convert_char_sat(x); }
-INLINE_OVERLOADABLE char convert_char_sat_rtz(short x)
-{ return convert_char_sat(x); }
-INLINE_OVERLOADABLE char convert_char_sat_rtp(short x)
-{ return convert_char_sat(x); }
-INLINE_OVERLOADABLE char convert_char_sat_rtn(short x)
-{ return convert_char_sat(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(short x)
-{ return convert_uchar_sat(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(short x)
-{ return convert_uchar_sat(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(short x)
-{ return convert_uchar_sat(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(short x)
-{ return convert_uchar_sat(x); }
-INLINE_OVERLOADABLE long convert_long_sat_rte(ushort x)
-{ return convert_long_sat(x); }
-INLINE_OVERLOADABLE long convert_long_sat_rtz(ushort x)
-{ return convert_long_sat(x); }
-INLINE_OVERLOADABLE long convert_long_sat_rtp(ushort x)
-{ return convert_long_sat(x); }
-INLINE_OVERLOADABLE long convert_long_sat_rtn(ushort x)
-{ return convert_long_sat(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(ushort x)
-{ return convert_ulong_sat(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(ushort x)
-{ return convert_ulong_sat(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(ushort x)
-{ return convert_ulong_sat(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(ushort x)
-{ return convert_ulong_sat(x); }
-INLINE_OVERLOADABLE int convert_int_sat_rte(ushort x)
-{ return convert_int_sat(x); }
-INLINE_OVERLOADABLE int convert_int_sat_rtz(ushort x)
-{ return convert_int_sat(x); }
-INLINE_OVERLOADABLE int convert_int_sat_rtp(ushort x)
-{ return convert_int_sat(x); }
-INLINE_OVERLOADABLE int convert_int_sat_rtn(ushort x)
-{ return convert_int_sat(x); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rte(ushort x)
-{ return convert_uint_sat(x); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rtz(ushort x)
-{ return convert_uint_sat(x); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rtp(ushort x)
-{ return convert_uint_sat(x); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rtn(ushort x)
-{ return convert_uint_sat(x); }
-INLINE_OVERLOADABLE short convert_short_sat_rte(ushort x)
-{ return convert_short_sat(x); }
-INLINE_OVERLOADABLE short convert_short_sat_rtz(ushort x)
-{ return convert_short_sat(x); }
-INLINE_OVERLOADABLE short convert_short_sat_rtp(ushort x)
-{ return convert_short_sat(x); }
-INLINE_OVERLOADABLE short convert_short_sat_rtn(ushort x)
-{ return convert_short_sat(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(ushort x)
-{ return convert_ushort_sat(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(ushort x)
-{ return convert_ushort_sat(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(ushort x)
-{ return convert_ushort_sat(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(ushort x)
-{ return convert_ushort_sat(x); }
-INLINE_OVERLOADABLE char convert_char_sat_rte(ushort x)
-{ return convert_char_sat(x); }
-INLINE_OVERLOADABLE char convert_char_sat_rtz(ushort x)
-{ return convert_char_sat(x); }
-INLINE_OVERLOADABLE char convert_char_sat_rtp(ushort x)
-{ return convert_char_sat(x); }
-INLINE_OVERLOADABLE char convert_char_sat_rtn(ushort x)
-{ return convert_char_sat(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(ushort x)
-{ return convert_uchar_sat(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(ushort x)
-{ return convert_uchar_sat(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(ushort x)
-{ return convert_uchar_sat(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(ushort x)
-{ return convert_uchar_sat(x); }
-INLINE_OVERLOADABLE long convert_long_sat_rte(char x)
-{ return convert_long_sat(x); }
-INLINE_OVERLOADABLE long convert_long_sat_rtz(char x)
-{ return convert_long_sat(x); }
-INLINE_OVERLOADABLE long convert_long_sat_rtp(char x)
-{ return convert_long_sat(x); }
-INLINE_OVERLOADABLE long convert_long_sat_rtn(char x)
-{ return convert_long_sat(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(char x)
-{ return convert_ulong_sat(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(char x)
-{ return convert_ulong_sat(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(char x)
-{ return convert_ulong_sat(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(char x)
-{ return convert_ulong_sat(x); }
-INLINE_OVERLOADABLE int convert_int_sat_rte(char x)
-{ return convert_int_sat(x); }
-INLINE_OVERLOADABLE int convert_int_sat_rtz(char x)
-{ return convert_int_sat(x); }
-INLINE_OVERLOADABLE int convert_int_sat_rtp(char x)
-{ return convert_int_sat(x); }
-INLINE_OVERLOADABLE int convert_int_sat_rtn(char x)
-{ return convert_int_sat(x); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rte(char x)
-{ return convert_uint_sat(x); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rtz(char x)
-{ return convert_uint_sat(x); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rtp(char x)
-{ return convert_uint_sat(x); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rtn(char x)
-{ return convert_uint_sat(x); }
-INLINE_OVERLOADABLE short convert_short_sat_rte(char x)
-{ return convert_short_sat(x); }
-INLINE_OVERLOADABLE short convert_short_sat_rtz(char x)
-{ return convert_short_sat(x); }
-INLINE_OVERLOADABLE short convert_short_sat_rtp(char x)
-{ return convert_short_sat(x); }
-INLINE_OVERLOADABLE short convert_short_sat_rtn(char x)
-{ return convert_short_sat(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(char x)
-{ return convert_ushort_sat(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(char x)
-{ return convert_ushort_sat(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(char x)
-{ return convert_ushort_sat(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(char x)
-{ return convert_ushort_sat(x); }
-INLINE_OVERLOADABLE char convert_char_sat_rte(char x)
-{ return convert_char_sat(x); }
-INLINE_OVERLOADABLE char convert_char_sat_rtz(char x)
-{ return convert_char_sat(x); }
-INLINE_OVERLOADABLE char convert_char_sat_rtp(char x)
-{ return convert_char_sat(x); }
-INLINE_OVERLOADABLE char convert_char_sat_rtn(char x)
-{ return convert_char_sat(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(char x)
-{ return convert_uchar_sat(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(char x)
-{ return convert_uchar_sat(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(char x)
-{ return convert_uchar_sat(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(char x)
-{ return convert_uchar_sat(x); }
-INLINE_OVERLOADABLE long convert_long_sat_rte(uchar x)
-{ return convert_long_sat(x); }
-INLINE_OVERLOADABLE long convert_long_sat_rtz(uchar x)
-{ return convert_long_sat(x); }
-INLINE_OVERLOADABLE long convert_long_sat_rtp(uchar x)
-{ return convert_long_sat(x); }
-INLINE_OVERLOADABLE long convert_long_sat_rtn(uchar x)
-{ return convert_long_sat(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(uchar x)
-{ return convert_ulong_sat(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(uchar x)
-{ return convert_ulong_sat(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(uchar x)
-{ return convert_ulong_sat(x); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(uchar x)
-{ return convert_ulong_sat(x); }
-INLINE_OVERLOADABLE int convert_int_sat_rte(uchar x)
-{ return convert_int_sat(x); }
-INLINE_OVERLOADABLE int convert_int_sat_rtz(uchar x)
-{ return convert_int_sat(x); }
-INLINE_OVERLOADABLE int convert_int_sat_rtp(uchar x)
-{ return convert_int_sat(x); }
-INLINE_OVERLOADABLE int convert_int_sat_rtn(uchar x)
-{ return convert_int_sat(x); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rte(uchar x)
-{ return convert_uint_sat(x); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rtz(uchar x)
-{ return convert_uint_sat(x); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rtp(uchar x)
-{ return convert_uint_sat(x); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rtn(uchar x)
-{ return convert_uint_sat(x); }
-INLINE_OVERLOADABLE short convert_short_sat_rte(uchar x)
-{ return convert_short_sat(x); }
-INLINE_OVERLOADABLE short convert_short_sat_rtz(uchar x)
-{ return convert_short_sat(x); }
-INLINE_OVERLOADABLE short convert_short_sat_rtp(uchar x)
-{ return convert_short_sat(x); }
-INLINE_OVERLOADABLE short convert_short_sat_rtn(uchar x)
-{ return convert_short_sat(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(uchar x)
-{ return convert_ushort_sat(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(uchar x)
-{ return convert_ushort_sat(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(uchar x)
-{ return convert_ushort_sat(x); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(uchar x)
-{ return convert_ushort_sat(x); }
-INLINE_OVERLOADABLE char convert_char_sat_rte(uchar x)
-{ return convert_char_sat(x); }
-INLINE_OVERLOADABLE char convert_char_sat_rtz(uchar x)
-{ return convert_char_sat(x); }
-INLINE_OVERLOADABLE char convert_char_sat_rtp(uchar x)
-{ return convert_char_sat(x); }
-INLINE_OVERLOADABLE char convert_char_sat_rtn(uchar x)
-{ return convert_char_sat(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(uchar x)
-{ return convert_uchar_sat(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(uchar x)
-{ return convert_uchar_sat(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(uchar x)
-{ return convert_uchar_sat(x); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(uchar x)
-{ return convert_uchar_sat(x); }
-INLINE_OVERLOADABLE long convert_long_sat_rte(float x)
-{ return convert_long_sat(__gen_ocl_rnde(x)); }
-INLINE_OVERLOADABLE long convert_long_sat_rtz(float x)
-{ return convert_long_sat(__gen_ocl_rndz(x)); }
-INLINE_OVERLOADABLE long convert_long_sat_rtp(float x)
-{ return convert_long_sat(__gen_ocl_rndu(x)); }
-INLINE_OVERLOADABLE long convert_long_sat_rtn(float x)
-{ return convert_long_sat(__gen_ocl_rndd(x)); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rte(float x)
-{ return convert_ulong_sat(__gen_ocl_rnde(x)); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rtz(float x)
-{ return convert_ulong_sat(__gen_ocl_rndz(x)); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rtp(float x)
-{ return convert_ulong_sat(__gen_ocl_rndu(x)); }
-INLINE_OVERLOADABLE ulong convert_ulong_sat_rtn(float x)
-{ return convert_ulong_sat(__gen_ocl_rndd(x)); }
-INLINE_OVERLOADABLE int convert_int_sat_rte(float x)
-{ return convert_int_sat(__gen_ocl_rnde(x)); }
-INLINE_OVERLOADABLE int convert_int_sat_rtz(float x)
-{ return convert_int_sat(__gen_ocl_rndz(x)); }
-INLINE_OVERLOADABLE int convert_int_sat_rtp(float x)
-{ return convert_int_sat(__gen_ocl_rndu(x)); }
-INLINE_OVERLOADABLE int convert_int_sat_rtn(float x)
-{ return convert_int_sat(__gen_ocl_rndd(x)); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rte(float x)
-{ return convert_uint_sat(__gen_ocl_rnde(x)); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rtz(float x)
-{ return convert_uint_sat(__gen_ocl_rndz(x)); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rtp(float x)
-{ return convert_uint_sat(__gen_ocl_rndu(x)); }
-INLINE_OVERLOADABLE uint convert_uint_sat_rtn(float x)
-{ return convert_uint_sat(__gen_ocl_rndd(x)); }
-INLINE_OVERLOADABLE short convert_short_sat_rte(float x)
-{ return convert_short_sat(__gen_ocl_rnde(x)); }
-INLINE_OVERLOADABLE short convert_short_sat_rtz(float x)
-{ return convert_short_sat(__gen_ocl_rndz(x)); }
-INLINE_OVERLOADABLE short convert_short_sat_rtp(float x)
-{ return convert_short_sat(__gen_ocl_rndu(x)); }
-INLINE_OVERLOADABLE short convert_short_sat_rtn(float x)
-{ return convert_short_sat(__gen_ocl_rndd(x)); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rte(float x)
-{ return convert_ushort_sat(__gen_ocl_rnde(x)); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rtz(float x)
-{ return convert_ushort_sat(__gen_ocl_rndz(x)); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rtp(float x)
-{ return convert_ushort_sat(__gen_ocl_rndu(x)); }
-INLINE_OVERLOADABLE ushort convert_ushort_sat_rtn(float x)
-{ return convert_ushort_sat(__gen_ocl_rndd(x)); }
-INLINE_OVERLOADABLE char convert_char_sat_rte(float x)
-{ return convert_char_sat(__gen_ocl_rnde(x)); }
-INLINE_OVERLOADABLE char convert_char_sat_rtz(float x)
-{ return convert_char_sat(__gen_ocl_rndz(x)); }
-INLINE_OVERLOADABLE char convert_char_sat_rtp(float x)
-{ return convert_char_sat(__gen_ocl_rndu(x)); }
-INLINE_OVERLOADABLE char convert_char_sat_rtn(float x)
-{ return convert_char_sat(__gen_ocl_rndd(x)); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rte(float x)
-{ return convert_uchar_sat(__gen_ocl_rnde(x)); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rtz(float x)
-{ return convert_uchar_sat(__gen_ocl_rndz(x)); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rtp(float x)
-{ return convert_uchar_sat(__gen_ocl_rndu(x)); }
-INLINE_OVERLOADABLE uchar convert_uchar_sat_rtn(float x)
-{ return convert_uchar_sat(__gen_ocl_rndd(x)); }
-INLINE OVERLOADABLE long2 convert_long2_sat_rte(long2 v) {
-  return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat_rtz(long2 v) {
-  return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat_rtp(long2 v) {
-  return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat_rtn(long2 v) {
-  return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(long2 v) {
-  return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(long2 v) {
-  return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(long2 v) {
-  return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(long2 v) {
-  return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rte(long2 v) {
-  return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rtz(long2 v) {
-  return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rtp(long2 v) {
-  return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rtn(long2 v) {
-  return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(long2 v) {
-  return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(long2 v) {
-  return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(long2 v) {
-  return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(long2 v) {
-  return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rte(long2 v) {
-  return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rtz(long2 v) {
-  return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rtp(long2 v) {
-  return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rtn(long2 v) {
-  return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(long2 v) {
-  return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(long2 v) {
-  return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(long2 v) {
-  return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(long2 v) {
-  return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rte(long2 v) {
-  return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rtz(long2 v) {
-  return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rtp(long2 v) {
-  return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rtn(long2 v) {
-  return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(long2 v) {
-  return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(long2 v) {
-  return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(long2 v) {
-  return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(long2 v) {
-  return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat_rte(ulong2 v) {
-  return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat_rtz(ulong2 v) {
-  return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat_rtp(ulong2 v) {
-  return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat_rtn(ulong2 v) {
-  return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(ulong2 v) {
-  return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(ulong2 v) {
-  return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(ulong2 v) {
-  return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(ulong2 v) {
-  return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rte(ulong2 v) {
-  return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rtz(ulong2 v) {
-  return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rtp(ulong2 v) {
-  return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rtn(ulong2 v) {
-  return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(ulong2 v) {
-  return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(ulong2 v) {
-  return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(ulong2 v) {
-  return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(ulong2 v) {
-  return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rte(ulong2 v) {
-  return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rtz(ulong2 v) {
-  return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rtp(ulong2 v) {
-  return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rtn(ulong2 v) {
-  return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(ulong2 v) {
-  return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(ulong2 v) {
-  return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(ulong2 v) {
-  return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(ulong2 v) {
-  return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rte(ulong2 v) {
-  return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rtz(ulong2 v) {
-  return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rtp(ulong2 v) {
-  return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rtn(ulong2 v) {
-  return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(ulong2 v) {
-  return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(ulong2 v) {
-  return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(ulong2 v) {
-  return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(ulong2 v) {
-  return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat_rte(int2 v) {
-  return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat_rtz(int2 v) {
-  return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat_rtp(int2 v) {
-  return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat_rtn(int2 v) {
-  return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(int2 v) {
-  return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(int2 v) {
-  return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(int2 v) {
-  return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(int2 v) {
-  return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rte(int2 v) {
-  return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rtz(int2 v) {
-  return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rtp(int2 v) {
-  return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rtn(int2 v) {
-  return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(int2 v) {
-  return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(int2 v) {
-  return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(int2 v) {
-  return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(int2 v) {
-  return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rte(int2 v) {
-  return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rtz(int2 v) {
-  return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rtp(int2 v) {
-  return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rtn(int2 v) {
-  return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(int2 v) {
-  return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(int2 v) {
-  return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(int2 v) {
-  return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(int2 v) {
-  return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rte(int2 v) {
-  return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rtz(int2 v) {
-  return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rtp(int2 v) {
-  return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rtn(int2 v) {
-  return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(int2 v) {
-  return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(int2 v) {
-  return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(int2 v) {
-  return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(int2 v) {
-  return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat_rte(uint2 v) {
-  return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat_rtz(uint2 v) {
-  return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat_rtp(uint2 v) {
-  return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat_rtn(uint2 v) {
-  return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(uint2 v) {
-  return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(uint2 v) {
-  return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(uint2 v) {
-  return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(uint2 v) {
-  return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rte(uint2 v) {
-  return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rtz(uint2 v) {
-  return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rtp(uint2 v) {
-  return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rtn(uint2 v) {
-  return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(uint2 v) {
-  return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(uint2 v) {
-  return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(uint2 v) {
-  return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(uint2 v) {
-  return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rte(uint2 v) {
-  return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rtz(uint2 v) {
-  return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rtp(uint2 v) {
-  return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rtn(uint2 v) {
-  return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(uint2 v) {
-  return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(uint2 v) {
-  return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(uint2 v) {
-  return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(uint2 v) {
-  return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rte(uint2 v) {
-  return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rtz(uint2 v) {
-  return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rtp(uint2 v) {
-  return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rtn(uint2 v) {
-  return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(uint2 v) {
-  return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(uint2 v) {
-  return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(uint2 v) {
-  return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(uint2 v) {
-  return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat_rte(short2 v) {
-  return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat_rtz(short2 v) {
-  return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat_rtp(short2 v) {
-  return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat_rtn(short2 v) {
-  return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(short2 v) {
-  return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(short2 v) {
-  return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(short2 v) {
-  return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(short2 v) {
-  return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rte(short2 v) {
-  return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rtz(short2 v) {
-  return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rtp(short2 v) {
-  return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rtn(short2 v) {
-  return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(short2 v) {
-  return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(short2 v) {
-  return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(short2 v) {
-  return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(short2 v) {
-  return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rte(short2 v) {
-  return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rtz(short2 v) {
-  return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rtp(short2 v) {
-  return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rtn(short2 v) {
-  return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(short2 v) {
-  return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(short2 v) {
-  return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(short2 v) {
-  return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(short2 v) {
-  return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rte(short2 v) {
-  return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rtz(short2 v) {
-  return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rtp(short2 v) {
-  return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rtn(short2 v) {
-  return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(short2 v) {
-  return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(short2 v) {
-  return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(short2 v) {
-  return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(short2 v) {
-  return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat_rte(ushort2 v) {
-  return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat_rtz(ushort2 v) {
-  return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat_rtp(ushort2 v) {
-  return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat_rtn(ushort2 v) {
-  return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(ushort2 v) {
-  return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(ushort2 v) {
-  return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(ushort2 v) {
-  return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(ushort2 v) {
-  return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rte(ushort2 v) {
-  return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rtz(ushort2 v) {
-  return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rtp(ushort2 v) {
-  return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rtn(ushort2 v) {
-  return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(ushort2 v) {
-  return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(ushort2 v) {
-  return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(ushort2 v) {
-  return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(ushort2 v) {
-  return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rte(ushort2 v) {
-  return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rtz(ushort2 v) {
-  return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rtp(ushort2 v) {
-  return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rtn(ushort2 v) {
-  return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(ushort2 v) {
-  return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(ushort2 v) {
-  return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(ushort2 v) {
-  return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(ushort2 v) {
-  return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rte(ushort2 v) {
-  return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rtz(ushort2 v) {
-  return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rtp(ushort2 v) {
-  return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rtn(ushort2 v) {
-  return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(ushort2 v) {
-  return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(ushort2 v) {
-  return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(ushort2 v) {
-  return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(ushort2 v) {
-  return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat_rte(char2 v) {
-  return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat_rtz(char2 v) {
-  return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat_rtp(char2 v) {
-  return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat_rtn(char2 v) {
-  return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(char2 v) {
-  return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(char2 v) {
-  return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(char2 v) {
-  return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(char2 v) {
-  return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rte(char2 v) {
-  return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rtz(char2 v) {
-  return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rtp(char2 v) {
-  return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rtn(char2 v) {
-  return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(char2 v) {
-  return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(char2 v) {
-  return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(char2 v) {
-  return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(char2 v) {
-  return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rte(char2 v) {
-  return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rtz(char2 v) {
-  return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rtp(char2 v) {
-  return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rtn(char2 v) {
-  return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(char2 v) {
-  return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(char2 v) {
-  return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(char2 v) {
-  return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(char2 v) {
-  return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rte(char2 v) {
-  return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rtz(char2 v) {
-  return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rtp(char2 v) {
-  return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rtn(char2 v) {
-  return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(char2 v) {
-  return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(char2 v) {
-  return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(char2 v) {
-  return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(char2 v) {
-  return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat_rte(uchar2 v) {
-  return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat_rtz(uchar2 v) {
-  return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat_rtp(uchar2 v) {
-  return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat_rtn(uchar2 v) {
-  return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(uchar2 v) {
-  return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(uchar2 v) {
-  return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(uchar2 v) {
-  return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(uchar2 v) {
-  return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rte(uchar2 v) {
-  return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rtz(uchar2 v) {
-  return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rtp(uchar2 v) {
-  return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rtn(uchar2 v) {
-  return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(uchar2 v) {
-  return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(uchar2 v) {
-  return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(uchar2 v) {
-  return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(uchar2 v) {
-  return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rte(uchar2 v) {
-  return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rtz(uchar2 v) {
-  return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rtp(uchar2 v) {
-  return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rtn(uchar2 v) {
-  return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(uchar2 v) {
-  return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(uchar2 v) {
-  return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(uchar2 v) {
-  return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(uchar2 v) {
-  return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rte(uchar2 v) {
-  return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rtz(uchar2 v) {
-  return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rtp(uchar2 v) {
-  return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rtn(uchar2 v) {
-  return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(uchar2 v) {
-  return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(uchar2 v) {
-  return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(uchar2 v) {
-  return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(uchar2 v) {
-  return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat_rte(float2 v) {
-  return (long2)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat_rtz(float2 v) {
-  return (long2)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat_rtp(float2 v) {
-  return (long2)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE long2 convert_long2_sat_rtn(float2 v) {
-  return (long2)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rte(float2 v) {
-  return (ulong2)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtz(float2 v) {
-  return (ulong2)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtp(float2 v) {
-  return (ulong2)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ulong2 convert_ulong2_sat_rtn(float2 v) {
-  return (ulong2)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rte(float2 v) {
-  return (int2)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rtz(float2 v) {
-  return (int2)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rtp(float2 v) {
-  return (int2)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE int2 convert_int2_sat_rtn(float2 v) {
-  return (int2)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rte(float2 v) {
-  return (uint2)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rtz(float2 v) {
-  return (uint2)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rtp(float2 v) {
-  return (uint2)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uint2 convert_uint2_sat_rtn(float2 v) {
-  return (uint2)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rte(float2 v) {
-  return (short2)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rtz(float2 v) {
-  return (short2)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rtp(float2 v) {
-  return (short2)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE short2 convert_short2_sat_rtn(float2 v) {
-  return (short2)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rte(float2 v) {
-  return (ushort2)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtz(float2 v) {
-  return (ushort2)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtp(float2 v) {
-  return (ushort2)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE ushort2 convert_ushort2_sat_rtn(float2 v) {
-  return (ushort2)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rte(float2 v) {
-  return (char2)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rtz(float2 v) {
-  return (char2)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rtp(float2 v) {
-  return (char2)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE char2 convert_char2_sat_rtn(float2 v) {
-  return (char2)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rte(float2 v) {
-  return (uchar2)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtz(float2 v) {
-  return (uchar2)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtp(float2 v) {
-  return (uchar2)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1));
-}
-
-INLINE OVERLOADABLE uchar2 convert_uchar2_sat_rtn(float2 v) {
-  return (uchar2)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rte(long3 v) {
-  return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rtz(long3 v) {
-  return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rtp(long3 v) {
-  return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rtn(long3 v) {
-  return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(long3 v) {
-  return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(long3 v) {
-  return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(long3 v) {
-  return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(long3 v) {
-  return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rte(long3 v) {
-  return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rtz(long3 v) {
-  return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rtp(long3 v) {
-  return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rtn(long3 v) {
-  return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(long3 v) {
-  return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(long3 v) {
-  return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(long3 v) {
-  return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(long3 v) {
-  return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rte(long3 v) {
-  return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rtz(long3 v) {
-  return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rtp(long3 v) {
-  return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rtn(long3 v) {
-  return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(long3 v) {
-  return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(long3 v) {
-  return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(long3 v) {
-  return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(long3 v) {
-  return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rte(long3 v) {
-  return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rtz(long3 v) {
-  return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rtp(long3 v) {
-  return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rtn(long3 v) {
-  return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(long3 v) {
-  return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(long3 v) {
-  return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(long3 v) {
-  return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(long3 v) {
-  return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rte(ulong3 v) {
-  return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rtz(ulong3 v) {
-  return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rtp(ulong3 v) {
-  return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rtn(ulong3 v) {
-  return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(ulong3 v) {
-  return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(ulong3 v) {
-  return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(ulong3 v) {
-  return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(ulong3 v) {
-  return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rte(ulong3 v) {
-  return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rtz(ulong3 v) {
-  return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rtp(ulong3 v) {
-  return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rtn(ulong3 v) {
-  return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(ulong3 v) {
-  return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(ulong3 v) {
-  return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(ulong3 v) {
-  return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(ulong3 v) {
-  return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rte(ulong3 v) {
-  return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rtz(ulong3 v) {
-  return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rtp(ulong3 v) {
-  return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rtn(ulong3 v) {
-  return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(ulong3 v) {
-  return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(ulong3 v) {
-  return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(ulong3 v) {
-  return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(ulong3 v) {
-  return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rte(ulong3 v) {
-  return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rtz(ulong3 v) {
-  return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rtp(ulong3 v) {
-  return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rtn(ulong3 v) {
-  return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(ulong3 v) {
-  return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(ulong3 v) {
-  return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(ulong3 v) {
-  return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(ulong3 v) {
-  return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rte(int3 v) {
-  return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rtz(int3 v) {
-  return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rtp(int3 v) {
-  return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rtn(int3 v) {
-  return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(int3 v) {
-  return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(int3 v) {
-  return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(int3 v) {
-  return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(int3 v) {
-  return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rte(int3 v) {
-  return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rtz(int3 v) {
-  return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rtp(int3 v) {
-  return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rtn(int3 v) {
-  return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(int3 v) {
-  return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(int3 v) {
-  return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(int3 v) {
-  return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(int3 v) {
-  return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rte(int3 v) {
-  return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rtz(int3 v) {
-  return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rtp(int3 v) {
-  return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rtn(int3 v) {
-  return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(int3 v) {
-  return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(int3 v) {
-  return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(int3 v) {
-  return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(int3 v) {
-  return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rte(int3 v) {
-  return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rtz(int3 v) {
-  return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rtp(int3 v) {
-  return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rtn(int3 v) {
-  return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(int3 v) {
-  return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(int3 v) {
-  return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(int3 v) {
-  return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(int3 v) {
-  return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rte(uint3 v) {
-  return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rtz(uint3 v) {
-  return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rtp(uint3 v) {
-  return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rtn(uint3 v) {
-  return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(uint3 v) {
-  return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(uint3 v) {
-  return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(uint3 v) {
-  return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(uint3 v) {
-  return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rte(uint3 v) {
-  return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rtz(uint3 v) {
-  return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rtp(uint3 v) {
-  return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rtn(uint3 v) {
-  return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(uint3 v) {
-  return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(uint3 v) {
-  return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(uint3 v) {
-  return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(uint3 v) {
-  return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rte(uint3 v) {
-  return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rtz(uint3 v) {
-  return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rtp(uint3 v) {
-  return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rtn(uint3 v) {
-  return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(uint3 v) {
-  return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(uint3 v) {
-  return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(uint3 v) {
-  return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(uint3 v) {
-  return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rte(uint3 v) {
-  return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rtz(uint3 v) {
-  return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rtp(uint3 v) {
-  return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rtn(uint3 v) {
-  return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(uint3 v) {
-  return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(uint3 v) {
-  return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(uint3 v) {
-  return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(uint3 v) {
-  return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rte(short3 v) {
-  return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rtz(short3 v) {
-  return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rtp(short3 v) {
-  return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rtn(short3 v) {
-  return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(short3 v) {
-  return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(short3 v) {
-  return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(short3 v) {
-  return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(short3 v) {
-  return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rte(short3 v) {
-  return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rtz(short3 v) {
-  return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rtp(short3 v) {
-  return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rtn(short3 v) {
-  return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(short3 v) {
-  return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(short3 v) {
-  return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(short3 v) {
-  return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(short3 v) {
-  return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rte(short3 v) {
-  return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rtz(short3 v) {
-  return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rtp(short3 v) {
-  return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rtn(short3 v) {
-  return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(short3 v) {
-  return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(short3 v) {
-  return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(short3 v) {
-  return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(short3 v) {
-  return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rte(short3 v) {
-  return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rtz(short3 v) {
-  return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rtp(short3 v) {
-  return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rtn(short3 v) {
-  return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(short3 v) {
-  return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(short3 v) {
-  return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(short3 v) {
-  return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(short3 v) {
-  return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rte(ushort3 v) {
-  return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rtz(ushort3 v) {
-  return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rtp(ushort3 v) {
-  return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rtn(ushort3 v) {
-  return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(ushort3 v) {
-  return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(ushort3 v) {
-  return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(ushort3 v) {
-  return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(ushort3 v) {
-  return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rte(ushort3 v) {
-  return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rtz(ushort3 v) {
-  return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rtp(ushort3 v) {
-  return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rtn(ushort3 v) {
-  return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(ushort3 v) {
-  return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(ushort3 v) {
-  return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(ushort3 v) {
-  return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(ushort3 v) {
-  return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rte(ushort3 v) {
-  return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rtz(ushort3 v) {
-  return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rtp(ushort3 v) {
-  return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rtn(ushort3 v) {
-  return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(ushort3 v) {
-  return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(ushort3 v) {
-  return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(ushort3 v) {
-  return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(ushort3 v) {
-  return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rte(ushort3 v) {
-  return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rtz(ushort3 v) {
-  return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rtp(ushort3 v) {
-  return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rtn(ushort3 v) {
-  return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(ushort3 v) {
-  return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(ushort3 v) {
-  return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(ushort3 v) {
-  return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(ushort3 v) {
-  return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rte(char3 v) {
-  return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rtz(char3 v) {
-  return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rtp(char3 v) {
-  return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rtn(char3 v) {
-  return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(char3 v) {
-  return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(char3 v) {
-  return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(char3 v) {
-  return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(char3 v) {
-  return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rte(char3 v) {
-  return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rtz(char3 v) {
-  return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rtp(char3 v) {
-  return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rtn(char3 v) {
-  return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(char3 v) {
-  return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(char3 v) {
-  return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(char3 v) {
-  return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(char3 v) {
-  return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rte(char3 v) {
-  return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rtz(char3 v) {
-  return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rtp(char3 v) {
-  return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rtn(char3 v) {
-  return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(char3 v) {
-  return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(char3 v) {
-  return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(char3 v) {
-  return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(char3 v) {
-  return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rte(char3 v) {
-  return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rtz(char3 v) {
-  return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rtp(char3 v) {
-  return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rtn(char3 v) {
-  return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(char3 v) {
-  return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(char3 v) {
-  return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(char3 v) {
-  return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(char3 v) {
-  return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rte(uchar3 v) {
-  return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rtz(uchar3 v) {
-  return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rtp(uchar3 v) {
-  return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rtn(uchar3 v) {
-  return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(uchar3 v) {
-  return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(uchar3 v) {
-  return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(uchar3 v) {
-  return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(uchar3 v) {
-  return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rte(uchar3 v) {
-  return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rtz(uchar3 v) {
-  return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rtp(uchar3 v) {
-  return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rtn(uchar3 v) {
-  return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(uchar3 v) {
-  return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(uchar3 v) {
-  return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(uchar3 v) {
-  return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(uchar3 v) {
-  return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rte(uchar3 v) {
-  return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rtz(uchar3 v) {
-  return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rtp(uchar3 v) {
-  return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rtn(uchar3 v) {
-  return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(uchar3 v) {
-  return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(uchar3 v) {
-  return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(uchar3 v) {
-  return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(uchar3 v) {
-  return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rte(uchar3 v) {
-  return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rtz(uchar3 v) {
-  return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rtp(uchar3 v) {
-  return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rtn(uchar3 v) {
-  return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(uchar3 v) {
-  return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(uchar3 v) {
-  return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(uchar3 v) {
-  return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(uchar3 v) {
-  return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rte(float3 v) {
-  return (long3)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rtz(float3 v) {
-  return (long3)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rtp(float3 v) {
-  return (long3)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE long3 convert_long3_sat_rtn(float3 v) {
-  return (long3)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rte(float3 v) {
-  return (ulong3)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtz(float3 v) {
-  return (ulong3)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtp(float3 v) {
-  return (ulong3)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ulong3 convert_ulong3_sat_rtn(float3 v) {
-  return (ulong3)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rte(float3 v) {
-  return (int3)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rtz(float3 v) {
-  return (int3)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rtp(float3 v) {
-  return (int3)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE int3 convert_int3_sat_rtn(float3 v) {
-  return (int3)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rte(float3 v) {
-  return (uint3)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rtz(float3 v) {
-  return (uint3)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rtp(float3 v) {
-  return (uint3)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uint3 convert_uint3_sat_rtn(float3 v) {
-  return (uint3)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rte(float3 v) {
-  return (short3)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rtz(float3 v) {
-  return (short3)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rtp(float3 v) {
-  return (short3)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE short3 convert_short3_sat_rtn(float3 v) {
-  return (short3)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rte(float3 v) {
-  return (ushort3)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtz(float3 v) {
-  return (ushort3)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtp(float3 v) {
-  return (ushort3)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE ushort3 convert_ushort3_sat_rtn(float3 v) {
-  return (ushort3)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rte(float3 v) {
-  return (char3)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rtz(float3 v) {
-  return (char3)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rtp(float3 v) {
-  return (char3)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE char3 convert_char3_sat_rtn(float3 v) {
-  return (char3)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rte(float3 v) {
-  return (uchar3)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtz(float3 v) {
-  return (uchar3)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtp(float3 v) {
-  return (uchar3)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2));
-}
-
-INLINE OVERLOADABLE uchar3 convert_uchar3_sat_rtn(float3 v) {
-  return (uchar3)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rte(long4 v) {
-  return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rtz(long4 v) {
-  return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rtp(long4 v) {
-  return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rtn(long4 v) {
-  return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(long4 v) {
-  return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(long4 v) {
-  return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(long4 v) {
-  return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(long4 v) {
-  return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rte(long4 v) {
-  return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rtz(long4 v) {
-  return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rtp(long4 v) {
-  return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rtn(long4 v) {
-  return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(long4 v) {
-  return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(long4 v) {
-  return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(long4 v) {
-  return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(long4 v) {
-  return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rte(long4 v) {
-  return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rtz(long4 v) {
-  return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rtp(long4 v) {
-  return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rtn(long4 v) {
-  return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(long4 v) {
-  return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(long4 v) {
-  return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(long4 v) {
-  return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(long4 v) {
-  return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rte(long4 v) {
-  return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rtz(long4 v) {
-  return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rtp(long4 v) {
-  return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rtn(long4 v) {
-  return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(long4 v) {
-  return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(long4 v) {
-  return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(long4 v) {
-  return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(long4 v) {
-  return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rte(ulong4 v) {
-  return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rtz(ulong4 v) {
-  return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rtp(ulong4 v) {
-  return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rtn(ulong4 v) {
-  return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(ulong4 v) {
-  return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(ulong4 v) {
-  return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(ulong4 v) {
-  return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(ulong4 v) {
-  return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rte(ulong4 v) {
-  return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rtz(ulong4 v) {
-  return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rtp(ulong4 v) {
-  return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rtn(ulong4 v) {
-  return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(ulong4 v) {
-  return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(ulong4 v) {
-  return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(ulong4 v) {
-  return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(ulong4 v) {
-  return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rte(ulong4 v) {
-  return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rtz(ulong4 v) {
-  return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rtp(ulong4 v) {
-  return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rtn(ulong4 v) {
-  return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(ulong4 v) {
-  return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(ulong4 v) {
-  return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(ulong4 v) {
-  return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(ulong4 v) {
-  return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rte(ulong4 v) {
-  return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rtz(ulong4 v) {
-  return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rtp(ulong4 v) {
-  return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rtn(ulong4 v) {
-  return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(ulong4 v) {
-  return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(ulong4 v) {
-  return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(ulong4 v) {
-  return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(ulong4 v) {
-  return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rte(int4 v) {
-  return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rtz(int4 v) {
-  return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rtp(int4 v) {
-  return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rtn(int4 v) {
-  return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(int4 v) {
-  return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(int4 v) {
-  return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(int4 v) {
-  return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(int4 v) {
-  return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rte(int4 v) {
-  return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rtz(int4 v) {
-  return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rtp(int4 v) {
-  return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rtn(int4 v) {
-  return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(int4 v) {
-  return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(int4 v) {
-  return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(int4 v) {
-  return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(int4 v) {
-  return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rte(int4 v) {
-  return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rtz(int4 v) {
-  return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rtp(int4 v) {
-  return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rtn(int4 v) {
-  return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(int4 v) {
-  return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(int4 v) {
-  return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(int4 v) {
-  return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(int4 v) {
-  return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rte(int4 v) {
-  return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rtz(int4 v) {
-  return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rtp(int4 v) {
-  return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rtn(int4 v) {
-  return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(int4 v) {
-  return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(int4 v) {
-  return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(int4 v) {
-  return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(int4 v) {
-  return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rte(uint4 v) {
-  return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rtz(uint4 v) {
-  return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rtp(uint4 v) {
-  return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rtn(uint4 v) {
-  return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(uint4 v) {
-  return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(uint4 v) {
-  return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(uint4 v) {
-  return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(uint4 v) {
-  return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rte(uint4 v) {
-  return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rtz(uint4 v) {
-  return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rtp(uint4 v) {
-  return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rtn(uint4 v) {
-  return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(uint4 v) {
-  return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(uint4 v) {
-  return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(uint4 v) {
-  return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(uint4 v) {
-  return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rte(uint4 v) {
-  return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rtz(uint4 v) {
-  return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rtp(uint4 v) {
-  return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rtn(uint4 v) {
-  return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(uint4 v) {
-  return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(uint4 v) {
-  return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(uint4 v) {
-  return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(uint4 v) {
-  return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rte(uint4 v) {
-  return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rtz(uint4 v) {
-  return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rtp(uint4 v) {
-  return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rtn(uint4 v) {
-  return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(uint4 v) {
-  return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(uint4 v) {
-  return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(uint4 v) {
-  return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(uint4 v) {
-  return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rte(short4 v) {
-  return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rtz(short4 v) {
-  return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rtp(short4 v) {
-  return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rtn(short4 v) {
-  return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(short4 v) {
-  return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(short4 v) {
-  return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(short4 v) {
-  return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(short4 v) {
-  return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rte(short4 v) {
-  return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rtz(short4 v) {
-  return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rtp(short4 v) {
-  return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rtn(short4 v) {
-  return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(short4 v) {
-  return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(short4 v) {
-  return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(short4 v) {
-  return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(short4 v) {
-  return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rte(short4 v) {
-  return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rtz(short4 v) {
-  return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rtp(short4 v) {
-  return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rtn(short4 v) {
-  return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(short4 v) {
-  return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(short4 v) {
-  return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(short4 v) {
-  return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(short4 v) {
-  return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rte(short4 v) {
-  return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rtz(short4 v) {
-  return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rtp(short4 v) {
-  return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rtn(short4 v) {
-  return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(short4 v) {
-  return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(short4 v) {
-  return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(short4 v) {
-  return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(short4 v) {
-  return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rte(ushort4 v) {
-  return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rtz(ushort4 v) {
-  return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rtp(ushort4 v) {
-  return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rtn(ushort4 v) {
-  return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(ushort4 v) {
-  return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(ushort4 v) {
-  return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(ushort4 v) {
-  return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(ushort4 v) {
-  return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rte(ushort4 v) {
-  return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rtz(ushort4 v) {
-  return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rtp(ushort4 v) {
-  return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rtn(ushort4 v) {
-  return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(ushort4 v) {
-  return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(ushort4 v) {
-  return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(ushort4 v) {
-  return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(ushort4 v) {
-  return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rte(ushort4 v) {
-  return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rtz(ushort4 v) {
-  return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rtp(ushort4 v) {
-  return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rtn(ushort4 v) {
-  return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(ushort4 v) {
-  return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(ushort4 v) {
-  return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(ushort4 v) {
-  return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(ushort4 v) {
-  return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rte(ushort4 v) {
-  return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rtz(ushort4 v) {
-  return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rtp(ushort4 v) {
-  return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rtn(ushort4 v) {
-  return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(ushort4 v) {
-  return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(ushort4 v) {
-  return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(ushort4 v) {
-  return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(ushort4 v) {
-  return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rte(char4 v) {
-  return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rtz(char4 v) {
-  return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rtp(char4 v) {
-  return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rtn(char4 v) {
-  return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(char4 v) {
-  return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(char4 v) {
-  return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(char4 v) {
-  return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(char4 v) {
-  return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rte(char4 v) {
-  return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rtz(char4 v) {
-  return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rtp(char4 v) {
-  return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rtn(char4 v) {
-  return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(char4 v) {
-  return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(char4 v) {
-  return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(char4 v) {
-  return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(char4 v) {
-  return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rte(char4 v) {
-  return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rtz(char4 v) {
-  return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rtp(char4 v) {
-  return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rtn(char4 v) {
-  return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(char4 v) {
-  return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(char4 v) {
-  return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(char4 v) {
-  return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(char4 v) {
-  return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rte(char4 v) {
-  return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rtz(char4 v) {
-  return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rtp(char4 v) {
-  return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rtn(char4 v) {
-  return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(char4 v) {
-  return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(char4 v) {
-  return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(char4 v) {
-  return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(char4 v) {
-  return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rte(uchar4 v) {
-  return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rtz(uchar4 v) {
-  return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rtp(uchar4 v) {
-  return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rtn(uchar4 v) {
-  return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(uchar4 v) {
-  return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(uchar4 v) {
-  return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(uchar4 v) {
-  return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(uchar4 v) {
-  return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rte(uchar4 v) {
-  return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rtz(uchar4 v) {
-  return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rtp(uchar4 v) {
-  return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rtn(uchar4 v) {
-  return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(uchar4 v) {
-  return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(uchar4 v) {
-  return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(uchar4 v) {
-  return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(uchar4 v) {
-  return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rte(uchar4 v) {
-  return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rtz(uchar4 v) {
-  return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rtp(uchar4 v) {
-  return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rtn(uchar4 v) {
-  return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(uchar4 v) {
-  return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(uchar4 v) {
-  return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(uchar4 v) {
-  return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(uchar4 v) {
-  return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rte(uchar4 v) {
-  return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rtz(uchar4 v) {
-  return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rtp(uchar4 v) {
-  return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rtn(uchar4 v) {
-  return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(uchar4 v) {
-  return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(uchar4 v) {
-  return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(uchar4 v) {
-  return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(uchar4 v) {
-  return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rte(float4 v) {
-  return (long4)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rtz(float4 v) {
-  return (long4)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rtp(float4 v) {
-  return (long4)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE long4 convert_long4_sat_rtn(float4 v) {
-  return (long4)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rte(float4 v) {
-  return (ulong4)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtz(float4 v) {
-  return (ulong4)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtp(float4 v) {
-  return (ulong4)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ulong4 convert_ulong4_sat_rtn(float4 v) {
-  return (ulong4)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rte(float4 v) {
-  return (int4)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rtz(float4 v) {
-  return (int4)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rtp(float4 v) {
-  return (int4)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE int4 convert_int4_sat_rtn(float4 v) {
-  return (int4)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rte(float4 v) {
-  return (uint4)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rtz(float4 v) {
-  return (uint4)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rtp(float4 v) {
-  return (uint4)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uint4 convert_uint4_sat_rtn(float4 v) {
-  return (uint4)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rte(float4 v) {
-  return (short4)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rtz(float4 v) {
-  return (short4)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rtp(float4 v) {
-  return (short4)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE short4 convert_short4_sat_rtn(float4 v) {
-  return (short4)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rte(float4 v) {
-  return (ushort4)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtz(float4 v) {
-  return (ushort4)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtp(float4 v) {
-  return (ushort4)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE ushort4 convert_ushort4_sat_rtn(float4 v) {
-  return (ushort4)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rte(float4 v) {
-  return (char4)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rtz(float4 v) {
-  return (char4)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rtp(float4 v) {
-  return (char4)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE char4 convert_char4_sat_rtn(float4 v) {
-  return (char4)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rte(float4 v) {
-  return (uchar4)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtz(float4 v) {
-  return (uchar4)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtp(float4 v) {
-  return (uchar4)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3));
-}
-
-INLINE OVERLOADABLE uchar4 convert_uchar4_sat_rtn(float4 v) {
-  return (uchar4)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rte(long8 v) {
-  return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rtz(long8 v) {
-  return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rtp(long8 v) {
-  return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rtn(long8 v) {
-  return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(long8 v) {
-  return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(long8 v) {
-  return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(long8 v) {
-  return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(long8 v) {
-  return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rte(long8 v) {
-  return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rtz(long8 v) {
-  return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rtp(long8 v) {
-  return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rtn(long8 v) {
-  return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(long8 v) {
-  return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(long8 v) {
-  return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(long8 v) {
-  return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(long8 v) {
-  return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rte(long8 v) {
-  return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rtz(long8 v) {
-  return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rtp(long8 v) {
-  return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rtn(long8 v) {
-  return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(long8 v) {
-  return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(long8 v) {
-  return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(long8 v) {
-  return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(long8 v) {
-  return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rte(long8 v) {
-  return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rtz(long8 v) {
-  return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rtp(long8 v) {
-  return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rtn(long8 v) {
-  return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(long8 v) {
-  return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(long8 v) {
-  return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(long8 v) {
-  return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(long8 v) {
-  return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rte(ulong8 v) {
-  return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rtz(ulong8 v) {
-  return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rtp(ulong8 v) {
-  return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rtn(ulong8 v) {
-  return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(ulong8 v) {
-  return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(ulong8 v) {
-  return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(ulong8 v) {
-  return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(ulong8 v) {
-  return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rte(ulong8 v) {
-  return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rtz(ulong8 v) {
-  return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rtp(ulong8 v) {
-  return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rtn(ulong8 v) {
-  return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(ulong8 v) {
-  return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(ulong8 v) {
-  return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(ulong8 v) {
-  return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(ulong8 v) {
-  return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rte(ulong8 v) {
-  return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rtz(ulong8 v) {
-  return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rtp(ulong8 v) {
-  return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rtn(ulong8 v) {
-  return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(ulong8 v) {
-  return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(ulong8 v) {
-  return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(ulong8 v) {
-  return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(ulong8 v) {
-  return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rte(ulong8 v) {
-  return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rtz(ulong8 v) {
-  return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rtp(ulong8 v) {
-  return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rtn(ulong8 v) {
-  return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(ulong8 v) {
-  return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(ulong8 v) {
-  return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(ulong8 v) {
-  return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(ulong8 v) {
-  return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rte(int8 v) {
-  return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rtz(int8 v) {
-  return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rtp(int8 v) {
-  return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rtn(int8 v) {
-  return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(int8 v) {
-  return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(int8 v) {
-  return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(int8 v) {
-  return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(int8 v) {
-  return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rte(int8 v) {
-  return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rtz(int8 v) {
-  return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rtp(int8 v) {
-  return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rtn(int8 v) {
-  return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(int8 v) {
-  return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(int8 v) {
-  return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(int8 v) {
-  return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(int8 v) {
-  return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rte(int8 v) {
-  return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rtz(int8 v) {
-  return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rtp(int8 v) {
-  return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rtn(int8 v) {
-  return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(int8 v) {
-  return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(int8 v) {
-  return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(int8 v) {
-  return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(int8 v) {
-  return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rte(int8 v) {
-  return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rtz(int8 v) {
-  return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rtp(int8 v) {
-  return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rtn(int8 v) {
-  return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(int8 v) {
-  return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(int8 v) {
-  return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(int8 v) {
-  return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(int8 v) {
-  return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rte(uint8 v) {
-  return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rtz(uint8 v) {
-  return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rtp(uint8 v) {
-  return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rtn(uint8 v) {
-  return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(uint8 v) {
-  return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(uint8 v) {
-  return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(uint8 v) {
-  return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(uint8 v) {
-  return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rte(uint8 v) {
-  return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rtz(uint8 v) {
-  return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rtp(uint8 v) {
-  return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rtn(uint8 v) {
-  return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(uint8 v) {
-  return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(uint8 v) {
-  return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(uint8 v) {
-  return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(uint8 v) {
-  return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rte(uint8 v) {
-  return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rtz(uint8 v) {
-  return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rtp(uint8 v) {
-  return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rtn(uint8 v) {
-  return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(uint8 v) {
-  return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(uint8 v) {
-  return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(uint8 v) {
-  return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(uint8 v) {
-  return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rte(uint8 v) {
-  return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rtz(uint8 v) {
-  return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rtp(uint8 v) {
-  return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rtn(uint8 v) {
-  return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(uint8 v) {
-  return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(uint8 v) {
-  return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(uint8 v) {
-  return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(uint8 v) {
-  return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rte(short8 v) {
-  return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rtz(short8 v) {
-  return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rtp(short8 v) {
-  return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rtn(short8 v) {
-  return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(short8 v) {
-  return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(short8 v) {
-  return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(short8 v) {
-  return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(short8 v) {
-  return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rte(short8 v) {
-  return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rtz(short8 v) {
-  return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rtp(short8 v) {
-  return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rtn(short8 v) {
-  return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(short8 v) {
-  return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(short8 v) {
-  return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(short8 v) {
-  return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(short8 v) {
-  return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rte(short8 v) {
-  return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rtz(short8 v) {
-  return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rtp(short8 v) {
-  return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rtn(short8 v) {
-  return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(short8 v) {
-  return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(short8 v) {
-  return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(short8 v) {
-  return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(short8 v) {
-  return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rte(short8 v) {
-  return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rtz(short8 v) {
-  return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rtp(short8 v) {
-  return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rtn(short8 v) {
-  return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(short8 v) {
-  return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(short8 v) {
-  return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(short8 v) {
-  return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(short8 v) {
-  return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rte(ushort8 v) {
-  return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rtz(ushort8 v) {
-  return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rtp(ushort8 v) {
-  return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rtn(ushort8 v) {
-  return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(ushort8 v) {
-  return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(ushort8 v) {
-  return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(ushort8 v) {
-  return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(ushort8 v) {
-  return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rte(ushort8 v) {
-  return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rtz(ushort8 v) {
-  return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rtp(ushort8 v) {
-  return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rtn(ushort8 v) {
-  return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(ushort8 v) {
-  return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(ushort8 v) {
-  return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(ushort8 v) {
-  return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(ushort8 v) {
-  return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rte(ushort8 v) {
-  return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rtz(ushort8 v) {
-  return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rtp(ushort8 v) {
-  return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rtn(ushort8 v) {
-  return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(ushort8 v) {
-  return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(ushort8 v) {
-  return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(ushort8 v) {
-  return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(ushort8 v) {
-  return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rte(ushort8 v) {
-  return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rtz(ushort8 v) {
-  return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rtp(ushort8 v) {
-  return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rtn(ushort8 v) {
-  return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(ushort8 v) {
-  return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(ushort8 v) {
-  return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(ushort8 v) {
-  return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(ushort8 v) {
-  return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rte(char8 v) {
-  return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rtz(char8 v) {
-  return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rtp(char8 v) {
-  return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rtn(char8 v) {
-  return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(char8 v) {
-  return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(char8 v) {
-  return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(char8 v) {
-  return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(char8 v) {
-  return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rte(char8 v) {
-  return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rtz(char8 v) {
-  return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rtp(char8 v) {
-  return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rtn(char8 v) {
-  return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(char8 v) {
-  return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(char8 v) {
-  return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(char8 v) {
-  return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(char8 v) {
-  return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rte(char8 v) {
-  return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rtz(char8 v) {
-  return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rtp(char8 v) {
-  return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rtn(char8 v) {
-  return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(char8 v) {
-  return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(char8 v) {
-  return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(char8 v) {
-  return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(char8 v) {
-  return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rte(char8 v) {
-  return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rtz(char8 v) {
-  return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rtp(char8 v) {
-  return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rtn(char8 v) {
-  return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(char8 v) {
-  return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(char8 v) {
-  return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(char8 v) {
-  return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(char8 v) {
-  return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rte(uchar8 v) {
-  return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rtz(uchar8 v) {
-  return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rtp(uchar8 v) {
-  return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rtn(uchar8 v) {
-  return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(uchar8 v) {
-  return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(uchar8 v) {
-  return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(uchar8 v) {
-  return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(uchar8 v) {
-  return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rte(uchar8 v) {
-  return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rtz(uchar8 v) {
-  return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rtp(uchar8 v) {
-  return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rtn(uchar8 v) {
-  return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(uchar8 v) {
-  return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(uchar8 v) {
-  return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(uchar8 v) {
-  return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(uchar8 v) {
-  return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rte(uchar8 v) {
-  return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rtz(uchar8 v) {
-  return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rtp(uchar8 v) {
-  return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rtn(uchar8 v) {
-  return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(uchar8 v) {
-  return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(uchar8 v) {
-  return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(uchar8 v) {
-  return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(uchar8 v) {
-  return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rte(uchar8 v) {
-  return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rtz(uchar8 v) {
-  return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rtp(uchar8 v) {
-  return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rtn(uchar8 v) {
-  return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(uchar8 v) {
-  return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(uchar8 v) {
-  return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(uchar8 v) {
-  return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(uchar8 v) {
-  return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rte(float8 v) {
-  return (long8)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rtz(float8 v) {
-  return (long8)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rtp(float8 v) {
-  return (long8)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE long8 convert_long8_sat_rtn(float8 v) {
-  return (long8)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rte(float8 v) {
-  return (ulong8)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtz(float8 v) {
-  return (ulong8)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtp(float8 v) {
-  return (ulong8)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ulong8 convert_ulong8_sat_rtn(float8 v) {
-  return (ulong8)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rte(float8 v) {
-  return (int8)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rtz(float8 v) {
-  return (int8)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rtp(float8 v) {
-  return (int8)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE int8 convert_int8_sat_rtn(float8 v) {
-  return (int8)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rte(float8 v) {
-  return (uint8)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rtz(float8 v) {
-  return (uint8)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rtp(float8 v) {
-  return (uint8)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uint8 convert_uint8_sat_rtn(float8 v) {
-  return (uint8)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rte(float8 v) {
-  return (short8)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rtz(float8 v) {
-  return (short8)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rtp(float8 v) {
-  return (short8)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE short8 convert_short8_sat_rtn(float8 v) {
-  return (short8)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rte(float8 v) {
-  return (ushort8)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtz(float8 v) {
-  return (ushort8)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtp(float8 v) {
-  return (ushort8)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE ushort8 convert_ushort8_sat_rtn(float8 v) {
-  return (ushort8)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rte(float8 v) {
-  return (char8)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rtz(float8 v) {
-  return (char8)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rtp(float8 v) {
-  return (char8)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE char8 convert_char8_sat_rtn(float8 v) {
-  return (char8)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rte(float8 v) {
-  return (uchar8)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtz(float8 v) {
-  return (uchar8)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtp(float8 v) {
-  return (uchar8)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7));
-}
-
-INLINE OVERLOADABLE uchar8 convert_uchar8_sat_rtn(float8 v) {
-  return (uchar8)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rte(long16 v) {
-  return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rtz(long16 v) {
-  return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rtp(long16 v) {
-  return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rtn(long16 v) {
-  return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(long16 v) {
-  return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(long16 v) {
-  return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(long16 v) {
-  return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(long16 v) {
-  return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rte(long16 v) {
-  return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rtz(long16 v) {
-  return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rtp(long16 v) {
-  return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rtn(long16 v) {
-  return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(long16 v) {
-  return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(long16 v) {
-  return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(long16 v) {
-  return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(long16 v) {
-  return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rte(long16 v) {
-  return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rtz(long16 v) {
-  return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rtp(long16 v) {
-  return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rtn(long16 v) {
-  return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(long16 v) {
-  return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(long16 v) {
-  return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(long16 v) {
-  return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(long16 v) {
-  return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rte(long16 v) {
-  return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rtz(long16 v) {
-  return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rtp(long16 v) {
-  return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rtn(long16 v) {
-  return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(long16 v) {
-  return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(long16 v) {
-  return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(long16 v) {
-  return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(long16 v) {
-  return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rte(ulong16 v) {
-  return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rtz(ulong16 v) {
-  return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rtp(ulong16 v) {
-  return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rtn(ulong16 v) {
-  return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(ulong16 v) {
-  return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(ulong16 v) {
-  return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(ulong16 v) {
-  return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(ulong16 v) {
-  return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rte(ulong16 v) {
-  return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rtz(ulong16 v) {
-  return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rtp(ulong16 v) {
-  return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rtn(ulong16 v) {
-  return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(ulong16 v) {
-  return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(ulong16 v) {
-  return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(ulong16 v) {
-  return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(ulong16 v) {
-  return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rte(ulong16 v) {
-  return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rtz(ulong16 v) {
-  return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rtp(ulong16 v) {
-  return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rtn(ulong16 v) {
-  return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(ulong16 v) {
-  return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(ulong16 v) {
-  return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(ulong16 v) {
-  return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(ulong16 v) {
-  return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rte(ulong16 v) {
-  return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rtz(ulong16 v) {
-  return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rtp(ulong16 v) {
-  return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rtn(ulong16 v) {
-  return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(ulong16 v) {
-  return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(ulong16 v) {
-  return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(ulong16 v) {
-  return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(ulong16 v) {
-  return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rte(int16 v) {
-  return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rtz(int16 v) {
-  return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rtp(int16 v) {
-  return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rtn(int16 v) {
-  return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(int16 v) {
-  return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(int16 v) {
-  return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(int16 v) {
-  return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(int16 v) {
-  return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rte(int16 v) {
-  return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rtz(int16 v) {
-  return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rtp(int16 v) {
-  return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rtn(int16 v) {
-  return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(int16 v) {
-  return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(int16 v) {
-  return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(int16 v) {
-  return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(int16 v) {
-  return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rte(int16 v) {
-  return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rtz(int16 v) {
-  return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rtp(int16 v) {
-  return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rtn(int16 v) {
-  return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(int16 v) {
-  return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(int16 v) {
-  return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(int16 v) {
-  return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(int16 v) {
-  return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rte(int16 v) {
-  return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rtz(int16 v) {
-  return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rtp(int16 v) {
-  return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rtn(int16 v) {
-  return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(int16 v) {
-  return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(int16 v) {
-  return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(int16 v) {
-  return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(int16 v) {
-  return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rte(uint16 v) {
-  return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rtz(uint16 v) {
-  return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rtp(uint16 v) {
-  return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rtn(uint16 v) {
-  return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(uint16 v) {
-  return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(uint16 v) {
-  return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(uint16 v) {
-  return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(uint16 v) {
-  return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rte(uint16 v) {
-  return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rtz(uint16 v) {
-  return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rtp(uint16 v) {
-  return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rtn(uint16 v) {
-  return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(uint16 v) {
-  return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(uint16 v) {
-  return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(uint16 v) {
-  return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(uint16 v) {
-  return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rte(uint16 v) {
-  return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rtz(uint16 v) {
-  return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rtp(uint16 v) {
-  return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rtn(uint16 v) {
-  return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(uint16 v) {
-  return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(uint16 v) {
-  return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(uint16 v) {
-  return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(uint16 v) {
-  return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rte(uint16 v) {
-  return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rtz(uint16 v) {
-  return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rtp(uint16 v) {
-  return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rtn(uint16 v) {
-  return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(uint16 v) {
-  return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(uint16 v) {
-  return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(uint16 v) {
-  return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(uint16 v) {
-  return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rte(short16 v) {
-  return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rtz(short16 v) {
-  return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rtp(short16 v) {
-  return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rtn(short16 v) {
-  return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(short16 v) {
-  return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(short16 v) {
-  return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(short16 v) {
-  return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(short16 v) {
-  return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rte(short16 v) {
-  return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rtz(short16 v) {
-  return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rtp(short16 v) {
-  return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rtn(short16 v) {
-  return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(short16 v) {
-  return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(short16 v) {
-  return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(short16 v) {
-  return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(short16 v) {
-  return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rte(short16 v) {
-  return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rtz(short16 v) {
-  return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rtp(short16 v) {
-  return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rtn(short16 v) {
-  return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(short16 v) {
-  return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(short16 v) {
-  return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(short16 v) {
-  return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(short16 v) {
-  return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rte(short16 v) {
-  return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rtz(short16 v) {
-  return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rtp(short16 v) {
-  return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rtn(short16 v) {
-  return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(short16 v) {
-  return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(short16 v) {
-  return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(short16 v) {
-  return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(short16 v) {
-  return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rte(ushort16 v) {
-  return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rtz(ushort16 v) {
-  return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rtp(ushort16 v) {
-  return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rtn(ushort16 v) {
-  return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(ushort16 v) {
-  return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(ushort16 v) {
-  return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(ushort16 v) {
-  return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(ushort16 v) {
-  return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rte(ushort16 v) {
-  return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rtz(ushort16 v) {
-  return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rtp(ushort16 v) {
-  return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rtn(ushort16 v) {
-  return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(ushort16 v) {
-  return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(ushort16 v) {
-  return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(ushort16 v) {
-  return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(ushort16 v) {
-  return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rte(ushort16 v) {
-  return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rtz(ushort16 v) {
-  return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rtp(ushort16 v) {
-  return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rtn(ushort16 v) {
-  return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(ushort16 v) {
-  return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(ushort16 v) {
-  return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(ushort16 v) {
-  return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(ushort16 v) {
-  return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rte(ushort16 v) {
-  return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rtz(ushort16 v) {
-  return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rtp(ushort16 v) {
-  return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rtn(ushort16 v) {
-  return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(ushort16 v) {
-  return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(ushort16 v) {
-  return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(ushort16 v) {
-  return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(ushort16 v) {
-  return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rte(char16 v) {
-  return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rtz(char16 v) {
-  return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rtp(char16 v) {
-  return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rtn(char16 v) {
-  return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(char16 v) {
-  return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(char16 v) {
-  return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(char16 v) {
-  return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(char16 v) {
-  return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rte(char16 v) {
-  return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rtz(char16 v) {
-  return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rtp(char16 v) {
-  return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rtn(char16 v) {
-  return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(char16 v) {
-  return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(char16 v) {
-  return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(char16 v) {
-  return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(char16 v) {
-  return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rte(char16 v) {
-  return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rtz(char16 v) {
-  return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rtp(char16 v) {
-  return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rtn(char16 v) {
-  return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(char16 v) {
-  return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(char16 v) {
-  return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(char16 v) {
-  return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(char16 v) {
-  return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rte(char16 v) {
-  return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rtz(char16 v) {
-  return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rtp(char16 v) {
-  return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rtn(char16 v) {
-  return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(char16 v) {
-  return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(char16 v) {
-  return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(char16 v) {
-  return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(char16 v) {
-  return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rte(uchar16 v) {
-  return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rtz(uchar16 v) {
-  return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rtp(uchar16 v) {
-  return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rtn(uchar16 v) {
-  return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(uchar16 v) {
-  return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(uchar16 v) {
-  return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(uchar16 v) {
-  return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(uchar16 v) {
-  return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rte(uchar16 v) {
-  return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rtz(uchar16 v) {
-  return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rtp(uchar16 v) {
-  return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rtn(uchar16 v) {
-  return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(uchar16 v) {
-  return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(uchar16 v) {
-  return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(uchar16 v) {
-  return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(uchar16 v) {
-  return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rte(uchar16 v) {
-  return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rtz(uchar16 v) {
-  return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rtp(uchar16 v) {
-  return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rtn(uchar16 v) {
-  return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(uchar16 v) {
-  return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(uchar16 v) {
-  return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(uchar16 v) {
-  return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(uchar16 v) {
-  return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rte(uchar16 v) {
-  return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rtz(uchar16 v) {
-  return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rtp(uchar16 v) {
-  return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rtn(uchar16 v) {
-  return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(uchar16 v) {
-  return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(uchar16 v) {
-  return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(uchar16 v) {
-  return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(uchar16 v) {
-  return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rte(float16 v) {
-  return (long16)(convert_long_sat_rte(v.s0), convert_long_sat_rte(v.s1), convert_long_sat_rte(v.s2), convert_long_sat_rte(v.s3), convert_long_sat_rte(v.s4), convert_long_sat_rte(v.s5), convert_long_sat_rte(v.s6), convert_long_sat_rte(v.s7), convert_long_sat_rte(v.s8), convert_long_sat_rte(v.s9), convert_long_sat_rte(v.sA), convert_long_sat_rte(v.sB), convert_long_sat_rte(v.sC), convert_long_sat_rte(v.sD), convert_long_sat_rte(v.sE), convert_long_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rtz(float16 v) {
-  return (long16)(convert_long_sat_rtz(v.s0), convert_long_sat_rtz(v.s1), convert_long_sat_rtz(v.s2), convert_long_sat_rtz(v.s3), convert_long_sat_rtz(v.s4), convert_long_sat_rtz(v.s5), convert_long_sat_rtz(v.s6), convert_long_sat_rtz(v.s7), convert_long_sat_rtz(v.s8), convert_long_sat_rtz(v.s9), convert_long_sat_rtz(v.sA), convert_long_sat_rtz(v.sB), convert_long_sat_rtz(v.sC), convert_long_sat_rtz(v.sD), convert_long_sat_rtz(v.sE), convert_long_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rtp(float16 v) {
-  return (long16)(convert_long_sat_rtp(v.s0), convert_long_sat_rtp(v.s1), convert_long_sat_rtp(v.s2), convert_long_sat_rtp(v.s3), convert_long_sat_rtp(v.s4), convert_long_sat_rtp(v.s5), convert_long_sat_rtp(v.s6), convert_long_sat_rtp(v.s7), convert_long_sat_rtp(v.s8), convert_long_sat_rtp(v.s9), convert_long_sat_rtp(v.sA), convert_long_sat_rtp(v.sB), convert_long_sat_rtp(v.sC), convert_long_sat_rtp(v.sD), convert_long_sat_rtp(v.sE), convert_long_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE long16 convert_long16_sat_rtn(float16 v) {
-  return (long16)(convert_long_sat_rtn(v.s0), convert_long_sat_rtn(v.s1), convert_long_sat_rtn(v.s2), convert_long_sat_rtn(v.s3), convert_long_sat_rtn(v.s4), convert_long_sat_rtn(v.s5), convert_long_sat_rtn(v.s6), convert_long_sat_rtn(v.s7), convert_long_sat_rtn(v.s8), convert_long_sat_rtn(v.s9), convert_long_sat_rtn(v.sA), convert_long_sat_rtn(v.sB), convert_long_sat_rtn(v.sC), convert_long_sat_rtn(v.sD), convert_long_sat_rtn(v.sE), convert_long_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rte(float16 v) {
-  return (ulong16)(convert_ulong_sat_rte(v.s0), convert_ulong_sat_rte(v.s1), convert_ulong_sat_rte(v.s2), convert_ulong_sat_rte(v.s3), convert_ulong_sat_rte(v.s4), convert_ulong_sat_rte(v.s5), convert_ulong_sat_rte(v.s6), convert_ulong_sat_rte(v.s7), convert_ulong_sat_rte(v.s8), convert_ulong_sat_rte(v.s9), convert_ulong_sat_rte(v.sA), convert_ulong_sat_rte(v.sB), convert_ulong_sat_rte(v.sC), convert_ulong_sat_rte(v.sD), convert_ulong_sat_rte(v.sE), convert_ulong_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtz(float16 v) {
-  return (ulong16)(convert_ulong_sat_rtz(v.s0), convert_ulong_sat_rtz(v.s1), convert_ulong_sat_rtz(v.s2), convert_ulong_sat_rtz(v.s3), convert_ulong_sat_rtz(v.s4), convert_ulong_sat_rtz(v.s5), convert_ulong_sat_rtz(v.s6), convert_ulong_sat_rtz(v.s7), convert_ulong_sat_rtz(v.s8), convert_ulong_sat_rtz(v.s9), convert_ulong_sat_rtz(v.sA), convert_ulong_sat_rtz(v.sB), convert_ulong_sat_rtz(v.sC), convert_ulong_sat_rtz(v.sD), convert_ulong_sat_rtz(v.sE), convert_ulong_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtp(float16 v) {
-  return (ulong16)(convert_ulong_sat_rtp(v.s0), convert_ulong_sat_rtp(v.s1), convert_ulong_sat_rtp(v.s2), convert_ulong_sat_rtp(v.s3), convert_ulong_sat_rtp(v.s4), convert_ulong_sat_rtp(v.s5), convert_ulong_sat_rtp(v.s6), convert_ulong_sat_rtp(v.s7), convert_ulong_sat_rtp(v.s8), convert_ulong_sat_rtp(v.s9), convert_ulong_sat_rtp(v.sA), convert_ulong_sat_rtp(v.sB), convert_ulong_sat_rtp(v.sC), convert_ulong_sat_rtp(v.sD), convert_ulong_sat_rtp(v.sE), convert_ulong_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE ulong16 convert_ulong16_sat_rtn(float16 v) {
-  return (ulong16)(convert_ulong_sat_rtn(v.s0), convert_ulong_sat_rtn(v.s1), convert_ulong_sat_rtn(v.s2), convert_ulong_sat_rtn(v.s3), convert_ulong_sat_rtn(v.s4), convert_ulong_sat_rtn(v.s5), convert_ulong_sat_rtn(v.s6), convert_ulong_sat_rtn(v.s7), convert_ulong_sat_rtn(v.s8), convert_ulong_sat_rtn(v.s9), convert_ulong_sat_rtn(v.sA), convert_ulong_sat_rtn(v.sB), convert_ulong_sat_rtn(v.sC), convert_ulong_sat_rtn(v.sD), convert_ulong_sat_rtn(v.sE), convert_ulong_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rte(float16 v) {
-  return (int16)(convert_int_sat_rte(v.s0), convert_int_sat_rte(v.s1), convert_int_sat_rte(v.s2), convert_int_sat_rte(v.s3), convert_int_sat_rte(v.s4), convert_int_sat_rte(v.s5), convert_int_sat_rte(v.s6), convert_int_sat_rte(v.s7), convert_int_sat_rte(v.s8), convert_int_sat_rte(v.s9), convert_int_sat_rte(v.sA), convert_int_sat_rte(v.sB), convert_int_sat_rte(v.sC), convert_int_sat_rte(v.sD), convert_int_sat_rte(v.sE), convert_int_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rtz(float16 v) {
-  return (int16)(convert_int_sat_rtz(v.s0), convert_int_sat_rtz(v.s1), convert_int_sat_rtz(v.s2), convert_int_sat_rtz(v.s3), convert_int_sat_rtz(v.s4), convert_int_sat_rtz(v.s5), convert_int_sat_rtz(v.s6), convert_int_sat_rtz(v.s7), convert_int_sat_rtz(v.s8), convert_int_sat_rtz(v.s9), convert_int_sat_rtz(v.sA), convert_int_sat_rtz(v.sB), convert_int_sat_rtz(v.sC), convert_int_sat_rtz(v.sD), convert_int_sat_rtz(v.sE), convert_int_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rtp(float16 v) {
-  return (int16)(convert_int_sat_rtp(v.s0), convert_int_sat_rtp(v.s1), convert_int_sat_rtp(v.s2), convert_int_sat_rtp(v.s3), convert_int_sat_rtp(v.s4), convert_int_sat_rtp(v.s5), convert_int_sat_rtp(v.s6), convert_int_sat_rtp(v.s7), convert_int_sat_rtp(v.s8), convert_int_sat_rtp(v.s9), convert_int_sat_rtp(v.sA), convert_int_sat_rtp(v.sB), convert_int_sat_rtp(v.sC), convert_int_sat_rtp(v.sD), convert_int_sat_rtp(v.sE), convert_int_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE int16 convert_int16_sat_rtn(float16 v) {
-  return (int16)(convert_int_sat_rtn(v.s0), convert_int_sat_rtn(v.s1), convert_int_sat_rtn(v.s2), convert_int_sat_rtn(v.s3), convert_int_sat_rtn(v.s4), convert_int_sat_rtn(v.s5), convert_int_sat_rtn(v.s6), convert_int_sat_rtn(v.s7), convert_int_sat_rtn(v.s8), convert_int_sat_rtn(v.s9), convert_int_sat_rtn(v.sA), convert_int_sat_rtn(v.sB), convert_int_sat_rtn(v.sC), convert_int_sat_rtn(v.sD), convert_int_sat_rtn(v.sE), convert_int_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rte(float16 v) {
-  return (uint16)(convert_uint_sat_rte(v.s0), convert_uint_sat_rte(v.s1), convert_uint_sat_rte(v.s2), convert_uint_sat_rte(v.s3), convert_uint_sat_rte(v.s4), convert_uint_sat_rte(v.s5), convert_uint_sat_rte(v.s6), convert_uint_sat_rte(v.s7), convert_uint_sat_rte(v.s8), convert_uint_sat_rte(v.s9), convert_uint_sat_rte(v.sA), convert_uint_sat_rte(v.sB), convert_uint_sat_rte(v.sC), convert_uint_sat_rte(v.sD), convert_uint_sat_rte(v.sE), convert_uint_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rtz(float16 v) {
-  return (uint16)(convert_uint_sat_rtz(v.s0), convert_uint_sat_rtz(v.s1), convert_uint_sat_rtz(v.s2), convert_uint_sat_rtz(v.s3), convert_uint_sat_rtz(v.s4), convert_uint_sat_rtz(v.s5), convert_uint_sat_rtz(v.s6), convert_uint_sat_rtz(v.s7), convert_uint_sat_rtz(v.s8), convert_uint_sat_rtz(v.s9), convert_uint_sat_rtz(v.sA), convert_uint_sat_rtz(v.sB), convert_uint_sat_rtz(v.sC), convert_uint_sat_rtz(v.sD), convert_uint_sat_rtz(v.sE), convert_uint_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rtp(float16 v) {
-  return (uint16)(convert_uint_sat_rtp(v.s0), convert_uint_sat_rtp(v.s1), convert_uint_sat_rtp(v.s2), convert_uint_sat_rtp(v.s3), convert_uint_sat_rtp(v.s4), convert_uint_sat_rtp(v.s5), convert_uint_sat_rtp(v.s6), convert_uint_sat_rtp(v.s7), convert_uint_sat_rtp(v.s8), convert_uint_sat_rtp(v.s9), convert_uint_sat_rtp(v.sA), convert_uint_sat_rtp(v.sB), convert_uint_sat_rtp(v.sC), convert_uint_sat_rtp(v.sD), convert_uint_sat_rtp(v.sE), convert_uint_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uint16 convert_uint16_sat_rtn(float16 v) {
-  return (uint16)(convert_uint_sat_rtn(v.s0), convert_uint_sat_rtn(v.s1), convert_uint_sat_rtn(v.s2), convert_uint_sat_rtn(v.s3), convert_uint_sat_rtn(v.s4), convert_uint_sat_rtn(v.s5), convert_uint_sat_rtn(v.s6), convert_uint_sat_rtn(v.s7), convert_uint_sat_rtn(v.s8), convert_uint_sat_rtn(v.s9), convert_uint_sat_rtn(v.sA), convert_uint_sat_rtn(v.sB), convert_uint_sat_rtn(v.sC), convert_uint_sat_rtn(v.sD), convert_uint_sat_rtn(v.sE), convert_uint_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rte(float16 v) {
-  return (short16)(convert_short_sat_rte(v.s0), convert_short_sat_rte(v.s1), convert_short_sat_rte(v.s2), convert_short_sat_rte(v.s3), convert_short_sat_rte(v.s4), convert_short_sat_rte(v.s5), convert_short_sat_rte(v.s6), convert_short_sat_rte(v.s7), convert_short_sat_rte(v.s8), convert_short_sat_rte(v.s9), convert_short_sat_rte(v.sA), convert_short_sat_rte(v.sB), convert_short_sat_rte(v.sC), convert_short_sat_rte(v.sD), convert_short_sat_rte(v.sE), convert_short_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rtz(float16 v) {
-  return (short16)(convert_short_sat_rtz(v.s0), convert_short_sat_rtz(v.s1), convert_short_sat_rtz(v.s2), convert_short_sat_rtz(v.s3), convert_short_sat_rtz(v.s4), convert_short_sat_rtz(v.s5), convert_short_sat_rtz(v.s6), convert_short_sat_rtz(v.s7), convert_short_sat_rtz(v.s8), convert_short_sat_rtz(v.s9), convert_short_sat_rtz(v.sA), convert_short_sat_rtz(v.sB), convert_short_sat_rtz(v.sC), convert_short_sat_rtz(v.sD), convert_short_sat_rtz(v.sE), convert_short_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rtp(float16 v) {
-  return (short16)(convert_short_sat_rtp(v.s0), convert_short_sat_rtp(v.s1), convert_short_sat_rtp(v.s2), convert_short_sat_rtp(v.s3), convert_short_sat_rtp(v.s4), convert_short_sat_rtp(v.s5), convert_short_sat_rtp(v.s6), convert_short_sat_rtp(v.s7), convert_short_sat_rtp(v.s8), convert_short_sat_rtp(v.s9), convert_short_sat_rtp(v.sA), convert_short_sat_rtp(v.sB), convert_short_sat_rtp(v.sC), convert_short_sat_rtp(v.sD), convert_short_sat_rtp(v.sE), convert_short_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE short16 convert_short16_sat_rtn(float16 v) {
-  return (short16)(convert_short_sat_rtn(v.s0), convert_short_sat_rtn(v.s1), convert_short_sat_rtn(v.s2), convert_short_sat_rtn(v.s3), convert_short_sat_rtn(v.s4), convert_short_sat_rtn(v.s5), convert_short_sat_rtn(v.s6), convert_short_sat_rtn(v.s7), convert_short_sat_rtn(v.s8), convert_short_sat_rtn(v.s9), convert_short_sat_rtn(v.sA), convert_short_sat_rtn(v.sB), convert_short_sat_rtn(v.sC), convert_short_sat_rtn(v.sD), convert_short_sat_rtn(v.sE), convert_short_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rte(float16 v) {
-  return (ushort16)(convert_ushort_sat_rte(v.s0), convert_ushort_sat_rte(v.s1), convert_ushort_sat_rte(v.s2), convert_ushort_sat_rte(v.s3), convert_ushort_sat_rte(v.s4), convert_ushort_sat_rte(v.s5), convert_ushort_sat_rte(v.s6), convert_ushort_sat_rte(v.s7), convert_ushort_sat_rte(v.s8), convert_ushort_sat_rte(v.s9), convert_ushort_sat_rte(v.sA), convert_ushort_sat_rte(v.sB), convert_ushort_sat_rte(v.sC), convert_ushort_sat_rte(v.sD), convert_ushort_sat_rte(v.sE), convert_ushort_sat_rte [...]
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtz(float16 v) {
-  return (ushort16)(convert_ushort_sat_rtz(v.s0), convert_ushort_sat_rtz(v.s1), convert_ushort_sat_rtz(v.s2), convert_ushort_sat_rtz(v.s3), convert_ushort_sat_rtz(v.s4), convert_ushort_sat_rtz(v.s5), convert_ushort_sat_rtz(v.s6), convert_ushort_sat_rtz(v.s7), convert_ushort_sat_rtz(v.s8), convert_ushort_sat_rtz(v.s9), convert_ushort_sat_rtz(v.sA), convert_ushort_sat_rtz(v.sB), convert_ushort_sat_rtz(v.sC), convert_ushort_sat_rtz(v.sD), convert_ushort_sat_rtz(v.sE), convert_ushort_sat_rtz [...]
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtp(float16 v) {
-  return (ushort16)(convert_ushort_sat_rtp(v.s0), convert_ushort_sat_rtp(v.s1), convert_ushort_sat_rtp(v.s2), convert_ushort_sat_rtp(v.s3), convert_ushort_sat_rtp(v.s4), convert_ushort_sat_rtp(v.s5), convert_ushort_sat_rtp(v.s6), convert_ushort_sat_rtp(v.s7), convert_ushort_sat_rtp(v.s8), convert_ushort_sat_rtp(v.s9), convert_ushort_sat_rtp(v.sA), convert_ushort_sat_rtp(v.sB), convert_ushort_sat_rtp(v.sC), convert_ushort_sat_rtp(v.sD), convert_ushort_sat_rtp(v.sE), convert_ushort_sat_rtp [...]
-}
-
-INLINE OVERLOADABLE ushort16 convert_ushort16_sat_rtn(float16 v) {
-  return (ushort16)(convert_ushort_sat_rtn(v.s0), convert_ushort_sat_rtn(v.s1), convert_ushort_sat_rtn(v.s2), convert_ushort_sat_rtn(v.s3), convert_ushort_sat_rtn(v.s4), convert_ushort_sat_rtn(v.s5), convert_ushort_sat_rtn(v.s6), convert_ushort_sat_rtn(v.s7), convert_ushort_sat_rtn(v.s8), convert_ushort_sat_rtn(v.s9), convert_ushort_sat_rtn(v.sA), convert_ushort_sat_rtn(v.sB), convert_ushort_sat_rtn(v.sC), convert_ushort_sat_rtn(v.sD), convert_ushort_sat_rtn(v.sE), convert_ushort_sat_rtn [...]
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rte(float16 v) {
-  return (char16)(convert_char_sat_rte(v.s0), convert_char_sat_rte(v.s1), convert_char_sat_rte(v.s2), convert_char_sat_rte(v.s3), convert_char_sat_rte(v.s4), convert_char_sat_rte(v.s5), convert_char_sat_rte(v.s6), convert_char_sat_rte(v.s7), convert_char_sat_rte(v.s8), convert_char_sat_rte(v.s9), convert_char_sat_rte(v.sA), convert_char_sat_rte(v.sB), convert_char_sat_rte(v.sC), convert_char_sat_rte(v.sD), convert_char_sat_rte(v.sE), convert_char_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rtz(float16 v) {
-  return (char16)(convert_char_sat_rtz(v.s0), convert_char_sat_rtz(v.s1), convert_char_sat_rtz(v.s2), convert_char_sat_rtz(v.s3), convert_char_sat_rtz(v.s4), convert_char_sat_rtz(v.s5), convert_char_sat_rtz(v.s6), convert_char_sat_rtz(v.s7), convert_char_sat_rtz(v.s8), convert_char_sat_rtz(v.s9), convert_char_sat_rtz(v.sA), convert_char_sat_rtz(v.sB), convert_char_sat_rtz(v.sC), convert_char_sat_rtz(v.sD), convert_char_sat_rtz(v.sE), convert_char_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rtp(float16 v) {
-  return (char16)(convert_char_sat_rtp(v.s0), convert_char_sat_rtp(v.s1), convert_char_sat_rtp(v.s2), convert_char_sat_rtp(v.s3), convert_char_sat_rtp(v.s4), convert_char_sat_rtp(v.s5), convert_char_sat_rtp(v.s6), convert_char_sat_rtp(v.s7), convert_char_sat_rtp(v.s8), convert_char_sat_rtp(v.s9), convert_char_sat_rtp(v.sA), convert_char_sat_rtp(v.sB), convert_char_sat_rtp(v.sC), convert_char_sat_rtp(v.sD), convert_char_sat_rtp(v.sE), convert_char_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE char16 convert_char16_sat_rtn(float16 v) {
-  return (char16)(convert_char_sat_rtn(v.s0), convert_char_sat_rtn(v.s1), convert_char_sat_rtn(v.s2), convert_char_sat_rtn(v.s3), convert_char_sat_rtn(v.s4), convert_char_sat_rtn(v.s5), convert_char_sat_rtn(v.s6), convert_char_sat_rtn(v.s7), convert_char_sat_rtn(v.s8), convert_char_sat_rtn(v.s9), convert_char_sat_rtn(v.sA), convert_char_sat_rtn(v.sB), convert_char_sat_rtn(v.sC), convert_char_sat_rtn(v.sD), convert_char_sat_rtn(v.sE), convert_char_sat_rtn(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rte(float16 v) {
-  return (uchar16)(convert_uchar_sat_rte(v.s0), convert_uchar_sat_rte(v.s1), convert_uchar_sat_rte(v.s2), convert_uchar_sat_rte(v.s3), convert_uchar_sat_rte(v.s4), convert_uchar_sat_rte(v.s5), convert_uchar_sat_rte(v.s6), convert_uchar_sat_rte(v.s7), convert_uchar_sat_rte(v.s8), convert_uchar_sat_rte(v.s9), convert_uchar_sat_rte(v.sA), convert_uchar_sat_rte(v.sB), convert_uchar_sat_rte(v.sC), convert_uchar_sat_rte(v.sD), convert_uchar_sat_rte(v.sE), convert_uchar_sat_rte(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtz(float16 v) {
-  return (uchar16)(convert_uchar_sat_rtz(v.s0), convert_uchar_sat_rtz(v.s1), convert_uchar_sat_rtz(v.s2), convert_uchar_sat_rtz(v.s3), convert_uchar_sat_rtz(v.s4), convert_uchar_sat_rtz(v.s5), convert_uchar_sat_rtz(v.s6), convert_uchar_sat_rtz(v.s7), convert_uchar_sat_rtz(v.s8), convert_uchar_sat_rtz(v.s9), convert_uchar_sat_rtz(v.sA), convert_uchar_sat_rtz(v.sB), convert_uchar_sat_rtz(v.sC), convert_uchar_sat_rtz(v.sD), convert_uchar_sat_rtz(v.sE), convert_uchar_sat_rtz(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtp(float16 v) {
-  return (uchar16)(convert_uchar_sat_rtp(v.s0), convert_uchar_sat_rtp(v.s1), convert_uchar_sat_rtp(v.s2), convert_uchar_sat_rtp(v.s3), convert_uchar_sat_rtp(v.s4), convert_uchar_sat_rtp(v.s5), convert_uchar_sat_rtp(v.s6), convert_uchar_sat_rtp(v.s7), convert_uchar_sat_rtp(v.s8), convert_uchar_sat_rtp(v.s9), convert_uchar_sat_rtp(v.sA), convert_uchar_sat_rtp(v.sB), convert_uchar_sat_rtp(v.sC), convert_uchar_sat_rtp(v.sD), convert_uchar_sat_rtp(v.sE), convert_uchar_sat_rtp(v.sF));
-}
-
-INLINE OVERLOADABLE uchar16 convert_uchar16_sat_rtn(float16 v) {
-  return (uchar16)(convert_uchar_sat_rtn(v.s0), convert_uchar_sat_rtn(v.s1), convert_uchar_sat_rtn(v.s2), convert_uchar_sat_rtn(v.s3), convert_uchar_sat_rtn(v.s4), convert_uchar_sat_rtn(v.s5), convert_uchar_sat_rtn(v.s6), convert_uchar_sat_rtn(v.s7), convert_uchar_sat_rtn(v.s8), convert_uchar_sat_rtn(v.s9), convert_uchar_sat_rtn(v.sA), convert_uchar_sat_rtn(v.sB), convert_uchar_sat_rtn(v.sC), convert_uchar_sat_rtn(v.sD), convert_uchar_sat_rtn(v.sE), convert_uchar_sat_rtn(v.sF));
-}
-
diff --git a/backend/src/ocl_stdlib.tmpl.h b/backend/src/ocl_stdlib.tmpl.h
deleted file mode 100755
index f648a8c..0000000
--- a/backend/src/ocl_stdlib.tmpl.h
+++ /dev/null
@@ -1,5160 +0,0 @@
-/*
- * Copyright © 2012 Intel Corporation
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library. If not, see <http://www.gnu.org/licenses/>.
- *
- * Author: Benjamin Segovia <benjamin.segovia at intel.com>
- */
-
-#ifndef __GEN_OCL_STDLIB_H__
-#define __GEN_OCL_STDLIB_H__
-
-#define INLINE inline __attribute__((always_inline))
-#define OVERLOADABLE __attribute__((overloadable))
-#define PURE __attribute__((pure))
-#define CONST __attribute__((const))
-#define INLINE_OVERLOADABLE inline __attribute__((overloadable,always_inline))
-// FIXME, clang's opencl FE doesn't support static.
-#define static
-
-/////////////////////////////////////////////////////////////////////////////
-// OpenCL built-in scalar data types
-/////////////////////////////////////////////////////////////////////////////
-typedef unsigned char uchar;
-typedef unsigned short ushort;
-typedef unsigned int uint;
-typedef unsigned long ulong;
-typedef __typeof__(sizeof(int)) size_t;
-typedef __typeof__((int *)0-(int *)0) ptrdiff_t;
-typedef signed int intptr_t;
-typedef unsigned int uintptr_t;
-
-/////////////////////////////////////////////////////////////////////////////
-// OpenCL address space
-/////////////////////////////////////////////////////////////////////////////
-// These are built-ins in LLVM 3.3.
-#if 100*__clang_major__ + __clang_minor__ <= 302
-#define __private __attribute__((address_space(0)))
-#define __global __attribute__((address_space(1)))
-#define __constant __attribute__((address_space(2)))
-#define __local __attribute__((address_space(3)))
-#define global __global
-#define local __local
-#define constant __constant
-#define private __private
-#endif
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-
-/////////////////////////////////////////////////////////////////////////////
-// OpenCL built-in vector data types
-/////////////////////////////////////////////////////////////////////////////
-#define DEF(type) typedef type type##2 __attribute__((ext_vector_type(2)));\
-                  typedef type type##3 __attribute__((ext_vector_type(3)));\
-                  typedef type type##4 __attribute__((ext_vector_type(4)));\
-                  typedef type type##8 __attribute__((ext_vector_type(8)));\
-                  typedef type type##16 __attribute__((ext_vector_type(16)));
-DEF(char);
-DEF(uchar);
-DEF(short);
-DEF(ushort);
-DEF(int);
-DEF(uint);
-DEF(long);
-DEF(ulong);
-DEF(float);
-DEF(double);
-#undef DEF
-/////////////////////////////////////////////////////////////////////////////
-// OpenCL other built-in data types
-/////////////////////////////////////////////////////////////////////////////
-// FIXME:
-// This is a transitional hack to bypass the LLVM 3.3 built-in types.
-// See the Khronos SPIR specification for handling of these types.
-#define __texture __attribute__((address_space(4)))
-struct _image1d_t;
-typedef __texture struct _image1d_t* __image1d_t;
-struct _image1d_buffer_t;
-typedef __texture struct _image1d_buffer_t* __image1d_buffer_t;
-struct _image1d_array_t;
-typedef __texture struct _image1d_array_t* __image1d_array_t;
-struct _image2d_t;
-typedef __texture struct _image2d_t* __image2d_t;
-struct _image2d_array_t;
-typedef __texture struct _image2d_array_t* __image2d_array_t;
-struct _image3d_t;
-typedef __texture struct _image3d_t* __image3d_t;
-typedef const ushort __sampler_t;
-typedef size_t __event_t;
-#define image1d_t __image1d_t
-#define image1d_buffer_t __image1d_buffer_t
-#define image1d_array_t __image1d_array_t
-#define image2d_t __image2d_t
-#define image2d_array_t __image2d_array_t
-#define image3d_t __image3d_t
-#define sampler_t __sampler_t
-#define event_t __event_t
-/////////////////////////////////////////////////////////////////////////////
-// OpenCL conversions & type casting
-/////////////////////////////////////////////////////////////////////////////
-
-// ##BEGIN_AS##
-
-// ##END_AS##
-
-// ##BEGIN_CONVERT##
-
-// ##END_CONVERT##
-
-/////////////////////////////////////////////////////////////////////////////
-// OpenCL preprocessor directives & macros
-/////////////////////////////////////////////////////////////////////////////
-#define __OPENCL_VERSION__ 120
-#define __CL_VERSION_1_0__ 100
-#define __CL_VERSION_1_1__ 110
-#define __CL_VERSION_1_2__ 120
-
-#define __ENDIAN_LITTLE__ 1
-#define __IMAGE_SUPPORT__ 1
-#define __kernel_exec(X, TYPE) __kernel __attribute__((work_group_size_hint(X,1,1))) \
-                                        __attribute__((vec_type_hint(TYPE)))
-#define kernel_exec(X, TYPE) __kernel_exec(X, TYPE)
-#define cl_khr_global_int32_base_atomics
-#define cl_khr_global_int32_extended_atomics
-#define cl_khr_local_int32_base_atomics
-#define cl_khr_local_int32_extended_atomics
-#define cl_khr_byte_addressable_store
-#define cl_khr_icd
-#define cl_khr_gl_sharing
-
-/////////////////////////////////////////////////////////////////////////////
-// OpenCL floating-point macros and pragmas
-/////////////////////////////////////////////////////////////////////////////
-#define FLT_DIG 6
-#define FLT_MANT_DIG 24
-#define FLT_MAX_10_EXP +38
-#define FLT_MAX_EXP +128
-#define FLT_MIN_10_EXP -37
-#define FLT_MIN_EXP -125
-#define FLT_RADIX 2
-#define FLT_ONE 1.0000000000e+00         /* 0x3F800000 */
-#define FLT_MAX 0x1.fffffep127f
-#define FLT_MIN 0x1.0p-126f
-#define FLT_EPSILON 0x1.0p-23f
-
-#define MAXFLOAT     3.40282347e38F
-INLINE_OVERLOADABLE float __ocl_inff(void) {
-  union { uint u; float f; } u;
-  u.u = 0x7F800000;
-  return u.f;
-}
-INLINE_OVERLOADABLE float __ocl_nanf(void) {
-  union { uint u; float f; } u;
-  u.u = 0x7F800001;
-  return u.f;
-}
-typedef union
-{
-  float value;
-  uint  word;
-} float_shape_type;
-
-/* Get a 32 bit int from a float.  */
-#ifndef GEN_OCL_GET_FLOAT_WORD
-# define GEN_OCL_GET_FLOAT_WORD(i,d)  \
-do {                                  \
-  float_shape_type gf_u;              \
-  gf_u.value = (d);                   \
-  (i) = gf_u.word;                    \
-} while (0)
-#endif
-/* Set a float from a 32 bit int.  */
-#ifndef GEN_OCL_SET_FLOAT_WORD
-# define GEN_OCL_SET_FLOAT_WORD(d,i)  \
-do {                                  \
-  float_shape_type sf_u;              \
-  sf_u.word = (i);                    \
-  (d) = sf_u.value;                   \
-} while (0)
-#endif
-
-INLINE_OVERLOADABLE int __ocl_finitef (float x){
-  unsigned ix;
-  GEN_OCL_GET_FLOAT_WORD (ix, x);
-  return (ix & 0x7fffffff) < 0x7f800000;
-}
-
-#define HUGE_VALF    (__ocl_inff())
-#define INFINITY     (__ocl_inff())
-#define NAN          (__ocl_nanf())
-#define M_E_F        2.718281828459045F
-#define M_LOG2E_F    1.4426950408889634F
-#define M_LOG10E_F   0.43429448190325176F
-#define M_LN2_F      0.6931471805599453F
-#define M_LN10_F     2.302585092994046F
-#define M_PI_F       3.141592653589793F
-#define M_PI_2_F     1.5707963267948966F
-#define M_PI_4_F     0.7853981633974483F
-#define M_1_PI_F     0.3183098861837907F
-#define M_2_PI_F     0.6366197723675814F
-#define M_2_SQRTPI_F 1.1283791670955126F
-#define M_SQRT2_F    1.4142135623730951F
-#define M_SQRT1_2_F  0.7071067811865476F
-/////////////////////////////////////////////////////////////////////////////
-// OpenCL integer built-in macros
-/////////////////////////////////////////////////////////////////////////////
-#define CHAR_BIT    8
-#define CHAR_MAX    SCHAR_MAX
-#define CHAR_MIN    SCHAR_MIN
-#define INT_MAX     2147483647
-#define INT_MIN     (-2147483647 - 1)
-#define LONG_MAX    0x7fffffffffffffffL
-#define LONG_MIN    (-0x7fffffffffffffffL - 1)
-#define SCHAR_MAX   127
-#define SCHAR_MIN   (-127 - 1)
-#define SHRT_MAX    32767
-#define SHRT_MIN    (-32767 - 1)
-#define UCHAR_MAX   255
-#define USHRT_MAX   65535
-#define UINT_MAX    0xffffffff
-#define ULONG_MAX   0xffffffffffffffffUL
-/////////////////////////////////////////////////////////////////////////////
-// OpenCL relational built-in functions
-/////////////////////////////////////////////////////////////////////////////
-
-int INLINE_OVERLOADABLE isequal(float x, float y) { return x == y; }
-int INLINE_OVERLOADABLE isnotequal(float x, float y) { return x != y; }
-int INLINE_OVERLOADABLE isgreater(float x, float y) { return x > y; }
-int INLINE_OVERLOADABLE isgreaterequal(float x, float y) { return x >= y; }
-int INLINE_OVERLOADABLE isless(float x, float y) { return x < y; }
-int INLINE_OVERLOADABLE islessequal(float x, float y) { return x <= y; }
-int INLINE_OVERLOADABLE islessgreater(float x, float y) { return (x < y) || (x > y); }
-
-#define SDEF(TYPE)                                                              \
-OVERLOADABLE TYPE ocl_sadd_sat(TYPE x, TYPE y);                          \
-OVERLOADABLE TYPE ocl_ssub_sat(TYPE x, TYPE y);                          \
-INLINE_OVERLOADABLE TYPE add_sat(TYPE x, TYPE y) { return ocl_sadd_sat(x, y); } \
-INLINE_OVERLOADABLE TYPE sub_sat(TYPE x, TYPE y) { return ocl_ssub_sat(x, y); }
-SDEF(char);
-SDEF(short);
-#undef SDEF
-OVERLOADABLE int ocl_sadd_sat(int x, int y);
-INLINE_OVERLOADABLE int add_sat(int x, int y) { return ocl_sadd_sat(x, y); }
-OVERLOADABLE int ocl_ssub_sat(int x, int y);
-INLINE_OVERLOADABLE int sub_sat(int x, int y) {
-  return (y == 0x80000000u) ? (ocl_sadd_sat(ocl_sadd_sat(0x7fffffff, x), 1)) : ocl_ssub_sat(x, y);
-}
-OVERLOADABLE long ocl_sadd_sat(long x, long y);
-INLINE_OVERLOADABLE long add_sat(long x, long y) {
-  union {long l; uint i[2];} ux, uy;
-  ux.l = x;
-  uy.l = y;
-  if((ux.i[1] ^ uy.i[1]) & 0x80000000u)
-    return x + y;
-  return ocl_sadd_sat(x, y);
-}
-OVERLOADABLE long ocl_ssub_sat(long x, long y);
-INLINE_OVERLOADABLE long sub_sat(long x, long y) {
-  union {long l; uint i[2];} ux, uy;
-  ux.l = x;
-  uy.l = y;
-  if((ux.i[1] ^ uy.i[1]) & 0x80000000u)
-    return ocl_ssub_sat(x, y);
-  return x - y;
-}
-#define UDEF(TYPE)                                                              \
-OVERLOADABLE TYPE ocl_uadd_sat(TYPE x, TYPE y);                          \
-OVERLOADABLE TYPE ocl_usub_sat(TYPE x, TYPE y);                          \
-INLINE_OVERLOADABLE TYPE add_sat(TYPE x, TYPE y) { return ocl_uadd_sat(x, y); } \
-INLINE_OVERLOADABLE TYPE sub_sat(TYPE x, TYPE y) { return ocl_usub_sat(x, y); }
-UDEF(uchar);
-UDEF(ushort);
-UDEF(uint);
-UDEF(ulong);
-#undef UDEF
-
-INLINE_OVERLOADABLE int isfinite(float x) {
-  union { uint u; float f; } u;
-  u.f = x;
-  return (u.u & 0x7FFFFFFF) < 0x7F800000;
-}
-INLINE_OVERLOADABLE int isinf(float x) {
-  union { uint u; float f; } u;
-  u.f = x;
-  return (u.u & 0x7FFFFFFF) == 0x7F800000;
-}
-INLINE_OVERLOADABLE int isnan(float x) {
-  return x != x;
-}
-INLINE_OVERLOADABLE int isnormal(float x) {
-  union { uint u; float f; } u;
-  u.f = x;
-  u.u &= 0x7FFFFFFF;
-  return (u.u < 0x7F800000) && (u.u >= 0x800000);
-}
-INLINE_OVERLOADABLE int isordered(float x, float y) { return isequal(x, x) && isequal(y, y); }
-INLINE_OVERLOADABLE int isunordered(float x, float y) { return isnan(x) || isnan(y); }
-INLINE_OVERLOADABLE int signbit(float x) {
-  union { uint u; float f; } u;
-  u.f = x;
-  return u.u >> 31;
-}
-
-#define DEC1(type) INLINE_OVERLOADABLE int any(type a) { return a<0; }
-#define DEC2(type) INLINE_OVERLOADABLE int any(type a) { return a.s0<0 || a.s1<0; }
-#define DEC3(type) INLINE_OVERLOADABLE int any(type a) { return a.s0<0 || a.s1<0 || a.s2<0; }
-#define DEC4(type) INLINE_OVERLOADABLE int any(type a) { return a.s0<0 || a.s1<0 || a.s2<0 || a.s3<0; }
-#define DEC8(type) INLINE_OVERLOADABLE int any(type a) { return a.s0<0 || a.s1<0 || a.s2<0 || a.s3<0 || a.s4<0 || a.s5<0 || a.s6<0 || a.s7<0; }
-#define DEC16(type) INLINE_OVERLOADABLE int any(type a) { return a.s0<0 || a.s1<0 || a.s2<0 || a.s3<0 || a.s4<0 || a.s5<0 || a.s6<0 || a.s7<0 || a.s8<0 || a.s9<0 || a.sA<0 || a.sB<0 || a.sC<0 || a.sD<0 || a.sE<0 || a.sF<0; }
-DEC1(char);
-DEC1(short);
-DEC1(int);
-DEC1(long);
-#define DEC(n) DEC##n(char##n); DEC##n(short##n); DEC##n(int##n); DEC##n(long##n);
-DEC(2);
-DEC(3);
-DEC(4);
-DEC(8);
-DEC(16);
-#undef DEC
-#undef DEC1
-#undef DEC2
-#undef DEC3
-#undef DEC4
-#undef DEC8
-#undef DEC16
-#define DEC1(type) INLINE_OVERLOADABLE int all(type a) { return a<0; }
-#define DEC2(type) INLINE_OVERLOADABLE int all(type a) { return a.s0<0 && a.s1<0; }
-#define DEC3(type) INLINE_OVERLOADABLE int all(type a) { return a.s0<0 && a.s1<0 && a.s2<0; }
-#define DEC4(type) INLINE_OVERLOADABLE int all(type a) { return a.s0<0 && a.s1<0 && a.s2<0 && a.s3<0; }
-#define DEC8(type) INLINE_OVERLOADABLE int all(type a) { return a.s0<0 && a.s1<0 && a.s2<0 && a.s3<0 && a.s4<0 && a.s5<0 && a.s6<0 && a.s7<0; }
-#define DEC16(type) INLINE_OVERLOADABLE int all(type a) { return a.s0<0 && a.s1<0 && a.s2<0 && a.s3<0 && a.s4<0 && a.s5<0 && a.s6<0 && a.s7<0 && a.s8<0 && a.s9<0 && a.sA<0 && a.sB<0 && a.sC<0 && a.sD<0 && a.sE<0 && a.sF<0; }
-DEC1(char);
-DEC1(short);
-DEC1(int);
-DEC1(long);
-#define DEC(n) DEC##n(char##n); DEC##n(short##n); DEC##n(int##n); DEC##n(long##n);
-DEC(2);
-DEC(3);
-DEC(4);
-DEC(8);
-DEC(16);
-#undef DEC
-#undef DEC1
-#undef DEC2
-#undef DEC3
-#undef DEC4
-#undef DEC8
-#undef DEC16
-
-#define DEF(type) INLINE_OVERLOADABLE type bitselect(type a, type b, type c) { return (a & ~c) | (b & c); }
-DEF(char); DEF(uchar); DEF(short); DEF(ushort); DEF(int); DEF(uint)
-DEF(long); DEF(ulong)
-#undef DEF
-INLINE_OVERLOADABLE float bitselect(float a, float b, float c) {
-  return as_float(bitselect(as_int(a), as_int(b), as_int(c)));
-}
-
-/////////////////////////////////////////////////////////////////////////////
-// Integer built-in functions
-/////////////////////////////////////////////////////////////////////////////
-PURE CONST uint __gen_ocl_fbh(uint);
-PURE CONST uint __gen_ocl_fbl(uint);
-
-INLINE_OVERLOADABLE char clz(char x) {
-  if (x < 0)
-    return 0;
-  if (x == 0)
-    return 8;
-  return __gen_ocl_fbh(x) - 24;
-}
-
-INLINE_OVERLOADABLE uchar clz(uchar x) {
-  if (x == 0)
-    return 8;
-  return __gen_ocl_fbh(x) - 24;
-}
-
-INLINE_OVERLOADABLE short clz(short x) {
-  if (x < 0)
-    return 0;
-  if (x == 0)
-    return 16;
-  return __gen_ocl_fbh(x) - 16;
-}
-
-INLINE_OVERLOADABLE ushort clz(ushort x) {
-  if (x == 0)
-    return 16;
-  return __gen_ocl_fbh(x) - 16;
-}
-
-INLINE_OVERLOADABLE int clz(int x) {
-  if (x < 0)
-    return 0;
-  if (x == 0)
-    return 32;
-  return __gen_ocl_fbh(x);
-}
-
-INLINE_OVERLOADABLE uint clz(uint x) {
-  if (x == 0)
-    return 32;
-  return __gen_ocl_fbh(x);
-}
-
-INLINE_OVERLOADABLE long clz(long x) {
-  union { int i[2]; long x; } u;
-  u.x = x;
-  if (u.i[1] & 0x80000000u)
-    return 0;
-  if (u.i[1] == 0 && u.i[0] == 0)
-    return 64;
-  uint v = clz(u.i[1]);
-  if(v == 32)
-    v += clz(u.i[0]);
-  return v;
-}
-
-INLINE_OVERLOADABLE ulong clz(ulong x) {
-  if (x == 0)
-    return 64;
-  union { uint i[2]; ulong x; } u;
-  u.x = x;
-  uint v = clz(u.i[1]);
-  if(v == 32)
-    v += clz(u.i[0]);
-  return v;
-}
-
-OVERLOADABLE int __gen_ocl_mul_hi(int x, int y);
-OVERLOADABLE uint __gen_ocl_mul_hi(uint x, uint y);
-OVERLOADABLE long __gen_ocl_mul_hi(long x, long y);
-OVERLOADABLE ulong __gen_ocl_mul_hi(ulong x, ulong y);
-INLINE_OVERLOADABLE char mul_hi(char x, char y) { return (x * y) >> 8; }
-INLINE_OVERLOADABLE uchar mul_hi(uchar x, uchar y) { return (x * y) >> 8; }
-INLINE_OVERLOADABLE short mul_hi(short x, short y) { return (x * y) >> 16; }
-INLINE_OVERLOADABLE ushort mul_hi(ushort x, ushort y) { return (x * y) >> 16; }
-INLINE_OVERLOADABLE int mul_hi(int x, int y) { return __gen_ocl_mul_hi(x, y); }
-INLINE_OVERLOADABLE uint mul_hi(uint x, uint y) { return __gen_ocl_mul_hi(x, y); }
-INLINE_OVERLOADABLE long mul_hi(long x, long y) {
-  return __gen_ocl_mul_hi(x, y);
-}
-INLINE_OVERLOADABLE ulong mul_hi(ulong x, ulong y) {
-  return __gen_ocl_mul_hi(x, y);
-}
-
-#define DEF(type) INLINE_OVERLOADABLE type mad_hi(type a, type b, type c) { return mul_hi(a, b) + c; }
-DEF(char)
-DEF(uchar)
-DEF(short)
-DEF(ushort)
-DEF(int)
-DEF(uint)
-DEF(long)
-DEF(ulong)
-#undef DEF
-
-INLINE_OVERLOADABLE int mul24(int a, int b) { return ((a << 8) >> 8) * ((b << 8) >> 8); }
-INLINE_OVERLOADABLE uint mul24(uint a, uint b) { return (a & 0xFFFFFF) * (b & 0xFFFFFF); }
-
-INLINE_OVERLOADABLE int mad24(int a, int b, int c) { return mul24(a, b) + c; }
-INLINE_OVERLOADABLE uint mad24(uint a, uint b, uint c) { return mul24(a, b) + c; }
-
-INLINE_OVERLOADABLE char mad_sat(char a, char b, char c) {
-  int x = (int)a * (int)b + (int)c;
-  if (x > 127)
-    x = 127;
-  if (x < -128)
-    x = -128;
-  return x;
-}
-
-INLINE_OVERLOADABLE uchar mad_sat(uchar a, uchar b, uchar c) {
-  uint x = (uint)a * (uint)b + (uint)c;
-  if (x > 255)
-    x = 255;
-  return x;
-}
-
-INLINE_OVERLOADABLE short mad_sat(short a, short b, short c) {
-  int x = (int)a * (int)b + (int)c;
-  if (x > 32767)
-    x = 32767;
-  if (x < -32768)
-    x = -32768;
-  return x;
-}
-
-INLINE_OVERLOADABLE ushort mad_sat(ushort a, ushort b, ushort c) {
-  uint x = (uint)a * (uint)b + (uint)c;
-  if (x > 65535)
-    x = 65535;
-  return x;
-}
-
-INLINE_OVERLOADABLE int mad_sat(int a, int b, int c) {
-  long x = (long)a * (long)b + (long)c;
-  if (x > 0x7FFFFFFF)
-    x = 0x7FFFFFFF;
-  else if (x < -0x7FFFFFFF-1)
-    x = -0x7FFFFFFF-1;
-  return (int)x;
-}
-
-INLINE_OVERLOADABLE uint mad_sat(uint a, uint b, uint c) {
-  ulong x = (ulong)a * (ulong)b + (ulong)c;
-  if (x > 0xFFFFFFFFu)
-    x = 0xFFFFFFFFu;
-  return (uint)x;
-}
-
-OVERLOADABLE long __gen_ocl_mad_sat(long a, long b, long c);
-OVERLOADABLE ulong __gen_ocl_mad_sat(ulong a, ulong b, ulong c);
-
-INLINE_OVERLOADABLE long mad_sat(long a, long b, long c) {
-  return __gen_ocl_mad_sat(a, b, c);
-}
-
-INLINE_OVERLOADABLE ulong mad_sat(ulong a, ulong b, ulong c) {
-  return __gen_ocl_mad_sat(a, b, c);
-}
-
-INLINE_OVERLOADABLE uchar __rotate_left(uchar x, uchar y) { return (x << y) | (x >> (8 - y)); }
-INLINE_OVERLOADABLE char __rotate_left(char x, char y) { return __rotate_left((uchar)x, (uchar)y); }
-INLINE_OVERLOADABLE ushort __rotate_left(ushort x, ushort y) { return (x << y) | (x >> (16 - y)); }
-INLINE_OVERLOADABLE short __rotate_left(short x, short y) { return __rotate_left((ushort)x, (ushort)y); }
-INLINE_OVERLOADABLE uint __rotate_left(uint x, uint y) { return (x << y) | (x >> (32 - y)); }
-INLINE_OVERLOADABLE int __rotate_left(int x, int y) { return __rotate_left((uint)x, (uint)y); }
-INLINE_OVERLOADABLE ulong __rotate_left(ulong x, ulong y) { return (x << y) | (x >> (64 - y)); }
-INLINE_OVERLOADABLE long __rotate_left(long x, long y) { return __rotate_left((ulong)x, (ulong)y); }
-#define DEF(type, m) INLINE_OVERLOADABLE type rotate(type x, type y) { return __rotate_left(x, (type)(y & m)); }
-DEF(char, 7)
-DEF(uchar, 7)
-DEF(short, 15)
-DEF(ushort, 15)
-DEF(int, 31)
-DEF(uint, 31)
-DEF(long, 63)
-DEF(ulong, 63)
-#undef DEF
-
-OVERLOADABLE short __gen_ocl_upsample(short hi, short lo);
-OVERLOADABLE int __gen_ocl_upsample(int hi, int lo);
-OVERLOADABLE long __gen_ocl_upsample(long hi, long lo);
-INLINE_OVERLOADABLE short upsample(char hi, uchar lo) { return __gen_ocl_upsample((short)hi, (short)lo); }
-INLINE_OVERLOADABLE ushort upsample(uchar hi, uchar lo) { return __gen_ocl_upsample((short)hi, (short)lo); }
-INLINE_OVERLOADABLE int upsample(short hi, ushort lo) { return __gen_ocl_upsample((int)hi, (int)lo); }
-INLINE_OVERLOADABLE uint upsample(ushort hi, ushort lo) { return __gen_ocl_upsample((int)hi, (int)lo); }
-INLINE_OVERLOADABLE long upsample(int hi, uint lo) {
-  return __gen_ocl_upsample((long)hi, (long)lo);
-}
-INLINE_OVERLOADABLE ulong upsample(uint hi, uint lo) {
-  return __gen_ocl_upsample((long)hi, (long)lo);
-}
-
-OVERLOADABLE uint __gen_ocl_hadd(uint x, uint y);
-OVERLOADABLE uint __gen_ocl_rhadd(uint x, uint y);
-#define DEC DEF(char); DEF(uchar); DEF(short); DEF(ushort)
-#define DEF(type) INLINE_OVERLOADABLE type hadd(type x, type y) { return (x + y) >> 1; }
-DEC
-#undef DEF
-#define DEF(type) INLINE_OVERLOADABLE type rhadd(type x, type y) { return (x + y + 1) >> 1; }
-DEC
-#undef DEF
-#undef DEC
-INLINE_OVERLOADABLE int hadd(int x, int y) {
-  return (x < 0 && y > 0) || (x > 0 && y < 0) ?
-         ((x + y) >> 1) :
-         __gen_ocl_hadd((uint)x, (uint)y);
-}
-INLINE_OVERLOADABLE uint hadd(uint x, uint y) { return __gen_ocl_hadd(x, y); }
-INLINE_OVERLOADABLE int rhadd(int x, int y) {
-  return (x < 0 && y > 0) || (x > 0 && y < 0) ?
-         ((x + y + 1) >> 1) :
-         __gen_ocl_rhadd((uint)x, (uint)y);
- }
-INLINE_OVERLOADABLE uint rhadd(uint x, uint y) { return __gen_ocl_rhadd(x, y); }
-OVERLOADABLE ulong __gen_ocl_hadd(ulong x, ulong y);
-OVERLOADABLE ulong __gen_ocl_rhadd(ulong x, ulong y);
-INLINE_OVERLOADABLE long hadd(long x, long y) {
-  return (x < 0 && y > 0) || (x > 0 && y < 0) ?
-         ((x + y) >> 1) :
-         __gen_ocl_hadd((ulong)x, (ulong)y);
-}
-INLINE_OVERLOADABLE ulong hadd(ulong x, ulong y) {
-  return __gen_ocl_hadd(x, y);
-}
-INLINE_OVERLOADABLE long rhadd(long x, long y) {
-  return (x < 0 && y > 0) || (x > 0 && y < 0) ?
-         ((x + y + 1) >> 1) :
-         __gen_ocl_rhadd((ulong)x, (ulong)y);
-}
-INLINE_OVERLOADABLE ulong rhadd(ulong x, ulong y) {
-  return __gen_ocl_rhadd(x, y);
-}
-
-int __gen_ocl_abs(int x);
-#define DEC(TYPE) INLINE_OVERLOADABLE u##TYPE abs(TYPE x) { return (u##TYPE) __gen_ocl_abs(x); }
-DEC(int)
-DEC(short)
-DEC(char)
-#undef DEC
-INLINE_OVERLOADABLE ulong abs(long x) { return x < 0 ? -x : x; }
-/* For unsigned types, do nothing. */
-#define DEC(TYPE) INLINE_OVERLOADABLE TYPE abs(TYPE x) { return x; }
-DEC(uint)
-DEC(ushort)
-DEC(uchar)
-DEC(ulong)
-#undef DEC
-
-/* Char and short type abs diff */
-/* promote char and short to int and will be no module overflow */
-#define DEC(TYPE, UTYPE) INLINE_OVERLOADABLE UTYPE abs_diff(TYPE x, TYPE y) \
-                         { return (UTYPE) (abs((int)x - (int)y)); }
-DEC(char, uchar)
-DEC(uchar, uchar)
-DEC(short, ushort)
-DEC(ushort, ushort)
-#undef DEC
-
-INLINE_OVERLOADABLE uint abs_diff (uint x, uint y) {
-    /* same signed will never overflow. */
-    return y > x ? (y -x) : (x - y);
-}
-
-INLINE_OVERLOADABLE uint abs_diff (int x, int y) {
-    /* same signed will never module overflow. */
-    if ((x >= 0 && y >= 0) || (x <= 0 && y <= 0))
-        return abs(x - y);
-
-    return (abs(x) + abs(y));
-}
-
-INLINE_OVERLOADABLE ulong abs_diff (long x, long y) {
-  if ((x >= 0 && y >= 0) || (x <= 0 && y <= 0))
-    return abs(x - y);
-  return abs(x) + abs(y);
-}
-INLINE_OVERLOADABLE ulong abs_diff (ulong x, ulong y) {
-  return y > x ? (y - x) : (x - y);
-}
-
-
-/////////////////////////////////////////////////////////////////////////////
-// SIMD level function
-/////////////////////////////////////////////////////////////////////////////
-short __gen_ocl_simd_any(short);
-short __gen_ocl_simd_all(short);
-
-
-/////////////////////////////////////////////////////////////////////////////
-// Work Items functions (see 6.11.1 of OCL 1.1 spec)
-/////////////////////////////////////////////////////////////////////////////
-
-PURE CONST uint __gen_ocl_get_work_dim(void);
-INLINE uint get_work_dim(void) {
-  return __gen_ocl_get_work_dim();
-}
-
-#define DECL_INTERNAL_WORK_ITEM_FN(NAME) \
-PURE CONST unsigned int __gen_ocl_##NAME##0(void); \
-PURE CONST unsigned int __gen_ocl_##NAME##1(void); \
-PURE CONST unsigned int __gen_ocl_##NAME##2(void);
-DECL_INTERNAL_WORK_ITEM_FN(get_group_id)
-DECL_INTERNAL_WORK_ITEM_FN(get_local_id)
-DECL_INTERNAL_WORK_ITEM_FN(get_local_size)
-DECL_INTERNAL_WORK_ITEM_FN(get_global_size)
-DECL_INTERNAL_WORK_ITEM_FN(get_global_offset)
-DECL_INTERNAL_WORK_ITEM_FN(get_num_groups)
-#undef DECL_INTERNAL_WORK_ITEM_FN
-
-#define DECL_PUBLIC_WORK_ITEM_FN(NAME, OTHER_RET)    \
-INLINE unsigned NAME(unsigned int dim) {             \
-  if (dim == 0) return __gen_ocl_##NAME##0();        \
-  else if (dim == 1) return __gen_ocl_##NAME##1();   \
-  else if (dim == 2) return __gen_ocl_##NAME##2();   \
-  else return OTHER_RET;                             \
-}
-
-DECL_PUBLIC_WORK_ITEM_FN(get_group_id, 0)
-DECL_PUBLIC_WORK_ITEM_FN(get_local_id, 0)
-DECL_PUBLIC_WORK_ITEM_FN(get_local_size, 1)
-DECL_PUBLIC_WORK_ITEM_FN(get_global_size, 1)
-DECL_PUBLIC_WORK_ITEM_FN(get_global_offset, 0)
-DECL_PUBLIC_WORK_ITEM_FN(get_num_groups, 1)
-#undef DECL_PUBLIC_WORK_ITEM_FN
-
-INLINE uint get_global_id(uint dim) {
-  return get_local_id(dim) + get_local_size(dim) * get_group_id(dim) + get_global_offset(dim);
-}
-
-/////////////////////////////////////////////////////////////////////////////
-// Math Functions (see 6.11.2 of OCL 1.1 spec)
-/////////////////////////////////////////////////////////////////////////////
-PURE CONST float __gen_ocl_fabs(float x);
-PURE CONST float __gen_ocl_sin(float x);
-PURE CONST float __gen_ocl_cos(float x);
-PURE CONST float __gen_ocl_sqrt(float x);
-PURE CONST float __gen_ocl_rsqrt(float x);
-PURE CONST float __gen_ocl_log(float x);
-PURE CONST float __gen_ocl_exp(float x);
-PURE CONST float __gen_ocl_pow(float x, float y);
-PURE CONST float __gen_ocl_rcp(float x);
-PURE CONST float __gen_ocl_rndz(float x);
-PURE CONST float __gen_ocl_rnde(float x);
-PURE CONST float __gen_ocl_rndu(float x);
-PURE CONST float __gen_ocl_rndd(float x);
-INLINE_OVERLOADABLE float __gen_ocl_internal_floor(float x) { return __gen_ocl_rndd(x); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_copysign(float x, float y) {
-  union { unsigned u; float f; } ux, uy;
-  ux.f = x;
-  uy.f = y;
-  ux.u = (ux.u & 0x7fffffff) | (uy.u & 0x80000000u);
-  return ux.f;
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_log(float x) {
-/*
- *  Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-  union { unsigned int i; float f; } u;
-  const float
-  ln2_hi =   6.9313812256e-01,  /* 0x3f317180 */
-  ln2_lo =   9.0580006145e-06,  /* 0x3717f7d1 */
-  two25 =    3.355443200e+07, /* 0x4c000000 */
-  Lg1 = 6.6666668653e-01, /* 3F2AAAAB */
-  Lg2 = 4.0000000596e-01, /* 3ECCCCCD */
-  Lg3 = 2.8571429849e-01, /* 3E924925 */
-  Lg4 = 2.2222198546e-01, /* 3E638E29 */
-  Lg5 = 1.8183572590e-01, /* 3E3A3325 */
-  Lg6 = 1.5313838422e-01, /* 3E1CD04F */
-  Lg7 = 1.4798198640e-01; /* 3E178897 */
-
-  const float zero   =  0.0;
-  float hfsq,f,s,z,R,w,t1,t2,dk;
-  int k,ix,i,j;
-
-  u.f = x;  ix = u.i;
-  k=0;
-  if (ix < 0x00800000) {      /* x < 2**-126  */
-      if ((ix&0x7fffffff)==0)
-    return -two25/zero;   /* log(+-0)=-inf */
-      if (ix<0) return (x-x)/zero;  /* log(-#) = NaN */
-      return -INFINITY;  /* Gen does not support subnormal number now */
-      //k -= 25; x *= two25; /* subnormal number, scale up x */
-      //u.f = x;  ix = u.i;
-  }
-  if (ix >= 0x7f800000) return x+x;
-  k += (ix>>23)-127;
-  ix &= 0x007fffff;
-  i = (ix+(0x95f64<<3))&0x800000;
-  u.i = ix|(i^0x3f800000); x = u.f;
-  k += (i>>23);
-  f = x-(float)1.0;
-  if((0x007fffff&(15+ix))<16) { /* |f| < 2**-20 */
-      if(f==zero) {
-        if(k==0) return zero;
-        else {
-          dk=(float)k; return dk*ln2_hi+dk*ln2_lo;
-        }
-      }
-      R = f*f*((float)0.5-(float)0.33333333333333333*f);
-      if(k==0)
-        return f-R;
-      else {
-        dk=(float)k;  return dk*ln2_hi-((R-dk*ln2_lo)-f);
-      }
-  }
-  s = f/((float)2.0+f);
-  dk = (float)k;
-  z = s*s;
-  i = ix-(0x6147a<<3);
-  w = z*z;
-  j = (0x6b851<<3)-ix;
-  t1= w*(Lg2+w*(Lg4+w*Lg6));
-  t2= z*(Lg1+w*(Lg3+w*(Lg5+w*Lg7)));
-  i |= j;
-  R = t2+t1;
-  if(i>0) {
-      hfsq=(float)0.5*f*f;
-      if(k==0) return f-(hfsq-s*(hfsq+R)); else
-         return dk*ln2_hi-((hfsq-(s*(hfsq+R)+dk*ln2_lo))-f);
-  } else {
-      if(k==0) return f-s*(f-R); else
-         return dk*ln2_hi-((s*(f-R)-dk*ln2_lo)-f);
-  }
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_log10(float x) {
-/*
- *  Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-  union {float f; unsigned i; }u;
-  const float
-  zero       = 0.0,
-  two25      =  3.3554432000e+07, /* 0x4c000000 */
-  ivln10     =  4.3429449201e-01, /* 0x3ede5bd9 */
-  log10_2hi  =  3.0102920532e-01, /* 0x3e9a2080 */
-  log10_2lo  =  7.9034151668e-07; /* 0x355427db */
-
-  float y,z;
-  int i,k,hx;
-
-  u.f = x; hx = u.i;
-  k=0;
-  if (hx < 0x00800000) {                  /* x < 2**-126  */
-    if ((hx&0x7fffffff)==0)
-      return -two25/zero;             /* log(+-0)=-inf */
-    if (hx<0) return NAN;        /* log(-#) = NaN */
-    return -INFINITY;      /* Gen does not support subnormal now */
-    //k -= 25; x *= two25; /* subnormal number, scale up x */
-    //u.f = x; hx = u.i;
-  }
-  if (hx >= 0x7f800000) return x+x;
-  k += (hx>>23)-127;
-  i  = ((unsigned)k&0x80000000)>>31;
-  hx = (hx&0x007fffff)|((0x7f-i)<<23);
-  y  = (float)(k+i);
-  u.i = hx; x = u.f;
-  z  = y*log10_2lo + ivln10*__gen_ocl_internal_log(x);
-  return  z+y*log10_2hi;
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_log2(float x) {
-/*
- *  Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
- *  adapted for log2 by Ulrich Drepper <drepper at cygnus.com>
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-  const float zero   =  0.0,
-  ln2 = 0.69314718055994530942,
-  two25 =    3.355443200e+07, /** 0x4c000000 */
-  Lg1 = 6.6666668653e-01, /** 3F2AAAAB */
-  Lg2 = 4.0000000596e-01, /** 3ECCCCCD */
-  Lg3 = 2.8571429849e-01, /** 3E924925 */
-  Lg4 = 2.2222198546e-01, /** 3E638E29 */
-  Lg5 = 1.8183572590e-01, /** 3E3A3325 */
-  Lg6 = 1.5313838422e-01, /** 3E1CD04F */
-  Lg7 = 1.4798198640e-01; /** 3E178897 */
-
-  float hfsq,f,s,z,R,w,t1,t2,dk;
-  int k,ix,i,j;
-
-  union {float f; int i; }u;//GET_FLOAT_WORD(ix,x);
-  u.f = x; ix = u.i;
-
-  k=0;
-  if (ix < 0x00800000) {           /** x < 2**-126  */
-      if ((ix&0x7fffffff)==0)
-      return -two25/(x-x);        /** log(+-0)=-inf */
-
-      if (ix<0) return (x-x)/(x-x);    /** log(-#) = NaN */
-      return -INFINITY;
-      k -= 25; x *= two25; /** subnormal number, scale up x */
-      u.f = x; ix = u.i; //GET_FLOAT_WORD(ix,x);
-  }
-
-  if (ix >= 0x7f800000) return x+x;
-
-  k += (ix>>23)-127;
-  ix &= 0x007fffff;
-  i = (ix+(0x95f64<<3))&0x800000;
-
-  u.i = ix|(i^0x3f800000); x = u.f;//SET_FLOAT_WORD(x,ix|(i^0x3f800000));    /** normalize x or x/2 */
-  k += (i>>23);
-  dk = (float)k;
-  f = x-(float)1.0;
-
-  if((0x007fffff&(15+ix))<16) {    /** |f| < 2**-20 */
-      if(f==zero) return dk;
-
-      R = f*f*((float)0.5-(float)0.33333333333333333*f);
-      return dk-(R-f)/ln2;
-  }
-
-  s = f/((float)2.0+f);
-  z = s*s;
-  i = ix-(0x6147a<<3);
-  w = z*z;
-  j = (0x6b851<<3)-ix;
-  t1= w*(Lg2+w*(Lg4+w*Lg6));
-  t2= z*(Lg1+w*(Lg3+w*(Lg5+w*Lg7)));
-  i |= j;
-  R = t2+t1;
-
-  if(i>0) {
-      hfsq=(float)0.5*f*f;
-      return dk-((hfsq-(s*(hfsq+R)))-f)/ln2;
-  } else {
-      return dk-((s*(f-R))-f)/ln2;
-  }
-}
-
-INLINE float __gen_ocl_scalbnf (float x, int n){
-  /* copy from fdlibm */
-  float two25 = 3.355443200e+07,	/* 0x4c000000 */
-  twom25 = 2.9802322388e-08,	        /* 0x33000000 */
-  huge = 1.0e+30,
-  tiny = 1.0e-30;
-  int k,ix;
-  GEN_OCL_GET_FLOAT_WORD(ix,x);
-  k = (ix&0x7f800000)>>23; /* extract exponent */
-  if (k==0) {	/* 0 or subnormal x */
-    if ((ix&0x7fffffff)==0) return x; /* +-0 */
-    x *= two25;
-    GEN_OCL_GET_FLOAT_WORD(ix,x);
-    k = ((ix&0x7f800000)>>23) - 25;
-  }
-  if (k==0xff) return x+x;	/* NaN or Inf */
-  if (n< -50000)
-    return tiny*__gen_ocl_internal_copysign(tiny,x);	/*underflow*/
-  if (n> 50000 || k+n > 0xfe)
-    return huge*__gen_ocl_internal_copysign(huge,x); /* overflow  */
-  /* Now k and n are bounded we know that k = k+n does not overflow. */
-  k = k+n;
-  if (k > 0) { /* normal result */
-    GEN_OCL_SET_FLOAT_WORD(x,(ix&0x807fffff)|(k<<23));
-    return x;
-  }
-  if (k <= -25)
-    return tiny*__gen_ocl_internal_copysign(tiny,x);	/*underflow*/
-  k += 25;				/* subnormal result */
-  GEN_OCL_SET_FLOAT_WORD(x,(ix&0x807fffff)|(k<<23));
-  return x*twom25;
-}
-
-
-
-__constant const float PIo2[] = {
-  1.5703125000e+00, /* 0x3fc90000 */
-  4.5776367188e-04, /* 0x39f00000 */
-  2.5987625122e-05, /* 0x37da0000 */
-  7.5437128544e-08, /* 0x33a20000 */
-  6.0026650317e-11, /* 0x2e840000 */
-  7.3896444519e-13, /* 0x2b500000 */
-  5.3845816694e-15, /* 0x27c20000 */
-  5.6378512969e-18, /* 0x22d00000 */
-  8.3009228831e-20, /* 0x1fc40000 */
-  3.2756352257e-22, /* 0x1bc60000 */
-  6.3331015649e-25, /* 0x17440000 */
-};
-
-INLINE int __kernel_rem_pio2f(float *x, float *y, int e0, int nx, int prec, const __constant int *ipio2)
-{
-  /* copied from fdlibm */
-const float
-zero   = 0.0,
-one    = 1.0,
-two8   =  2.5600000000e+02, /* 0x43800000 */
-twon8  =  3.9062500000e-03; /* 0x3b800000 */
-
-  int init_jk[3]; /* initial value for jk */
-  int jz,jx,jv,jp,jk,carry,n,iq[20],i,j,k,m,q0,ih;
-  float z,fw,f[20],fq[20],q[20];
-  init_jk[0] = 4; init_jk[1] = 7; init_jk[2] = 9;
-    /* initialize jk*/
-  jk = init_jk[prec];
-  jp = jk;
-
-    /* determine jx,jv,q0, note that 3>q0 */
-  jx =  nx-1;
-  jv = (e0-3)/8; if(jv<0) jv=0;
-  q0 =  e0-8*(jv+1);
-
-    /* set up f[0] to f[jx+jk] where f[jx+jk] = ipio2[jv+jk] */
-  j = jv-jx; m = jx+jk;
-  for(i=0;i<=m;i++,j++) f[i] = (j<0)? zero : (float) ipio2[j];
-
-    /* compute q[0],q[1],...q[jk] */
-  for (i=0;i<=jk;i++) {
-      for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j]; q[i] = fw;
-  }
-
-  jz = jk;
-recompute:
-    /* distill q[] into iq[] reversingly */
-  for(i=0,j=jz,z=q[jz];j>0;i++,j--) {
-      fw    =  (float)((int)(twon8* z));
-      iq[i] =  (int)(z-two8*fw);
-      z     =  q[j-1]+fw;
-  }
-
-    /* compute n */
-  z  = __gen_ocl_scalbnf(z,q0);   /* actual value of z */
-  z -= (float)8.0*__gen_ocl_internal_floor(z*(float)0.125); /* trim off integer >= 8 */
-  n  = (int) z;
-  z -= (float)n;
-  ih = 0;
-  if(q0>0) {  /* need iq[jz-1] to determine n */
-      i  = (iq[jz-1]>>(8-q0)); n += i;
-      iq[jz-1] -= i<<(8-q0);
-      ih = iq[jz-1]>>(7-q0);
-  }
-  else if(q0==0) ih = iq[jz-1]>>8;
-  else if(z>=(float)0.5) ih=2;
-
-  if(ih>0) {  /* q > 0.5 */
-      n += 1; carry = 0;
-      for(i=0;i<jz ;i++) {  /* compute 1-q */
-    j = iq[i];
-    if(carry==0) {
-        if(j!=0) {
-      carry = 1; iq[i] = 0x100- j;
-        }
-    } else  iq[i] = 0xff - j;
-      }
-      if(q0>0) {    /* rare case: chance is 1 in 12 */
-          switch(q0) {
-          case 1:
-           iq[jz-1] &= 0x7f; break;
-        case 2:
-           iq[jz-1] &= 0x3f; break;
-          }
-      }
-      if(ih==2) {
-    z = one - z;
-    if(carry!=0) z -= __gen_ocl_scalbnf(one,q0);
-      }
-  }
-
-    /* check if recomputation is needed */
-  if(z==zero) {
-      j = 0;
-      for (i=jz-1;i>=jk;i--) j |= iq[i];
-      if(j==0) { /* need recomputation */
-    for(k=1;iq[jk-k]==0;k++);   /* k = no. of terms needed */
-
-    for(i=jz+1;i<=jz+k;i++) {   /* add q[jz+1] to q[jz+k] */
-        f[jx+i] = (float) ipio2[jv+i];
-        for(j=0,fw=0.0;j<=jx;j++) fw += x[j]*f[jx+i-j];
-        q[i] = fw;
-    }
-    jz += k;
-    goto recompute;
-      }
-  }
-
-    /* chop off zero terms */
-  if(z==(float)0.0) {
-      jz -= 1; q0 -= 8;
-      while(iq[jz]==0) { jz--; q0-=8;}
-  } else { /* break z into 8-bit if necessary */
-      z = __gen_ocl_scalbnf(z,-q0);
-      if(z>=two8) {
-    fw = (float)((int)(twon8*z));
-    iq[jz] = (int)(z-two8*fw);
-    jz += 1; q0 += 8;
-    iq[jz] = (int) fw;
-      } else iq[jz] = (int) z ;
-  }
-
-    /* convert integer "bit" chunk to floating-point value */
-  fw = __gen_ocl_scalbnf(one,q0);
-  for(i=jz;i>=0;i--) {
-      q[i] = fw*(float)iq[i]; fw*=twon8;
-  }
-
-    /* compute PIo2[0,...,jp]*q[jz,...,0] */
-  for(i=jz;i>=0;i--) {
-      for(fw=0.0,k=0;k<=jp&&k<=jz-i;k++) fw += PIo2[k]*q[i+k];
-      fq[jz-i] = fw;
-  }
-
-    /* compress fq[] into y[] */
-  switch(prec) {
-      case 0:
-    fw = 0.0;
-    for (i=jz;i>=0;i--) fw += fq[i];
-    y[0] = (ih==0)? fw: -fw;
-    break;
-      case 1:
-      case 2:
-    fw = 0.0;
-    for (i=jz;i>=0;i--) fw += fq[i];
-    y[0] = (ih==0)? fw: -fw;
-    fw = fq[0]-fw;
-    for (i=1;i<=jz;i++) fw += fq[i];
-    y[1] = (ih==0)? fw: -fw;
-    break;
-      case 3: /* painful */
-    for (i=jz;i>0;i--) {
-        fw      = fq[i-1]+fq[i];
-        fq[i]  += fq[i-1]-fw;
-        fq[i-1] = fw;
-    }
-    for (i=jz;i>1;i--) {
-        fw      = fq[i-1]+fq[i];
-        fq[i]  += fq[i-1]-fw;
-        fq[i-1] = fw;
-    }
-    for (fw=0.0,i=jz;i>=2;i--) fw += fq[i];
-    if(ih==0) {
-        y[0] =  fq[0]; y[1] =  fq[1]; y[2] =  fw;
-    } else {
-        y[0] = -fq[0]; y[1] = -fq[1]; y[2] = -fw;
-    }
-  }
-  return n&7;
-
-}
-__constant const int npio2_hw[32] = {
-0x3fc90f00, 0x40490f00, 0x4096cb00, 0x40c90f00, 0x40fb5300, 0x4116cb00,
-0x412fed00, 0x41490f00, 0x41623100, 0x417b5300, 0x418a3a00, 0x4196cb00,
-0x41a35c00, 0x41afed00, 0x41bc7e00, 0x41c90f00, 0x41d5a000, 0x41e23100,
-0x41eec200, 0x41fb5300, 0x4203f200, 0x420a3a00, 0x42108300, 0x4216cb00,
-0x421d1400, 0x42235c00, 0x4229a500, 0x422fed00, 0x42363600, 0x423c7e00,
-0x4242c700, 0x42490f00
-};
-
-__constant const int two_over_pi[22*9] = {
-0xA2, 0xF9, 0x83, 0x6E, 0x4E, 0x44, 0x15, 0x29, 0xFC,
-0x27, 0x57, 0xD1, 0xF5, 0x34, 0xDD, 0xC0, 0xDB, 0x62,
-0x95, 0x99, 0x3C, 0x43, 0x90, 0x41, 0xFE, 0x51, 0x63,
-0xAB, 0xDE, 0xBB, 0xC5, 0x61, 0xB7, 0x24, 0x6E, 0x3A,
-0x42, 0x4D, 0xD2, 0xE0, 0x06, 0x49, 0x2E, 0xEA, 0x09,
-0xD1, 0x92, 0x1C, 0xFE, 0x1D, 0xEB, 0x1C, 0xB1, 0x29,
-0xA7, 0x3E, 0xE8, 0x82, 0x35, 0xF5, 0x2E, 0xBB, 0x44,
-0x84, 0xE9, 0x9C, 0x70, 0x26, 0xB4, 0x5F, 0x7E, 0x41,
-0x39, 0x91, 0xD6, 0x39, 0x83, 0x53, 0x39, 0xF4, 0x9C,
-0x84, 0x5F, 0x8B, 0xBD, 0xF9, 0x28, 0x3B, 0x1F, 0xF8,
-0x97, 0xFF, 0xDE, 0x05, 0x98, 0x0F, 0xEF, 0x2F, 0x11,
-0x8B, 0x5A, 0x0A, 0x6D, 0x1F, 0x6D, 0x36, 0x7E, 0xCF,
-0x27, 0xCB, 0x09, 0xB7, 0x4F, 0x46, 0x3F, 0x66, 0x9E,
-0x5F, 0xEA, 0x2D, 0x75, 0x27, 0xBA, 0xC7, 0xEB, 0xE5,
-0xF1, 0x7B, 0x3D, 0x07, 0x39, 0xF7, 0x8A, 0x52, 0x92,
-0xEA, 0x6B, 0xFB, 0x5F, 0xB1, 0x1F, 0x8D, 0x5D, 0x08,
-0x56, 0x03, 0x30, 0x46, 0xFC, 0x7B, 0x6B, 0xAB, 0xF0,
-0xCF, 0xBC, 0x20, 0x9A, 0xF4, 0x36, 0x1D, 0xA9, 0xE3,
-0x91, 0x61, 0x5E, 0xE6, 0x1B, 0x08, 0x65, 0x99, 0x85,
-0x5F, 0x14, 0xA0, 0x68, 0x40, 0x8D, 0xFF, 0xD8, 0x80,
-0x4D, 0x73, 0x27, 0x31, 0x06, 0x06, 0x15, 0x56, 0xCA,
-0x73, 0xA8, 0xC9, 0x60, 0xE2, 0x7B, 0xC0, 0x8C, 0x6B,
-};
-
-
-
-INLINE int __ieee754_rem_pio2f(float x, float *y) {
-  /* copied from fdlibm */
-  float z,w,t,r,fn;
-  float tx[3];
-
-const float half_value = 5.0000000e-1;
-const float zero =  0.0000000000;
-const float two8 =  2.5600000000e+02;
-const float invpio2 =  6.3661980629e-01;
-const float pio2_1  =  1.5707855225e+00;
-const float pio2_1t =  1.0804334124e-05;
-const float pio2_2  =  1.0804273188e-05;
-const float pio2_2t =  6.0770999344e-11;
-const float pio2_3  =  6.0770943833e-11;
-const float pio2_3t =  6.1232342629e-17;
-  int e0,i,j,nx,n,ix,hx;
-
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  ix = hx&0x7fffffff;
-  if(ix<=0x3f490fd8)   /* |x| ~<= pi/4 , no need for reduction */
-      {y[0] = x; y[1] = 0; return 0;}
-  if(ix<0x4016cbe4) {  /* |x| < 3pi/4, special case with n=+-1 */
-      if(hx>0) {
-    z = x - pio2_1;
-    if((ix&0xfffffff0)!=0x3fc90fd0) { /* 24+24 bit pi OK */
-        y[0] = z - pio2_1t;
-        y[1] = (z-y[0])-pio2_1t;
-    } else {    /* near pi/2, use 24+24+24 bit pi */
-        z -= pio2_2;
-        y[0] = z - pio2_2t;
-        y[1] = (z-y[0])-pio2_2t;
-    }
-    return 1;
-      } else {  /* negative x */
-    z = x + pio2_1;
-    if((ix&0xfffffff0)!=0x3fc90fd0) { /* 24+24 bit pi OK */
-        y[0] = z + pio2_1t;
-        y[1] = (z-y[0])+pio2_1t;
-    } else {    /* near pi/2, use 24+24+24 bit pi */
-        z += pio2_2;
-        y[0] = z + pio2_2t;
-        y[1] = (z-y[0])+pio2_2t;
-    }
-    return -1;
-      }
-  }
-  if(ix<=0x43490f80) { /* |x| ~<= 2^7*(pi/2), medium size */
-      t  = __gen_ocl_fabs(x);
-      n  = (int) (t*invpio2+half_value);
-      fn = (float)n;
-      r  = t-fn*pio2_1;
-      w  = fn*pio2_1t;  /* 1st round good to 40 bit */
-      if(n<32&&(ix&0xffffff00)!=npio2_hw[n-1]) {
-    y[0] = r-w; /* quick check no cancellation */
-      } else {
-          uint high;
-          j  = ix>>23;
-          y[0] = r-w;
-    GEN_OCL_GET_FLOAT_WORD(high,y[0]);
-          i = j-((high>>23)&0xff);
-          if(i>8) {  /* 2nd iteration needed, good to 57 */
-        t  = r;
-        w  = fn*pio2_2;
-        r  = t-w;
-        w  = fn*pio2_2t-((t-r)-w);
-        y[0] = r-w;
-        GEN_OCL_GET_FLOAT_WORD(high,y[0]);
-        i = j-((high>>23)&0xff);
-        if(i>25)  { /* 3rd iteration need, 74 bits acc */
-          t  = r; /* will cover all possible cases */
-          w  = fn*pio2_3;
-          r  = t-w;
-          w  = fn*pio2_3t-((t-r)-w);
-          y[0] = r-w;
-        }
-    }
-      }
-      y[1] = (r-y[0])-w;
-      if(hx<0)  {y[0] = -y[0]; y[1] = -y[1]; return -n;}
-      else   return n;
-  }
-    /*
-     * all other (large) arguments
-     */
-  if(ix>=0x7f800000) {    /* x is inf or NaN */
-      y[0]=y[1]=x-x; return 0;
-  }
-    /* set z = scalbn(|x|,ilogb(x)-7) */
-  e0  = (ix>>23)-134;   /* e0 = ilogb(z)-7; */
-  GEN_OCL_SET_FLOAT_WORD(z, ix - ((int)(e0<<23)));
-  for(i=0;i<2;i++) {
-    tx[i] = (float)((int)(z));
-    z     = (z-tx[i])*two8;
-  }
-  tx[2] = z;
-  nx = 3;
-  while(tx[nx-1]==zero) nx--; /* skip zero term */
-  n  =  __kernel_rem_pio2f(tx,y,e0,nx,2,two_over_pi);
-  if(hx<0) {y[0] = -y[0]; y[1] = -y[1]; return -n;}
-  return n;
-}
-
-INLINE_OVERLOADABLE float __kernel_sinf(float x, float y, int iy)
-{
-  /* copied from fdlibm */
-const float
-half_value =  5.0000000000e-01,/* 0x3f000000 */
-S1  = -1.6666667163e-01, /* 0xbe2aaaab */
-S2  =  8.3333337680e-03, /* 0x3c088889 */
-S3  = -1.9841270114e-04, /* 0xb9500d01 */
-S4  =  2.7557314297e-06, /* 0x3638ef1b */
-S5  = -2.5050759689e-08, /* 0xb2d72f34 */
-S6  =  1.5896910177e-10; /* 0x2f2ec9d3 */
-  float z,r,v;
-  int ix;
-  GEN_OCL_GET_FLOAT_WORD(ix,x);
-  ix &= 0x7fffffff;     /* high word of x */
-  if(ix<0x32000000)     /* |x| < 2**-27 */
-     {if((int)x==0) return x;}    /* generate inexact */
-  z =  x*x;
-  v =  z*x;
-  r =  S2+z*(S3+z*(S4+z*(S5+z*S6)));
-  if(iy==0) return x+v*(S1+z*r);
-  else      return x-((z*(half_value*y-v*r)-y)-v*S1);
-}
-
-INLINE  float __kernel_cosf(float x, float y)
-{
-  /* copied from fdlibm */
-  const float
-  one =  1.0000000000e+00, /* 0x3f800000 */
-  C1  =  4.1666667908e-02, /* 0x3d2aaaab */
-  C2  = -1.3888889225e-03, /* 0xbab60b61 */
-  C3  =  2.4801587642e-05, /* 0x37d00d01 */
-  C4  = -2.7557314297e-07, /* 0xb493f27c */
-  C5  =  2.0875723372e-09, /* 0x310f74f6 */
-  C6  = -1.1359647598e-11; /* 0xad47d74e */
-  const float pio2_hi = 0x1.92p0, pio2_mid = 0x1.fb4p-12, pio2_low = 0x1.4442d2p-24;
-  float a,hz,z,r,qx;
-  int ix;
-  GEN_OCL_GET_FLOAT_WORD(ix,x);
-  ix &= 0x7fffffff;     /* ix = |x|'s high word*/
-  if(ix<0x32000000) {     /* if x < 2**27 */
-      if(((int)x)==0) return one;   /* generate inexact */
-  }
-
-  if(x < 0.0f) { x= -x; y = -y; }
-  if(ix > 0x3f490fdb) { /* |x|>pi/4*/
-    return -__kernel_sinf(x-pio2_hi-pio2_mid-pio2_low, y, 1);
-  }
-  z  = x*x;
-  r  = z*(C1+z*(C2+z*(C3+z*(C4+z*(C5+z*C6)))));
-  if(ix < 0x3e99999a)       /* if |x| < 0.3 */
-      return one - ((float)0.5*z - (z*r - x*y));
-  else {
-      GEN_OCL_SET_FLOAT_WORD(qx,ix-0x01000000); /* x/4 */
-      hz = (float)0.5*z-qx;
-      a  = one-qx;
-      return a - (hz - (z*r-x*y));
-  }
-}
-
-INLINE_OVERLOADABLE  float sin(float x) {
-  /* copied from fdlibm */
-  float y[2],z=0.0;
-  int n, ix;
-
-  GEN_OCL_GET_FLOAT_WORD(ix,x);
-
-    /* |x| ~< pi/4 */
-  ix &= 0x7fffffff;
-  if(ix <= 0x3f490fd8) return __kernel_sinf(x,z,0);
-
-    /* sin(Inf or NaN) is NaN */
-  else if (ix>=0x7f800000) return x-x;
-
-    /* argument reduction needed */
-  else {
-      n = __ieee754_rem_pio2f(x,y);
-      switch(n&3) {
-    case 0: return  __kernel_sinf(y[0],y[1],1);
-    case 1: return  __kernel_cosf(y[0],y[1]);
-    case 2: return -__kernel_sinf(y[0],y[1],1);
-    default:
-      return -__kernel_cosf(y[0],y[1]);
-      }
-  }
-}
-INLINE_OVERLOADABLE  float cos(float x) {
-  /* copied from fdlibm */
-  float y[2],z=0.0;
-  int n, ix;
-
-  GEN_OCL_GET_FLOAT_WORD(ix,x);
-
-    /* |x| ~< pi/4 */
-  ix &= 0x7fffffff;
-  if(ix <= 0x3f490fd8) return __kernel_cosf(x,z);
-
-    /* cos(Inf or NaN) is NaN */
-  else if (ix>=0x7f800000) return x-x;
-
-    /* argument reduction needed */
-  else {
-      n = __ieee754_rem_pio2f(x,y);
-      switch(n&3) {
-    case 0: return  __kernel_cosf(y[0],y[1]);
-    case 1: return  -__kernel_sinf(y[0],y[1],1);
-    case 2: return -__kernel_cosf(y[0],y[1]);
-    default:
-      return __kernel_sinf(y[0],y[1],1);
-      }
-  }
-}
-
-INLINE float __kernel_tanf(float x, float y, int iy)
-{
-  /* copied from fdlibm */
-        float z,r,v,w,s;
-        int ix,hx;
-        const float
-        one   =  1.0000000000e+00, /* 0x3f800000 */
-        pio4  =  7.8539812565e-01, /* 0x3f490fda */
-        pio4lo=  3.7748947079e-08; /* 0x33222168 */
-        float T[13];// =  {
-         T[0] = 3.3333334327e-01; /* 0x3eaaaaab */
-         T[1] = 1.3333334029e-01; /* 0x3e088889 */
-         T[2] = 5.3968254477e-02; /* 0x3d5d0dd1 */
-         T[3] = 2.1869488060e-02; /* 0x3cb327a4 */
-         T[4] = 8.8632395491e-03; /* 0x3c11371f */
-         T[5] = 3.5920790397e-03; /* 0x3b6b6916 */
-         T[6] = 1.4562094584e-03; /* 0x3abede48 */
-         T[7] = 5.8804126456e-04; /* 0x3a1a26c8 */
-         T[8] = 2.4646313977e-04; /* 0x398137b9 */
-         T[9] = 7.8179444245e-05; /* 0x38a3f445 */
-         T[10] = 7.1407252108e-05; /* 0x3895c07a */
-         T[11] = -1.8558637748e-05; /* 0xb79bae5f */
-         T[12] = 2.5907305826e-05; /* 0x37d95384 */
-
-
-        GEN_OCL_GET_FLOAT_WORD(hx,x);
-        ix = hx&0x7fffffff;     /* high word of |x| */
-        if(ix<0x31800000)                       /* x < 2**-28 */
-            {if((int)x==0) {                    /* generate inexact */
-                if((ix|(iy+1))==0) return one/__gen_ocl_fabs(x);
-                else return (iy==1)? x: -one/x;
-            }
-            }
-        if(ix>=0x3f2ca140) {                    /* |x|>=0.6744 */
-            if(hx<0) {x = -x; y = -y;}
-
-
-            z = pio4-x;
-            w = pio4lo-y;
-            x = z+w; y = 0.0;
-        }
-        z       =  x*x;
-        w       =  z*z;
-    /* Break x^5*(T[1]+x^2*T[2]+...) into
-     *    x^5(T[1]+x^4*T[3]+...+x^20*T[11]) +
-     *    x^5(x^2*(T[2]+x^4*T[4]+...+x^22*[T12]))
-     */
-        r = T[1]+w*(T[3]+w*(T[5]+w*(T[7]+w*(T[9]+w*T[11]))));
-        v = z*(T[2]+w*(T[4]+w*(T[6]+w*(T[8]+w*(T[10]+w*T[12])))));
-        s = z*x;
-        r = y + z*(s*(r+v)+y);
-        r += T[0]*s;
-        w = x+r;
-        if(ix>=0x3f2ca140) {
-            v = (float)iy;
-            return (float)(1-((hx>>30)&2))*(v-(float)2.0*(x-(w*w/(w+v)-r)));
-        }
-        if(iy==1) return w;
-        else {          /* if allow error up to 2 ulp
-                           simply return -1.0/(x+r) here */
-     /*  compute -1.0/(x+r) accurately */
-            float a,t;
-            int i;
-            z  = w;
-            GEN_OCL_GET_FLOAT_WORD(i,z);
-            GEN_OCL_SET_FLOAT_WORD(z,i&0xfffff000);
-            v  = r-(z - x);     /* z+v = r+x */
-            t = a  = -(float)1.0/w;     /* a = -1.0/w */
-            GEN_OCL_GET_FLOAT_WORD(i,t);
-            GEN_OCL_SET_FLOAT_WORD(t,i&0xfffff000);
-            s  = (float)1.0+t*z;
-            return t+a*(s+t*v);
-        }
-}
-
-INLINE_OVERLOADABLE float tan(float x)
-{
-  /* copied from fdlibm */
-        const float pio2_hi = 0x1.92p-0, pio2_mid = 0x1.fb4p-12, pio2_low = 0x1.4442d2p-24;
-        const float pio4  =  7.8539812565e-01;
-        float y[2],z=0.0;
-        int n, ix;
-
-        GEN_OCL_GET_FLOAT_WORD(ix,x);
-
-    /* |x| ~< pi/4 */
-        ix &= 0x7fffffff;
-        if(ix <= 0x3f490fda) return __kernel_tanf(x,z,1);
-
-    /* tan(Inf or NaN) is NaN */
-        else if (ix>=0x7f800000) return x-x;            /* NaN */
-
-    /* argument reduction needed */
-      else {
-        n = __ieee754_rem_pio2f(x,y);
-
-        x = y[0];
-        float m = y[1];
-        int iy = 1-((n&1)<<1);
-        GEN_OCL_GET_FLOAT_WORD(ix,x);
-        float sign = 1.0f;
-        if(ix < 0) {
-          x = -x; m = -m;
-          sign = -1.0f;
-        }
-
-        if(x > pio4) {/* reduce x to less than pi/4 through (pi/2-x) */
-          float t = __kernel_tanf(pio2_hi-x+pio2_mid+pio2_low, -m, 1);
-          if(iy == -1) return sign*(-t); else return sign*1/t;
-        } else
-            return __kernel_tanf(y[0],y[1],1-((n&1)<<1)); /*   1 -- n even
-                                                              -1 -- n odd */
-      }
-}
-
-INLINE_OVERLOADABLE float native_cos(float x) { return __gen_ocl_cos(x); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_cospi(float x) {
-  int ix;
-  if(isinf(x) || isnan(x)) { return NAN; }
-  if(x < 0.0f) { x = -x; }
-  GEN_OCL_GET_FLOAT_WORD(ix, x);
-  if(x> 0x1.0p24) return 1.0f;
-  float m = __gen_ocl_internal_floor(x);
-  ix = (int)m;
-  m = x-m;
-  if((ix&0x1) != 0) m+=1.0f;
-    ix = __gen_ocl_internal_floor(m*4.0f);
-
-  switch(ix) {
-   case 0:
-    return __kernel_cosf(m*M_PI_F, 0.0f);
-   case 1:
-   case 2:
-    return __kernel_sinf((0.5f-m)*M_PI_F, 0.0f, 0);
-   case 3:
-   case 4:
-    return -__kernel_cosf((m-1.0f)*M_PI_F, 0.0f);
-   case 5:
-   case 6:
-    return __kernel_sinf((m-1.5f)*M_PI_F, 0.0f, 0);
-   default:
-    return __kernel_cosf((2.0f-m)*M_PI_F, 0.0f);
-   }
-}
-INLINE_OVERLOADABLE float native_sin(float x) { return __gen_ocl_sin(x); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_sinpi(float x) {
-  float sign = 1.0f;
-  int ix;
-  if(isinf(x)) return NAN;
-  if(x < 0.0f) { x = -x; sign = -1.0f; }
-  GEN_OCL_GET_FLOAT_WORD(ix, x);
-  if(x> 0x1.0p24) return 0.0f;
-  float m = __gen_ocl_internal_floor(x);
-  ix = (int)m;
-  m = x-m;
-  if((ix&0x1) != 0) m+=1.0f;
-    ix = __gen_ocl_internal_floor(m*4.0f);
-
-  switch(ix) {
-   case 0:
-    return sign*__kernel_sinf(m*M_PI_F, 0.0f, 0);
-   case 1:
-   case 2:
-    return sign*__kernel_cosf((m-0.5f)*M_PI_F, 0.0f);
-   case 3:
-   case 4:
-    return -sign*__kernel_sinf((m-1.0f)*M_PI_F, 0.0f, 0);
-   case 5:
-   case 6:
-    return -sign*__kernel_cosf((m-1.5f)*M_PI_F, 0.0f);
-   default:
-    return -sign*__kernel_sinf((2.0f-m)*M_PI_F, 0.0f, 0);
-   }
-
-}
-INLINE_OVERLOADABLE float native_sqrt(float x) { return __gen_ocl_sqrt(x); }
-INLINE_OVERLOADABLE float native_rsqrt(float x) { return __gen_ocl_rsqrt(x); }
-INLINE_OVERLOADABLE float native_log2(float x) { return __gen_ocl_log(x); }
-INLINE_OVERLOADABLE float native_log(float x) {
-  return native_log2(x) * 0.6931472002f;
-}
-INLINE_OVERLOADABLE float tgamma(float x) {
-/*
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-  float pi = 3.1415927410e+00,
-    a0 = 7.7215664089e-02,
-    a1 = 3.2246702909e-01,
-    a2 = 6.7352302372e-02,
-    a3 = 2.0580807701e-02,
-    a4 = 7.3855509982e-03,
-    a5 = 2.8905137442e-03,
-    a6 = 1.1927076848e-03,
-    a7 = 5.1006977446e-04,
-    a8 = 2.2086278477e-04,
-    a9 = 1.0801156895e-04,
-    a10 = 2.5214456400e-05,
-    a11 = 4.4864096708e-05,
-    tc = 1.4616321325e+00,
-    tf = -1.2148628384e-01,
-    tt = 6.6971006518e-09,
-    t0 = 4.8383611441e-01,
-    t1 = -1.4758771658e-01,
-    t2 = 6.4624942839e-02,
-    t3 = -3.2788541168e-02,
-    t4 = 1.7970675603e-02,
-    t5 = -1.0314224288e-02,
-    t6 = 6.1005386524e-03,
-    t7 = -3.6845202558e-03,
-    t8 = 2.2596477065e-03,
-    t9 = -1.4034647029e-03,
-    t10 = 8.8108185446e-04,
-    t11 = -5.3859531181e-04,
-    t12 = 3.1563205994e-04,
-    t13 = -3.1275415677e-04,
-    t14 = 3.3552918467e-04,
-    u0 = -7.7215664089e-02,
-    u1 = 6.3282704353e-01,
-    u2 = 1.4549225569e+00,
-    u3 = 9.7771751881e-01,
-    u4 = 2.2896373272e-01,
-    u5 = 1.3381091878e-02,
-    v1 = 2.4559779167e+00,
-    v2 = 2.1284897327e+00,
-    v3 = 7.6928514242e-01,
-    v4 = 1.0422264785e-01,
-    v5 = 3.2170924824e-03,
-    s0 = -7.7215664089e-02,
-    s1 = 2.1498242021e-01,
-    s2 = 3.2577878237e-01,
-    s3 = 1.4635047317e-01,
-    s4 = 2.6642270386e-02,
-    s5 = 1.8402845599e-03,
-    s6 = 3.1947532989e-05,
-    r1 = 1.3920053244e+00,
-    r2 = 7.2193557024e-01,
-    r3 = 1.7193385959e-01,
-    r4 = 1.8645919859e-02,
-    r5 = 7.7794247773e-04,
-    r6 = 7.3266842264e-06,
-    w0 = 4.1893854737e-01,
-    w1 = 8.3333335817e-02,
-    w2 = -2.7777778450e-03,
-    w3 = 7.9365057172e-04,
-    w4 = -5.9518753551e-04,
-    w5 = 8.3633989561e-04,
-    w6 = -1.6309292987e-03;
-  float t, y, z, nadj, p, p1, p2, p3, q, r, w;
-  int i, hx, ix;
-  nadj = 0;
-  hx = *(int *) (&x);
-  ix = hx & 0x7fffffff;
-  if (ix >= 0x7f800000)
-    return x * x;
-  if (ix == 0)
-    return INFINITY;
-  if (ix < 0x1c800000) {
-    if (hx < 0) {
-      return - native_log(-x);
-    } else
-      return - native_log(x);
-  }
-  if (hx < 0) {
-    if (ix >= 0x4b000000)
-      return INFINITY;
-    t = __gen_ocl_internal_sinpi(x);
-    if (__gen_ocl_fabs(t) < 1e-8f)
-      return INFINITY;
-    nadj = native_log(M_PI_F / __gen_ocl_fabs(t * x));
-    x = -x;
-  }
-
-  if (ix == 0x3f800000 || ix == 0x40000000)
-    r = 0;
-  else if (ix < 0x40000000) {
-    if (ix <= 0x3f666666) {
-      r = - native_log(x);
-      if (ix >= 0x3f3b4a20) {
-        y = 1 - x;
-        i = 0;
-      } else if (ix >= 0x3e6d3308) {
-        y = x - (tc - 1);
-        i = 1;
-      } else {
-        y = x;
-        i = 2;
-      }
-    } else {
-      r = 0;
-      if (ix >= 0x3fdda618) {
-        y = 2 - x;
-        i = 0;
-      } else if (ix >= 0x3F9da620) {
-        y = x - tc;
-        i = 1;
-      } else {
-        y = x - 1;
-        i = 2;
-      }
-    }
-    switch (i) {
-    case 0:
-      z = y * y;
-      p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10))));
-      p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11)))));
-      p = y * p1 + p2;
-      r += (p - .5f * y);
-      break;
-    case 1:
-      z = y * y;
-      w = z * y;
-      p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12)));
-      p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13)));
-      p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14)));
-      p = z * p1 - (tt - w * (p2 + y * p3));
-      r += (tf + p);
-      break;
-    case 2:
-      p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5)))));
-      p2 = 1 + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5))));
-      r += (-.5f * y + p1 / p2);
-    }
-  } else if (ix < 0x41000000) {
-    i = x;
-    t = 0;
-    y = x - i;
-    p = y*(s0+y*(s1+y*(s2+y*(s3+y*(s4+y*(s5+y*s6))))));
-    q = 1 + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6)))));
-    r = .5f * y + p / q;
-    z = 1;
-    switch (i) {
-    case 7:
-      z *= (y + 6.f);
-    case 6:
-      z *= (y + 5.f);
-    case 5:
-      z *= (y + 4.f);
-    case 4:
-      z *= (y + 3.f);
-    case 3:
-      z *= (y + 2.f);
-      r += native_log(z);
-      break;
-    }
-  } else if (ix < 0x5c800000) {
-    t = native_log(x);
-    z = 1 / x;
-    y = z * z;
-    w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6)))));
-    r = (x - .5f) * (t - 1) + w;
-  } else
-    r = x * (native_log(x) - 1);
-  if (hx < 0)
-    r = nadj - r;
-  return r;
-}
-
-INLINE_OVERLOADABLE float lgamma(float x) {
-/*
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-    const float
-        zero=  0.,
-        one =  1.0000000000e+00,
-        pi  =  3.1415927410e+00,
-        a0  =  7.7215664089e-02,
-        a1  =  3.2246702909e-01,
-        a2  =  6.7352302372e-02,
-        a3  =  2.0580807701e-02,
-        a4  =  7.3855509982e-03,
-        a5  =  2.8905137442e-03,
-        a6  =  1.1927076848e-03,
-        a7  =  5.1006977446e-04,
-        a8  =  2.2086278477e-04,
-        a9  =  1.0801156895e-04,
-        a10 =  2.5214456400e-05,
-        a11 =  4.4864096708e-05,
-        tc  =  1.4616321325e+00,
-        tf  = -1.2148628384e-01,
-        tt  =  6.6971006518e-09,
-        t0  =  4.8383611441e-01,
-        t1  = -1.4758771658e-01,
-        t2  =  6.4624942839e-02,
-        t3  = -3.2788541168e-02,
-        t4  =  1.7970675603e-02,
-        t5  = -1.0314224288e-02,
-        t6  =  6.1005386524e-03,
-        t7  = -3.6845202558e-03,
-        t8  =  2.2596477065e-03,
-        t9  = -1.4034647029e-03,
-        t10 =  8.8108185446e-04,
-        t11 = -5.3859531181e-04,
-        t12 =  3.1563205994e-04,
-        t13 = -3.1275415677e-04,
-        t14 =  3.3552918467e-04,
-        u0  = -7.7215664089e-02,
-        u1  =  6.3282704353e-01,
-        u2  =  1.4549225569e+00,
-        u3  =  9.7771751881e-01,
-        u4  =  2.2896373272e-01,
-        u5  =  1.3381091878e-02,
-        v1  =  2.4559779167e+00,
-        v2  =  2.1284897327e+00,
-        v3  =  7.6928514242e-01,
-        v4  =  1.0422264785e-01,
-        v5  =  3.2170924824e-03,
-        s0  = -7.7215664089e-02,
-        s1  =  2.1498242021e-01,
-        s2  =  3.2577878237e-01,
-        s3  =  1.4635047317e-01,
-        s4  =  2.6642270386e-02,
-        s5  =  1.8402845599e-03,
-        s6  =  3.1947532989e-05,
-        r1  =  1.3920053244e+00,
-        r2  =  7.2193557024e-01,
-        r3  =  1.7193385959e-01,
-        r4  =  1.8645919859e-02,
-        r5  =  7.7794247773e-04,
-        r6  =  7.3266842264e-06,
-        w0  =  4.1893854737e-01,
-        w1  =  8.3333335817e-02,
-        w2  = -2.7777778450e-03,
-        w3  =  7.9365057172e-04,
-        w4  = -5.9518753551e-04,
-        w5  =  8.3633989561e-04,
-        w6  = -1.6309292987e-03;
-	float t, y, z, nadj, p, p1, p2, p3, q, r, w;
-	int i, hx, ix;
-	nadj = 0;
-	hx = *(int *)&x;
-	ix = hx & 0x7fffffff;
-	if (ix >= 0x7f800000)
-		return x * x;
-	if (ix == 0)
-		return ((x + one) / zero);
-	if (ix < 0x1c800000) {
-		if (hx < 0) {
-			return -native_log(-x);
-		} else
-			return -native_log(x);
-	}
-	if (hx < 0) {
-		if (ix >= 0x4b000000)
-			return ((-x) / zero);
-		t = __gen_ocl_internal_sinpi(x);
-		if (t == zero)
-			return ((-x) / zero);
-		nadj = native_log(pi / __gen_ocl_fabs(t * x));
-		x = -x;
-	}
-	if (ix == 0x3f800000 || ix == 0x40000000)
-		r = 0;
-	else if (ix < 0x40000000) {
-		if (ix <= 0x3f666666) {
-			r = -native_log(x);
-			if (ix >= 0x3f3b4a20) {
-				y = one - x;
-				i = 0;
-			} else if (ix >= 0x3e6d3308) {
-				y = x - (tc - one);
-				i = 1;
-			} else {
-				y = x;
-				i = 2;
-			}
-		} else {
-			r = zero;
-			if (ix >= 0x3fdda618) {
-				y = (float) 2.0 - x;
-				i = 0;
-			}
-			else if (ix >= 0x3F9da620) {
-				y = x - tc;
-				i = 1;
-			}
-			else {
-				y = x - one;
-				i = 2;
-			}
-		}
-		switch (i) {
-		case 0:
-			z = y * y;
-			p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10))));
-			p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11)))));
-			p = y * p1 + p2;
-			r += (p - (float) 0.5 * y);
-			break;
-		case 1:
-			z = y * y;
-			w = z * y;
-			p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12)));
-			p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13)));
-			p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14)));
-			p = z * p1 - (tt - w * (p2 + y * p3));
-			r += (tf + p);
-			break;
-		case 2:
-			p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5)))));
-			p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5))));
-			r += (-(float) 0.5 * y + p1 / p2);
-		}
-	} else if (ix < 0x41000000) {
-		i = (int) x;
-		t = zero;
-		y = x - (float) i;
-		p = y * (s0 + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6))))));
-		q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6)))));
-		r = .5f * y + p / q;
-		z = one;
-		switch (i) {
-		case 7:
-			z *= (y + (float) 6.0);
-		case 6:
-			z *= (y + (float) 5.0);
-		case 5:
-			z *= (y + (float) 4.0);
-		case 4:
-			z *= (y + (float) 3.0);
-		case 3:
-			z *= (y + (float) 2.0);
-			r += native_log(z);
-			break;
-		}
-
-	} else if (ix < 0x5c800000) {
-		t = native_log(x);
-		z = one / x;
-		y = z * z;
-		w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6)))));
-		r = (x - .5f) * (t - one) + w;
-	} else
-		r = x * (native_log(x) - one);
-	if (hx < 0)
-		r = nadj - r;
-	return r;
-}
-
-/*
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-#define BODY \
-    const float  \
-        zero=  0.,  \
-        one =  1.0000000000e+00,  \
-        pi  =  3.1415927410e+00,  \
-        a0  =  7.7215664089e-02,  \
-        a1  =  3.2246702909e-01,  \
-        a2  =  6.7352302372e-02,  \
-        a3  =  2.0580807701e-02,  \
-        a4  =  7.3855509982e-03,  \
-        a5  =  2.8905137442e-03,  \
-        a6  =  1.1927076848e-03,  \
-        a7  =  5.1006977446e-04,  \
-        a8  =  2.2086278477e-04,  \
-        a9  =  1.0801156895e-04,  \
-        a10 =  2.5214456400e-05,  \
-        a11 =  4.4864096708e-05,  \
-        tc  =  1.4616321325e+00,  \
-        tf  = -1.2148628384e-01,  \
-        tt  =  6.6971006518e-09,  \
-        t0  =  4.8383611441e-01,  \
-        t1  = -1.4758771658e-01,  \
-        t2  =  6.4624942839e-02,  \
-        t3  = -3.2788541168e-02,  \
-        t4  =  1.7970675603e-02,  \
-        t5  = -1.0314224288e-02,  \
-        t6  =  6.1005386524e-03,  \
-        t7  = -3.6845202558e-03,  \
-        t8  =  2.2596477065e-03,  \
-        t9  = -1.4034647029e-03,  \
-        t10 =  8.8108185446e-04,  \
-        t11 = -5.3859531181e-04,  \
-        t12 =  3.1563205994e-04,  \
-        t13 = -3.1275415677e-04,  \
-        t14 =  3.3552918467e-04,  \
-        u0  = -7.7215664089e-02,  \
-        u1  =  6.3282704353e-01,  \
-        u2  =  1.4549225569e+00,  \
-        u3  =  9.7771751881e-01,  \
-        u4  =  2.2896373272e-01,  \
-        u5  =  1.3381091878e-02,  \
-        v1  =  2.4559779167e+00,  \
-        v2  =  2.1284897327e+00,  \
-        v3  =  7.6928514242e-01,  \
-        v4  =  1.0422264785e-01,  \
-        v5  =  3.2170924824e-03,  \
-        s0  = -7.7215664089e-02,  \
-        s1  =  2.1498242021e-01,  \
-        s2  =  3.2577878237e-01,  \
-        s3  =  1.4635047317e-01,  \
-        s4  =  2.6642270386e-02,  \
-        s5  =  1.8402845599e-03,  \
-        s6  =  3.1947532989e-05,  \
-        r1  =  1.3920053244e+00,  \
-        r2  =  7.2193557024e-01,  \
-        r3  =  1.7193385959e-01,  \
-        r4  =  1.8645919859e-02,  \
-        r5  =  7.7794247773e-04,  \
-        r6  =  7.3266842264e-06,  \
-        w0  =  4.1893854737e-01,  \
-        w1  =  8.3333335817e-02,  \
-        w2  = -2.7777778450e-03,  \
-        w3  =  7.9365057172e-04,  \
-        w4  = -5.9518753551e-04,  \
-        w5  =  8.3633989561e-04,  \
-        w6  = -1.6309292987e-03;  \
-	float t, y, z, nadj, p, p1, p2, p3, q, r, w;  \
-	int i, hx, ix;  \
-	nadj = 0;  \
-	hx = *(int *)&x;  \
-	*signgamp = 1;  \
-	ix = hx & 0x7fffffff;  \
-	if (ix >= 0x7f800000)  \
-		return x * x;  \
-	if (ix == 0)  \
-		return ((x + one) / zero);  \
-	if (ix < 0x1c800000) {  \
-		if (hx < 0) {  \
-			*signgamp = -1;  \
-			return -native_log(-x);  \
-		} else  \
-			return -native_log(x);  \
-	}  \
-	if (hx < 0) {  \
-		if (ix >= 0x4b000000)  \
-			return ((-x) / zero);  \
-		t = __gen_ocl_internal_sinpi(x);  \
-		if (t == zero)  \
-			return ((-x) / zero);  \
-		nadj = native_log(pi / __gen_ocl_fabs(t * x));  \
-		if (t < zero)  \
-			*signgamp = -1;  \
-		x = -x;  \
-	}  \
-	if (ix == 0x3f800000 || ix == 0x40000000)  \
-		r = 0;  \
-	else if (ix < 0x40000000) {  \
-		if (ix <= 0x3f666666) {  \
-			r = -native_log(x);  \
-			if (ix >= 0x3f3b4a20) {  \
-				y = one - x;  \
-				i = 0;  \
-			} else if (ix >= 0x3e6d3308) {  \
-				y = x - (tc - one);  \
-				i = 1;  \
-			} else {  \
-				y = x;  \
-				i = 2;  \
-			}  \
-		} else {  \
-			r = zero;  \
-			if (ix >= 0x3fdda618) {  \
-				y = (float) 2.0 - x;  \
-				i = 0;  \
-			}  \
-			else if (ix >= 0x3F9da620) {  \
-				y = x - tc;  \
-				i = 1;  \
-			}  \
-			else {  \
-				y = x - one;  \
-				i = 2;  \
-			}  \
-		}  \
-		switch (i) {  \
-		case 0:  \
-			z = y * y;  \
-			p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10))));  \
-			p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11)))));  \
-			p = y * p1 + p2;  \
-			r += (p - (float) 0.5 * y);  \
-			break;  \
-		case 1:  \
-			z = y * y;  \
-			w = z * y;  \
-			p1 = t0 + w * (t3 + w * (t6 + w * (t9 + w * t12)));  \
-			p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13)));  \
-			p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14)));  \
-			p = z * p1 - (tt - w * (p2 + y * p3));  \
-			r += (tf + p);  \
-			break;  \
-		case 2:  \
-			p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5)))));  \
-			p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5))));  \
-			r += (-(float) 0.5 * y + p1 / p2);  \
-		}  \
-	} else if (ix < 0x41000000) {  \
-		i = (int) x;  \
-		t = zero;  \
-		y = x - (float) i;  \
-		p = y * (s0 + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6))))));  \
-		q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6)))));  \
-		r = .5f * y + p / q;  \
-		z = one;  \
-		switch (i) {  \
-		case 7:  \
-			z *= (y + (float) 6.0);  \
-		case 6:  \
-			z *= (y + (float) 5.0);  \
-		case 5:  \
-			z *= (y + (float) 4.0);  \
-		case 4:  \
-			z *= (y + (float) 3.0);  \
-		case 3:  \
-			z *= (y + (float) 2.0);  \
-			r += native_log(z);  \
-			break;  \
-		}  \
-		  \
-	} else if (ix < 0x5c800000) {  \
-		t = native_log(x);  \
-		z = one / x;  \
-		y = z * z;  \
-		w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6)))));  \
-		r = (x - .5f) * (t - one) + w;  \
-	} else  \
-		r = x * (native_log(x) - one);  \
-	if (hx < 0)  \
-		r = nadj - r;  \
-	return r;
-INLINE_OVERLOADABLE float lgamma_r(float x, global int *signgamp) { BODY; }
-INLINE_OVERLOADABLE float lgamma_r(float x, local int *signgamp) { BODY; }
-INLINE_OVERLOADABLE float lgamma_r(float x, private int *signgamp) { BODY; }
-#undef BODY
-
-INLINE_OVERLOADABLE float native_log10(float x) {
-  return native_log2(x) * 0.3010299956f;
-}
-INLINE_OVERLOADABLE float log1p(float x) {
-/*
- *  Conversion to float by Ian Lance Taylor, Cygnus Support, ian at cygnus.com
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-  const float
-  ln2_hi =   6.9313812256e-01,  /* 0x3f317180 */
-  ln2_lo =   9.0580006145e-06,  /* 0x3717f7d1 */
-  two25 =    3.355443200e+07, /* 0x4c000000 */
-  Lp1 = 6.6666668653e-01, /* 3F2AAAAB */
-  Lp2 = 4.0000000596e-01, /* 3ECCCCCD */
-  Lp3 = 2.8571429849e-01, /* 3E924925 */
-  Lp4 = 2.2222198546e-01, /* 3E638E29 */
-  Lp5 = 1.8183572590e-01, /* 3E3A3325 */
-  Lp6 = 1.5313838422e-01, /* 3E1CD04F */
-  Lp7 = 1.4798198640e-01; /* 3E178897 */
-  const float zero = 0.0;
-  float hfsq,f,c,s,z,R,u;
-  int k,hx,hu,ax;
-  union {float f; unsigned i;} un;
-  un.f = x;  hx = un.i;
-  ax = hx&0x7fffffff;
-
-  k = 1;
-  if (hx < 0x3ed413d7) {      /* x < 0.41422  */
-      if(ax>=0x3f800000) {    /* x <= -1.0 */
-    if(x==(float)-1.0) return -two25/zero; /* log1p(-1)=+inf */
-    else return (x-x)/(x-x);  /* log1p(x<-1)=NaN */
-      }
-      if(ax<0x31000000) {     /* |x| < 2**-29 */
-    if(two25+x>zero     /* raise inexact */
-              &&ax<0x24800000)    /* |x| < 2**-54 */
-        return x;
-    else
-        return x - x*x*(float)0.5;
-      }
-      if(hx>0||hx<=((int)0xbe95f61f)) {
-    k=0;f=x;hu=1;}  /* -0.2929<x<0.41422 */
-  }
-  if (hx >= 0x7f800000) return x+x;
-  if(k!=0) {
-      if(hx<0x5a000000) {
-    u  = (float)1.0+x;
-
-    un.f = u; hu = un.i;
-          k  = (hu>>23)-127;
-    /* correction term */
-          c  = (k>0)? (float)1.0-(u-x):x-(u-(float)1.0);
-    c /= u;
-      } else {
-    u  = x;
-    un.f = u; hu = un.i;
-          k  = (hu>>23)-127;
-    c  = 0;
-      }
-      hu &= 0x007fffff;
-      if(hu<0x3504f7) {
-          un.i = hu|0x3f800000; u = un.f;/* normalize u */
-      } else {
-          k += 1;
-          un.i = hu|0x3f000000; u = un.f;  /* normalize u/2 */
-          hu = (0x00800000-hu)>>2;
-      }
-      f = u-(float)1.0;
-  }
-  hfsq=(float)0.5*f*f;
-  if(hu==0) { /* |f| < 2**-20 */
-      if(f==zero) { if(k==0) return zero;
-      else {c += k*ln2_lo; return k*ln2_hi+c;} }
-      R = hfsq*((float)1.0-(float)0.66666666666666666*f);
-      if(k==0) return f-R; else
-             return k*ln2_hi-((R-(k*ln2_lo+c))-f);
-  }
-  s = f/((float)2.0+f);
-  z = s*s;
-  R = z*(Lp1+z*(Lp2+z*(Lp3+z*(Lp4+z*(Lp5+z*(Lp6+z*Lp7))))));
-  if(k==0) return f-(hfsq-s*(hfsq+R)); else
-     return k*ln2_hi-((hfsq-(s*(hfsq+R)+(k*ln2_lo+c)))-f);
-
-}
-INLINE_OVERLOADABLE float logb(float x) {
-union {float f; unsigned i;} u;
-  u.f = x;
-  int e =  ((u.i & 0x7f800000) >> 23);
-  if(e == 0) {
-    /* sub normal or +/-0 */
-    return -INFINITY;
-  } else if(e == 0xff) {
-    /* inf & nan */
-    return x*x;
-  } else {
-    return (float)(e-127);
-  }
-}
-#define FP_ILOGB0 (-0x7FFFFFFF-1)
-#define FP_ILOGBNAN FP_ILOGB0
-INLINE_OVERLOADABLE int ilogb(float x) {
-  union { int i; float f; } u;
-  if (isnan(x))
-    return FP_ILOGBNAN;
-  if (isinf(x))
-    return 0x7FFFFFFF;
-  u.f = x;
-  u.i &= 0x7fffffff;
-  if (u.i == 0)
-    return FP_ILOGB0;
-  if (u.i >= 0x800000)
-    return (u.i >> 23) - 127;
-  int r = -126;
-  int a = u.i & 0x7FFFFF;
-  while(a < 0x800000) {
-    a <<= 1;
-    r --;
-  }
-  return r;
-}
-INLINE_OVERLOADABLE float nan(uint code) {
-  return NAN;
-}
-INLINE_OVERLOADABLE float native_powr(float x, float y) { return __gen_ocl_pow(x,y); }
-INLINE_OVERLOADABLE float native_recip(float x) { return __gen_ocl_rcp(x); }
-INLINE_OVERLOADABLE float native_tan(float x) {
-  return native_sin(x) / native_cos(x);
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_tanpi(float x) {
-  float sign = 1.0f;
-  int ix;
-  if(isinf(x)) return NAN;
-  if(x < 0.0f) { x = -x; sign = -1.0f; }
-  GEN_OCL_GET_FLOAT_WORD(ix, x);
-  if(x> 0x1.0p24) return 0.0f;
-  float m = __gen_ocl_internal_floor(x);
-  ix = (int)m;
-  m = x-m;
-  int n = __gen_ocl_internal_floor(m*4.0f);
-  if(m == 0.5f) {
-    return (ix&0x1) == 0 ? sign*INFINITY : sign*-INFINITY;
-  }
-  if(m == 0.0f) {
-    return (ix&0x1) == 0 ? 0.0f : -0.0f;
-  }
-
-  switch(n) {
-    case 0:
-      return sign * __kernel_tanf(m*M_PI_F, 0.0f, 1);
-    case 1:
-      return sign * 1.0f/__kernel_tanf((0.5f-m)*M_PI_F, 0.0f, 1);
-    case 2:
-      return sign * 1.0f/__kernel_tanf((0.5f-m)*M_PI_F, 0.0f, 1);
-    default:
-      return sign * -1.0f*__kernel_tanf((1.0f-m)*M_PI_F, 0.0f, 1);
-  }
-}
-INLINE_OVERLOADABLE float native_exp2(float x) { return __gen_ocl_exp(x); }
-INLINE_OVERLOADABLE float native_exp(float x) { return __gen_ocl_exp(M_LOG2E_F*x); }
-INLINE_OVERLOADABLE float native_exp10(float x) { return __gen_ocl_pow(10, x); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_cbrt(float x) {
-  /* copied from fdlibm */
-  const unsigned
-  B1 = 709958130, /* B1 = (84+2/3-0.03306235651)*2**23 */
-  B2 = 642849266; /* B2 = (76+2/3-0.03306235651)*2**23 */
-
-  const float
-  C =  5.4285717010e-01, /* 19/35     = 0x3f0af8b0 */
-  D = -7.0530611277e-01, /* -864/1225 = 0xbf348ef1 */
-  E =  1.4142856598e+00, /* 99/70     = 0x3fb50750 */
-  F =  1.6071428061e+00, /* 45/28     = 0x3fcdb6db */
-  G =  3.5714286566e-01; /* 5/14      = 0x3eb6db6e */
-
-  float r,s,t, w;
-  int hx;
-  uint sign;
-  uint high;
-
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  sign=hx&0x80000000;     /* sign= sign(x) */
-  hx  ^=sign;
-  if(hx>=0x7f800000) return(x+x); /* cbrt(NaN,INF) is itself */
-  if(hx==0)
-      return(x);    /* cbrt(0) is itself */
-
-  GEN_OCL_SET_FLOAT_WORD(x,hx); /* x <- |x| */
-    /* rough cbrt to 5 bits */
-  if(hx<0x00800000)     /* subnormal number */
-    {
-    //SET_FLOAT_WORD(t,0x4b800000); /* set t= 2**24 */
-     //t*=x; GET_FLOAT_WORD(high,t); SET_FLOAT_WORD(t,high/3+B2);
-      t = (sign = 0) ? 0.0f : -0.0f;
-      return t;
-    }
-  else
-    GEN_OCL_SET_FLOAT_WORD(t,hx/3+B1);
-
-
-    /* new cbrt to 23 bits */
-  r=t*t/x;
-  s=C+r*t;
-  t*=G+F/(s+E+D/s);
-    /* one step newton iteration to 53 bits with error less than 0.667 ulps */
-  s=t*t;    /* t*t is exact */
-  r=x/s;
-  w=t+t;
-  r=(r-t)/(w+r);  /* r-s is exact */
-  t=t+t*r;
-
-    /* retore the sign bit */
-  GEN_OCL_GET_FLOAT_WORD(high,t);
-  GEN_OCL_SET_FLOAT_WORD(t,high|sign);
-  return(t);
-}
-
-#define BODY \
-  *cosval = cos(x); \
-  return sin(x);
-INLINE_OVERLOADABLE float sincos(float x, global float *cosval) { BODY; }
-INLINE_OVERLOADABLE float sincos(float x, local float *cosval) { BODY; }
-INLINE_OVERLOADABLE float sincos(float x, private float *cosval) { BODY; }
-#undef BODY
-
-INLINE float __gen_ocl_asin_util(float x) {
-/*
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunSoft, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-  float
-  pS0 =  1.66666666666666657415e-01,
-  pS1 = -3.25565818622400915405e-01,
-  pS2 =  2.01212532134862925881e-01,
-  pS3 = -4.00555345006794114027e-02,
-  pS4 =  7.91534994289814532176e-04,
-  pS5 =  3.47933107596021167570e-05,
-  qS1 = -2.40339491173441421878e+00,
-  qS2 =  2.02094576023350569471e+00,
-  qS3 = -6.88283971605453293030e-01,
-  qS4 =  7.70381505559019352791e-02;
-
-  float t = x*x;
-  float p = t*(pS0+t*(pS1+t*(pS2+t*(pS3+t*(pS4+t*pS5)))));
-  float q = 1.0+t*(qS1+t*(qS2+t*(qS3+t*qS4)));
-  float w = p / q;
-  return x + x*w;
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_asin(float x) {
-  uint ix;
-  union { uint i; float f; } u;
-  u.f = x;
-  ix = u.i & 0x7fffffff;
-  if(ix == 0x3f800000) {
-    return x * M_PI_2_F;  /* asin(|1|)=+-pi/2 with inexact */
-  }
-  if(ix > 0x3f800000) {            /* |x|>= 1 */
-    return  NAN;          /* asin(|x|>1) is NaN */
-  }
-
-  if(ix < 0x32000000) {            /* if |x| < 2**-27 */
-    if(HUGE_VALF + x > FLT_ONE) return x;   /* return x with inexact if x!=0*/
-  }
-
-  if(x < -0.5) {
-    return 2 * __gen_ocl_asin_util(native_sqrt((1+x) / 2)) - M_PI_2_F;
-  } else if(x > 0.5) {
-    return M_PI_2_F - 2 * __gen_ocl_asin_util(native_sqrt((1-x) / 2));
-  } else {
-    return __gen_ocl_asin_util(x);
-  }
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_asinpi(float x) {
-  return __gen_ocl_internal_asin(x) / M_PI_F;
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_acos(float x) {
-  if(x > 0.5)
-    return 2 * __gen_ocl_asin_util(native_sqrt((1-x)/2));
-  else
-    return M_PI_2_F - __gen_ocl_internal_asin(x);
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_acospi(float x) {
-  return __gen_ocl_internal_acos(x) / M_PI_F;
-}
-__constant float atanhi[4] = {
-  4.6364760399e-01, /* atan(0.5)hi 0x3eed6338 */
-  7.8539812565e-01, /* atan(1.0)hi 0x3f490fda */
-  9.8279368877e-01, /* atan(1.5)hi 0x3f7b985e */
-  1.5707962513e+00, /* atan(inf)hi 0x3fc90fda */
-};
-__constant float atanlo[4] = {
-  5.0121582440e-09, /* atan(0.5)lo 0x31ac3769 */
-  3.7748947079e-08, /* atan(1.0)lo 0x33222168 */
-  3.4473217170e-08, /* atan(1.5)lo 0x33140fb4 */
-  7.5497894159e-08, /* atan(inf)lo 0x33a22168 */
-};
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_atan(float x) {
-  /* copied from fdlibm */
-  float aT[11];
-  aT[0] = 3.3333334327e-01; /* 0x3eaaaaaa */
-  aT[1] =  -2.0000000298e-01; /* 0xbe4ccccd */
-  aT[2] =   1.4285714924e-01; /* 0x3e124925 */
-  aT[3] =  -1.1111110449e-01; /* 0xbde38e38 */
-  aT[4] =   9.0908870101e-02; /* 0x3dba2e6e */
-  aT[5] =  -7.6918758452e-02; /* 0xbd9d8795 */
-  aT[6] =   6.6610731184e-02; /* 0x3d886b35 */
-  aT[7] =  -5.8335702866e-02; /* 0xbd6ef16b */
-  aT[8] =   4.9768779427e-02; /* 0x3d4bda59 */
-  aT[9] =  -3.6531571299e-02; /* 0xbd15a221 */
-  aT[10] =   1.6285819933e-02; /* 0x3c8569d7 */
-  const float one = 1.0, huge = 1.0e30;
-
-  float w,s1,s2,z;
-  int ix,hx,id;
-
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  ix = hx&0x7fffffff;
-  if(ix>=0x50800000) {  /* if |x| >= 2^34 */
-      if(ix>0x7f800000)
-    return x+x;   /* NaN */
-      if(hx>0) return  atanhi[3]+atanlo[3];
-      else     return -atanhi[3]-atanlo[3];
-  } if (ix < 0x3ee00000) {  /* |x| < 0.4375 */
-      if (ix < 0x31000000) {  /* |x| < 2^-29 */
-    if(huge+x>one) return x;  /* raise inexact */
-      }
-      id = -1;
-  } else {
-  x = __gen_ocl_fabs(x);
-  if (ix < 0x3f980000) {    /* |x| < 1.1875 */
-      if (ix < 0x3f300000) {  /* 7/16 <=|x|<11/16 */
-    id = 0; x = ((float)2.0*x-one)/((float)2.0+x);
-      } else {      /* 11/16<=|x|< 19/16 */
-    id = 1; x  = (x-one)/(x+one);
-      }
-  } else {
-      if (ix < 0x401c0000) {  /* |x| < 2.4375 */
-    id = 2; x  = (x-(float)1.5)/(one+(float)1.5*x);
-      } else {      /* 2.4375 <= |x| < 2^66 */
-    id = 3; x  = -(float)1.0/x;
-      }
-  }}
-    /* end of argument reduction */
-  z = x*x;
-  w = z*z;
-    /* break sum from i=0 to 10 aT[i]z**(i+1) into odd and even poly */
-  s1 = z*(aT[0]+w*(aT[2]+w*(aT[4]+w*(aT[6]+w*(aT[8]+w*aT[10])))));
-  s2 = w*(aT[1]+w*(aT[3]+w*(aT[5]+w*(aT[7]+w*aT[9]))));
-  if (id<0) return x - x*(s1+s2);
-  else {
-      z = atanhi[id] - ((x*(s1+s2) - atanlo[id]) - x);
-      return (hx<0)? -z:z;
-  }
-
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_atanpi(float x) {
-  return __gen_ocl_internal_atan(x) / M_PI_F;
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_erf(float x) {
-  return M_2_SQRTPI_F * (x - __gen_ocl_pow(x, 3) / 3 + __gen_ocl_pow(x, 5) / 10 - __gen_ocl_pow(x, 7) / 42 + __gen_ocl_pow(x, 9) / 216);
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_erfc(float x) {
-  return 1 - __gen_ocl_internal_erf(x);
-}
-
-// XXX work-around PTX profile
-#define sqrt native_sqrt
-INLINE_OVERLOADABLE float rsqrt(float x) { return native_rsqrt(x); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_atan2(float y, float x) {
-  /* copied from fdlibm */
-  float z;
-  int k,m,hx,hy,ix,iy;
-  const float
-  tiny  = 1.0e-30,
-  zero  = 0.0,
-  pi_o_4  = 7.8539818525e-01, /* 0x3f490fdb */
-  pi_o_2  = 1.5707963705e+00, /* 0x3fc90fdb */
-  pi      = 3.1415927410e+00, /* 0x40490fdb */
-  pi_lo   = -8.7422776573e-08; /* 0xb3bbbd2e */
-
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  ix = hx&0x7fffffff;
-  GEN_OCL_GET_FLOAT_WORD(hy,y);
-  iy = hy&0x7fffffff;
-
-  if((ix>0x7f800000)||
-     (iy>0x7f800000)) /* x or y is NaN */
-     return x+y;
-  if(hx==0x3f800000) return z=__gen_ocl_internal_atan(y);   /* x=1.0 */
-  m = ((hy>>31)&1)|((hx>>30)&2);  /* 2*sign(x)+sign(y) */
-
-    /* when y = 0 */
-  if(iy==0) {
-      switch(m) {
-    case 0:
-    case 1: return y;   /* atan(+-0,+anything)=+-0 */
-    case 2: return  pi+tiny;/* atan(+0,-anything) = pi */
-    case 3: return -pi-tiny;/* atan(-0,-anything) =-pi */
-      }
-  }
-    /* when x = 0 */
-  if(ix==0) return (hy<0)?  -pi_o_2-tiny: pi_o_2+tiny;
-
-  /* both are denorms. Gen does not support denorm, so we convert to normal float number*/
-  if(ix <= 0x7fffff && iy <= 0x7fffff) {
-    x = (float)(ix) * (1.0f - ((hx>>30) & 0x2));
-    y = (float)(iy) * (1.0f - ((hy>>30) & 0x2));
-  }
-
-    /* when x is INF */
-  if(ix==0x7f800000) {
-      if(iy==0x7f800000) {
-    switch(m) {
-        case 0: return  pi_o_4+tiny;/* atan(+INF,+INF) */
-        case 1: return -pi_o_4-tiny;/* atan(-INF,+INF) */
-        case 2: return  (float)3.0*pi_o_4+tiny;/*atan(+INF,-INF)*/
-        case 3: return (float)-3.0*pi_o_4-tiny;/*atan(-INF,-INF)*/
-    }
-      } else {
-    switch(m) {
-        case 0: return  zero  ; /* atan(+...,+INF) */
-        case 1: return -zero  ; /* atan(-...,+INF) */
-        case 2: return  pi+tiny  ;  /* atan(+...,-INF) */
-        case 3: return -pi-tiny  ;  /* atan(-...,-INF) */
-    }
-      }
-  }
-    /* when y is INF */
-  if(iy==0x7f800000) return (hy<0)? -pi_o_2-tiny: pi_o_2+tiny;
-
-    /* compute y/x */
-  k = (iy-ix)>>23;
-  if(k > 60) z=pi_o_2+(float)0.5*pi_lo;   /* |y/x| >  2**60 */
-  else if(hx<0&&k<-60) z=0.0;   /* |y|/x < -2**60 */
-  else z=__gen_ocl_internal_atan(__gen_ocl_fabs(y/x)); /* safe to do y/x */
-  switch (m) {
-      case 0: return       z  ; /* atan(+,+) */
-      case 1: {
-              uint zh;
-          GEN_OCL_GET_FLOAT_WORD(zh,z);
-          GEN_OCL_SET_FLOAT_WORD(z,zh ^ 0x80000000);
-        }
-        return       z  ; /* atan(-,+) */
-      case 2: return  pi-(z-pi_lo);/* atan(+,-) */
-      default: /* case 3 */
-            return  (z-pi_lo)-pi;/* atan(-,-) */
-  }
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_atan2pi(float y, float x) {
-  uint ix = as_uint(x), iy = as_uint(y),
-       pos_zero = 0, neg_zero = 0x80000000u,
-       pos_inf = 0x7f800000, neg_inf = 0xff800000u;
-  if(iy == pos_zero) {
-    if(ix == pos_zero)
-      return 0;
-    if(ix == neg_zero)
-      return 1;
-    if(x < 0)
-      return 1;
-    if(x > 0)
-      return 0;
-  }
-  if(iy == neg_zero) {
-    if(ix == pos_zero)
-      return -0.f;
-    if(ix == neg_zero)
-      return -1;
-    if(x < 0)
-      return -1;
-    if(x > 0)
-      return -0.f;
-  }
-  if((ix & 0x7fffffff) == 0) {
-    if(y < 0)
-      return -.5f;
-    if(y > 0)
-      return .5f;
-  }
-  if(ix == pos_inf) {
-    if(y > 0 && iy != pos_inf)
-      return 0;
-    if(y < 0 && iy != neg_inf)
-      return -0.f;
-  }
-  if(ix == neg_inf) {
-    if(y > 0 && iy != pos_inf)
-      return 1;
-    if(y < 0 && iy != neg_inf)
-      return -1;
-  }
-  if(iy == pos_inf) {
-    if(ix == pos_inf)
-      return 0.25f;
-    if(ix == neg_inf)
-      return 0.75f;
-    if(x >= 0 || x <= 0)
-      return 0.5f;
-  }
-  if(iy == neg_inf) {
-    if(ix == pos_inf)
-      return -0.25f;
-    if(ix == neg_inf)
-      return -0.75f;
-    if(x >= 0 || x <= 0)
-      return -0.5f;
-  }
-  return __gen_ocl_internal_atan2(y, x) / M_PI_F;
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_fabs(float x)  { return __gen_ocl_fabs(x); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_trunc(float x) { return __gen_ocl_rndz(x); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_round(float x) {
-  float y = __gen_ocl_rndz(x);
-  if (__gen_ocl_fabs(x - y) >= 0.5f)
-    y += __gen_ocl_internal_copysign(1.f, x);
-  return y;
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_ceil(float x)  { return __gen_ocl_rndu(x); }
-INLINE_OVERLOADABLE float powr(float x, float y) { return __gen_ocl_pow(x,y); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_rint(float x) {
-  return __gen_ocl_rnde(x);
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_exp(float x) {
-  //use native instruction when it has enough precision
-  if (x > -0x1.6p1 && x < 0x1.6p1)
-  {
-    return native_exp(x);
-  }
-
-  float o_threshold = 8.8721679688e+01,  /* 0x42b17180 */
-  u_threshold = -1.0397208405e+02,  /* 0xc2cff1b5 */
-  twom100 = 7.8886090522e-31, 	 /* 2**-100=0x0d800000 */
-  ivln2	 =	1.4426950216e+00; /* 0x3fb8aa3b =1/ln2 */
-  float y,hi=0.0,lo=0.0,t;
-  int k=0,xsb;
-  unsigned hx;
-  float ln2HI_0 = 6.9313812256e-01;	/* 0x3f317180 */
-  float ln2HI_1 = -6.9313812256e-01;	/* 0xbf317180 */
-  float ln2LO_0 = 9.0580006145e-06;  	/* 0x3717f7d1 */
-  float ln2LO_1 = -9.0580006145e-06; /* 0xb717f7d1 */
-  float half_0 = 0.5;
-  float half_1 =	-0.5;
-
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  xsb = (hx>>31)&1;		/* sign bit of x */
-  hx &= 0x7fffffff;		/* high word of |x| */
-
-  /* filter out non-finite argument */
-  if(hx >= 0x42b17218) {			/* if |x|>=88.721... */
-    // native_exp already handled this
-    return native_exp(x);
-  }
-
-  /* argument reduction */
-  if(hx > 0x3eb17218) {		/* if  |x| > 0.5 ln2 */
-    if(hx < 0x3F851592) {	/* and |x| < 1.5 ln2 */
-      hi = x-(xsb ==1 ? ln2HI_1 : ln2HI_0);
-      lo= xsb == 1? ln2LO_1 : ln2LO_0;
-      k = 1-xsb-xsb;
-    } else {
-      float tmp = xsb == 1 ? half_1 : half_0;
-      k  = ivln2*x+tmp;
-      t  = k;
-      hi = x - t*ln2HI_0;	/* t*ln2HI is exact here */
-      lo = t*ln2LO_0;
-    }
-    x  = hi - lo;
-  }
-
-  y = native_exp(x);
-  if(k >= -125) {
-    unsigned hy;
-    GEN_OCL_GET_FLOAT_WORD(hy,y);
-    GEN_OCL_SET_FLOAT_WORD(y,hy+(k<<23));	/* add k to y's exponent */
-    return y;
-  } else {
-    unsigned hy;
-    GEN_OCL_GET_FLOAT_WORD(hy,y);
-    GEN_OCL_SET_FLOAT_WORD(y,hy+((k+100)<<23)); /* add k to y's exponent */
-    return y*twom100;
-  }
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_fmod (float x, float y) {
-  //return x-y*__gen_ocl_rndz(x/y);
-  float one = 1.0;
-  float Zero[2];
-  int n,hx,hy,hz,ix,iy,sx,i;
-  Zero[0] = 0.0;
-  Zero[1] = -0.0;
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  GEN_OCL_GET_FLOAT_WORD(hy,y);
-  sx = hx&0x80000000;		/* sign of x */
-  hx ^=sx;		/* |x| */
-  hy &= 0x7fffffff;	/* |y| */
-  /* purge off exception values */
-  if(hy==0||(hx>=0x7f800000)||		/* y=0,or x not finite */
-  (hy>0x7f800000))			/* or y is NaN */
-    return (x*y)/(x*y);
-  if(hx<hy) return x;			/* |x|<|y| return x */
-  if(hx==hy)
-    return Zero[(unsigned)sx>>31];	/* |x|=|y| return x*0*/
-
-  /* determine ix = ilogb(x) */
-  if(hx<0x00800000) {	/* subnormal x */
-    for (ix = -126,i=(hx<<8); i>0; i<<=1) ix -=1;
-  } else ix = (hx>>23)-127;
-
-  /* determine iy = ilogb(y) */
-  if(hy<0x00800000) {	/* subnormal y */
-    for (iy = -126,i=(hy<<8); i>=0; i<<=1) iy -=1;
-  } else iy = (hy>>23)-127;
-
-  /* set up {hx,lx}, {hy,ly} and align y to x */
-  if(ix >= -126)
-    hx = 0x00800000|(0x007fffff&hx);
-  else {		/* subnormal x, shift x to normal */
-    n = -126-ix;
-    hx = hx<<n;
-  }
-  if(iy >= -126)
-    hy = 0x00800000|(0x007fffff&hy);
-  else {		/* subnormal y, shift y to normal */
-    n = -126-iy;
-    hy = hy<<n;
-  }
-  /* fix point fmod */
-  n = ix - iy;
-  while(n--) {
-    hz=hx-hy;
-    if(hz<0){hx = hx+hx;}
-    else {
-      if(hz==0)		/* return sign(x)*0 */
-        return Zero[(unsigned)sx>>31];
-      hx = hz+hz;
-    }
-  }
-  hz=hx-hy;
-  if(hz>=0) {hx=hz;}
-
-    /* convert back to floating value and restore the sign */
-  if(hx==0)			/* return sign(x)*0 */
-    return Zero[(unsigned)sx>>31];
-  while(hx<0x00800000) {		/* normalize x */
-    hx = hx+hx;
-    iy -= 1;
-  }
-  if(iy>= -126) {		/* normalize output */
-    hx = ((hx-0x00800000)|((iy+127)<<23));
-	GEN_OCL_SET_FLOAT_WORD(x,hx|sx);
-   } else {		/* subnormal output */
-     n = -126 - iy;
-     hx >>= n;
-     GEN_OCL_SET_FLOAT_WORD(x,hx|sx);
-     x *= one;		/* create necessary signal */
-  }
-  return x;		/* exact output */
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_expm1(float x) {
-  //return __gen_ocl_pow(M_E_F, x) - 1;
-  float	Q1 = -3.3333335072e-02, /* 0xbd088889 */
-  ln2_hi = 6.9313812256e-01,	/* 0x3f317180 */
-  ln2_lo = 9.0580006145e-06,	/* 0x3717f7d1 */
-  Q2 = 1.5873016091e-03, /* 0x3ad00d01 */
-  Q3 = -7.9365076090e-05, /* 0xb8a670cd */
-  Q4 = 4.0082177293e-06, /* 0x36867e54 */
-  Q5 = -2.0109921195e-07, /* 0xb457edbb */
-  huge = 1.0e30,
-  tiny = 1.0e-30,
-  ivln2 = 1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
-  one	=  1.0,
-  o_threshold=  8.8721679688e+01;  /* 0x42b17180 */
-  float y,hi,lo,c,t,e,hxs,hfx,r1;
-  int k,xsb;
-  int hx;
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  xsb = hx&0x80000000;
-  /* sign bit of x */
-  //if(xsb==0)
-  //y=x;
-  //else
-  //y= -x; /* y = |x| */
-  y = __gen_ocl_internal_fabs(x);
-  hx &= 0x7fffffff;		/* high word of |x| */
-  /* filter out huge and non-finite argument */
-  if(hx >= 0x4195b844) {			/* if |x|>=27*ln2 */
-    if(hx >= 0x42b17218) {		/* if |x|>=88.721... */
-      if(hx>0x7f800000)
-        return x+x; 	 /* NaN */
-      if(hx==0x7f800000)
-        return (xsb==0)? x:-1.0;/* exp(+-inf)={inf,-1} */
-      if(x > o_threshold)
-        return huge*huge; /* overflow */
-    }
-    if(xsb!=0) { /* x < -27*ln2, return -1.0 with inexact */
-      if(x+tiny<(float)0.0)	/* raise inexact */
-        return tiny-one;	/* return -1 */
-    }
-  }
-  /* argument reduction */
-  if(hx > 0x3eb17218) {/* if  |x| > 0.5 ln2 */
-    if(hx < 0x3F851592) {/* and |x| < 1.5 ln2 */
-      if(xsb==0){
-        hi = x - ln2_hi; lo = ln2_lo;  k =  1;
-      }	else {
-        hi = x + ln2_hi; lo = -ln2_lo;  k = -1;
-      }
-    } else {
-      k  = ivln2*x+((xsb==0)?(float)0.5:(float)-0.5);
-      t  = k;
-      hi = x - t*ln2_hi;/* t*ln2_hi is exact here */
-      lo = t*ln2_lo;
-    }
-    x  = hi - lo;
-    c  = (hi-x)-lo;
-  } else if(hx < 0x33000000) {	/* when |x|<2**-25, return x */
-    //t = huge+x; /* return x with inexact flags when x!=0 */
-    //return x - (t-(huge+x));
-    return x;
-  } else k = 0;
-  /* x is now in primary range */
-  hfx = (float)0.5*x;
-  hxs = x*hfx;
-  r1 = one+hxs*(Q1+hxs*(Q2+hxs*(Q3+hxs*(Q4+hxs*Q5))));
-  t = (float)3.0-r1*hfx;
-  e = hxs*((r1-t)/((float)6.0 - x*t));
-  if(k==0)
-    return x - (x*e-hxs);		/* c is 0 */
-  else{
-    e = (x*(e-c)-c);
-    e -= hxs;
-    if(k== -1)return (float)0.5*(x-e)-(float)0.5;
-    if(k==1){
-      if(x < (float)-0.25)
-        return -(float)2.0*(e-(x+(float)0.5));
-      else
-        return  (one+(float)2.0*(x-e));
-    }
-    if (k <= -2 || k>56) {	 /* suffice to return exp(x)-1 */
-      int i;
-      y = one-(e-x);
-      GEN_OCL_GET_FLOAT_WORD(i,y);
-      GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23));	/* add k to y's exponent */
-      return y-one;
-    }
-    t = one;
-    if(k<23) {
-      int i;
-      GEN_OCL_SET_FLOAT_WORD(t,0x3f800000 - (0x1000000>>k)); /* t=1-2^-k */
-      y = t-(e-x);
-      GEN_OCL_GET_FLOAT_WORD(i,y);
-      GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23));	/* add k to y's exponent */
-    } else {
-      int i;
-      GEN_OCL_SET_FLOAT_WORD(t,((0x7f-k)<<23));	/* 2^-k */
-      y = x-(e+t);
-      y += one;
-      GEN_OCL_GET_FLOAT_WORD(i,y);
-      GEN_OCL_SET_FLOAT_WORD(y,i+(k<<23));	/* add k to y's exponent */
-    }
-  }
-  return y;
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_acosh(float x) {
-  //return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
-  float one	= 1.0,
-  ln2	= 6.9314718246e-01;/* 0x3f317218 */
-  float t;
-  int hx;
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  if(hx<0x3f800000) {	/* x < 1 */
-    return (x-x)/(x-x);
-  } else if(hx >=0x4d800000) {	/* x > 2**28 */
-    if(hx >=0x7f800000) {/* x is inf of NaN */
-      return x+x;
-    } else
-      return __gen_ocl_internal_log(x)+ln2;/* acosh(huge)=log(2x) */
-  } else if (hx==0x3f800000) {
-    return 0.0;			/* acosh(1) = 0 */
-  } else if (hx > 0x40000000) {	/* 2**28 > x > 2 */
-    t=x*x;
-    return __gen_ocl_internal_log((float)2.0*x-one/(x+__gen_ocl_sqrt(t-one)));			
-  } else {			/* 1<x<2 */
-    t = x-one;
-    return log1p(t+__gen_ocl_sqrt((float)2.0*t+t*t));
-  }
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_asinh(float x){
-  //return native_log(x + native_sqrt(x * x + 1));
-  float one =  1.0000000000e+00, /* 0x3F800000 */
-  ln2 =  6.9314718246e-01, /* 0x3f317218 */
-  huge=  1.0000000000e+30;
-  float w;
-  int hx,ix;
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  ix = hx&0x7fffffff;
-  if(ix< 0x38000000) {	/* |x|<2**-14 */
-    if(huge+x>one) return x;	/* return x inexact except 0 */
-  }
-  if(ix>0x47000000) {/* |x| > 2**14 */
-    if(ix>=0x7f800000) return x+x;/* x is inf or NaN */
-    w = __gen_ocl_internal_log(__gen_ocl_internal_fabs(x))+ln2;
-  } else {
-    float xa = __gen_ocl_internal_fabs(x);
-    if (ix>0x40000000) {/* 2**14 > |x| > 2.0 */
-      w = __gen_ocl_internal_log(2.0f*xa+one/(__gen_ocl_sqrt(xa*xa+one)+xa));
-    } else {		/* 2.0 > |x| > 2**-14 */
-      float t = xa*xa;
-      w =log1p(xa+t/(one+__gen_ocl_sqrt(one+t)));
-    }
-  }
-  return __gen_ocl_internal_copysign(w, x);
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_sinh(float x){
-  //return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
-  float one = 1.0,
-  shuge = 1.0e37;
-  float t,w,h;
-  int ix,jx;
-  GEN_OCL_GET_FLOAT_WORD(jx,x);
-  ix = jx&0x7fffffff;
-  /* x is INF or NaN */
-  if(ix>=0x7f800000) return x+x;
-  h = 0.5;
-  if (jx<0) h = -h;
-  /* |x| in [0,22], return sign(x)*0.5*(E+E/(E+1))) */
-  if (ix < 0x41b00000) {		/* |x|<22 */
-    if (ix<0x31800000)	/* |x|<2**-28 */
-      if(shuge+x>one) return x;/* sinh(tiny) = tiny with inexact */
-    t = __gen_ocl_internal_expm1(__gen_ocl_internal_fabs(x));
-    if(ix<0x3f800000) return h*((float)2.0*t-t*t/(t+one));
-      return h*(t+t/(t+one));
-  }
-  /* |x| in [22, log(maxdouble)] return 0.5*exp(|x|) */
-  if (ix < 0x42b17180)  return h*__gen_ocl_internal_exp(__gen_ocl_internal_fabs(x));
-  /* |x| in [log(maxdouble), overflowthresold] */
-  if (ix<=0x42b2d4fc) {
-    w = __gen_ocl_internal_exp((float)0.5*__gen_ocl_internal_fabs(x));
-    t = h*w;
-    return t*w;
-  }
-  /* |x| > overflowthresold, sinh(x) overflow */
-  return x*shuge;
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_tanh(float x) {
-  //float y = native_exp(-2 * x);
-  //return (1 - y) / (1 + y);
-  float one=1.0, two=2.0, tiny = 1.0e-30;
-  float t,z;
-  int jx,ix;
-  GEN_OCL_GET_FLOAT_WORD(jx,x);
-  ix = jx&0x7fffffff;
-  /* x is INF or NaN */
-  if(ix>=0x7f800000) {
-    if (jx>=0)
-      return one/x+one; /* tanh(+-inf)=+-1 */
-    else
-      return one/x-one; /* tanh(NaN) = NaN */
-  }
-
-  if (ix < 0x41b00000) { /* |x|<22 */
-    if (ix == 0)
-      return x;		/* x == +-0 */
-    if (ix<0x24000000) 	/* |x|<2**-55 */
-      return x*(one+x);    	/* tanh(small) = small */
-    if (ix>=0x3f800000) {	/* |x|>=1  */
-      t = __gen_ocl_internal_expm1(two*__gen_ocl_internal_fabs(x));
-      z = one - two/(t+two);
-    } else {
-      t = __gen_ocl_internal_expm1(-two*__gen_ocl_internal_fabs(x));
-      z= -t/(t+two);
-    }
-  } else { /* |x| > 22, return +-1 */
-    z = one - tiny;		/* raised inexact flag */
-  }
-  return (jx>=0)? z: -z;
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_cosh(float x) {
-  //return (1 + native_exp(-2 * x)) / (2 * native_exp(-x));
-  float halF = 0.5,
-  huge = 1.0e+30,
-  tiny = 1.0e-30,
-  one = 1.0;
-  float t,w;
-  int ix;
-  GEN_OCL_GET_FLOAT_WORD(ix,x);
-  ix &= 0x7fffffff;
-  /* |x| in [0,22] */
-  if (ix < 0x41b00000) {
-    /* |x| in [0,0.5*ln2], return 1+expm1(|x|)^2/(2*exp(|x|)) */
-    if(ix<0x3eb17218) {
-      t = __gen_ocl_internal_expm1(__gen_ocl_fabs(x));
-      w = one+t;
-      if (ix<0x24000000) return w;	/* cosh(tiny) = 1 */
-      return one+(t*t)/(w+w);
-    }
-    /* |x| in [0.5*ln2,22], return (exp(|x|)+1/exp(|x|)/2; */
-    t = __gen_ocl_internal_exp(__gen_ocl_fabs(x));
-    return halF*t+halF/t;
-  }
-  /* |x| in [22, log(maxdouble)] return half*exp(|x|) */
-  if (ix < 0x42b17180)  return halF*__gen_ocl_internal_exp(__gen_ocl_fabs(x));
-  /* |x| in [log(maxdouble), overflowthresold] */
-  if (ix<=0x42b2d4fc) {
-    w = __gen_ocl_internal_exp(halF*__gen_ocl_fabs(x));
-    t = halF*w;
-    return t*w;
-  }
-  /* x is INF or NaN */
-  if(ix>=0x7f800000) return x*x;
-  /* |x| > overflowthresold, cosh(x) overflow */
-  return huge*huge;
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_remainder(float x, float p){
-  //return x-y*__gen_ocl_rnde(x/y);
-  float zero = 0.0;
-  int hx,hp;
-  unsigned sx;
-  float p_half;
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  GEN_OCL_GET_FLOAT_WORD(hp,p);
-  sx = hx&0x80000000;
-  hp &= 0x7fffffff;
-  hx &= 0x7fffffff;
-  /* purge off exception values */
-  if(hp==0) return (x*p)/(x*p);	        /* p = 0 */
-  if((hx>=0x7f800000)||               /* x not finite */
-    ((hp>0x7f800000)))	               /* p is NaN */
-    return (x*p)/(x*p);
-  if (hp<=0x7effffff) x = __gen_ocl_internal_fmod(x,p+p); /* now x < 2p */
-  if ((hx-hp)==0) return zero*x;
-  x = __gen_ocl_fabs(x);
-  p = __gen_ocl_fabs(p);
-  if (hp<0x01000000) {
-    if(x+x>p) {
-      x-=p;
-      if(x+x>=p) x -= p;
-    }
-  } else {
-    p_half = (float)0.5*p;
-    if(x>p_half) {
-      x-=p;
-      if(x>=p_half) x -= p;
-    }
-  }
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  GEN_OCL_SET_FLOAT_WORD(x,hx^sx);
-  return x;
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_ldexp(float x, int n) {
-  if(!__ocl_finitef(x)||x==(float)0.0) return x;
-  x = __gen_ocl_scalbnf(x,n);
-  return x;
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_atanh(float x) {
-  //return 0.5f * native_sqrt((1 + x) / (1 - x));
-  float xa = __gen_ocl_fabs (x);
-  float t;
-  if (isless (xa, 0.5f)){
-    if (xa < 0x1.0p-28f) return x;
-    t = xa + xa;
-    t = 0.5f * log1p (t + t * xa / (1.0f - xa));
-  } else if (isless (xa, 1.0f)){
-    t = 0.5f * log1p ((xa + xa) / (1.0f - xa));
-  } else{
-    if (isgreater (xa, 1.0f)) return (x - x) / (x - x);
-    return x / 0.0f;
-  }
-  return __gen_ocl_internal_copysign(t, x);
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_exp10(float x){
-  float px, qx,ans;
-  short n;
-  int i;
-  float*p;
-  float MAXL10 = 38.230809449325611792;
-  float LOG210 = 3.32192809488736234787e0;
-  float LG102A = 3.00781250000000000000E-1;
-  float LG102B = 2.48745663981195213739E-4;
-  float P[6];
-  P[0] = 2.063216740311022E-001;
-  P[1] = 5.420251702225484E-001;
-  P[2] = 1.171292686296281E+000;
-  P[3] = 2.034649854009453E+000;
-  P[4] = 2.650948748208892E+000;
-  P[5] = 2.302585167056758E+000;
-  if( isinf(x))
-    return INFINITY;
-
-  if( x < -MAXL10 )return 0.0;
-  /* The following is necessary because range reduction blows up: */
-  if( x == 0 )return 1.0;
-
-  /* Express 10**x = 10**g 2**n
-    *	 = 10**g 10**( n log10(2) )
-    *	 = 10**( g + n log10(2) )
-    */
-  px = x * LOG210;
-  qx = __gen_ocl_internal_floor( px + 0.5 );
-  n = qx;
-  x -= qx * LG102A;
-  x -= qx * LG102B;
-
-  /* rational approximation for exponential
-    * of the fractional part:
-    * 10**x - 1  =  2x P(x**2)/( Q(x**2) - P(x**2) )
-    */
-  p = P;
-  ans = *p++;
-  i = 5;
-  do{
-    ans = ans * x  +  *p++;
-  }
-  while( --i );
-  px = 1.0 + x * ans;
-
-  /* multiply by power of 2 */
-  x = __gen_ocl_internal_ldexp( px, n );
-  return x;
-}
-
-// TODO use llvm intrinsics definitions
-#define cospi __gen_ocl_internal_cospi
-#define cosh __gen_ocl_internal_cosh
-#define acos __gen_ocl_internal_acos
-#define acospi __gen_ocl_internal_acospi
-#define acosh __gen_ocl_internal_acosh
-#define sinpi __gen_ocl_internal_sinpi
-#define sinh __gen_ocl_internal_sinh
-#define asin __gen_ocl_internal_asin
-#define asinpi __gen_ocl_internal_asinpi
-#define asinh __gen_ocl_internal_asinh
-#define tanpi __gen_ocl_internal_tanpi
-#define tanh __gen_ocl_internal_tanh
-#define atan __gen_ocl_internal_atan
-#define atan2 __gen_ocl_internal_atan2
-#define atan2pi __gen_ocl_internal_atan2pi
-#define atanpi __gen_ocl_internal_atanpi
-#define atanh __gen_ocl_internal_atanh
-#define pow powr
-#define cbrt __gen_ocl_internal_cbrt
-#define rint __gen_ocl_internal_rint
-#define copysign __gen_ocl_internal_copysign
-#define erf __gen_ocl_internal_erf
-#define erfc __gen_ocl_internal_erfc
-#define fmod __gen_ocl_internal_fmod
-#define remainder __gen_ocl_internal_remainder
-#define ldexp __gen_ocl_internal_ldexp
-PURE CONST float __gen_ocl_mad(float a, float b, float c);
-PURE CONST float __gen_ocl_fmax(float a, float b);
-PURE CONST float __gen_ocl_fmin(float a, float b);
-INLINE_OVERLOADABLE float mad(float a, float b, float c) {
-  return __gen_ocl_mad(a, b, c);
-}
-
-#define DEF(TYPE1, TYPE2) \
-  INLINE_OVERLOADABLE TYPE1 select(TYPE1 src0, TYPE1 src1, TYPE2 cond) { \
-    return cond ? src1 : src0; \
-  }
-DEF(char, char)
-DEF(char, uchar)
-DEF(uchar, char)
-DEF(uchar, uchar)
-DEF(short, short)
-DEF(short, ushort)
-DEF(ushort, short)
-DEF(ushort, ushort)
-DEF(int, int)
-DEF(int, uint)
-DEF(uint, int)
-DEF(uint, uint)
-DEF(long, long)
-DEF(long, ulong)
-DEF(ulong, long)
-DEF(ulong, ulong)
-DEF(float, int)
-DEF(float, uint)
-#undef DEF
-
-/////////////////////////////////////////////////////////////////////////////
-// Common Functions (see 6.11.4 of OCL 1.1 spec)
-/////////////////////////////////////////////////////////////////////////////
-INLINE_OVERLOADABLE float step(float edge, float x) {
-  return x < edge ? 0.0 : 1.0;
-}
-
-#define DECL_MIN_MAX_CLAMP(TYPE) \
-INLINE_OVERLOADABLE TYPE max(TYPE a, TYPE b) { \
-  return a > b ? a : b; \
-} \
-INLINE_OVERLOADABLE TYPE min(TYPE a, TYPE b) { \
-  return a < b ? a : b; \
-} \
-INLINE_OVERLOADABLE TYPE clamp(TYPE v, TYPE l, TYPE u) { \
-  return max(min(v, u), l); \
-}
-DECL_MIN_MAX_CLAMP(int)
-DECL_MIN_MAX_CLAMP(short)
-DECL_MIN_MAX_CLAMP(char)
-DECL_MIN_MAX_CLAMP(uint)
-DECL_MIN_MAX_CLAMP(unsigned short)
-DECL_MIN_MAX_CLAMP(unsigned char)
-DECL_MIN_MAX_CLAMP(long)
-DECL_MIN_MAX_CLAMP(ulong)
-#undef DECL_MIN_MAX_CLAMP
-INLINE_OVERLOADABLE float max(float a, float b) {
-  return __gen_ocl_fmax(a, b);
-}
-INLINE_OVERLOADABLE float min(float a, float b) {
-  return __gen_ocl_fmin(a, b);
-}
-INLINE_OVERLOADABLE float clamp(float v, float l, float u) {
-  return max(min(v, u), l);
-}
-
-#define BODY \
-  if (isnan(x) || isinf(x)) { \
-    *exp = 0; \
-    return x; \
-  } \
-  uint u = as_uint(x); \
-  uint a = u & 0x7FFFFFFFu; \
-  if (a == 0) { \
-    *exp = 0; \
-    return x; \
-  } \
-  if (a >= 0x800000) { \
-    *exp = (a >> 23) - 126; \
-    return as_float((u & (0x807FFFFFu)) | 0x3F000000); \
-  } \
-  int e = -126; \
-  while (a < 0x400000) { \
-    e --; \
-    a <<= 1; \
-  } \
-  a <<= 1; \
-  *exp = e; \
-  return as_float((a & (0x807FFFFFu)) | (u & 0x80000000u) | 0x3F000000);
-INLINE_OVERLOADABLE float frexp(float x, global int *exp) { BODY; }
-INLINE_OVERLOADABLE float frexp(float x, local int *exp) { BODY; }
-INLINE_OVERLOADABLE float frexp(float x, private int *exp) { BODY; }
-#undef BODY
-
-INLINE_OVERLOADABLE float nextafter(float x, float y) {
-  int hx, hy, ix, iy;
-  hx = as_int(x);
-  hy = as_int(y);
-  ix = hx & 0x7fffffff;
-  iy = hy & 0x7fffffff;
-  if(ix>0x7f800000 || iy>0x7f800000)
-    return x+y;
-  if(hx == hy)
-    return y;
-  if(ix == 0) {
-    if(iy == 0)
-      return y;
-    else
-      return as_float((hy&0x80000000) | 1);
-  }
-  if(hx >= 0) {
-    if(hx > hy) {
-      hx -= 1;
-    } else {
-      hx += 1;
-    }
-  } else {
-    if(hy >= 0 || hx > hy){
-      hx -= 1;
-    } else {
-      hx += 1;
-    }
-  }
-  return as_float(hx);
-}
-
-#define BODY \
-  uint hx = as_uint(x), ix = hx & 0x7FFFFFFF; \
-  if (ix > 0x7F800000) { \
-    *i = nan(0u); \
-    return nan(0u); \
-  } \
-  if (ix == 0x7F800000) { \
-    *i = x; \
-    return as_float(hx & 0x80000000u); \
-  } \
-  *i = __gen_ocl_rndz(x); \
-  return x - *i;
-INLINE_OVERLOADABLE float modf(float x, global float *i) { BODY; }
-INLINE_OVERLOADABLE float modf(float x, local float *i) { BODY; }
-INLINE_OVERLOADABLE float modf(float x, private float *i) { BODY; }
-#undef BODY
-INLINE_OVERLOADABLE float degrees(float radians) { return (180 / M_PI_F) * radians; }
-INLINE_OVERLOADABLE float radians(float degrees) { return (M_PI_F / 180) * degrees; }
-
-INLINE_OVERLOADABLE float smoothstep(float e0, float e1, float x) {
-  x = clamp((x - e0) / (e1 - e0), 0.f, 1.f);
-  return x * x * (3 - 2 * x);
-}
-
-INLINE_OVERLOADABLE float sign(float x) {
-  if(x > 0)
-    return 1;
-  if(x < 0)
-    return -1;
-  if(x == -0.f)
-    return -0.f;
-  return 0.f;
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_fmax(float a, float b) { return max(a,b); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_fmin(float a, float b) { return min(a,b); }
-INLINE_OVERLOADABLE float __gen_ocl_internal_maxmag(float x, float y) {
-  float a = __gen_ocl_fabs(x), b = __gen_ocl_fabs(y);
-  return a > b ? x : b > a ? y : max(x, y);
-}
-INLINE_OVERLOADABLE float __gen_ocl_internal_minmag(float x, float y) {
-  float a = __gen_ocl_fabs(x), b = __gen_ocl_fabs(y);
-  return a < b ? x : b < a ? y : min(x, y);
-}
-INLINE_OVERLOADABLE float mix(float x, float y, float a) { return x + (y-x)*a;}
-INLINE_OVERLOADABLE float __gen_ocl_internal_fdim(float x, float y) {
-  if(isnan(x))
-    return x;
-  if(isnan(y))
-    return y;
-  return x > y ? (x - y) : +0.f;
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_pow(float x, float y) {
-  float z,ax,z_h,z_l,p_h,p_l;
-  float y1,t1,t2,r,s,sn,t,u,v,w;
-  int i,j,k,yisint,n;
-  int hx,hy,ix,iy,is;
-  float bp[2],dp_h[2],dp_l[2],
-  zero    =  0.0,
-  one	=  1.0,
-  two	=  2.0,
-  two24	=  16777216.0,	/* 0x4b800000 */
-  huge	=  1.0e30,
-  tiny    =  1.0e-30,
-  /* poly coefs for (3/2)*(log(x)-2s-2/3*s**3 */
-  L1  =  6.0000002384e-01, /* 0x3f19999a */
-  L2  =  4.2857143283e-01, /* 0x3edb6db7 */
-  L3  =  3.3333334327e-01, /* 0x3eaaaaab */
-  L4  =  2.7272811532e-01, /* 0x3e8ba305 */
-  L5  =  2.3066075146e-01, /* 0x3e6c3255 */
-  L6  =  2.0697501302e-01, /* 0x3e53f142 */
-  P1   =  1.6666667163e-01, /* 0x3e2aaaab */
-  P2   = -2.7777778450e-03, /* 0xbb360b61 */
-  P3   =  6.6137559770e-05, /* 0x388ab355 */
-  P4   = -1.6533901999e-06, /* 0xb5ddea0e */
-  P5   =  4.1381369442e-08, /* 0x3331bb4c */
-  lg2  =  6.9314718246e-01, /* 0x3f317218 */
-  lg2_h  =  6.93145752e-01, /* 0x3f317200 */
-  lg2_l  =  1.42860654e-06, /* 0x35bfbe8c */
-  ovt =  4.2995665694e-08, /* -(128-log2(ovfl+.5ulp)) */
-  cp    =  9.6179670095e-01, /* 0x3f76384f =2/(3ln2) */
-  cp_h  =  9.6179199219e-01, /* 0x3f763800 =head of cp */
-  cp_l  =  4.7017383622e-06, /* 0x369dc3a0 =tail of cp_h */
-  ivln2    =  1.4426950216e+00, /* 0x3fb8aa3b =1/ln2 */
-  ivln2_h  =  1.4426879883e+00, /* 0x3fb8aa00 =16b 1/ln2*/
-  ivln2_l  =  7.0526075433e-06; /* 0x36eca570 =1/ln2 tail*/
-  bp[0] = 1.0,bp[1] = 1.5,
-  dp_h[0] = 0.0,dp_h[1] = 5.84960938e-01,
-  dp_l[0] = 0.0,dp_l[1] = 1.56322085e-06;
-  GEN_OCL_GET_FLOAT_WORD(hx,x);
-  GEN_OCL_GET_FLOAT_WORD(hy,y);
-  ix = hx&0x7fffffff;  iy = hy&0x7fffffff;
-  if (ix < 0x00800000) {	   /* x < 2**-126  */
-    ix = 0;/* Gen does not support subnormal number now */
-  }
-  if (iy < 0x00800000) {	  /* y < 2**-126  */
-    iy = 0;/* Gen does not support subnormal number now */
-  }
-   /* y==zero: x**0 = 1 */
-  if(iy==0) return one;
-  if(hx==0x3f800000) return one;
-  /* +-NaN return x+y */
-  if(ix > 0x7f800000 || iy > 0x7f800000)
-    return (x+0.0f)+y+(0.0f);
-  /* determine if y is an odd int when x < 0
-     * yisint = 0	... y is not an integer
-     * yisint = 1	... y is an odd int
-     * yisint = 2	... y is an even int
-     */
-  yisint  = 0;
-  if(hx<0) {
-    if(iy>=0x4b800000) yisint = 2; /* even integer y */
-    else if(iy>=0x3f800000) {
-      k = (iy>>23)-0x7f;	   /* exponent */
-      j = iy>>(23-k);
-      if((j<<(23-k))==iy) yisint = 2-(j&1);
-    }
-  }
-  /* special value of y */
-  if (iy==0x7f800000) {	/* y is +-inf */
-    if (ix==0x3f800000)
-      //return  y - y;	/* inf**+-1 is NaN */
-      return one;
-    else if (ix > 0x3f800000)/* (|x|>1)**+-inf = inf,0 */
-      return (hy>=0)? y: zero;
-    else			/* (|x|<1)**-,+inf = inf,0 */
-      return (hy<0)?-y: zero;
-  }
-  if(iy==0x3f800000) {	/* y is  +-1 */
-    if(hy<0) return one/x; else return x;
-  }
-  if(hy==0x40000000) return x*x; /* y is  2 */
-  if(hy==0x3f000000) {	/* y is  0.5 */
-    if(hx>=0)return __gen_ocl_sqrt(x);
-  }
-
-  ax   = __gen_ocl_fabs(x);
-    /* special value of x */
-  if(ix==0x7f800000||ix==0||ix==0x3f800000){
-    z = ax;			/*x is +-0,+-inf,+-1*/
-    if(hy<0) z = one/z;	/* z = (1/|x|) */
-    if(hx<0) {
-      if(((ix-0x3f800000)|yisint)==0) {
-        z = (z-z)/(z-z); /* (-1)**non-int is NaN */
-      } else if(yisint==1)
-        z = -z;		/* (x<0)**odd = -(|x|**odd) */
-    }
-    return z;
-  }
-  n = ((uint)hx>>31)-1;
-
-  /* (x<0)**(non-int) is NaN */
-  if((n|yisint)==0) return (x-x)/(x-x);
-
-  sn = one; /* s (sign of result -ve**odd) = -1 else = 1 */
-  if((n|(yisint-1))==0) sn = -one;/* (-ve)**(odd int) */
-
-  /* |y| is huge */
-  if(iy>0x4d000000) { /* if |y| > 2**27 */
-    /* over/underflow if x is not close to one */
-    if(ix<0x3f7ffff8) return (hy<0)? sn*huge*huge:sn*tiny*tiny;
-    if(ix>0x3f800007) return (hy>0)? sn*huge*huge:sn*tiny*tiny;
-    /* now |1-x| is tiny <= 2**-20, suffice to compute
-          log(x) by x-x^2/2+x^3/3-x^4/4 */
-    t = ax-1;		/* t has 20 trailing zeros */
-    w = (t*t)*((float)0.5-t*(0.333333333333f-t*0.25f));
-    u = ivln2_h*t;	/* ivln2_h has 16 sig. bits */
-    v = t*ivln2_l-w*ivln2;
-    t1 = u+v;
-    GEN_OCL_GET_FLOAT_WORD(is,t1);
-    GEN_OCL_SET_FLOAT_WORD(t1,is&0xfffff000);
-    t2 = v-(t1-u);
-  } else {
-    float s2,s_h,s_l,t_h,t_l;
-    n = 0;
-	/* take care subnormal number */
-    //if(ix<0x00800000)
-      //{ax *= two24; n -= 24; GEN_OCL_GET_FLOAT_WORD(ix,ax); }
-    n  += ((ix)>>23)-0x7f;
-    j  = ix&0x007fffff;
-	/* determine interval */
-    ix = j|0x3f800000;		/* normalize ix */
-    if(j<=0x1cc471) k=0;	/* |x|<sqrt(3/2) */
-    else if(j<0x5db3d7) k=1;	/* |x|<sqrt(3)   */
-    else {k=0;n+=1;ix -= 0x00800000;}
-    GEN_OCL_SET_FLOAT_WORD(ax,ix);
-
-	/* compute s = s_h+s_l = (x-1)/(x+1) or (x-1.5)/(x+1.5) */
-    u = ax-bp[k];		/* bp[0]=1.0, bp[1]=1.5 */
-    v = one/(ax+bp[k]);
-    s = u*v;
-    s_h = s;
-    GEN_OCL_GET_FLOAT_WORD(is,s_h);
-    GEN_OCL_SET_FLOAT_WORD(s_h,is&0xfffff000);
-    /* t_h=ax+bp[k] High */
-    is = ((ix>>1)&0xfffff000)|0x20000000;
-    GEN_OCL_SET_FLOAT_WORD(t_h,is+0x00400000+(k<<21));
-    t_l = ax - (t_h-bp[k]);
-    s_l = v*((u-s_h*t_h)-s_h*t_l);
-    /* compute log(ax) */
-    s2 = s*s;
-    r = s2*s2*(L1+s2*(L2+s2*(L3+s2*(L4+s2*(L5+s2*L6)))));
-    r += s_l*(s_h+s);
-    s2  = s_h*s_h;
-    t_h = 3.0f+s2+r;
-    GEN_OCL_GET_FLOAT_WORD(is,t_h);
-    GEN_OCL_SET_FLOAT_WORD(t_h,is&0xfffff000);
-    t_l = r-((t_h-3.0f)-s2);
-    /* u+v = s*(1+...) */
-    u = s_h*t_h;
-    v = s_l*t_h+t_l*s;
-    /* 2/(3log2)*(s+...) */
-    p_h = u+v;
-    GEN_OCL_GET_FLOAT_WORD(is,p_h);
-    GEN_OCL_SET_FLOAT_WORD(p_h,is&0xfffff000);
-    p_l = v-(p_h-u);
-    z_h = cp_h*p_h;		/* cp_h+cp_l = 2/(3*log2) */
-    z_l = cp_l*p_h+p_l*cp+dp_l[k];
-    /* log2(ax) = (s+..)*2/(3*log2) = n + dp_h + z_h + z_l */
-    t = (float)n;
-    t1 = (((z_h+z_l)+dp_h[k])+t);
-    GEN_OCL_GET_FLOAT_WORD(is,t1);
-    GEN_OCL_SET_FLOAT_WORD(t1,is&0xfffff000);
-    t2 = z_l-(((t1-t)-dp_h[k])-z_h);
-  }
-
-  /* split up y into y1+y2 and compute (y1+y2)*(t1+t2) */
-  GEN_OCL_GET_FLOAT_WORD(is,y);
-  GEN_OCL_SET_FLOAT_WORD(y1,is&0xfffff000);
-  p_l = (y-y1)*t1+y*t2;
-  p_h = y1*t1;
-  z = p_l+p_h;
-  GEN_OCL_GET_FLOAT_WORD(j,z);
-  if (j>0x43000000)				/* if z > 128 */
-    return sn*huge*huge;			/* overflow */
-  else if (j==0x43000000) {			/* if z == 128 */
-    if(p_l+ovt>z-p_h) return sn*huge*huge;	/* overflow */
-  }
-  else if ((j&0x7fffffff)>0x43160000)		/* z <= -150 */
-    return sn*tiny*tiny;			/* underflow */
-  else if (j==0xc3160000){			/* z == -150 */
-    if(p_l<=z-p_h) return sn*tiny*tiny;		/* underflow */
-  }
-
-  /*
-    * compute 2**(p_h+p_l)
-    */
-  i = j&0x7fffffff;
-  k = (i>>23)-0x7f;
-  n = 0;
-  if(i>0x3f000000) {		/* if |z| > 0.5, set n = [z+0.5] */
-    n = j+(0x00800000>>(k+1));
-    k = ((n&0x7fffffff)>>23)-0x7f;	/* new k for n */
-    GEN_OCL_SET_FLOAT_WORD(t,n&~(0x007fffff>>k));
-    n = ((n&0x007fffff)|0x00800000)>>(23-k);
-    if(j<0) n = -n;
-    p_h -= t;
-  }
-  t = p_l+p_h;
-  GEN_OCL_GET_FLOAT_WORD(is,t);
-  GEN_OCL_SET_FLOAT_WORD(t,is&0xffff8000);
-  u = t*lg2_h;
-  v = (p_l-(t-p_h))*lg2+t*lg2_l;
-  z = u+v;
-  w = v-(z-u);
-  t  = z*z;
-  t1  = z - t*(P1+t*(P2+t*(P3+t*(P4+t*P5))));
-  r  = (z*t1)/(t1-two)-(w+z*w);
-  z  = one-(r-z);
-  GEN_OCL_GET_FLOAT_WORD(j,z);
-  j += (n<<23);
-  if((j>>23)<=0) z = __gen_ocl_scalbnf(z,n);	/* subnormal output */
-  else GEN_OCL_SET_FLOAT_WORD(z,j);
-  return sn*z;
-}
-
-
-INLINE_OVERLOADABLE float hypot(float x, float y) {
-  //return __gen_ocl_sqrt(x*x + y*y);
-  float a,b,an,bn,cn;
-  int e;
-  if (isfinite (x) && isfinite (y)){      /* Determine absolute values.  */
-  x = __gen_ocl_fabs (x);
-  y = __gen_ocl_fabs (y);
-  /* Find the bigger and the smaller one.  */
-  a = max(x,y);
-  b = min(x,y);
-  /* Now 0 <= b <= a.  */
-  /* Write a = an * 2^e, b = bn * 2^e with 0 <= bn <= an < 1.  */
-  an = frexp (a, &e);
-  bn = ldexp (b, - e);
-  /* Through the normalization, no unneeded overflow or underflow will occur here.  */
-  cn = __gen_ocl_sqrt (an * an + bn * bn);
-  return ldexp (cn, e);
-  }else{
-    if (isinf (x) || isinf (y))  /* x or y is infinite.  Return +Infinity.  */    
-      return INFINITY;
-    else        /* x or y is NaN.  Return NaN.  */
-      return x + y;
-  }
-}
-
-#define BODY \
-  if (isnan(x)) { \
-    *p = x; \
-    return x; \
-  } \
-  *p = __gen_ocl_internal_floor(x); \
-  if (isinf(x)) { \
-    return x > 0 ? +0. : -0.; \
-  } \
-  return __gen_ocl_internal_fmin(x - *p, 0x1.FFFFFep-1F);
-INLINE_OVERLOADABLE float fract(float x, global float *p) { BODY; }
-INLINE_OVERLOADABLE float fract(float x, local float *p) { BODY; }
-INLINE_OVERLOADABLE float fract(float x, private float *p) { BODY; }
-#undef BODY
-
-#define BODY \
-  float Zero[2]; \
-  int n,hx,hy,hz,ix,iy,sx,i,sy; \
-  uint q,sxy; \
-  Zero[0] = 0.0;Zero[1] = -0.0; \
-  GEN_OCL_GET_FLOAT_WORD(hx,x);GEN_OCL_GET_FLOAT_WORD(hy,y); \
-  sxy = (hx ^ hy) & 0x80000000;sx = hx&0x80000000;sy = hy&0x80000000; \
-  hx ^=sx; hy &= 0x7fffffff; \
-  if (hx < 0x00800000)hx = 0;if (hy < 0x00800000)hy = 0; \
-  if(hy==0||hx>=0x7f800000||hy>0x7f800000){ \
-    *quo = 0;return NAN; \
-  } \
-  if( hy == 0x7F800000 || hx == 0 ) { \
-    *quo = 0;return x; \
-  } \
-  if( hx == hy ) { \
-    *quo = (x == y) ? 1 : -1; \
-    return sx ? -0.0 : 0.0; \
-  } \
-  if(hx<hy) { \
-    q = 0; \
-    goto fixup; \
-  } else if(hx==hy) { \
-    *quo = (sxy ? -1 : 1); \
-    return Zero[(uint)sx>>31]; \
-  } \
-  ix = (hx>>23)-127; \
-  iy = (hy>>23)-127; \
-  hx = 0x00800000|(0x007fffff&hx); \
-  hy = 0x00800000|(0x007fffff&hy); \
-  n = ix - iy; \
-  q = 0; \
-  while(n--) { \
-    hz=hx-hy; \
-    if(hz<0) hx = hx << 1; \
-    else {hx = hz << 1; q++;} \
-    q <<= 1; \
-  } \
-  hz=hx-hy; \
-  if(hz>=0) {hx=hz;q++;} \
-  if(hx==0) { \
-    q &= 0x0000007f; \
-    *quo = (sxy ? -q : q); \
-    return Zero[(uint)sx>>31]; \
-  } \
-  while(hx<0x00800000) { \
-    hx <<= 1;iy -= 1; \
-  } \
-  if(iy>= -126) { \
-    hx = ((hx-0x00800000)|((iy+127)<<23)); \
-  } else {\
-    n = -126 - iy; \
-    hx >>= n; \
-  } \
-fixup: \
-  GEN_OCL_SET_FLOAT_WORD(x,hx); \
-  if(hx<0x00800000){ \
-    GEN_OCL_GET_FLOAT_WORD(hy,y); \
-    hy &= 0x7fffffff; \
-    if(hx+hx > hy ||(hx+hx==hy && (q & 1)))q++; \
-    x = 0; \
-  }else{ \
-    y = __gen_ocl_fabs(y); \
-    if (y < 0x1p-125f) { \
-      if (x+x>y || (x+x==y && (q & 1))) { \
-        q++;x-=y; \
-      } \
-    }else if (x>0.5f*y || (x==0.5f*y && (q & 1))) { \
-      q++;x-=y; \
-    } \
-    GEN_OCL_GET_FLOAT_WORD(hx,x);GEN_OCL_SET_FLOAT_WORD(x,hx^sx); \
-  } \
-  int sign = sx==sy?0:1; \
-  q &= 0x0000007f; \
-  *quo = (sign ? -q : q); \
-  return x;
-
-INLINE_OVERLOADABLE float remquo(float x, float y, global int *quo) {
-	BODY;
-}
-INLINE_OVERLOADABLE float remquo(float x, float y, local int *quo) { BODY; }
-INLINE_OVERLOADABLE float remquo(float x, float y, private int *quo) { BODY; }
-#undef BODY
-INLINE_OVERLOADABLE float native_divide(float x, float y) { return x/y; }
-INLINE_OVERLOADABLE float pown(float x, int n) {
-  if (x == 0 && n == 0)
-    return 1;
-  return powr(x, n);
-}
-
-INLINE_OVERLOADABLE float internal_rootn(float x, int n, const bool isFastpath)
-{
-  float ax,re;
-  int sign = 0;
-  if( n == 0 )return NAN;
-  //rootn ( x, n )  returns a NaN for x < 0 and n is even.
-  if( x < 0 && 0 == (n&1) )
-    return NAN;
-  if( x == 0.0 ){
-    switch( n & 0x80000001 ){
-      //rootn ( +-0,  n ) is +0 for even n > 0.
-      case 0:
-        return 0.0f;
-      //rootn ( +-0,  n ) is +-0 for odd n > 0.
-      case 1:
-        return x;
-      //rootn ( +-0,  n ) is +inf for even n < 0.
-      case 0x80000000:
-        return INFINITY;
-
-      //rootn ( +-0,  n ) is +-inf for odd n < 0.
-      case 0x80000001:
-        return __gen_ocl_internal_copysign(INFINITY, x);
-    }
-  }
-  ax = __gen_ocl_fabs(x);
-  if(x <0.0f && (n&1))
-    sign = 1;
-  if (isFastpath)
-    re = __gen_ocl_pow(ax,1.f/n);
-  else
-    re = __gen_ocl_internal_pow(ax,1.f/n);
-  if(sign)
-    re = -re;
-  return re;
-}
-
-INLINE_OVERLOADABLE float rootn(float x, int n) {
-  return internal_rootn(x, n, 0);
-}
-
-/////////////////////////////////////////////////////////////////////////////
-// Geometric functions (see 6.11.5 of OCL 1.1 spec)
-/////////////////////////////////////////////////////////////////////////////
-INLINE_OVERLOADABLE float dot(float p0, float p1) {
-  return p0 * p1;
-}
-INLINE_OVERLOADABLE float dot(float2 p0, float2 p1) {
-  return p0.x * p1.x + p0.y * p1.y;
-}
-INLINE_OVERLOADABLE float dot(float3 p0, float3 p1) {
-  return p0.x * p1.x + p0.y * p1.y + p0.z * p1.z;
-}
-INLINE_OVERLOADABLE float dot(float4 p0, float4 p1) {
-  return p0.x * p1.x + p0.y * p1.y + p0.z * p1.z + p0.w * p1.w;
-}
-INLINE_OVERLOADABLE float length(float x) { return __gen_ocl_fabs(x); }
-#define BODY \
-  if(m == 0) \
-    return 0; \
-  if(isinf(m)) \
-    return INFINITY; \
-  if(m < 1) \
-    m = 1; \
-  x /= m; \
-  return m * sqrt(dot(x,x));
-INLINE_OVERLOADABLE float length(float2 x) {
-  float m = max(__gen_ocl_fabs(x.s0), __gen_ocl_fabs(x.s1));
-  BODY;
-}
-INLINE_OVERLOADABLE float length(float3 x) {
-  float m = max(__gen_ocl_fabs(x.s0), max(__gen_ocl_fabs(x.s1), __gen_ocl_fabs(x.s2)));
-  BODY;
-}
-INLINE_OVERLOADABLE float length(float4 x) {
-  float m = max(__gen_ocl_fabs(x.s0), max(__gen_ocl_fabs(x.s1), max(__gen_ocl_fabs(x.s2), __gen_ocl_fabs(x.s3))));
-  BODY;
-}
-#undef BODY
-INLINE_OVERLOADABLE float distance(float x, float y) { return length(x-y); }
-INLINE_OVERLOADABLE float distance(float2 x, float2 y) { return length(x-y); }
-INLINE_OVERLOADABLE float distance(float3 x, float3 y) { return length(x-y); }
-INLINE_OVERLOADABLE float distance(float4 x, float4 y) { return length(x-y); }
-INLINE_OVERLOADABLE float normalize(float x) {
-  union { float f; unsigned u; } u;
-  u.f = x;
-  if(u.u == 0)
-    return 0.f;
-  if(isnan(x))
-    return NAN;
-  return u.u < 0x7fffffff ? 1.f : -1.f;
-}
-INLINE_OVERLOADABLE float2 normalize(float2 x) {
-  float m = length(x);
-  if(m == 0)
-    return 0;
-  return x / m;
-}
-INLINE_OVERLOADABLE float3 normalize(float3 x) {
-  float m = length(x);
-  if(m == 0)
-    return 0;
-  return x / m;
-}
-INLINE_OVERLOADABLE float4 normalize(float4 x) {
-  float m = length(x);
-  if(m == 0)
-    return 0;
-  return x / m;
-}
-
-INLINE_OVERLOADABLE float fast_length(float x) { return __gen_ocl_fabs(x); }
-INLINE_OVERLOADABLE float fast_length(float2 x) { return sqrt(dot(x,x)); }
-INLINE_OVERLOADABLE float fast_length(float3 x) { return sqrt(dot(x,x)); }
-INLINE_OVERLOADABLE float fast_length(float4 x) { return sqrt(dot(x,x)); }
-INLINE_OVERLOADABLE float fast_distance(float x, float y) { return length(x-y); }
-INLINE_OVERLOADABLE float fast_distance(float2 x, float2 y) { return length(x-y); }
-INLINE_OVERLOADABLE float fast_distance(float3 x, float3 y) { return length(x-y); }
-INLINE_OVERLOADABLE float fast_distance(float4 x, float4 y) { return length(x-y); }
-INLINE_OVERLOADABLE float fast_normalize(float x) { return x > 0 ? 1.f : (x < 0 ? -1.f : 0.f); }
-INLINE_OVERLOADABLE float2 fast_normalize(float2 x) { return x * rsqrt(dot(x, x)); }
-INLINE_OVERLOADABLE float3 fast_normalize(float3 x) { return x * rsqrt(dot(x, x)); }
-INLINE_OVERLOADABLE float4 fast_normalize(float4 x) { return x * rsqrt(dot(x, x)); }
-
-INLINE_OVERLOADABLE float3 cross(float3 v0, float3 v1) {
-   return v0.yzx*v1.zxy-v0.zxy*v1.yzx;
-}
-INLINE_OVERLOADABLE float4 cross(float4 v0, float4 v1) {
-   return (float4)(v0.yzx*v1.zxy-v0.zxy*v1.yzx, 0.f);
-}
-
-/////////////////////////////////////////////////////////////////////////////
-// Vector loads and stores
-/////////////////////////////////////////////////////////////////////////////
-
-// These loads and stores will use untyped reads and writes, so we can just
-// cast to vector loads / stores. Not C99 compliant BTW due to aliasing issue.
-// Well we do not care, we do not activate TBAA in the compiler
-#define DECL_UNTYPED_RW_SPACE_N(TYPE, DIM, SPACE) \
-INLINE_OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \
-  return *(SPACE TYPE##DIM *) (p + DIM * offset); \
-} \
-INLINE_OVERLOADABLE void vstore##DIM(TYPE##DIM v, size_t offset, SPACE TYPE *p) { \
-  *(SPACE TYPE##DIM *) (p + DIM * offset) = v; \
-}
-
-#define DECL_UNTYPED_RD_SPACE_N(TYPE, DIM, SPACE) \
-INLINE_OVERLOADABLE TYPE##DIM vload##DIM(size_t offset, const SPACE TYPE *p) { \
-  return *(SPACE TYPE##DIM *) (p + DIM * offset); \
-}
-
-#define DECL_UNTYPED_V3_SPACE(TYPE, SPACE) \
-INLINE_OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p) {\
-  *(p + 3 * offset) = v.s0; \
-  *(p + 3 * offset + 1) = v.s1; \
-  *(p + 3 * offset + 2) = v.s2; \
-} \
-INLINE_OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
-  return (TYPE##3)(*(p + 3 * offset), *(p+ 3 * offset + 1), *(p + 3 * offset + 2));\
-}
-
-#define DECL_UNTYPED_RDV3_SPACE(TYPE, SPACE) \
-INLINE_OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
-  return (TYPE##3)(*(p + 3 * offset), *(p+ 3 * offset + 1), *(p + 3 * offset + 2));\
-}
-
-#define DECL_UNTYPED_RW_ALL_SPACE(TYPE, SPACE) \
-  DECL_UNTYPED_RW_SPACE_N(TYPE, 2, SPACE) \
-  DECL_UNTYPED_V3_SPACE(TYPE, SPACE) \
-  DECL_UNTYPED_RW_SPACE_N(TYPE, 4, SPACE) \
-  DECL_UNTYPED_RW_SPACE_N(TYPE, 8, SPACE) \
-  DECL_UNTYPED_RW_SPACE_N(TYPE, 16, SPACE)
-
-#define DECL_UNTYPED_RD_ALL_SPACE(TYPE, SPACE) \
-  DECL_UNTYPED_RD_SPACE_N(TYPE, 2, SPACE) \
-  DECL_UNTYPED_RDV3_SPACE(TYPE, SPACE) \
-  DECL_UNTYPED_RD_SPACE_N(TYPE, 4, SPACE) \
-  DECL_UNTYPED_RD_SPACE_N(TYPE, 8, SPACE) \
-  DECL_UNTYPED_RD_SPACE_N(TYPE, 16, SPACE)
-
-#define DECL_UNTYPED_RW_ALL(TYPE) \
-  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __global) \
-  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __local) \
-  DECL_UNTYPED_RD_ALL_SPACE(TYPE, __constant) \
-  DECL_UNTYPED_RW_ALL_SPACE(TYPE, __private)
-
-#define DECL_BYTE_RD_SPACE(TYPE, SPACE) \
-INLINE_OVERLOADABLE TYPE##2 vload2(size_t offset, const SPACE TYPE *p) { \
-  return (TYPE##2)(*(p+2*offset), *(p+2*offset+1)); \
-} \
-INLINE_OVERLOADABLE TYPE##3 vload3(size_t offset, const SPACE TYPE *p) { \
-  return (TYPE##3)(*(p+3*offset), *(p+3*offset+1), *(p+3*offset+2)); \
-} \
-INLINE_OVERLOADABLE TYPE##4 vload4(size_t offset, const SPACE TYPE *p) { \
-  return (TYPE##4)(vload2(2*offset, p), vload2(2*offset, p+2)); \
-} \
-INLINE_OVERLOADABLE TYPE##8 vload8(size_t offset, const SPACE TYPE *p) { \
-  return (TYPE##8)(vload4(2*offset, p), vload4(2*offset, p+4)); \
-} \
-INLINE_OVERLOADABLE TYPE##16 vload16(size_t offset, const SPACE TYPE *p) { \
-  return (TYPE##16)(vload8(2*offset, p), vload8(2*offset, p+8)); \
-}
-
-#define DECL_BYTE_WR_SPACE(TYPE, SPACE) \
-INLINE_OVERLOADABLE void vstore2(TYPE##2 v, size_t offset, SPACE TYPE *p) {\
-  *(p + 2 * offset) = v.s0; \
-  *(p + 2 * offset + 1) = v.s1; \
-} \
-INLINE_OVERLOADABLE void vstore3(TYPE##3 v, size_t offset, SPACE TYPE *p) {\
-  *(p + 3 * offset) = v.s0; \
-  *(p + 3 * offset + 1) = v.s1; \
-  *(p + 3 * offset + 2) = v.s2; \
-} \
-INLINE_OVERLOADABLE void vstore4(TYPE##4 v, size_t offset, SPACE TYPE *p) { \
-  vstore2(v.lo, 2*offset, p); \
-  vstore2(v.hi, 2*offset, p+2); \
-} \
-INLINE_OVERLOADABLE void vstore8(TYPE##8 v, size_t offset, SPACE TYPE *p) { \
-  vstore4(v.lo, 2*offset, p); \
-  vstore4(v.hi, 2*offset, p+4); \
-} \
-INLINE_OVERLOADABLE void vstore16(TYPE##16 v, size_t offset, SPACE TYPE *p) { \
-  vstore8(v.lo, 2*offset, p); \
-  vstore8(v.hi, 2*offset, p+8); \
-}
-
-#define DECL_BYTE_RW_ALL(TYPE) \
-  DECL_BYTE_RD_SPACE(TYPE, __global) \
-  DECL_BYTE_RD_SPACE(TYPE, __local) \
-  DECL_BYTE_RD_SPACE(TYPE, __private) \
-  DECL_BYTE_RD_SPACE(TYPE, __constant) \
-  DECL_BYTE_WR_SPACE(TYPE, __global) \
-  DECL_BYTE_WR_SPACE(TYPE, __local) \
-  DECL_BYTE_WR_SPACE(TYPE, __private)
-
-DECL_BYTE_RW_ALL(char)
-DECL_BYTE_RW_ALL(uchar)
-DECL_BYTE_RW_ALL(short)
-DECL_BYTE_RW_ALL(ushort)
-DECL_UNTYPED_RW_ALL(int)
-DECL_UNTYPED_RW_ALL(uint)
-DECL_UNTYPED_RW_ALL(long)
-DECL_UNTYPED_RW_ALL(ulong)
-DECL_UNTYPED_RW_ALL(float)
-DECL_UNTYPED_RW_ALL(double)
-
-#undef DECL_UNTYPED_RW_ALL
-#undef DECL_UNTYPED_RW_ALL_SPACE
-#undef DECL_UNTYPED_RD_ALL_SPACE
-#undef DECL_UNTYPED_RW_SPACE_N
-#undef DECL_UNTYPED_RD_SPACE_N
-#undef DECL_UNTYPED_V3_SPACE
-#undef DECL_UNTYPED_RDV3_SPACE
-#undef DECL_BYTE_RD_SPACE
-#undef DECL_BYTE_WR_SPACE
-#undef DECL_BYTE_RW_ALL
-
-PURE CONST float __gen_ocl_f16to32(short h);
-PURE CONST short __gen_ocl_f32to16(float f);
-
-INLINE_OVERLOADABLE short f32to16_rtp(float f) {
-  short s = __gen_ocl_f32to16(f);
-  float con = __gen_ocl_f16to32(s);
-  //if(isinf(con)) return s;
-  if (f > con)
-    return s - signbit(f) * 2 + 1;
-  else
-    return s;
-}
-
-INLINE_OVERLOADABLE short f32to16_rtn(float f) {
-  short s = __gen_ocl_f32to16(f);
-  float con = __gen_ocl_f16to32(s);
-  //if(isinf(con)) return s;
-  if (con > f)
-    return s + signbit(f) * 2 - 1;
-  else
-    return s;
-}
-
-INLINE_OVERLOADABLE short f32to16_rtz(float f) {
-  short s = __gen_ocl_f32to16(f);
-  float con = __gen_ocl_f16to32(s);
-  //if(isinf(con)) return s;
-  if (((con > f) && !signbit(f)) ||
-      ((con < f) && signbit(f)))
-    return s - 1;
-  else
-    return s;
-}
-
-#define DECL_HALF_LD_SPACE(SPACE) \
-INLINE_OVERLOADABLE float vload_half(size_t offset, const SPACE half *p) { \
-  return __gen_ocl_f16to32(*(SPACE short *)(p + offset)); \
-} \
-INLINE_OVERLOADABLE float2 vload_half2(size_t offset, const SPACE half *p) { \
-  return (float2)(vload_half(offset*2, p), \
-                  vload_half(offset*2 + 1, p)); \
-} \
-INLINE_OVERLOADABLE float3 vload_half3(size_t offset, const SPACE half *p) { \
-  return (float3)(vload_half(offset*3, p), \
-                  vload_half(offset*3 + 1, p), \
-                  vload_half(offset*3 + 2, p)); \
-} \
-INLINE_OVERLOADABLE float3 vloada_half3(size_t offset, const SPACE half *p) { \
-  return (float3)(vload_half(offset*4, p), \
-                  vload_half(offset*4 + 1, p), \
-                  vload_half(offset*4 + 2, p)); \
-} \
-INLINE_OVERLOADABLE float4 vload_half4(size_t offset, const SPACE half *p) { \
-  return (float4)(vload_half2(offset*2, p), \
-                  vload_half2(offset*2 + 1, p)); \
-} \
-INLINE_OVERLOADABLE float8 vload_half8(size_t offset, const SPACE half *p) { \
-  return (float8)(vload_half4(offset*2, p), \
-                  vload_half4(offset*2 + 1, p)); \
-} \
-INLINE_OVERLOADABLE float16 vload_half16(size_t offset, const SPACE half *p) { \
-  return (float16)(vload_half8(offset*2, p), \
-                   vload_half8(offset*2 + 1, p)); \
-}
-
-#define DECL_HALF_ST_SPACE_ROUND(SPACE, ROUND, FUNC) \
-INLINE_OVERLOADABLE void vstore_half##ROUND(float data, size_t offset, SPACE half *p) { \
-  *(SPACE short *)(p + offset) = FUNC(data); \
-} \
-INLINE_OVERLOADABLE void vstorea_half##ROUND(float data, size_t offset, SPACE half *p) { \
-  vstore_half##ROUND(data, offset, p); \
-} \
-INLINE_OVERLOADABLE void vstore_half2##ROUND(float2 data, size_t offset, SPACE half *p) { \
-  vstore_half##ROUND(data.lo, offset*2, p); \
-  vstore_half##ROUND(data.hi, offset*2 + 1, p); \
-} \
-INLINE_OVERLOADABLE void vstorea_half2##ROUND(float2 data, size_t offset, SPACE half *p) { \
-  vstore_half2##ROUND(data, offset, p); \
-} \
-INLINE_OVERLOADABLE void vstore_half3##ROUND(float3 data, size_t offset, SPACE half *p) { \
-  vstore_half##ROUND(data.s0, offset*3, p); \
-  vstore_half##ROUND(data.s1, offset*3 + 1, p); \
-  vstore_half##ROUND(data.s2, offset*3 + 2, p); \
-} \
-INLINE_OVERLOADABLE void vstorea_half3##ROUND(float3 data, size_t offset, SPACE half *p) { \
-  vstore_half##ROUND(data.s0, offset*4, p); \
-  vstore_half##ROUND(data.s1, offset*4 + 1, p); \
-  vstore_half##ROUND(data.s2, offset*4 + 2, p); \
-} \
-INLINE_OVERLOADABLE void vstore_half4##ROUND(float4 data, size_t offset, SPACE half *p) { \
-  vstore_half2##ROUND(data.lo, offset*2, p); \
-  vstore_half2##ROUND(data.hi, offset*2 + 1, p); \
-} \
-INLINE_OVERLOADABLE void vstorea_half4##ROUND(float4 data, size_t offset, SPACE half *p) { \
-  vstore_half4##ROUND(data, offset, p); \
-} \
-INLINE_OVERLOADABLE void vstore_half8##ROUND(float8 data, size_t offset, SPACE half *p) { \
-  vstore_half4##ROUND(data.lo, offset*2, p); \
-  vstore_half4##ROUND(data.hi, offset*2 + 1, p); \
-} \
-INLINE_OVERLOADABLE void vstorea_half8##ROUND(float8 data, size_t offset, SPACE half *p) { \
-  vstore_half8##ROUND(data, offset, p); \
-} \
-INLINE_OVERLOADABLE void vstore_half16##ROUND(float16 data, size_t offset, SPACE half *p) { \
-  vstore_half8##ROUND(data.lo, offset*2, p); \
-  vstore_half8##ROUND(data.hi, offset*2 + 1, p); \
-} \
-INLINE_OVERLOADABLE void vstorea_half16##ROUND(float16 data, size_t offset, SPACE half *p) { \
-  vstore_half16##ROUND(data, offset, p); \
-}
-
-#define DECL_HALF_ST_SPACE(SPACE) \
-  DECL_HALF_ST_SPACE_ROUND(SPACE,  , __gen_ocl_f32to16) \
-  DECL_HALF_ST_SPACE_ROUND(SPACE, _rte, __gen_ocl_f32to16) \
-  DECL_HALF_ST_SPACE_ROUND(SPACE, _rtz, f32to16_rtz) \
-  DECL_HALF_ST_SPACE_ROUND(SPACE, _rtp, f32to16_rtp) \
-  DECL_HALF_ST_SPACE_ROUND(SPACE, _rtn, f32to16_rtn) \
-
-DECL_HALF_LD_SPACE(__global)
-DECL_HALF_LD_SPACE(__local)
-DECL_HALF_LD_SPACE(__constant)
-DECL_HALF_LD_SPACE(__private)
-
-DECL_HALF_ST_SPACE(__global)
-DECL_HALF_ST_SPACE(__local)
-DECL_HALF_ST_SPACE(__private)
-
-//#undef DECL_UNTYPED_RW_ALL_SPACE
-#undef DECL_HALF_LD_SPACE
-#undef DECL_HALF_ST_SPACE
-#undef DECL_HALF_ST_SPACE_ROUND
-
-#define vloada_half vload_half
-#define vloada_half2 vload_half2
-#define vloada_half4 vload_half4
-#define vloada_half8 vload_half8
-#define vloada_half16 vload_half16
-
-// XXX workaround ptx profile
-#define fabs __gen_ocl_internal_fabs
-#define trunc __gen_ocl_internal_trunc
-#define round __gen_ocl_internal_round
-#define floor __gen_ocl_internal_floor
-#define ceil __gen_ocl_internal_ceil
-#define log __gen_ocl_internal_log
-#define log2 __gen_ocl_internal_log2
-#define log10 __gen_ocl_internal_log10
-#define exp __gen_ocl_internal_exp
-#define exp2 native_exp2
-#define exp10 __gen_ocl_internal_exp10
-#define expm1 __gen_ocl_internal_expm1
-#define fmin __gen_ocl_internal_fmin
-#define fmax __gen_ocl_internal_fmax
-#define fma mad
-#define fdim __gen_ocl_internal_fdim
-#define maxmag __gen_ocl_internal_maxmag
-#define minmag __gen_ocl_internal_minmag
-
-/////////////////////////////////////////////////////////////////////////////
-// Miscellaneous Vector Functions (see 6.11.12 of OCL 1.1 spec)
-/////////////////////////////////////////////////////////////////////////////
-#define DEC2(TYPE, XTYPE, MASKTYPE) \
-  INLINE_OVERLOADABLE TYPE##2 shuffle(XTYPE x, MASKTYPE##2 mask) { \
-    TYPE##2 y; \
-    y.s0 = ((TYPE *) &x)[mask.s0 & (vec_step(x) - 1)]; \
-    y.s1 = ((TYPE *) &x)[mask.s1 & (vec_step(x) - 1)]; \
-    return y; \
-  }
-
-#define DEC4(TYPE, XTYPE, MASKTYPE) \
-  INLINE_OVERLOADABLE TYPE##4 shuffle(XTYPE x, MASKTYPE##4 mask) { \
-    TYPE##4 y; \
-    y.s0 = ((TYPE *) &x)[mask.s0 & (vec_step(x) - 1)]; \
-    y.s1 = ((TYPE *) &x)[mask.s1 & (vec_step(x) - 1)]; \
-    y.s2 = ((TYPE *) &x)[mask.s2 & (vec_step(x) - 1)]; \
-    y.s3 = ((TYPE *) &x)[mask.s3 & (vec_step(x) - 1)]; \
-    return y; \
-  }
-
-#define DEC8(TYPE, XTYPE, MASKTYPE) \
-  INLINE_OVERLOADABLE TYPE##8 shuffle(XTYPE x, MASKTYPE##8 mask) { \
-    TYPE##8 y; \
-    y.s0 = ((TYPE *) &x)[mask.s0 & (vec_step(x) - 1)]; \
-    y.s1 = ((TYPE *) &x)[mask.s1 & (vec_step(x) - 1)]; \
-    y.s2 = ((TYPE *) &x)[mask.s2 & (vec_step(x) - 1)]; \
-    y.s3 = ((TYPE *) &x)[mask.s3 & (vec_step(x) - 1)]; \
-    y.s4 = ((TYPE *) &x)[mask.s4 & (vec_step(x) - 1)]; \
-    y.s5 = ((TYPE *) &x)[mask.s5 & (vec_step(x) - 1)]; \
-    y.s6 = ((TYPE *) &x)[mask.s6 & (vec_step(x) - 1)]; \
-    y.s7 = ((TYPE *) &x)[mask.s7 & (vec_step(x) - 1)]; \
-    return y; \
-  }
-
-#define DEC16(TYPE, XTYPE, MASKTYPE) \
-  INLINE_OVERLOADABLE TYPE##16 shuffle(XTYPE x, MASKTYPE##16 mask) { \
-    TYPE##16 y; \
-    y.s0 = ((TYPE *) &x)[mask.s0 & (vec_step(x) - 1)]; \
-    y.s1 = ((TYPE *) &x)[mask.s1 & (vec_step(x) - 1)]; \
-    y.s2 = ((TYPE *) &x)[mask.s2 & (vec_step(x) - 1)]; \
-    y.s3 = ((TYPE *) &x)[mask.s3 & (vec_step(x) - 1)]; \
-    y.s4 = ((TYPE *) &x)[mask.s4 & (vec_step(x) - 1)]; \
-    y.s5 = ((TYPE *) &x)[mask.s5 & (vec_step(x) - 1)]; \
-    y.s6 = ((TYPE *) &x)[mask.s6 & (vec_step(x) - 1)]; \
-    y.s7 = ((TYPE *) &x)[mask.s7 & (vec_step(x) - 1)]; \
-    y.s8 = ((TYPE *) &x)[mask.s8 & (vec_step(x) - 1)]; \
-    y.s9 = ((TYPE *) &x)[mask.s9 & (vec_step(x) - 1)]; \
-    y.sa = ((TYPE *) &x)[mask.sa & (vec_step(x) - 1)]; \
-    y.sb = ((TYPE *) &x)[mask.sb & (vec_step(x) - 1)]; \
-    y.sc = ((TYPE *) &x)[mask.sc & (vec_step(x) - 1)]; \
-    y.sd = ((TYPE *) &x)[mask.sd & (vec_step(x) - 1)]; \
-    y.se = ((TYPE *) &x)[mask.se & (vec_step(x) - 1)]; \
-    y.sf = ((TYPE *) &x)[mask.sf & (vec_step(x) - 1)]; \
-    return y; \
-  }
-
-#define DEFMASK(TYPE, MASKTYPE) \
-  DEC2(TYPE, TYPE##2, MASKTYPE); DEC2(TYPE, TYPE##4, MASKTYPE); DEC2(TYPE, TYPE##8, MASKTYPE); DEC2(TYPE, TYPE##16, MASKTYPE) \
-  DEC4(TYPE, TYPE##2, MASKTYPE); DEC4(TYPE, TYPE##4, MASKTYPE); DEC4(TYPE, TYPE##8, MASKTYPE); DEC4(TYPE, TYPE##16, MASKTYPE) \
-  DEC8(TYPE, TYPE##2, MASKTYPE); DEC8(TYPE, TYPE##4, MASKTYPE); DEC8(TYPE, TYPE##8, MASKTYPE); DEC8(TYPE, TYPE##16, MASKTYPE) \
-  DEC16(TYPE, TYPE##2, MASKTYPE); DEC16(TYPE, TYPE##4, MASKTYPE); DEC16(TYPE, TYPE##8, MASKTYPE); DEC16(TYPE, TYPE##16, MASKTYPE)
-
-#define DEF(TYPE) \
-  DEFMASK(TYPE, uchar) \
-  DEFMASK(TYPE, ushort) \
-  DEFMASK(TYPE, uint) \
-  DEFMASK(TYPE, ulong)
-
-DEF(char)
-DEF(uchar)
-DEF(short)
-DEF(ushort)
-DEF(int)
-DEF(uint)
-DEF(float)
-DEF(long)
-DEF(ulong)
-#undef DEF
-#undef DEFMASK
-#undef DEC2
-#undef DEC4
-#undef DEC8
-#undef DEC16
-
-#define DEC2(TYPE, ARGTYPE, TEMPTYPE, MASKTYPE) \
-  INLINE_OVERLOADABLE TYPE##2 shuffle2(ARGTYPE x, ARGTYPE y, MASKTYPE##2 mask) { \
-    return shuffle((TEMPTYPE)(x, y), mask); \
-  }
-
-#define DEC2X(TYPE, MASKTYPE) \
-  INLINE_OVERLOADABLE TYPE##2 shuffle2(TYPE##16 x, TYPE##16 y, MASKTYPE##2 mask) { \
-    TYPE##2 z; \
-    z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
-    z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
-    return z; \
-  }
-
-#define DEC4(TYPE, ARGTYPE, TEMPTYPE, MASKTYPE) \
-  INLINE_OVERLOADABLE TYPE##4 shuffle2(ARGTYPE x, ARGTYPE y, MASKTYPE##4 mask) { \
-    return shuffle((TEMPTYPE)(x, y), mask); \
-  }
-
-#define DEC4X(TYPE, MASKTYPE) \
-  INLINE_OVERLOADABLE TYPE##4 shuffle2(TYPE##16 x, TYPE##16 y, MASKTYPE##4 mask) { \
-    TYPE##4 z; \
-    z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
-    z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
-    z.s2 = mask.s2 < 16 ? ((TYPE *)&x)[mask.s2] : ((TYPE *)&y)[mask.s2 & 15]; \
-    z.s3 = mask.s3 < 16 ? ((TYPE *)&x)[mask.s3] : ((TYPE *)&y)[mask.s3 & 15]; \
-    return z; \
-  }
-
-#define DEC8(TYPE, ARGTYPE, TEMPTYPE, MASKTYPE) \
-  INLINE_OVERLOADABLE TYPE##8 shuffle2(ARGTYPE x, ARGTYPE y, MASKTYPE##8 mask) { \
-    return shuffle((TEMPTYPE)(x, y), mask); \
-  }
-
-#define DEC8X(TYPE, MASKTYPE) \
-  INLINE_OVERLOADABLE TYPE##8 shuffle2(TYPE##16 x, TYPE##16 y, MASKTYPE##8 mask) { \
-    TYPE##8 z; \
-    z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
-    z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
-    z.s2 = mask.s2 < 16 ? ((TYPE *)&x)[mask.s2] : ((TYPE *)&y)[mask.s2 & 15]; \
-    z.s3 = mask.s3 < 16 ? ((TYPE *)&x)[mask.s3] : ((TYPE *)&y)[mask.s3 & 15]; \
-    z.s4 = mask.s4 < 16 ? ((TYPE *)&x)[mask.s4] : ((TYPE *)&y)[mask.s4 & 15]; \
-    z.s5 = mask.s5 < 16 ? ((TYPE *)&x)[mask.s5] : ((TYPE *)&y)[mask.s5 & 15]; \
-    z.s6 = mask.s6 < 16 ? ((TYPE *)&x)[mask.s6] : ((TYPE *)&y)[mask.s6 & 15]; \
-    z.s7 = mask.s7 < 16 ? ((TYPE *)&x)[mask.s7] : ((TYPE *)&y)[mask.s7 & 15]; \
-    return z; \
-  }
-
-#define DEC16(TYPE, ARGTYPE, TEMPTYPE, MASKTYPE) \
-  INLINE_OVERLOADABLE TYPE##16 shuffle2(ARGTYPE x, ARGTYPE y, MASKTYPE##16 mask) { \
-    return shuffle((TEMPTYPE)(x, y), mask); \
-  }
-
-#define DEC16X(TYPE, MASKTYPE) \
-  INLINE_OVERLOADABLE TYPE##16 shuffle2(TYPE##16 x, TYPE##16 y, MASKTYPE##16 mask) { \
-    TYPE##16 z; \
-    z.s0 = mask.s0 < 16 ? ((TYPE *)&x)[mask.s0] : ((TYPE *)&y)[mask.s0 & 15]; \
-    z.s1 = mask.s1 < 16 ? ((TYPE *)&x)[mask.s1] : ((TYPE *)&y)[mask.s1 & 15]; \
-    z.s2 = mask.s2 < 16 ? ((TYPE *)&x)[mask.s2] : ((TYPE *)&y)[mask.s2 & 15]; \
-    z.s3 = mask.s3 < 16 ? ((TYPE *)&x)[mask.s3] : ((TYPE *)&y)[mask.s3 & 15]; \
-    z.s4 = mask.s4 < 16 ? ((TYPE *)&x)[mask.s4] : ((TYPE *)&y)[mask.s4 & 15]; \
-    z.s5 = mask.s5 < 16 ? ((TYPE *)&x)[mask.s5] : ((TYPE *)&y)[mask.s5 & 15]; \
-    z.s6 = mask.s6 < 16 ? ((TYPE *)&x)[mask.s6] : ((TYPE *)&y)[mask.s6 & 15]; \
-    z.s7 = mask.s7 < 16 ? ((TYPE *)&x)[mask.s7] : ((TYPE *)&y)[mask.s7 & 15]; \
-    z.s8 = mask.s8 < 16 ? ((TYPE *)&x)[mask.s8] : ((TYPE *)&y)[mask.s8 & 15]; \
-    z.s9 = mask.s9 < 16 ? ((TYPE *)&x)[mask.s9] : ((TYPE *)&y)[mask.s9 & 15]; \
-    z.sa = mask.sa < 16 ? ((TYPE *)&x)[mask.sa] : ((TYPE *)&y)[mask.sa & 15]; \
-    z.sb = mask.sb < 16 ? ((TYPE *)&x)[mask.sb] : ((TYPE *)&y)[mask.sb & 15]; \
-    z.sc = mask.sc < 16 ? ((TYPE *)&x)[mask.sc] : ((TYPE *)&y)[mask.sc & 15]; \
-    z.sd = mask.sd < 16 ? ((TYPE *)&x)[mask.sd] : ((TYPE *)&y)[mask.sd & 15]; \
-    z.se = mask.se < 16 ? ((TYPE *)&x)[mask.se] : ((TYPE *)&y)[mask.se & 15]; \
-    z.sf = mask.sf < 16 ? ((TYPE *)&x)[mask.sf] : ((TYPE *)&y)[mask.sf & 15]; \
-    return z; \
-  }
-
-#define DEFMASK(TYPE, MASKTYPE) \
-  DEC2(TYPE, TYPE##2, TYPE##4, MASKTYPE) \
-  DEC2(TYPE, TYPE##4, TYPE##8, MASKTYPE) \
-  DEC2(TYPE, TYPE##8, TYPE##16, MASKTYPE) \
-  DEC2X(TYPE, MASKTYPE) \
-  DEC4(TYPE, TYPE##2, TYPE##4, MASKTYPE) \
-  DEC4(TYPE, TYPE##4, TYPE##8, MASKTYPE) \
-  DEC4(TYPE, TYPE##8, TYPE##16, MASKTYPE) \
-  DEC4X(TYPE, MASKTYPE) \
-  DEC8(TYPE, TYPE##2, TYPE##4, MASKTYPE) \
-  DEC8(TYPE, TYPE##4, TYPE##8, MASKTYPE) \
-  DEC8(TYPE, TYPE##8, TYPE##16, MASKTYPE) \
-  DEC8X(TYPE, MASKTYPE) \
-  DEC16(TYPE, TYPE##2, TYPE##4, MASKTYPE) \
-  DEC16(TYPE, TYPE##4, TYPE##8, MASKTYPE) \
-  DEC16(TYPE, TYPE##8, TYPE##16, MASKTYPE) \
-  DEC16X(TYPE, MASKTYPE)
-
-#define DEF(TYPE) \
-  DEFMASK(TYPE, uchar) \
-  DEFMASK(TYPE, ushort) \
-  DEFMASK(TYPE, uint) \
-  DEFMASK(TYPE, ulong)
-
-DEF(char)
-DEF(uchar)
-DEF(short)
-DEF(ushort)
-DEF(int)
-DEF(uint)
-DEF(float)
-DEF(long)
-DEF(ulong)
-#undef DEF
-#undef DEFMASK
-#undef DEC2
-#undef DEC2X
-#undef DEC4
-#undef DEC4X
-#undef DEC8
-#undef DEC8X
-#undef DEC16
-#undef DEC16X
-/////////////////////////////////////////////////////////////////////////////
-// Synchronization functions
-/////////////////////////////////////////////////////////////////////////////
-#define CLK_LOCAL_MEM_FENCE  (1 << 0)
-#define CLK_GLOBAL_MEM_FENCE (1 << 1)
-
-void __gen_ocl_barrier_local(void);
-void __gen_ocl_barrier_global(void);
-void __gen_ocl_barrier_local_and_global(void);
-
-typedef uint cl_mem_fence_flags;
-void barrier(cl_mem_fence_flags flags);
-
-INLINE void mem_fence(cl_mem_fence_flags flags) {
-}
-INLINE void read_mem_fence(cl_mem_fence_flags flags) {
-}
-INLINE void write_mem_fence(cl_mem_fence_flags flags) {
-}
-
-/////////////////////////////////////////////////////////////////////////////
-// Async Copies and prefetch
-/////////////////////////////////////////////////////////////////////////////
-#define BODY(SRC_STRIDE, DST_STRIDE) \
-  uint size = get_local_size(2) * get_local_size(1) * get_local_size(0); \
-  uint count = num / size;  \
-  uint offset = get_local_id(2) * get_local_size(1) + get_local_id(1);  \
-  offset = offset * get_local_size(0) + get_local_id(0); \
-  for(uint i=0; i<count; i+=1) { \
-    *(dst + offset * DST_STRIDE) = *(src + offset * SRC_STRIDE); \
-    offset += size;                                 \
-  } \
-  if(offset < num) \
-    *(dst + offset * DST_STRIDE) = *(src + offset * SRC_STRIDE); \
-  return 0;
-
-#define DEFN(TYPE) \
-INLINE_OVERLOADABLE event_t async_work_group_copy (local TYPE *dst,  const global TYPE *src, \
-										    size_t num, event_t event) { \
-  BODY(1, 1); \
-} \
-INLINE_OVERLOADABLE event_t async_work_group_copy (global TYPE *dst,  const local TYPE *src, \
-										    size_t num, event_t event) { \
-  BODY(1, 1); \
-} \
-INLINE_OVERLOADABLE event_t async_work_group_strided_copy (local TYPE *dst,  const global TYPE *src, \
-										            size_t num, size_t src_stride, event_t event) { \
-  BODY(src_stride, 1); \
-} \
-INLINE_OVERLOADABLE event_t async_work_group_strided_copy (global TYPE *dst,  const local TYPE *src, \
-										            size_t num, size_t dst_stride, event_t event) { \
-  BODY(1, dst_stride); \
-}
-#define DEF(TYPE) \
-  DEFN(TYPE); DEFN(TYPE##2); DEFN(TYPE##3); DEFN(TYPE##4); DEFN(TYPE##8); DEFN(TYPE##16);
-DEF(char)
-DEF(uchar)
-DEF(short)
-DEF(ushort)
-DEF(int)
-DEF(uint)
-DEF(long)
-DEF(ulong)
-DEF(float)
-DEF(double)
-#undef BODY
-#undef DEFN
-#undef DEF
-
-INLINE void wait_group_events (int num_events, event_t *event_list) {
-  barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);
-}
-
-#define DEFN(TYPE) \
-INLINE_OVERLOADABLE void prefetch(const global TYPE *p, size_t num) { }
-#define DEF(TYPE) \
-DEFN(TYPE); DEFN(TYPE##2); DEFN(TYPE##3); DEFN(TYPE##4); DEFN(TYPE##8); DEFN(TYPE##16)
-DEF(char);
-DEF(uchar);
-DEF(short);
-DEF(ushort);
-DEF(int);
-DEF(uint);
-DEF(long);
-DEF(ulong);
-DEF(float);
-#undef DEFN
-#undef DEF
-
-/////////////////////////////////////////////////////////////////////////////
-// Atomic functions
-/////////////////////////////////////////////////////////////////////////////
-OVERLOADABLE uint __gen_ocl_atomic_add(__global uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_add(__local uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_sub(__global uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_sub(__local uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_and(__global uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_and(__local uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_or(__global uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_or(__local uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_xor(__global uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_xor(__local uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_xchg(__global uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_xchg(__local uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_inc(__global uint *p);
-OVERLOADABLE uint __gen_ocl_atomic_inc(__local uint *p);
-OVERLOADABLE uint __gen_ocl_atomic_dec(__global uint *p);
-OVERLOADABLE uint __gen_ocl_atomic_dec(__local uint *p);
-OVERLOADABLE uint __gen_ocl_atomic_cmpxchg(__global uint *p, uint cmp, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_cmpxchg(__local uint *p, uint cmp, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_imin(__global uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_imin(__local uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_imax(__global uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_imax(__local uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_umin(__global uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_umin(__local uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_umax(__global uint *p, uint val);
-OVERLOADABLE uint __gen_ocl_atomic_umax(__local uint *p, uint val);
-
-#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE, PREFIX)                        \
-  INLINE_OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p, TYPE val) { \
-    return (TYPE)__gen_ocl_##PREFIX##NAME((SPACE uint *)p, val);            \
-  }
-
-#define DECL_ATOMIC_OP_TYPE(NAME, TYPE, PREFIX) \
-  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __global, PREFIX) \
-  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local, PREFIX)
-
-#define DECL_ATOMIC_OP(NAME) \
-  DECL_ATOMIC_OP_TYPE(NAME, uint, atomic_)        \
-  DECL_ATOMIC_OP_TYPE(NAME, int, atomic_)
-
-DECL_ATOMIC_OP(add)
-DECL_ATOMIC_OP(sub)
-DECL_ATOMIC_OP(and)
-DECL_ATOMIC_OP(or)
-DECL_ATOMIC_OP(xor)
-DECL_ATOMIC_OP(xchg)
-DECL_ATOMIC_OP_TYPE(min, int, atomic_i)
-DECL_ATOMIC_OP_TYPE(max, int, atomic_i)
-DECL_ATOMIC_OP_TYPE(min, uint, atomic_u)
-DECL_ATOMIC_OP_TYPE(max, uint, atomic_u)
-
-#undef DECL_ATOMIC_OP_SPACE
-
-#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE, PREFIX)                        \
-  INLINE_OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p, TYPE val) { \
-    return as_float(__gen_ocl_##PREFIX##NAME((SPACE uint *)p, as_uint(val))); \
-  }
-DECL_ATOMIC_OP_SPACE(xchg, float, __global, atomic_)
-DECL_ATOMIC_OP_SPACE(xchg, float, __local, atomic_)
-
-#undef DECL_ATOMIC_OP
-#undef DECL_ATOMIC_OP_TYPE
-#undef DECL_ATOMIC_OP_SPACE
-
-#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE) \
-  INLINE_OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p) { \
-    return (TYPE)__gen_ocl_atomic_##NAME((SPACE uint *)p); \
-  }
-
-#define DECL_ATOMIC_OP_TYPE(NAME, TYPE) \
-  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __global) \
-  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local)
-
-#define DECL_ATOMIC_OP(NAME) \
-  DECL_ATOMIC_OP_TYPE(NAME, uint) \
-  DECL_ATOMIC_OP_TYPE(NAME, int)
-
-DECL_ATOMIC_OP(inc)
-DECL_ATOMIC_OP(dec)
-
-#undef DECL_ATOMIC_OP
-#undef DECL_ATOMIC_OP_TYPE
-#undef DECL_ATOMIC_OP_SPACE
-
-#define DECL_ATOMIC_OP_SPACE(NAME, TYPE, SPACE)  \
-  INLINE_OVERLOADABLE TYPE atomic_##NAME (volatile SPACE TYPE *p, TYPE cmp, TYPE val) { \
-    return (TYPE)__gen_ocl_atomic_##NAME((SPACE uint *)p, (uint)cmp, (uint)val); \
-  }
-
-#define DECL_ATOMIC_OP_TYPE(NAME, TYPE) \
-  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __global) \
-  DECL_ATOMIC_OP_SPACE(NAME, TYPE, __local)
-
-#define DECL_ATOMIC_OP(NAME) \
-  DECL_ATOMIC_OP_TYPE(NAME, uint) \
-  DECL_ATOMIC_OP_TYPE(NAME, int)
-
-DECL_ATOMIC_OP(cmpxchg)
-
-#undef DECL_ATOMIC_OP
-#undef DECL_ATOMIC_OP_TYPE
-#undef DECL_ATOMIC_OP_SPACE
-
-// XXX for conformance test
-// The following atom_xxx api is on OpenCL spec 1.0.
-// But the conformance test suite will test them anyway.
-#define atom_add atomic_add
-#define atom_sub atomic_sub
-#define atom_and atomic_and
-#define atom_or atomic_or
-#define atom_xor atomic_xor
-#define atom_xchg atomic_xchg
-#define atom_min atomic_min
-#define atom_max atomic_max
-#define atom_inc atomic_inc
-#define atom_dec atomic_dec
-#define atom_cmpxchg atomic_cmpxchg
-
-/////////////////////////////////////////////////////////////////////////////
-// Force the compilation to SIMD8 or SIMD16
-/////////////////////////////////////////////////////////////////////////////
-
-int __gen_ocl_force_simd8(void);
-int __gen_ocl_force_simd16(void);
-
-#define NULL ((void*)0)
-
-// ##BEGIN_COMMON_DEFINES##
-// ##END_COMMON_DEFINES##
-
-/////////////////////////////////////////////////////////////////////////////
-// Image access functions
-/////////////////////////////////////////////////////////////////////////////
-
-// 1D read
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, float u, uint sampler_offset);
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, int u, uint sampler_offset);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, float u, uint sampler_offset);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, int u, uint sampler_offset);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, float u, uint sampler_offset);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, int u, uint sampler_offset);
-
-// 2D & 1D Array read
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset);
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, float u, float v, uint sampler_offset);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, int u, int v, uint sampler_offset);
-
-// 3D & 2D Array read
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset);
-OVERLOADABLE int4 __gen_ocl_read_imagei(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset);
-OVERLOADABLE uint4 __gen_ocl_read_imageui(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, float u, float v, float w, uint sampler_offset);
-OVERLOADABLE float4 __gen_ocl_read_imagef(uint surface_id, sampler_t sampler, int u, int v, int w, uint sampler_offset);
-
-// 1D write
-OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int4 color);
-OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, int u, uint4 color);
-OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, int u, float4 color);
-
-// 2D & 1D Array write
-OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int v, int4 color);
-OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, int u, int v, uint4 color);
-OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, int u, int v, float4 color);
-
-// 3D & 2D Array write
-OVERLOADABLE void __gen_ocl_write_imagei(uint surface_id, int u, int v, int w, int4 color);
-OVERLOADABLE void __gen_ocl_write_imageui(uint surface_id, int u, int v, int w, uint4 color);
-OVERLOADABLE void __gen_ocl_write_imagef(uint surface_id, int u, int v, int w, float4 color);
-
-int __gen_ocl_get_image_width(uint surface_id);
-int __gen_ocl_get_image_height(uint surface_id);
-int __gen_ocl_get_image_channel_data_type(uint surface_id);
-int __gen_ocl_get_image_channel_order(uint surface_id);
-int __gen_ocl_get_image_depth(uint surface_id);
-/* The printf function. */
-/* From LLVM 3.4, c string are all in constant address space */
-#if 100*__clang_major__ + __clang_minor__ < 304
-int __gen_ocl_printf_stub(const char * format, ...);
-#else
-int __gen_ocl_printf_stub(constant char * format, ...);
-#endif
-#define printf __gen_ocl_printf_stub
-
-// 2D 3D Image Common Macro
-#ifdef GEN7_SAMPLER_CLAMP_BORDER_WORKAROUND
-#define GEN_FIX_1 1
-#else
-#define GEN_FIX_1 0
-#endif
-
-#define GET_IMAGE(cl_image, surface_id) \
-    uint surface_id = (uint)cl_image
-INLINE_OVERLOADABLE float __gen_compute_array_index(const float index, image1d_array_t image)
-{
-  GET_IMAGE(image, surface_id);
-  float array_size = __gen_ocl_get_image_depth(surface_id);
-  return clamp(rint(index), 0.f, array_size - 1.f);
-}
-
-INLINE_OVERLOADABLE float __gen_compute_array_index(float index, image2d_array_t image)
-{
-  GET_IMAGE(image, surface_id);
-  float array_size = __gen_ocl_get_image_depth(surface_id);
-  return clamp(rint(index), 0.f, array_size - 1.f);
-}
-
-INLINE_OVERLOADABLE int __gen_compute_array_index(int index, image1d_array_t image)
-{
-  GET_IMAGE(image, surface_id);
-  int array_size = __gen_ocl_get_image_depth(surface_id);
-  return clamp(index, 0, array_size - 1);
-}
-
-INLINE_OVERLOADABLE int __gen_compute_array_index(int index, image2d_array_t image)
-{
-  GET_IMAGE(image, surface_id);
-  int array_size = __gen_ocl_get_image_depth(surface_id);
-  return clamp(index, 0, array_size - 1);
-}
-
-#define DECL_READ_IMAGE0(int_clamping_fix,                                   \
-                        image_type, type, suffix, coord_type, n)             \
-  INLINE_OVERLOADABLE type read_image ##suffix(image_type cl_image,          \
-                                               const sampler_t sampler,      \
-                                               coord_type coord)             \
-  {                                                                          \
-    GET_IMAGE(cl_image, surface_id);                                         \
-    GET_IMAGE_ARRAY_SIZE(cl_image, coord, int, ai);                          \
-    if (int_clamping_fix &&                                                  \
-        ((sampler & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP) &&             \
-        ((sampler & __CLK_FILTER_MASK) == CLK_FILTER_NEAREST))               \
-            return   __gen_ocl_read_image ##suffix(                          \
-                        EXPEND_READ_COORD(surface_id, sampler, coord));      \
-    return  __gen_ocl_read_image ##suffix(                                   \
-                    EXPEND_READ_COORDF(surface_id, sampler, coord), 0);      \
-  }
-
-#define DECL_READ_IMAGE1(float_coord_rounding_fix, int_clamping_fix,         \
-                        image_type, type, suffix, coord_type, n)             \
-  INLINE_OVERLOADABLE type read_image ##suffix(image_type cl_image,          \
-                                               const sampler_t sampler,      \
-                                               coord_type coord)             \
-  {                                                                          \
-    GET_IMAGE(cl_image, surface_id);                                         \
-    GET_IMAGE_ARRAY_SIZE(cl_image, coord, float, ai)                         \
-    coord_type tmpCoord = coord;                                             \
-    if (float_coord_rounding_fix | int_clamping_fix) {                       \
-      if (((sampler & __CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP)              \
-          && ((sampler & __CLK_FILTER_MASK) == CLK_FILTER_NEAREST)) {        \
-        if (float_coord_rounding_fix                                         \
-            && ((sampler & CLK_NORMALIZED_COORDS_TRUE) == 0)) {              \
-          FIXUP_FLOAT_COORD(tmpCoord);                                       \
-        }                                                                    \
-        if (int_clamping_fix) {                                              \
-            coord_type intCoord;                                             \
-            if (sampler & CLK_NORMALIZED_COORDS_TRUE) {                      \
-              DENORMALIZE_COORD(surface_id, intCoord, tmpCoord);             \
-            } else                                                           \
-              intCoord = tmpCoord;                                           \
-            return   __gen_ocl_read_image ##suffix(                          \
-                       EXPEND_READ_COORDI(surface_id, sampler, intCoord));\
-       }                                                                     \
-      }                                                                      \
-    }                                                                        \
-    return  __gen_ocl_read_image ##suffix(                                   \
-                        EXPEND_READ_COORDF(surface_id, sampler, tmpCoord), 0);\
-  }
-
-#define DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, coord_type, n)   \
-  INLINE_OVERLOADABLE type read_image ##suffix(image_type cl_image,          \
-                                               coord_type coord)             \
-  {                                                                          \
-    GET_IMAGE(cl_image, surface_id);                                         \
-    GET_IMAGE_ARRAY_SIZE(cl_image, coord, int, ai)                           \
-    return __gen_ocl_read_image ##suffix(                                    \
-           EXPEND_READ_COORDF(surface_id,                                    \
-                             CLK_NORMALIZED_COORDS_FALSE                     \
-                             | CLK_ADDRESS_NONE                              \
-                             | CLK_FILTER_NEAREST, (float)coord), 0);        \
-  }
-
-#define DECL_WRITE_IMAGE(image_type, type, suffix, coord_type) \
-  INLINE_OVERLOADABLE void write_image ##suffix(image_type cl_image, coord_type coord, type color)\
-  {\
-    GET_IMAGE(cl_image, surface_id);\
-    __gen_ocl_write_image ##suffix(EXPEND_WRITE_COORD(surface_id, coord, color));\
-  }
-
-#define DECL_IMAGE_INFO_COMMON(image_type)    \
-  INLINE_OVERLOADABLE  int get_image_channel_data_type(image_type image)\
-  { \
-    GET_IMAGE(image, surface_id);\
-    return __gen_ocl_get_image_channel_data_type(surface_id); \
-  }\
-  INLINE_OVERLOADABLE  int get_image_channel_order(image_type image)\
-  { \
-    GET_IMAGE(image, surface_id);\
-    return __gen_ocl_get_image_channel_order(surface_id); \
-  } \
-  INLINE_OVERLOADABLE int get_image_width(image_type image) \
-  { \
-    GET_IMAGE(image, surface_id); \
-    return __gen_ocl_get_image_width(surface_id);  \
-  }
-
-// 1D
-#define DECL_IMAGE(int_clamping_fix, image_type, type, suffix)                       \
-  DECL_READ_IMAGE0(int_clamping_fix, image_type, type, suffix, int, 1)               \
-  DECL_READ_IMAGE1(GEN_FIX_1, int_clamping_fix, image_type, type, suffix, float, 1)  \
-  DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, int, 1)                        \
-  DECL_WRITE_IMAGE(image_type, type, suffix, int)                                    \
-  DECL_WRITE_IMAGE(image_type, type, suffix, float)
-
-#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord, 1
-#define EXPEND_READ_COORDF(id, sampler, coord) id, sampler, (float)coord
-#define EXPEND_READ_COORDI(id, sampler, coord) id, sampler, (int)(coord < 0 ? -1 : coord), 1
-#define DENORMALIZE_COORD(id, dstCoord, srcCoord) dstCoord = srcCoord * __gen_ocl_get_image_width(id);
-#define EXPEND_WRITE_COORD(id, coord, color) id, coord, color
-#define GET_IMAGE_ARRAY_SIZE(a,b,c,d)
-
-#define FIXUP_FLOAT_COORD(tmpCoord)                            \
-  {                                                            \
-    if (tmpCoord < 0 && tmpCoord > -0x1p-20f)                  \
-      tmpCoord += -0x1p-9;                                     \
-  }
-
-DECL_IMAGE(GEN_FIX_1, image1d_t, int4, i)
-DECL_IMAGE(GEN_FIX_1, image1d_t, uint4, ui)
-DECL_IMAGE(0, image1d_t, float4, f)
-DECL_IMAGE(GEN_FIX_1, image1d_buffer_t, int4, i)
-DECL_IMAGE(GEN_FIX_1, image1d_buffer_t, uint4, ui)
-DECL_IMAGE(0, image1d_buffer_t, float4, f)
-
-// 1D Info
-DECL_IMAGE_INFO_COMMON(image1d_t)
-DECL_IMAGE_INFO_COMMON(image1d_buffer_t)
-
-#undef EXPEND_READ_COORD
-#undef EXPEND_READ_COORDF
-#undef EXPEND_READ_COORDI
-#undef DENORMALIZE_COORD
-#undef EXPEND_WRITE_COORD
-#undef FIXUP_FLOAT_COORD
-#undef DECL_IMAGE
-// End of 1D
-
-#define DECL_IMAGE(int_clamping_fix, image_type, type, suffix, n)                       \
-  DECL_READ_IMAGE0(int_clamping_fix, image_type, type, suffix, int ##n, n)              \
-  DECL_READ_IMAGE1(GEN_FIX_1, int_clamping_fix, image_type, type, suffix, float ##n, n) \
-  DECL_READ_IMAGE_NOSAMPLER(image_type, type, suffix, int ##n, n)                       \
-  DECL_WRITE_IMAGE(image_type, type, suffix, int ## n)                                  \
-  DECL_WRITE_IMAGE(image_type, type, suffix, float ## n)
-// 2D
-#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, coord.s1, 1
-#define EXPEND_READ_COORDF(id, sampler, coord) id, sampler, (float)coord.s0, (float)coord.s1
-#define EXPEND_READ_COORDI(id, sampler, coord) id, sampler, (int)(coord.s0 < 0 ? -1 : coord.s0), \
-                                               (int)(coord.s1 < 0 ? -1 : coord.s1), 1
-#define DENORMALIZE_COORD(id, dstCoord, srcCoord) dstCoord.x = srcCoord.x * __gen_ocl_get_image_width(id); \
-                                                  dstCoord.y = srcCoord.y * __gen_ocl_get_image_height(id);
-#define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, coord.s1, color
-
-#define FIXUP_FLOAT_COORD(tmpCoord)                            \
-  {                                                            \
-    if (tmpCoord.s0 < 0 && tmpCoord.s0 > -0x1p-20f)            \
-      tmpCoord.s0 += -0x1p-9;                                  \
-    if (tmpCoord.s1 < 0 && tmpCoord.s1 > -0x1p-20f)            \
-      tmpCoord.s1 += -0x1p-9f;                                 \
-  }
-
-DECL_IMAGE(GEN_FIX_1, image2d_t, int4, i, 2)
-DECL_IMAGE(GEN_FIX_1, image2d_t, uint4, ui, 2)
-DECL_IMAGE(0, image2d_t, float4, f, 2)
-
-// 1D Array
-#undef GET_IMAGE_ARRAY_SIZE
-#undef EXPEND_READ_COORD
-#undef EXPEND_READ_COORDF
-#undef EXPEND_READ_COORDI
-#undef DENORMALIZE_COORD
-#undef EXPEND_WRITE_COORD
-#undef FIXUP_FLOAT_COORD
-
-#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, (int)0, ai, 2
-#define EXPEND_READ_COORDF(id, sampler, coord) id, sampler, (float)coord.s0, (float)ai
-#define EXPEND_READ_COORDI(id, sampler, coord) id, sampler, (int)(coord.s0 < 0 ? -1 : coord.s0), 0, (int)ai, 2
-#define DENORMALIZE_COORD(id, dstCoord, srcCoord) dstCoord.x = srcCoord.x * __gen_ocl_get_image_width(id);
-#define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, __gen_compute_array_index(coord.s1, cl_image), color
-#define GET_IMAGE_ARRAY_SIZE(image, coord, coord_type, ai) \
-  coord_type ai = __gen_compute_array_index(coord.s1, image);
-
-#define FIXUP_FLOAT_COORD(tmpCoord)                            \
-  {                                                            \
-    if (tmpCoord.s0 < 0 && tmpCoord.s0 > -0x1p-20f)            \
-      tmpCoord.s0 += -0x1p-9;                                  \
-  }
-
-DECL_IMAGE(GEN_FIX_1, image1d_array_t, int4, i, 2)
-DECL_IMAGE(GEN_FIX_1, image1d_array_t, uint4, ui, 2)
-DECL_IMAGE(0, image1d_array_t, float4, f, 2)
-
-// 2D Info
-DECL_IMAGE_INFO_COMMON(image2d_t)
-INLINE_OVERLOADABLE int get_image_height(image2d_t image)
-{
-  GET_IMAGE(image, surface_id);
-  return __gen_ocl_get_image_height(surface_id);
-}
-INLINE_OVERLOADABLE int2 get_image_dim(image2d_t image)
-{
-  return (int2){get_image_width(image), get_image_height(image)};
-}
-
-// 1D Array info
-DECL_IMAGE_INFO_COMMON(image1d_array_t)
-INLINE_OVERLOADABLE size_t get_image_array_size(image1d_array_t image)
-{
-  GET_IMAGE(image, surface_id);
-  return __gen_ocl_get_image_depth(surface_id);
-}
-
-#undef EXPEND_READ_COORD
-#undef EXPEND_READ_COORDI
-#undef EXPEND_READ_COORDF
-#undef DENORMALIZE_COORD
-#undef EXPEND_WRITE_COORD
-#undef FIXUP_FLOAT_COORD
-#undef GET_IMAGE_ARRAY_SIZE
-// End of 2D and 1D Array
-
-// 3D
-#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, coord.s1, coord.s2, 1
-#define EXPEND_READ_COORDF(id, sampler, coord) id, sampler, (float)coord.s0, (float)coord.s1, (float)coord.s2
-#define EXPEND_READ_COORDI(id, sampler, coord) id, sampler, (int) (coord.s0 < 0 ? -1 : coord.s0), \
-                                               (int)(coord.s1 < 0 ? -1 : coord.s1), (int)(coord.s2 < 0 ? -1 : coord.s2), 1
-#define DENORMALIZE_COORD(id, dstCoord, srcCoord) dstCoord.x = srcCoord.x * __gen_ocl_get_image_width(id); \
-                                                  dstCoord.y = srcCoord.y * __gen_ocl_get_image_height(id); \
-                                                  dstCoord.z = srcCoord.z * __gen_ocl_get_image_depth(id);
-#define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, coord.s1, coord.s2, color
-
-#define FIXUP_FLOAT_COORD(tmpCoord)                             \
-  {                                                             \
-    if (tmpCoord.s0 < 0 && tmpCoord.s0 > -0x1p-20)              \
-      tmpCoord.s0 += -0x1p-9;                                   \
-    if (tmpCoord.s1 < 0 && tmpCoord.s1 > -0x1p-20)              \
-      tmpCoord.s1 += -0x1p-9;                                   \
-    if (tmpCoord.s2 < 0 && tmpCoord.s2 > -0x1p-20)              \
-      tmpCoord.s2 += -0x1p-9;                                   \
-  }
-#define GET_IMAGE_ARRAY_SIZE(a,b,c,d)
-
-DECL_IMAGE(GEN_FIX_1, image3d_t, int4, i, 4)
-DECL_IMAGE(GEN_FIX_1, image3d_t, uint4, ui, 4)
-DECL_IMAGE(0, image3d_t, float4, f, 4)
-
-DECL_IMAGE(GEN_FIX_1, image3d_t, int4, i, 3)
-DECL_IMAGE(GEN_FIX_1, image3d_t, uint4, ui, 3)
-DECL_IMAGE(0, image3d_t, float4, f, 3)
-
-#undef EXPEND_READ_COORD
-#undef EXPEND_READ_COORDF
-#undef EXPEND_READ_COORDI
-#undef DENORMALIZE_COORD
-#undef EXPEND_WRITE_COORD
-#undef FIXUP_FLOAT_COORD
-#undef GET_IMAGE_ARRAY_SIZE
-
-#define EXPEND_READ_COORD(id, sampler, coord) id, sampler, coord.s0, coord.s1, ai, 1
-#define EXPEND_READ_COORDF(id, sampler, coord) id, sampler, (float)coord.s0, (float)coord.s1, (float)ai
-#define EXPEND_READ_COORDI(id, sampler, coord) id, sampler, (int) (coord.s0 < 0 ? -1 : coord.s0), \
-                                               (int)(coord.s1 < 0 ? -1 : coord.s1), (int)ai, 1
-#define DENORMALIZE_COORD(id, dstCoord, srcCoord) dstCoord.x = srcCoord.x * __gen_ocl_get_image_width(id); \
-                                                  dstCoord.y = srcCoord.y * __gen_ocl_get_image_height(id);
-#define EXPEND_WRITE_COORD(id, coord, color) id, coord.s0, coord.s1, __gen_compute_array_index(coord.s2, cl_image), color
-
-#define FIXUP_FLOAT_COORD(tmpCoord)                             \
-  {                                                             \
-    if (tmpCoord.s0 < 0 && tmpCoord.s0 > -0x1p-20)              \
-      tmpCoord.s0 += -0x1p-9;                                   \
-    if (tmpCoord.s1 < 0 && tmpCoord.s1 > -0x1p-20)              \
-      tmpCoord.s1 += -0x1p-9;                                   \
-  }
-#define GET_IMAGE_ARRAY_SIZE(image, coord, coord_type, ai) \
-  coord_type ai = __gen_compute_array_index(coord.s2, image);
-
-// 2D Array
-DECL_IMAGE(GEN_FIX_1, image2d_array_t, int4, i, 4)
-DECL_IMAGE(GEN_FIX_1, image2d_array_t, uint4, ui, 4)
-DECL_IMAGE(0, image2d_array_t, float4, f, 4)
-
-DECL_IMAGE(GEN_FIX_1, image2d_array_t, int4, i, 3)
-DECL_IMAGE(GEN_FIX_1, image2d_array_t, uint4, ui, 3)
-DECL_IMAGE(0, image2d_array_t, float4, f, 3)
-
-// 3D Info
-DECL_IMAGE_INFO_COMMON(image3d_t)
-INLINE_OVERLOADABLE int get_image_height(image3d_t image)
-{
-  GET_IMAGE(image, surface_id);
-  return __gen_ocl_get_image_height(surface_id);
-}
-INLINE_OVERLOADABLE int get_image_depth(image3d_t image)
-{
-  GET_IMAGE(image, surface_id);
-  return __gen_ocl_get_image_depth(surface_id);
-}
-INLINE_OVERLOADABLE int4 get_image_dim(image3d_t image)
-{
-  return (int4){get_image_width(image), get_image_height(image), get_image_depth(image), 0};
-}
-
-// 2D Array Info
-DECL_IMAGE_INFO_COMMON(image2d_array_t)
-INLINE_OVERLOADABLE int get_image_height(image2d_array_t image)
-{
-  GET_IMAGE(image, surface_id);
-  return __gen_ocl_get_image_height(surface_id);
-}
-INLINE_OVERLOADABLE int2 get_image_dim(image2d_array_t image)
-{
-  return (int2){get_image_width(image), get_image_height(image)};
-}
-INLINE_OVERLOADABLE size_t get_image_array_size(image2d_array_t image)
-{
-  GET_IMAGE(image, surface_id);
-  return __gen_ocl_get_image_depth(surface_id);
-}
-
-#undef EXPEND_READ_COORD
-#undef EXPEND_READ_COORDF
-#undef EXPEND_READ_COORDI
-#undef DENORMALIZE_COORD
-#undef EXPEND_WRITE_COORD
-#undef FIXUP_FLOAT_COORD
-#undef GET_IMAGE_ARRAY_SIZE
-// End of 3D and 2D Array
-
-#undef DECL_IMAGE
-#undef DECL_READ_IMAGE
-#undef DECL_READ_IMAGE_NOSAMPLER
-#undef DECL_WRITE_IMAGE
-#undef GEN_FIX_1
-// End of Image
-
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_acosh (float x)
-{
-    return native_log(x + native_sqrt(x + 1) * native_sqrt(x - 1));
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_asinh (float x)
-{
-    return native_log(x + native_sqrt(x * x + 1));
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_atanh (float x)
-{
-    return 0.5f * native_log((1 + x) / (1 - x));
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_cbrt (float x)
-{
-    return __gen_ocl_pow(x, 0.3333333333f);
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_cos (float x)
-{
-    return native_cos(x);
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_cosh (float x)
-{
-    return (1 + native_exp(-2 * x)) / (2 * native_exp(-x));
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_cospi (float x)
-{
-    return __gen_ocl_cos(x * M_PI_F);
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_exp (float x)
-{
-    return native_exp(x);
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_exp10 (float x)
-{
-    return native_exp10(x);
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_expm1 (float x)
-{
-    return __gen_ocl_pow(M_E_F, x) - 1;
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_fmod (float x, float y)
-{
-    return x-y*__gen_ocl_rndz(x/y);
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_hypot (float x, float y)
-{
-    return __gen_ocl_sqrt(x*x + y*y);
-}
-
-INLINE_OVERLOADABLE int __gen_ocl_internal_fastpath_ilogb (float x)
-{
-    return __gen_ocl_rndd(native_log2(x));
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_ldexp (float x, int n)
-{
-    return __gen_ocl_pow(2, n) * x;
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_log (float x)
-{
-    return native_log(x);
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_log2 (float x)
-{
-    return native_log2(x);
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_log10 (float x)
-{
-    return native_log10(x);
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_log1p (float x)
-{
-    return native_log(x + 1);
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_logb (float x)
-{
-    return __gen_ocl_rndd(native_log2(x));
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_remainder (float x, float y)
-{
-    return x-y*__gen_ocl_rnde(x/y);
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_rootn(float x, int n)
-{
-  return internal_rootn(x, n, 1);
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_sin (float x)
-{
-    return native_sin(x);
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_sincos (float x, __global float *cosval)
-{
-    *cosval = native_cos(x);
-    return native_sin(x);
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_sincos (float x, __local float *cosval)
-{
-    *cosval = native_cos(x);
-    return native_sin(x);
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_sincos (float x, __private float *cosval)
-{
-    *cosval = native_cos(x);
-    return native_sin(x);
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_sinh (float x)
-{
-    return (1 - native_exp(-2 * x)) / (2 * native_exp(-x));
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_sinpi (float x)
-{
-    return __gen_ocl_sin(x * M_PI_F);
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_tan (float x)
-{
-    return native_tan(x);
-}
-
-INLINE_OVERLOADABLE float __gen_ocl_internal_fastpath_tanh (float x)
-{
-    float y = native_exp(-2 * x);
-    return (1 - y) / (1 + y);
-}
-
-#pragma OPENCL EXTENSION cl_khr_fp64 : disable
-
-#undef DECL_IMAGE
-#undef DECL_READ_IMAGE
-#undef DECL_READ_IMAGE_NOSAMPLER
-#undef DECL_WRITE_IMAGE
-
-#undef GET_IMAGE
-// ##BEGIN_VECTOR##
-// ##END_VECTOR##
-
-#undef INLINE_OVERLOADABLE
-#undef PURE
-#undef CONST
-#undef OVERLOADABLE
-#undef INLINE
-
-#endif /* __GEN_OCL_STDLIB_H__ */
diff --git a/backend/src/sys/alloc.cpp b/backend/src/sys/alloc.cpp
index 2db95c9..08dc7b1 100644
--- a/backend/src/sys/alloc.cpp
+++ b/backend/src/sys/alloc.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/sys/alloc.hpp b/backend/src/sys/alloc.hpp
index 8fcb3a7..6ee4e69 100644
--- a/backend/src/sys/alloc.hpp
+++ b/backend/src/sys/alloc.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -73,13 +73,15 @@ public: \
   void* operator new(size_t size) { \
     return gbe::alignedMalloc(size, GBE_DEFAULT_ALIGNMENT); \
   } \
+  void  operator delete(void* ptr) { return gbe::alignedFree(ptr); } \
   void* operator new[](size_t size) { \
    return gbe::alignedMalloc(size, GBE_DEFAULT_ALIGNMENT); \
   } \
+  void  operator delete[](void* ptr) { return gbe::alignedFree(ptr); } \
   void* operator new(size_t size, void *p) { return p; } \
+  void  operator delete(void* ptr, void *p) {/*do nothing*/} \
   void* operator new[](size_t size, void *p) { return p; } \
-  void  operator delete(void* ptr) { return gbe::alignedFree(ptr); } \
-  void  operator delete[](void* ptr) { return gbe::alignedFree(ptr); }
+  void  operator delete[](void* ptr, void *p) { /*do nothing*/ }
 
 /*! Macros to handle allocation position */
 #define GBE_NEW(T,...) \
diff --git a/backend/src/sys/assert.cpp b/backend/src/sys/assert.cpp
index 52178a1..1431fbc 100644
--- a/backend/src/sys/assert.cpp
+++ b/backend/src/sys/assert.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/sys/assert.hpp b/backend/src/sys/assert.hpp
index 553e391..03a8577 100644
--- a/backend/src/sys/assert.hpp
+++ b/backend/src/sys/assert.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/sys/atomic.hpp b/backend/src/sys/atomic.hpp
index 3684ae9..693f6ab 100644
--- a/backend/src/sys/atomic.hpp
+++ b/backend/src/sys/atomic.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/sys/cvar.cpp b/backend/src/sys/cvar.cpp
index 1ee2c98..a4332fd 100644
--- a/backend/src/sys/cvar.cpp
+++ b/backend/src/sys/cvar.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/sys/cvar.hpp b/backend/src/sys/cvar.hpp
index 7350a3e..a9e8415 100644
--- a/backend/src/sys/cvar.hpp
+++ b/backend/src/sys/cvar.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/sys/exception.hpp b/backend/src/sys/exception.hpp
index d74ca0d..8b8dc7a 100644
--- a/backend/src/sys/exception.hpp
+++ b/backend/src/sys/exception.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/sys/fixed_array.hpp b/backend/src/sys/fixed_array.hpp
index d84c350..f534092 100644
--- a/backend/src/sys/fixed_array.hpp
+++ b/backend/src/sys/fixed_array.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/sys/hash_map.hpp b/backend/src/sys/hash_map.hpp
index fb1d1ef..e153cf3 100644
--- a/backend/src/sys/hash_map.hpp
+++ b/backend/src/sys/hash_map.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/sys/intrinsics.hpp b/backend/src/sys/intrinsics.hpp
index 2e25dc7..0e8e8af 100644
--- a/backend/src/sys/intrinsics.hpp
+++ b/backend/src/sys/intrinsics.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/sys/list.hpp b/backend/src/sys/list.hpp
index 51b9c39..d63a64d 100644
--- a/backend/src/sys/list.hpp
+++ b/backend/src/sys/list.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/sys/map.hpp b/backend/src/sys/map.hpp
index 1c72400..87ad71a 100644
--- a/backend/src/sys/map.hpp
+++ b/backend/src/sys/map.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/sys/mutex.cpp b/backend/src/sys/mutex.cpp
index 9640150..ac14c28 100644
--- a/backend/src/sys/mutex.cpp
+++ b/backend/src/sys/mutex.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/sys/mutex.hpp b/backend/src/sys/mutex.hpp
index 1a462b0..8feaffe 100644
--- a/backend/src/sys/mutex.hpp
+++ b/backend/src/sys/mutex.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/sys/platform.cpp b/backend/src/sys/platform.cpp
index 95768ee..cfd4987 100644
--- a/backend/src/sys/platform.cpp
+++ b/backend/src/sys/platform.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/sys/platform.hpp b/backend/src/sys/platform.hpp
index b8a2841..e7aeee6 100644
--- a/backend/src/sys/platform.hpp
+++ b/backend/src/sys/platform.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/sys/set.hpp b/backend/src/sys/set.hpp
index db68807..7c810cd 100644
--- a/backend/src/sys/set.hpp
+++ b/backend/src/sys/set.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/sys/vector.hpp b/backend/src/sys/vector.hpp
index dc89991..fe961be 100644
--- a/backend/src/sys/vector.hpp
+++ b/backend/src/sys/vector.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/backend/src/update.sh b/backend/src/update.sh
deleted file mode 100755
index 0e5f8c0..0000000
--- a/backend/src/update.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-#! /bin/sh -e
-./update_as.sh
-./update_convert.sh
diff --git a/backend/src/update_as.sh b/backend/src/update_as.sh
deleted file mode 100755
index c68e789..0000000
--- a/backend/src/update_as.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#! /bin/sh -e
-
-AS_HEADER=ocl_as.h
-
-exec >$AS_HEADER.tmp
-echo "// This file is autogenerated by gen_as.sh."
-echo "// Don't modify it manually."
-./gen_as.sh
-exec >&2
-
-mv $AS_HEADER.tmp $AS_HEADER
diff --git a/backend/src/update_blob_ocl_header.py b/backend/src/update_blob_ocl_header.py
deleted file mode 100755
index 50f2501..0000000
--- a/backend/src/update_blob_ocl_header.py
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/usr/bin/env python
-#
-# Copyright (C) 2012 Intel Corporation
-#
-# This library is free software; you can redistribute it and/or
-# modify it under the terms of the GNU Lesser General Public
-# License as published by the Free Software Foundation; either
-# version 2 of the License, or (at your option) any later version.
-#
-# This library is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# Lesser General Public License for more details.
-#
-# You should have received a copy of the GNU Lesser General Public
-# License along with this library. If not, see <http://www.gnu.org/licenses/>.
-#
-# Author: Zhigang Gong <zhigang.gong at linux.intel.com>
-#/
-import sys
-import os
-
-if len(sys.argv) != 3:
-    print "Invalid argument {0}".format(sys.argv)
-    print "use {0} tmpl_file_name output_file_name".format(sys.argv[0])
-    raise
-
-def safeUnlink(filename):
-    try:
-        os.remove(filename)
-    except OSError:
-        pass
-
-header_segments = [ "vector", "as", "convert", "common_defines"]
-blobFileName = sys.argv[2]
-blobTempName = sys.argv[2] + '.tmp'
-safeUnlink(blobFileName)
-tmplFile = open(sys.argv[1], 'r')
-blob = open(blobTempName, 'w')
-path = os.path.dirname(sys.argv[1])
-if path == '':
-    path = '.'
-
-matched_header = ""
-for tline in tmplFile:
-    if matched_header == "":
-        blob.write(tline)
-        for header in header_segments:
-            if tline.strip() == '// ##BEGIN_{0}##'.format(header.upper()) :
-                hFile = open(path + '/ocl_' + header + '.h', 'r')
-                lineNr = 0
-                for hline in hFile:
-                    if lineNr >= 2:  #ignore the 2 lines of comment at the top of file.
-                        blob.write(hline)
-                    lineNr += 1
-                hFile.close()
-                matched_header = header
-    else:
-        if tline.strip() == '// ##END_{0}##'.format(matched_header.upper()) :
-            blob.write(tline)
-            matched_header = "";
-
-tmplFile.close()
-blob.close()
-os.rename(blobTempName, blobFileName)
diff --git a/backend/src/update_convert.sh b/backend/src/update_convert.sh
deleted file mode 100755
index 3c47917..0000000
--- a/backend/src/update_convert.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#! /bin/sh -e
-
-CONVERT_HEADER=ocl_convert.h
-
-
-exec >$CONVERT_HEADER.tmp
-echo "// This file is autogenerated by gen_convert.sh."
-echo "// Don't modify it manually."
-./gen_convert.sh
-exec >&2
-
-mv $CONVERT_HEADER.tmp $CONVERT_HEADER
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index d96a2e0..0a959c8 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -10,8 +10,13 @@ set (benchmark_sources
   ../utests/utest.cpp
   ../utests/utest_file_map.cpp
   ../utests/utest_helper.cpp
+  ../utests/vload_bench.cpp
   enqueue_copy_buf.cpp)
 
+
+SET(CMAKE_CXX_FLAGS "-DBUILD_BENCHMARK ${CMAKE_CXX_FLAGS}")
+SET(CMAKE_C_FLAGS "-DBUILD_BENCHMARK ${CMAKE_C_FLAGS}")
+
 ADD_LIBRARY(benchmarks SHARED ${ADDMATHFUNC} ${benchmark_sources})
 
 #TARGET_LINK_LIBRARIES(benchmarks cl m ${OPENGL_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
diff --git a/benchmark/benchmark_run.cpp b/benchmark/benchmark_run.cpp
index b29ccc3..01748ce 100644
--- a/benchmark/benchmark_run.cpp
+++ b/benchmark/benchmark_run.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -98,7 +98,7 @@ int main(int argc, char *argv[])
 
       case 'n':
         try {
-          UTest::runAllNoIssue();
+          UTest::runAllBenchMark();
         }
         catch (Exception e){
           std::cout << "  " << e.what() << "    [SUCCESS]" << std::endl;
diff --git a/docs/Beignet.mdwn b/docs/Beignet.mdwn
index 7e5b730..fea56b8 100644
--- a/docs/Beignet.mdwn
+++ b/docs/Beignet.mdwn
@@ -46,8 +46,6 @@ There are some severe OpenCL related regression in clang 3.4 version.
 * If you want to try Clang/LLVM 3.4, you need to disable terminfo:
 --disable-terminfo. It's a llvm 3.4 bug.
 
-Please be noted that the code was compiled on GCC 4.6, GCC 4.7 and GCC 4.8. Since the code
-uses really recent C++11 features, you may expect problems with older compilers.
 
 How to build and install
 ------------------------
@@ -66,6 +64,12 @@ Basically, from the root directory of the project
 
 `> cmake ../ # to configure`
 
+Please be noted that the code was compiled on GCC 4.6, GCC 4.7 and GCC 4.8 and CLANG 3.5 and
+ICC 14.0.3. Since the code uses really recent C++11 features, you may expect problems with
+older compilers. The default compiler should be GCC, and if you want to choose compiler manually,
+you need to configure it as below:
+`> cmake -DCOMPILER=[GCC|CLANG|ICC] ../`
+
 CMake will check the dependencies and will complain if it does not find them.
 
 `> make`
@@ -115,15 +119,22 @@ will run all the unit tests one after the others
 will only run `some_unit_test0` and `some_unit_test1` tests
 
 On all supported target platform, the pass rate should be 100%. If it is not, you may
-need to refer the "Known Issues" section.
+need to refer the "Known Issues" section. Please be noted, the `. setenv.sh` is only
+required to run unit test cases. For all other OpenCL applications, don't execute that
+command.
+
+Normally, beignet needs to run under X server environment as normal user. If there isn't X server,
+beignet provides two alternative to run:
+* Run as root without X.
+* Enable the drm render nodes by passing drm.rnodes=1 to the kernel boot args, then you can run beignet with non-root and without X.
 
 Supported Targets
 -----------------
 
  * 3rd Generation Intel Core Processors
  * Intel “Bay Trail” platforms with Intel HD Graphics
- * 4th Generation Intel Core Processors, need kernel patch currently, see below
-   for details:
+ * 4th Generation Intel Core Processors, need kernel patch currently, see the "Known Issues" section.
+ * 5th Generation Intel Core Processors "Broadwell".
 
 Known Issues
 ------------
@@ -150,8 +161,8 @@ Known Issues
   `# echo 0 > /sys/module/i915/parameters/enable_cmd_parser`
 
 * Some unit test cases, maybe 20 to 30, fail on 4th Generation (HSW) platform.
-  The 4th Generation Intel Core Processors's support requires some Linux kernel
-  modification. You need to apply the patch at:
+  _The 4th Generation Intel Core Processors's support requires some Linux kernel
+  modification_. You need to apply the patch at:  
   [https://01.org/zh/beignet/downloads/linux-kernel-patch-hsw-support](https://01.org/zh/beignet/downloads/linux-kernel-patch-hsw-support)
 
 * Precision issue.
@@ -175,12 +186,12 @@ is also good which is about 99%. There are still some remains work items listed
 most of them are extension support and performance related.
 
 - Performance tuning. There are some major optimizations need to be done,
-  Peephole optimization, convert to structured BBs and leverage Gen's structured
-  instructions, and optimize the extreme slow software based sin/cos/... math
-  functions due to the native math instruction lack of necessary precision.
-  And all the code is inlined which will increase the icache miss rate
-  significantly. And many other things which are specified partially in
-  [[here|Beignet/Backend/TODO]].
+  Peephole optimization, futher tuning the structurized BB transformation to
+  support more pattern such as self loop/while loop. And optimize the slow
+  software based sin/cos/... math functions due to the native math instruction
+  lack of necessary precision. And all the code is inlined which will increase
+  the icache miss rate significantly. And many other things which are specified
+  partially in [[here|Beignet/Backend/TODO]].
 
 - Complete cl\_khr\_gl\_sharing support. We lack of some APIs implementation such
   as clCreateFromGLBuffer,clCreateFromGLRenderbuffer,clGetGLObjectInfo... Currently,
@@ -194,9 +205,6 @@ most of them are extension support and performance related.
   (i.e. for each NDRangeKernels). This is really inefficient since some
   expensive pipe controls are issued for each batch buffer.
 
-- Valgrind reports some leaks in libdrm. It sounds like a false positive but it
-  has to be checked. Idem for LLVM. There is one leak here to check.
-
 More generally, everything in the run-time that triggers the "FATAL" macro means
 that something that must be supported is not implemented properly (either it
 does not comply with the standard or it is just missing)
@@ -204,22 +212,27 @@ does not comply with the standard or it is just missing)
 Project repository
 ------------------
 Right now, we host our project on fdo at:
-[http://cgit.freedesktop.org/beignet/](http://cgit.freedesktop.org/beignet/).
+[http://cgit.freedesktop.org/beignet/](http://cgit.freedesktop.org/beignet/).  
 And the intel 01.org:
 [https://01.org/beignet](https://01.org/beignet)
 
 The team
 --------
-Beignet project was created by Ben Segovia. Since 2013, Now we have a team in
-Intel China OTC graphics team continue to work on this project.
-The official contact for this project is: Zou Nanhai (<nanhai.zou at intel.com>).
+Beignet project was created by Ben Segovia. Since 2013, Now Intel China OTC graphics
+team continue to work on this project. The official contact for this project is:  
+Zou Nanhai (<nanhai.zou at intel.com>).
 
 How to contribute
 -----------------
 You are always welcome to contribute to this project, just need to subscribe
 to the beignet mail list and send patches to it for review.
 The official mail list is as below:
-[http://lists.freedesktop.org/mailman/listinfo/beignet](http://lists.freedesktop.org/mailman/listinfo/beignet)
+[http://lists.freedesktop.org/mailman/listinfo/beignet](http://lists.freedesktop.org/mailman/listinfo/beignet)  
+The official bugzilla is at:
+[https://bugs.freedesktop.org/enter_bug.cgi?product=Beignet](https://bugs.freedesktop.org/enter_bug.cgi?product=Beignet)  
+You are welcome to submit beignet bug. Please be noted, please specify the exact platform
+information, such as BYT/IVB/HSW/BDW, and GT1/GT2/GT3. You can easily get this information
+by running the beignet's unit test.
 
 Documents for OpenCL application developers
 -------------------------------------------
diff --git a/docs/Beignet/Backend/TODO.mdwn b/docs/Beignet/Backend/TODO.mdwn
index 501c508..4dc8593 100644
--- a/docs/Beignet/Backend/TODO.mdwn
+++ b/docs/Beignet/Backend/TODO.mdwn
@@ -24,9 +24,6 @@ The code is defined in `src/llvm`.  We used the SPIR and the OpenCL profile
 to compile the code. Therefore, a good part of the job is already done. However,
 many things must be implemented:
 
-- Better resolving of the PHI functions. Today, we always generate MOV
-  instructions at the end of each basic block . They can be easily optimized.
-
 - From LLVM 3.3, we use SPIR IR. We need to use the compiler defined type to
   represent sampler\_t/image2d\_t/image1d\_t/....
 
@@ -34,25 +31,14 @@ many things must be implemented:
   compatible for different clang versions. And may contribute what we have done in
   the ocl\_stdlib.h to libclc if possible.
 
-- Optimize math functions. If the native math instructions don't compy with the
-  OCL spec, we use pure software style to implement those math instructions which
-  is extremely slow, for example. The cos and sin for HD4000 platform are very slow.
-  For some applications which may not need such a high accurate results. We may
-  provide a mechanism to use native\_xxx functions instead of the extremely slow
-  version.
+- Optimize math functions.
 
 Gen IR
 ------
 
 The code is defined in `src/ir`. Main things to do are:
 
-- Convert unstructured BBs to structured format, and leverage Gen's structured
-  instruction such as if/else/endif to encoding those BBs. Then we can save many
-  instructions which are used to maintain software pcips and predications.
-
-- Implement those llvm.memset/llvm.memcpy more efficiently. Currently, we lower
-  them as normal memcpy at llvm module level and not considering the intrinsics
-  all have a constant data length.
+- Support structurized while loop and self loop BBs.
 
 - Finishing the handling of function arguments (see the [[IR
   description|gen_ir]] for more details)
@@ -66,7 +52,8 @@ The code is defined in `src/ir`. Main things to do are:
 - Implement fast path for small local variables. When the kernel only defines
   a small local array/variable, there will be a good chance to allocate the local
   array/variable in register space rather than system memory. This will reduce a
-  lot of memory load/stroe from the system memory.
+  lot of memory load/stroe from the system memory. After custom loop unrolling,
+  this optimization is not very important for most cases now.
 
 Backend
 -------
@@ -84,10 +71,10 @@ The code is defined in `src/backend`. Main things to do are:
 - Reduce the macro instructions in gen\_context. The macro instructions added in
   gen\_context will not get a chance to do post register allocation scheduling.
 
-- leverage the structured if/endif for branching processing.
-
 - Peephole optimization. There are many chances to do further peephole optimization.
 
+- Implement a better framework to do backend instructions optimizations.
+
 General plumbing
 ----------------
 
@@ -104,7 +91,3 @@ and writes) are not properly decoded yet.
 
 All of those code should be improved and cleaned up are tracked with "XXX"
 comments in the code.
-
-Parts of the code leaks memory when exceptions are used. There are some pointers
-to track and replace with std::unique\_ptr. Note that we also add a custom memory
-debugger that nicely complements (i.e. it is fast) Valgrind.
diff --git a/docs/Beignet/Backend/compiler_backend.mdwn b/docs/Beignet/Backend/compiler_backend.mdwn
index 3c489b2..30b2aba 100644
--- a/docs/Beignet/Backend/compiler_backend.mdwn
+++ b/docs/Beignet/Backend/compiler_backend.mdwn
@@ -100,8 +100,12 @@ do smarter scratch memory allocation to reduce scratch memory requirement.
 Instruction scheduling
 ----------------------
 
-Intra-basic block instruction scheduling is relatively simple. It is implemented
-but has known bug, we need further effort to fix it.
+Pre register allocation instruction scheduling is not implemented. Although it
+may reduce register pressure but it may also increase register dependencies. We
+need to think about a trade-off mechanism to do this optimization.  
+Post register allocation scheduling has been implemented, and could get about
+8% performance improvement. But those cycles data are based on experiments not
+accurate. We may need to tweak it when we get more information.
 
 Instruction encoding
 --------------------
diff --git a/docs/Beignet/Backend/mixed_buffer_pointer.mdwn b/docs/Beignet/Backend/mixed_buffer_pointer.mdwn
index f43ab7e..8e1a6f4 100644
--- a/docs/Beignet/Backend/mixed_buffer_pointer.mdwn
+++ b/docs/Beignet/Backend/mixed_buffer_pointer.mdwn
@@ -37,10 +37,35 @@ Therefore the following code is valid:
 }
 </code>
 
-As one may see, the load done in the last line actually mixes pointers from both
-source src0 and src1. This typically makes the use of binding table indices
-pretty hard. In we use binding table 0 for dst, 1 for src0 and 2 for src1 (for
+As one may see, the load done in the last line actually mixes pointers from
+both source src0 and src1. The pointer "from" in the last line is so called a
+mixed buffer pointer. This typically makes the use of binding table indices
+pretty hard. If we use binding table 0 for dst, 1 for src0 and 2 for src1 (for
 example), we are not able to express the load in the last line with one send
-only. The pointer "from" in the last line is so called a mixed buffer pointer.
+only.
+
+To support such kind of usage, we did some analysis through the def-use chain
+for all load/store instructions to find out all of the referenced memory object.
+In the above example, src0 is assigned a unique binding table index (like 2),
+src1 is assigned another binding table index (like 3), the load instruction
+above will be translated into two dataport messages with binding table index
+of src0 and src1.
+
+Here we take advantage of out-of-bound behaviour of Gen. The dataport messages
+we use will return zero value if it is an out-of-bound read. And if it is
+out-of-bound write, it will be skipped.
+
+To take use of out-of-bound check, we all use absolute graphics virtual
+address to represent pointer. As the surfaces do not overlap with each other,
+the addresses comming from src0 will be separated from addresses from src1.
+So, right before dataport message was generated, we subtract absolute graphics
+virtual address of the pointer with surface's base address. In the above
+example, first dataport message will read from src0's surface, and thus use
+binding table index of src0. So we first subtract pointer with src0's base
+address. Then use this relative address as the address of the message. You can
+see addresses not comming from src0 will follow out-of-bound behaviour (that is
+filled with zero). Only address from src0 will get valid data. Next, we can do
+similar thing for the second message. After that, we can easily sum them up to
+get the final result. For store operation, we follow same kind of logic,
+but as it is dataport write, we do not need an extra addition.
 
-(To be updated)
diff --git a/docs/NEWS.mdwn b/docs/NEWS.mdwn
index 1adb48a..0231d27 100644
--- a/docs/NEWS.mdwn
+++ b/docs/NEWS.mdwn
@@ -1,7 +1,10 @@
 # News
 
+## Nov 14, 2014
+[Beignet 1.0.0](https://01.org/beignet/downloads/beignet-1.0.0-2014-11-14) is released. This is a major release. Please see the release notes for more information.
+
 ## Sep 15, 2014
-[Beignet 0.9.3](https://01.org/zh/beignet/downloads/beignet-0.9.3-2014-09-15) is released. This is a bug-fix release.
+[Beignet 0.9.3](https://01.org/zh/beignet/downloads/beignet-0.9.3-2014-09-15-0) is released. This is a bug-fix release.
 
 ## July 17, 2014
 [Beignet 0.9.2](https://01.org/zh/beignet/downloads/beignet-0.9.2-2014-07-17) is released. This is a bug-fix release.
diff --git a/include/CL/cl_intel.h b/include/CL/cl_intel.h
index f2fe9d4..28bcb62 100644
--- a/include/CL/cl_intel.h
+++ b/include/CL/cl_intel.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/kernels/compiler_assignment_operation_in_if.cl b/kernels/compiler_assignment_operation_in_if.cl
new file mode 100644
index 0000000..d05a663
--- /dev/null
+++ b/kernels/compiler_assignment_operation_in_if.cl
@@ -0,0 +1,12 @@
+__kernel
+void compiler_assignment_operation_in_if(__global int *dst){
+  int gidx = (int)get_global_id(0);
+
+  int3 d1 = (int3) (gidx, gidx-1, gidx-3);
+  int k = gidx % 5;
+  if (k == 1){
+    d1.x = d1.y;
+  }
+  global int * addr = dst + gidx;
+  *addr = d1.x;
+}
diff --git a/kernels/compiler_box_blur_float_ref.bmp b/kernels/compiler_box_blur_float_ref.bmp
new file mode 100644
index 0000000..d5454dd
Binary files /dev/null and b/kernels/compiler_box_blur_float_ref.bmp differ
diff --git a/kernels/compiler_box_blur_image.cl b/kernels/compiler_box_blur_image.cl
index 42f463b..69fa229 100644
--- a/kernels/compiler_box_blur_image.cl
+++ b/kernels/compiler_box_blur_image.cl
@@ -6,13 +6,15 @@ __kernel void compiler_box_blur_image(__read_only image2d_t src,
                             CLK_FILTER_NEAREST;
   const int2 coord = (int2)(get_global_id(0), get_global_id(1));
   int2 offset;
-  float4 sum = 0;
+  uint4 sum = 0;
 
   for (offset.y = -1; offset.y <= 1; offset.y++) {
     for (offset.x = -1; offset.x <= 1; offset.x++) {
-      sum +=  read_imagef(src, sampler, coord + offset);
+      sum +=  read_imageui(src, sampler, coord + offset);
     }
   }
 
-  write_imagef(dst, coord, (1.0f/9.0f)*sum);
+  uint4 result = sum / 9;
+
+  write_imageui(dst, coord, result);
 }
diff --git a/kernels/compiler_box_blur_ref.bmp b/kernels/compiler_box_blur_ref.bmp
new file mode 100644
index 0000000..d5454dd
Binary files /dev/null and b/kernels/compiler_box_blur_ref.bmp differ
diff --git a/kernels/compiler_bswap.cl b/kernels/compiler_bswap.cl
new file mode 100644
index 0000000..9ef0e6b
--- /dev/null
+++ b/kernels/compiler_bswap.cl
@@ -0,0 +1,12 @@
+#define TEST_TYPE(TYPE, LENGTH)                                       \
+kernel void compiler_bswap_##TYPE(global TYPE * src, global TYPE * dst){ \
+   dst[get_global_id(0)]= __builtin_bswap##LENGTH(src[get_global_id(0)]); \
+}
+
+
+TEST_TYPE(short, 16)
+TEST_TYPE(ushort, 16)
+TEST_TYPE(int, 32)
+TEST_TYPE(uint, 32)
+
+#undef TEST_TYPE
diff --git a/kernels/compiler_clod_function_call.cl b/kernels/compiler_clod_function_call.cl
new file mode 100644
index 0000000..ecfac46
--- /dev/null
+++ b/kernels/compiler_clod_function_call.cl
@@ -0,0 +1,91 @@
+typedef float2 vec2;
+typedef float3 vec3;
+typedef float4 vec4;
+
+#define sin native_sin
+#define cos native_cos
+#define tan native_tan
+#define normalize fast_normalize
+#define length fast_length
+#define mod fmod
+
+vec3 reflect(vec3 I, vec3 N) {
+  return I - 2.0f * dot(N, I) * N;
+}
+
+uint pack_fp4(float4 u4) {
+  uint u;
+  u = (((uint) u4.x)) |
+      (((uint) u4.y) << 8) |
+      (((uint) u4.z) << 16);
+  return u;
+}
+
+#define OUTPUT do {\
+  const vec4 final = 255.f * max(min(gl_FragColor, (vec4)(1.f)), (vec4)(0.f)); \
+  dst[get_global_id(0) + get_global_id(1) * w] = pack_fp4(final); \
+} while (0)
+
+#define time 1.f
+
+float f(vec3 o)
+{
+    float a=(sin(o.x)+o.y*.25f)*.35f;
+    o=(vec3)(cos(a)*o.x-sin(a)*o.y,sin(a)*o.x+cos(a)*o.y,o.z);
+    return dot(cos(o)*cos(o),(vec3)(1.f))-1.2f;
+}
+
+// XXX front end does not inline this function
+vec3 s(vec3 o,vec3 d)
+{
+    float t=0.0f;
+    float dt = 0.2f;
+    float nh = 0.0f;
+    float lh = 0.0f;
+    for(int i=0;i<50;i++)
+    {
+        nh = f(o+d*t);
+        if(nh>0.0f) { lh=nh; t+=dt; }
+    }
+
+    if( nh>0.0f ) return (vec3)(.93f,.94f,.85f);
+
+    t = t - dt*nh/(nh-lh);
+
+    vec3 exyy=(vec3)(0.1f,0.0f,0.0f);
+    vec3 eyxy=(vec3)(0.0f,0.1f,0.0f);
+    vec3 eyyx=(vec3)(0.0f,0.0f,0.1f);
+    vec3 p=o+d*t;
+    vec3 n=-normalize((vec3)(f(p+exyy),f(p+eyxy),f(p+eyyx))+(vec3)((sin(p*75.f)))*.01f);
+
+    return (vec3)(mix( ((max(-dot(n,(vec3)(.577f)),0.f) + 0.125f*max(-dot(n,(vec3)(-.707f,-.707f,0.f)),0.f)))*(mod
+    (length(p.xy)*20.f,2.f)<1.0f?(vec3)(.71f,.85f,.25f):(vec3)(.79f,.93f,.4f))
+                           ,(vec3)(.93f,.94f,.85f), (vec3)(pow(t/9.f,5.f)) ) );
+}
+
+#if 0
+// XXX vector type in the function arguments not supported yet
+__kernel void compiler_clod(__global uint *dst, vec2 resolution, int w)
+{
+    vec2 gl_FragCoord = (vec2)(get_global_id(0), get_global_id(1));
+    //vec2 p = -1.0f + 2.0f * gl_FragCoord.xy / resolution.xy;
+    vec2 p;
+    p.x = -1.0f + 2.0f * gl_FragCoord.x / resolution.x;
+    p.y = -1.0f + 2.0f * gl_FragCoord.y / resolution.y;
+    vec4 gl_FragColor=(vec4)(s((vec3)(sin(time*1.5f)*.5f,cos(time)*.5f,time), normalize((vec3)(p.xy,1.0f))),1.0f);
+    OUTPUT;
+}
+#else
+__kernel void compiler_clod(__global uint *dst, float resx, float resy, int w)
+{
+    vec2 gl_FragCoord = (vec2)(get_global_id(0), get_global_id(1));
+    //vec2 p = -1.0f + 2.0f * gl_FragCoord.xy / resolution.xy;
+    vec2 p;
+    p.x = -1.0f + 2.0f * gl_FragCoord.x / resx;
+    p.y = -1.0f + 2.0f * gl_FragCoord.y / resy;
+    vec4 gl_FragColor=(vec4)(s((vec3)(sin(time*1.5f)*.5f,cos(time)*.5f,time), normalize((vec3)(p.xy,1.0f))),1.0f);
+    OUTPUT;
+}
+
+#endif
+
diff --git a/kernels/compiler_overflow.cl b/kernels/compiler_overflow.cl
new file mode 100644
index 0000000..af751b7
--- /dev/null
+++ b/kernels/compiler_overflow.cl
@@ -0,0 +1,45 @@
+#define COMPILER_OVERFLOW_ADD(TYPE, FUNC) \
+    kernel void compiler_overflow_##TYPE##_##FUNC (global TYPE* src0, global TYPE* src1, global TYPE* dst)   \
+{                                                \
+  __global TYPE* A = &src0[get_global_id(0)];    \
+  __global TYPE* B = &src1[get_global_id(0)];    \
+  __global TYPE* C = &dst[get_global_id(0)];    \
+  *C = *A + *B;    \
+  TYPE carry = -convert_##TYPE(*C < *B); \
+               \
+  (*C).y += carry.x;  \
+  carry.y += ((*C).y < carry.x); \
+  (*C).z += carry.y;   \
+    \
+  carry.z += ((*C).z < carry.y); \
+  (*C).w += carry.z; \
+  carry.w += ((*C).w < carry.z); \
+}
+
+
+COMPILER_OVERFLOW_ADD(ulong4, add)
+COMPILER_OVERFLOW_ADD(uint4, add)
+COMPILER_OVERFLOW_ADD(ushort4, add)
+COMPILER_OVERFLOW_ADD(uchar4, add)
+
+#define COMPILER_OVERFLOW_SUB(TYPE, FUNC) \
+    kernel void compiler_overflow_##TYPE##_##FUNC (global TYPE* src0, global TYPE* src1, global TYPE* dst)   \
+{                                                \
+  __global TYPE* A = &src0[get_global_id(0)];    \
+  __global TYPE* B = &src1[get_global_id(0)];    \
+  __global TYPE* C = &dst[get_global_id(0)];    \
+  TYPE borrow; \
+  unsigned result; \
+  size_t num = sizeof(*A)/sizeof((*A)[0]); \
+  for (uint i = 0; i < num; i++ ) {\
+     borrow[i] = __builtin_usub_overflow((*A)[i], (*B)[i], &result); \
+     (*C)[i] = result;  \
+   }\
+\
+  for (uint i = 0; i < num-1; i++ ) {\
+    borrow[i+1] += (*C)[i+1] < borrow[i];(*C)[i+1] -= borrow[i]; \
+  }\
+\
+}
+
+COMPILER_OVERFLOW_SUB(uint4, sub)
diff --git a/kernels/compiler_popcount.cl b/kernels/compiler_popcount.cl
new file mode 100644
index 0000000..1636118
--- /dev/null
+++ b/kernels/compiler_popcount.cl
@@ -0,0 +1,16 @@
+#define TEST_TYPE(TYPE)                                       \
+kernel void test_##TYPE(global TYPE *src, global TYPE *dst) { \
+  int i = get_global_id(0);                                   \
+  dst[i] = popcount(src[i]);                                  \
+}
+
+TEST_TYPE(char)
+TEST_TYPE(uchar)
+TEST_TYPE(short)
+TEST_TYPE(ushort)
+TEST_TYPE(int)
+TEST_TYPE(uint)
+TEST_TYPE(long)
+TEST_TYPE(ulong)
+
+#undef TEST_TYPE
diff --git a/kernels/compiler_time_stamp.cl b/kernels/compiler_time_stamp.cl
new file mode 100644
index 0000000..f66da58
--- /dev/null
+++ b/kernels/compiler_time_stamp.cl
@@ -0,0 +1,28 @@
+__kernel void
+compiler_time_stamp(__global int *src, __global int *dst)
+{
+  int i;
+  int final[16];
+  struct time_stamp t1, t2, t3;
+  t1 = __gen_ocl_get_timestamp();
+  for (i = 0; i < 16; ++i) {
+    int array[16], j;
+    for (j = 0; j < 16; ++j)
+      array[j] = get_global_id(0);
+    for (j = 0; j < src[0]; ++j)
+      array[j] = 1+src[j];
+    final[i] = array[i];
+    if(i == 7)
+      t2 = __gen_ocl_get_timestamp();
+  }
+  t3 = __gen_ocl_get_timestamp();
+  // currently printf does not support long type.
+  // printf("tmEvt %d %d %d  tmDiff %lu %lu\n", t3-t1, t2-t1);
+
+  // time_stamp.event maybe not zero, then the time diff is not accurate,
+  // because a time event occurs before the time stamp.
+  printf("tmEvt %d %d %d  tmDiff %u %u\n", t1.event, t2.event, t3.event,
+        (uint)(t3.tick-t1.tick), (uint)(t2.tick-t1.tick));
+
+  dst[get_global_id(0)] = final[get_global_id(0)];
+}
diff --git a/kernels/include/runtime_compile_link_inc.h b/kernels/include/runtime_compile_link_inc.h
index 9b66850..4011278 100644
--- a/kernels/include/runtime_compile_link_inc.h
+++ b/kernels/include/runtime_compile_link_inc.h
@@ -1,4 +1,4 @@
-int greater(long x, long y)
+inline int greater(long x, long y)
 {
   return x > y ;
 }
diff --git a/kernels/runtime_use_host_ptr_buffer.cl b/kernels/runtime_use_host_ptr_buffer.cl
new file mode 100644
index 0000000..dbaadf8
--- /dev/null
+++ b/kernels/runtime_use_host_ptr_buffer.cl
@@ -0,0 +1,6 @@
+__kernel void
+runtime_use_host_ptr_buffer(__global int* buf)
+{
+  int id = (int)get_global_id(0);
+  buf[id] = buf[id] / 2;
+}
diff --git a/kernels/sample.bmp b/kernels/sample.bmp
new file mode 100644
index 0000000..4e54bae
Binary files /dev/null and b/kernels/sample.bmp differ
diff --git a/kernels/set_kernel_arg.cl b/kernels/set_kernel_arg.cl
new file mode 100644
index 0000000..71cf521
--- /dev/null
+++ b/kernels/set_kernel_arg.cl
@@ -0,0 +1,20 @@
+__kernel void
+set_kernel_arg(__global unsigned int *dst, float3 src)
+{
+  size_t gid = get_global_id(0);
+
+  switch (gid%3)
+  {
+    case 0:
+        dst[gid] = src.x;
+      break;
+    case 1:
+        dst[gid] = src.y;
+      break;
+    case 2:
+        dst[gid] = src.z;
+      break;
+    default:
+      break;
+  }
+}
diff --git a/kernels/test_fill_image_1d_array.cl b/kernels/test_fill_image_1d_array.cl
new file mode 100644
index 0000000..ee742f4
--- /dev/null
+++ b/kernels/test_fill_image_1d_array.cl
@@ -0,0 +1,11 @@
+__kernel void
+test_fill_image_1d_array(__write_only image1d_array_t dst)
+{
+  int coordx;
+  int coordy;
+  coordx = (int)get_global_id(0);
+  coordy = (int)get_global_id(1);
+  uint4 color4 = {0, 1, 2 ,3};
+  if (coordy < 7)
+    write_imageui(dst, (int2)(coordx, coordy), color4);
+}
diff --git a/kernels/test_fill_image_2d_array.cl b/kernels/test_fill_image_2d_array.cl
new file mode 100644
index 0000000..e756010
--- /dev/null
+++ b/kernels/test_fill_image_2d_array.cl
@@ -0,0 +1,13 @@
+__kernel void
+test_fill_image_2d_array(__write_only image2d_array_t dst)
+{
+  int coordx;
+  int coordy;
+  int coordz;
+  coordx = (int)get_global_id(0);
+  coordy = (int)get_global_id(1);
+  coordz = (int)get_global_id(2);
+  uint4 color4 = {0, 1, 2 ,3};
+  if (coordz < 7)
+    write_imageui(dst, (int3)(coordx, coordy, coordz), color4);
+}
diff --git a/kernels/test_printf.cl b/kernels/test_printf.cl
index 84bb478..0a59e88 100644
--- a/kernels/test_printf.cl
+++ b/kernels/test_printf.cl
@@ -4,35 +4,43 @@ test_printf(void)
   int x = (int)get_global_id(0);
   int y = (int)get_global_id(1);
   int z = (int)get_global_id(2);
+  int g0 = (int)get_global_size(0);
+  int g1 = (int)get_global_size(1);
   uint a = 'x';
   float f = 5.0f;
   int3 vec;
+  ulong cc = 1004294967296;
   vec.x = x;
   vec.y = y;
   vec.z = z;
 
   if (x == 0 && y == 0 && z == 0) {
     printf("--- Welcome to the printf test of %s ---\n", "Intel Beignet");
-
     printf("### output a char is %c\n", a);
+    printf("@@@ A long value is %ld\n", cc);
   }
 
-  if (x % 15 == 0)
-    if (y % 3 == 0)
-      if (z % 7 == 0)
-        printf("######## global_id(x, y, z) = %v3d, global_size(d0, d1, d3) = (%d, %d, %d)\n",
-                vec, get_global_size(0), get_global_size(1), get_global_size(2));
+  for(int i = 0; i < g0/2; i++)
+    for(int j = 0; j < g1/2; j++)
+      if(x == 0 && y == 0 && z == 0)
+        printf("loops: i = %d, j = %d\n", i, j);
 
-  if (x == 1)
+  if (x == 0) {
     if (y == 0) {
       if (z % 2 == 0)
-          printf("#### output a float is %f\n", f);
+          printf("!!! output a float is %f\n", f);
       else
-          printf("#### output a float to int is %d\n", f);
+          printf("!!! output a float to int is %d\n", f);
     }
+  }
+
+  if (x % 15 == 0)
+    if (y % 3 == 0)
+      if (z % 7 == 0)
+        printf("######## global_id(x, y, z) = %v3d, global_size(d0, d1, d3) = (%d, %d, %d)\n",
+                vec, get_global_size(0), get_global_size(1), get_global_size(2));
 
   if (x == 0 && y == 0 && z == 0) {
     printf("--- End to the printf test ---\n");
   }
-
 }
diff --git a/kernels/vload_bench.cl b/kernels/vload_bench.cl
new file mode 100644
index 0000000..c906c75
--- /dev/null
+++ b/kernels/vload_bench.cl
@@ -0,0 +1,33 @@
+#define VLOAD_BENCH(T, N, M) \
+__kernel void \
+vload_bench_##M ##T ##N(__global T* src, __global uint* dst, uint offset) \
+{ \
+  int id = (int)get_global_id(0); \
+  uint ##N srcV = 0; \
+  for(int i = 0; i < M; i++) \
+  { \
+    srcV += convert_uint ##N(vload ##N(id + (i & 0xFFFF), src + offset)); \
+  } \
+  vstore ##N(srcV, id, dst);\
+  /*if (id < 16)*/ \
+  /*printf("id %d %d %d\n", id, srcV.s0, srcV.s1);*/ \
+}
+
+#define VLOAD_BENCH_ALL_VECTOR(T, N_ITERATIONS) \
+               VLOAD_BENCH(T, 2, N_ITERATIONS)  \
+               VLOAD_BENCH(T, 3, N_ITERATIONS)  \
+               VLOAD_BENCH(T, 4, N_ITERATIONS)  \
+               VLOAD_BENCH(T, 8, N_ITERATIONS)  \
+               VLOAD_BENCH(T, 16, N_ITERATIONS)
+
+#define VLOAD_BENCH_ALL_TYPES(N_ITERATIONS)     \
+   VLOAD_BENCH_ALL_VECTOR(uchar, N_ITERATIONS)  \
+   VLOAD_BENCH_ALL_VECTOR(char, N_ITERATIONS)   \
+   VLOAD_BENCH_ALL_VECTOR(ushort, N_ITERATIONS) \
+   VLOAD_BENCH_ALL_VECTOR(short, N_ITERATIONS)  \
+   VLOAD_BENCH_ALL_VECTOR(uint, N_ITERATIONS)   \
+   VLOAD_BENCH_ALL_VECTOR(int, N_ITERATIONS)    \
+   VLOAD_BENCH_ALL_VECTOR(float, N_ITERATIONS)
+
+VLOAD_BENCH_ALL_TYPES(1)
+VLOAD_BENCH_ALL_TYPES(10000)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index ce16a8c..7182bad 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -42,7 +42,8 @@ set (KERNEL_STR_FILES)
 set (KERNEL_NAMES cl_internal_copy_buf_align4
 cl_internal_copy_buf_align16 cl_internal_copy_buf_unalign_same_offset
 cl_internal_copy_buf_unalign_dst_offset cl_internal_copy_buf_unalign_src_offset
-cl_internal_copy_buf_rect cl_internal_copy_image_1d_to_1d cl_internal_copy_image_2d_to_2d
+cl_internal_copy_buf_rect cl_internal_copy_buf_rect_align4
+cl_internal_copy_image_1d_to_1d cl_internal_copy_image_2d_to_2d
 cl_internal_copy_image_3d_to_2d cl_internal_copy_image_2d_to_3d cl_internal_copy_image_3d_to_3d
 cl_internal_copy_image_2d_to_buffer cl_internal_copy_image_3d_to_buffer
 cl_internal_copy_buffer_to_image_2d cl_internal_copy_buffer_to_image_3d
@@ -108,10 +109,22 @@ SET(CMAKE_CXX_FLAGS "-DHAS_OCLIcd ${CMAKE_CXX_FLAGS}")
 SET(CMAKE_C_FLAGS "-DHAS_OCLIcd ${CMAKE_C_FLAGS}")
 endif (OCLIcd_FOUND)
 
+if (DRM_INTEL_USERPTR)
+SET(CMAKE_CXX_FLAGS "-DHAS_USERPTR ${CMAKE_CXX_FLAGS}")
+SET(CMAKE_C_FLAGS "-DHAS_USERPTR ${CMAKE_C_FLAGS}")
+endif (DRM_INTEL_USERPTR)
+
+set(GIT_SHA1 "git_sha1.h")
+add_custom_target(${GIT_SHA1} ALL
+  COMMAND chmod +x ${CMAKE_CURRENT_SOURCE_DIR}/git_sha1.sh
+  COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/git_sha1.sh ${CMAKE_CURRENT_SOURCE_DIR} ${GIT_SHA1}
+)
+
 SET(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-Bsymbolic,--allow-shlib-undefined")
 
 link_directories (${LLVM_LIBRARY_DIR} ${DRM_LIBDIR})
 add_library(cl SHARED ${OPENCL_SRC})
+ADD_DEPENDENCIES(cl ${GIT_SHA1})
 target_link_libraries(
                       cl
                       ${X11_LIBRARIES}
diff --git a/src/cl_alloc.c b/src/cl_alloc.c
index 93d2e6a..e532569 100644
--- a/src/cl_alloc.c
+++ b/src/cl_alloc.c
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/src/cl_alloc.h b/src/cl_alloc.h
index 9b463ed..433ffc6 100644
--- a/src/cl_alloc.h
+++ b/src/cl_alloc.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/src/cl_api.c b/src/cl_api.c
index 630511f..972c687 100644
--- a/src/cl_api.c
+++ b/src/cl_api.c
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -90,10 +90,10 @@ handle_events(cl_command_queue queue, cl_int num, const cl_event *wait_list,
 }
 
 /* The following code checking overlap is from Appendix of openCL spec 1.1 */
-inline cl_bool check_copy_overlap(const size_t src_offset[3],
-                                  const size_t dst_offset[3],
-                                  const size_t region[3],
-                                  size_t row_pitch, size_t slice_pitch)
+cl_bool check_copy_overlap(const size_t src_offset[3],
+                           const size_t dst_offset[3],
+                           const size_t region[3],
+                           size_t row_pitch, size_t slice_pitch)
 {
   const size_t src_min[] = {src_offset[0], src_offset[1], src_offset[2]};
   const size_t src_max[] = {src_offset[0] + region[0],
@@ -1903,7 +1903,7 @@ clEnqueueFillBuffer(cl_command_queue   command_queue,
     goto error;
   }
 
-  if (offset < 0 || offset + size > buffer->size) {
+  if (offset + size > buffer->size) {
     err = CL_INVALID_VALUE;
     goto error;
   }
@@ -1985,11 +1985,11 @@ clEnqueueCopyBuffer(cl_command_queue     command_queue,
     goto error;
   }
 
-  if (src_offset < 0 || src_offset + cb > src_buffer->size) {
+  if (src_offset + cb > src_buffer->size) {
     err = CL_INVALID_VALUE;
     goto error;
   }
-  if (dst_offset < 0 || dst_offset + cb > dst_buffer->size) {
+  if (dst_offset + cb > dst_buffer->size) {
     err = CL_INVALID_VALUE;
     goto error;
   }
@@ -2653,6 +2653,8 @@ clEnqueueMapBuffer(cl_command_queue  command_queue,
   data->size        = size;
   data->ptr         = ptr;
   data->unsync_map  = 1;
+  if (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION))
+    data->write_map = 1;
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_MAP_BUFFER) == CL_ENQUEUE_EXECUTE_IMM) {
@@ -2663,9 +2665,13 @@ clEnqueueMapBuffer(cl_command_queue  command_queue,
     ptr = data->ptr;
     if(event) cl_event_set_status(*event, CL_COMPLETE);
   } else {
-    if ((ptr = cl_mem_map_gtt_unsync(buffer)) == NULL) {
-      err = CL_MAP_FAILURE;
-      goto error;
+    if (buffer->is_userptr)
+      ptr = buffer->host_ptr;
+    else {
+      if ((ptr = cl_mem_map_gtt_unsync(buffer)) == NULL) {
+        err = CL_MAP_FAILURE;
+        goto error;
+      }
     }
   }
   err = _cl_map_mem(buffer, ptr, &mem_ptr, offset, size, NULL, NULL);
@@ -2735,6 +2741,8 @@ clEnqueueMapImage(cl_command_queue   command_queue,
   data->region[0]   = region[0];  data->region[1] = region[1];  data->region[2] = region[2];
   data->ptr         = ptr;
   data->unsync_map  = 1;
+  if (map_flags & (CL_MAP_WRITE | CL_MAP_WRITE_INVALIDATE_REGION))
+    data->write_map = 1;
 
   if(handle_events(command_queue, num_events_in_wait_list, event_wait_list,
                    event, data, CL_COMMAND_MAP_IMAGE) == CL_ENQUEUE_EXECUTE_IMM) {
@@ -3203,7 +3211,7 @@ clMapBufferIntel(cl_mem mem, cl_int *errcode_ret)
   void *ptr = NULL;
   cl_int err = CL_SUCCESS;
   CHECK_MEM (mem);
-  ptr = cl_mem_map(mem);
+  ptr = cl_mem_map(mem, 1);
 error:
   if (errcode_ret)
     *errcode_ret = err;
diff --git a/src/cl_command_queue.c b/src/cl_command_queue.c
index 0be37a7..12530d7 100644
--- a/src/cl_command_queue.c
+++ b/src/cl_command_queue.c
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -149,7 +149,7 @@ cl_command_queue_bind_image(cl_command_queue queue, cl_kernel k)
       cl_gpgpu_bind_image(gpgpu, k->images[i].idx + BTI_MAX_IMAGE_NUM, image->base.bo, image->offset,
                           image->intel_fmt, image->image_type,
                           image->w, image->h, image->depth,
-                          image->row_pitch, image->tiling);
+                          image->row_pitch, (cl_gpgpu_tiling)image->tiling);
   }
   return CL_SUCCESS;
 }
@@ -336,7 +336,7 @@ cl_fulsim_read_all_surfaces(cl_command_queue queue, cl_kernel k)
     assert(mem->bo);
     chunk_n = cl_buffer_get_size(mem->bo) / chunk_sz;
     chunk_remainder = cl_buffer_get_size(mem->bo) % chunk_sz;
-    to = cl_mem_map(mem);
+    to = cl_mem_map(mem, 1);
     for (j = 0; j < chunk_n; ++j) {
       char name[256];
       sprintf(name, "dump%03i.bmp", curr);
@@ -410,7 +410,7 @@ cl_command_queue_ND_range(cl_command_queue queue,
   }
 #endif /* USE_FULSIM */
 
-  if (ver == 7 || ver == 75)
+  if (ver == 7 || ver == 75 || ver == 8)
     TRY (cl_command_queue_ND_range_gen7, queue, k, work_dim, global_wk_off, global_wk_sz, local_wk_sz);
   else
     FATAL ("Unknown Gen Device");
diff --git a/src/cl_command_queue.h b/src/cl_command_queue.h
index bd70f25..7ec1b6f 100644
--- a/src/cl_command_queue.h
+++ b/src/cl_command_queue.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/src/cl_command_queue_gen7.c b/src/cl_command_queue_gen7.c
index 330f0f9..ba015ca 100644
--- a/src/cl_command_queue_gen7.c
+++ b/src/cl_command_queue_gen7.c
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -109,7 +109,11 @@ cl_upload_constant_buffer(cl_command_queue queue, cl_kernel ker)
   gbe_program prog = ker->program->opaque;
   const int32_t arg_n = interp_kernel_get_arg_num(ker->opaque);
   size_t global_const_size = interp_program_get_global_constant_size(prog);
-  aligned_size = raw_size = global_const_size;
+  raw_size = global_const_size;
+  // Surface state need 4 byte alignment, and Constant argument's buffer size
+  // have align to 4 byte when alloc, so align global constant size to 4 can
+  // ensure the finally aligned_size align to 4.
+  aligned_size =  ALIGN(raw_size, 4);
   /* Reserve 8 bytes to get rid of 0 address */
   if(global_const_size == 0) aligned_size = 8;
 
@@ -271,6 +275,14 @@ cl_bind_printf(cl_gpgpu gpgpu, cl_kernel ker, void* printf_info, int printf_num,
   value = GBE_CURBE_PRINTF_BUF_POINTER;
   offset = interp_kernel_get_curbe_offset(ker->opaque, value, 0);
   buf_size = interp_get_printf_sizeof_size(printf_info) * global_sz;
+  /* because of the printf may exist in a loop, which loop number can not be gotten by
+     static analysis. So we set the data buffer as big as we can. Out of bound printf
+     info will be discarded. */
+  if (buf_size < 1*1024)
+    buf_size = 1*1024*1024;
+  else
+    buf_size = 4*1024*1024; //at most.
+
   if (offset > 0) {
     if (cl_gpgpu_set_printf_buffer(gpgpu, 1, buf_size, offset, interp_get_printf_buf_bti(printf_info)) != 0)
       return -1;
@@ -309,7 +321,10 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   kernel.use_slm = interp_kernel_use_slm(ker->opaque);
 
   /* Compute the number of HW threads we need */
-  TRY (cl_kernel_work_group_sz, ker, local_wk_sz, 3, &local_sz);
+  if(UNLIKELY(err = cl_kernel_work_group_sz(ker, local_wk_sz, 3, &local_sz) != CL_SUCCESS)) {
+    fprintf(stderr, "Beignet: Work group size exceed Kerne's work group size.\n");
+    return err;
+  }
   kernel.thread_n = thread_n = (local_sz + simd_sz - 1) / simd_sz;
   kernel.curbe_sz = cst_sz;
 
@@ -387,8 +402,7 @@ cl_command_queue_ND_range_gen7(cl_command_queue queue,
   return CL_SUCCESS;
 
 error:
-  fprintf(stderr, "error occured. \n");
-  exit(-1);
+  /* only some command/buffer internal error reach here, so return error code OOR */
   return CL_OUT_OF_RESOURCES;
 }
 
diff --git a/src/cl_context.c b/src/cl_context.c
index 152faf3..0f08e6a 100644
--- a/src/cl_context.c
+++ b/src/cl_context.c
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/src/cl_context.h b/src/cl_context.h
index 75afbf6..38ad2fd 100644
--- a/src/cl_context.h
+++ b/src/cl_context.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -20,9 +20,9 @@
 #ifndef __CL_CONTEXT_H__
 #define __CL_CONTEXT_H__
 
+#include "CL/cl.h"
 #include "cl_internals.h"
 #include "cl_driver.h"
-#include "CL/cl.h"
 #include "cl_khr_icd.h"
 
 #include <stdint.h>
@@ -47,6 +47,7 @@ enum _cl_internal_ker_type {
   CL_ENQUEUE_COPY_BUFFER_UNALIGN_DST_OFFSET,
   CL_ENQUEUE_COPY_BUFFER_UNALIGN_SRC_OFFSET,
   CL_ENQUEUE_COPY_BUFFER_RECT,
+  CL_ENQUEUE_COPY_BUFFER_RECT_ALIGN4,
   CL_ENQUEUE_COPY_IMAGE_1D_TO_1D,             //copy image 1d to image 1d
   CL_ENQUEUE_COPY_IMAGE_2D_TO_2D,             //copy image 2d to image 2d
   CL_ENQUEUE_COPY_IMAGE_3D_TO_2D,             //copy image 3d to image 2d
diff --git a/src/cl_device_data.h b/src/cl_device_data.h
index 28bd5f0..0d25ca4 100644
--- a/src/cl_device_data.h
+++ b/src/cl_device_data.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -155,7 +155,6 @@
 #define PCI_CHIP_HASWELL_CRW_E2      0x0D1E /* CRW GT2 */
 #define PCI_CHIP_HASWELL_CRW_E3      0x0D2E /* CRW GT3 */
 
-
 #define IS_HASWELL(devid) (  \
 	(devid) == PCI_CHIP_HASWELL_D1 || (devid) == PCI_CHIP_HASWELL_D2 || \
 	(devid) == PCI_CHIP_HASWELL_D3 || (devid) == PCI_CHIP_HASWELL_S1 || \
@@ -190,5 +189,46 @@
 
 #define IS_GEN75(devid)  IS_HASWELL(devid)
 
+/* BRW */
+#define PCI_CHIP_BROADWLL_M_GT1       0x1602 /* Intel(R) Broadwell Mobile - Halo (EDRAM) - GT1 */
+#define PCI_CHIP_BROADWLL_D_GT1       0x1606 /* Intel(R) Broadwell U-Processor - GT1 */
+#define PCI_CHIP_BROADWLL_S_GT1       0x160A /* Intel(R) Broadwell Server - GT1 */
+#define PCI_CHIP_BROADWLL_W_GT1       0x160D /* Intel(R) Broadwell Workstation - GT1 */
+#define PCI_CHIP_BROADWLL_U_GT1       0x160E /* Intel(R) Broadwell ULX - GT1 */
+#define PCI_CHIP_BROADWLL_M_GT2       0x1612 /* Intel(R) Broadwell Mobile - Halo (EDRAM) - GT2 */
+#define PCI_CHIP_BROADWLL_D_GT2       0x1616 /* Intel(R) Broadwell U-Processor - GT2 */
+#define PCI_CHIP_BROADWLL_S_GT2       0x161A /* Intel(R) Broadwell Server - GT2 */
+#define PCI_CHIP_BROADWLL_W_GT2       0x161D /* Intel(R) Broadwell Workstation - GT2 */
+#define PCI_CHIP_BROADWLL_U_GT2       0x161E /* Intel(R) Broadwell ULX - GT2 */
+#define PCI_CHIP_BROADWLL_M_GT3       0x1622 /* Intel(R) Broadwell Mobile - Halo (EDRAM) - GT3 */
+#define PCI_CHIP_BROADWLL_D_GT3       0x1626 /* Intel(R) Broadwell U-Processor - GT3 */
+#define PCI_CHIP_BROADWLL_S_GT3       0x162A /* Intel(R) Broadwell Server - GT3 */
+#define PCI_CHIP_BROADWLL_W_GT3       0x162D /* Intel(R) Broadwell Workstation - GT3 */
+#define PCI_CHIP_BROADWLL_U_GT3       0x162E /* Intel(R) Broadwell ULX - GT3 */
+
+#define IS_BRW_GT1(devid)               \
+  (devid == PCI_CHIP_BROADWLL_M_GT1 ||   \
+   devid == PCI_CHIP_BROADWLL_D_GT1 || \
+   devid == PCI_CHIP_BROADWLL_S_GT1 || \
+   devid == PCI_CHIP_BROADWLL_W_GT1 || \
+   devid == PCI_CHIP_BROADWLL_U_GT1)
+
+#define IS_BRW_GT2(devid)               \
+  (devid == PCI_CHIP_BROADWLL_M_GT2 ||   \
+   devid == PCI_CHIP_BROADWLL_D_GT2 || \
+   devid == PCI_CHIP_BROADWLL_S_GT2 || \
+   devid == PCI_CHIP_BROADWLL_W_GT2 || \
+   devid == PCI_CHIP_BROADWLL_U_GT2)
+
+#define IS_BRW_GT3(devid)               \
+  (devid == PCI_CHIP_BROADWLL_M_GT3 ||   \
+   devid == PCI_CHIP_BROADWLL_D_GT3 || \
+   devid == PCI_CHIP_BROADWLL_S_GT3 || \
+   devid == PCI_CHIP_BROADWLL_W_GT3 || \
+   devid == PCI_CHIP_BROADWLL_U_GT3)
+
+#define IS_BROADWELL(devid) (IS_BRW_GT1(devid) || IS_BRW_GT2(devid) || IS_BRW_GT3(devid))
+#define IS_GEN8(devid)      IS_BROADWELL(devid)
+
 #endif /* __CL_DEVICE_DATA_H__ */
 
diff --git a/src/cl_device_id.c b/src/cl_device_id.c
index ee3f2b7..522c3c5 100644
--- a/src/cl_device_id.c
+++ b/src/cl_device_id.c
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -31,6 +31,7 @@
 #include <assert.h>
 #include <stdio.h>
 #include <string.h>
+#include <stdlib.h>
 
 #ifndef CL_VERSION_1_2
 #define CL_DEVICE_BUILT_IN_KERNELS 0x103F
@@ -40,6 +41,7 @@ static struct _cl_device_id intel_ivb_gt2_device = {
   INIT_ICD(dispatch)
   .max_compute_unit = 16,
   .max_thread_per_unit = 8,
+  .sub_slice_count = 2,
   .max_work_item_sizes = {1024, 1024, 1024},
   .max_work_group_size = 1024,
   .max_clock_frequency = 1000,
@@ -50,6 +52,7 @@ static struct _cl_device_id intel_ivb_gt1_device = {
   INIT_ICD(dispatch)
   .max_compute_unit = 6,
   .max_thread_per_unit = 6,
+  .sub_slice_count = 1,
   .max_work_item_sizes = {512, 512, 512},
   .max_work_group_size = 512,
   .max_clock_frequency = 1000,
@@ -60,6 +63,7 @@ static struct _cl_device_id intel_baytrail_t_device = {
   INIT_ICD(dispatch)
   .max_compute_unit = 4,
   .max_thread_per_unit = 8,
+  .sub_slice_count = 1,
   .max_work_item_sizes = {512, 512, 512},
   .max_work_group_size = 512,
   .max_clock_frequency = 1000,
@@ -71,6 +75,7 @@ static struct _cl_device_id intel_hsw_gt1_device = {
   INIT_ICD(dispatch)
   .max_compute_unit = 10,
   .max_thread_per_unit = 7,
+  .sub_slice_count = 1,
   .max_work_item_sizes = {1024, 1024, 1024},
   .max_work_group_size = 1024,
   .max_clock_frequency = 1000,
@@ -81,6 +86,7 @@ static struct _cl_device_id intel_hsw_gt2_device = {
   INIT_ICD(dispatch)
   .max_compute_unit = 20,
   .max_thread_per_unit = 7,
+  .sub_slice_count = 2,
   .max_work_item_sizes = {1024, 1024, 1024},
   .max_work_group_size = 1024,
   .max_clock_frequency = 1000,
@@ -91,12 +97,48 @@ static struct _cl_device_id intel_hsw_gt3_device = {
   INIT_ICD(dispatch)
   .max_compute_unit = 40,
   .max_thread_per_unit = 7,
+  .sub_slice_count = 4,
   .max_work_item_sizes = {1024, 1024, 1024},
   .max_work_group_size = 1024,
   .max_clock_frequency = 1000,
 #include "cl_gen75_device.h"
 };
 
+/* XXX we clone IVB for HSW now */
+static struct _cl_device_id intel_brw_gt1_device = {
+  INIT_ICD(dispatch)
+  .max_compute_unit = 12,
+  .max_thread_per_unit = 7,
+  .sub_slice_count = 2,
+  .max_work_item_sizes = {1024, 1024, 1024},
+  .max_work_group_size = 512,
+  .max_clock_frequency = 1000,
+#include "cl_gen75_device.h"
+};
+
+static struct _cl_device_id intel_brw_gt2_device = {
+  INIT_ICD(dispatch)
+  .max_compute_unit = 24,
+  .max_thread_per_unit = 7,
+  .sub_slice_count = 3,
+  .max_work_item_sizes = {1024, 1024, 1024},
+  .max_work_group_size = 512,
+  .max_clock_frequency = 1000,
+#include "cl_gen75_device.h"
+};
+
+static struct _cl_device_id intel_brw_gt3_device = {
+  INIT_ICD(dispatch)
+  .max_compute_unit = 48,
+  .max_thread_per_unit = 7,
+  .sub_slice_count = 6,
+  .max_work_item_sizes = {1024, 1024, 1024},
+  .max_work_group_size = 512,
+  .max_clock_frequency = 1000,
+#include "cl_gen75_device.h"
+};
+
+
 LOCAL cl_device_id
 cl_get_gt_device(void)
 {
@@ -286,6 +328,54 @@ baytrail_t_device_break:
       ret = &intel_baytrail_t_device;
       break;
 
+    case PCI_CHIP_BROADWLL_M_GT1:
+      DECL_INFO_STRING(brw_gt1_break, intel_brw_gt1_device, name, "Intel(R) HD Graphics BroadWell Mobile GT1");
+    case PCI_CHIP_BROADWLL_D_GT1:
+      DECL_INFO_STRING(brw_gt1_break, intel_brw_gt1_device, name, "Intel(R) HD Graphics BroadWell U-Processor GT1");
+    case PCI_CHIP_BROADWLL_S_GT1:
+      DECL_INFO_STRING(brw_gt1_break, intel_brw_gt1_device, name, "Intel(R) HD Graphics BroadWell Server GT1");
+    case PCI_CHIP_BROADWLL_W_GT1:
+      DECL_INFO_STRING(brw_gt1_break, intel_brw_gt1_device, name, "Intel(R) HD Graphics BroadWell Workstation GT1");
+    case PCI_CHIP_BROADWLL_U_GT1:
+      DECL_INFO_STRING(brw_gt1_break, intel_brw_gt1_device, name, "Intel(R) HD Graphics BroadWell ULX GT1");
+brw_gt1_break:
+      intel_brw_gt1_device.vendor_id = device_id;
+      intel_brw_gt1_device.platform = intel_platform;
+      ret = &intel_brw_gt1_device;
+      break;
+
+    case PCI_CHIP_BROADWLL_M_GT2:
+      DECL_INFO_STRING(brw_gt2_break, intel_brw_gt2_device, name, "Intel(R) HD Graphics BroadWell Mobile GT2");
+    case PCI_CHIP_BROADWLL_D_GT2:
+      DECL_INFO_STRING(brw_gt2_break, intel_brw_gt2_device, name, "Intel(R) HD Graphics BroadWell U-Processor GT2");
+    case PCI_CHIP_BROADWLL_S_GT2:
+      DECL_INFO_STRING(brw_gt2_break, intel_brw_gt2_device, name, "Intel(R) HD Graphics BroadWell Server GT2");
+    case PCI_CHIP_BROADWLL_W_GT2:
+      DECL_INFO_STRING(brw_gt2_break, intel_brw_gt2_device, name, "Intel(R) HD Graphics BroadWell Workstation GT2");
+    case PCI_CHIP_BROADWLL_U_GT2:
+      DECL_INFO_STRING(brw_gt2_break, intel_brw_gt2_device, name, "Intel(R) HD Graphics BroadWell ULX GT2");
+brw_gt2_break:
+      intel_brw_gt2_device.vendor_id = device_id;
+      intel_brw_gt2_device.platform = intel_platform;
+      ret = &intel_brw_gt2_device;
+      break;
+
+    case PCI_CHIP_BROADWLL_M_GT3:
+      DECL_INFO_STRING(brw_gt3_break, intel_brw_gt3_device, name, "Intel(R) HD Graphics BroadWell Mobile GT2");
+    case PCI_CHIP_BROADWLL_D_GT3:
+      DECL_INFO_STRING(brw_gt3_break, intel_brw_gt3_device, name, "Intel(R) HD Graphics BroadWell U-Processor GT2");
+    case PCI_CHIP_BROADWLL_S_GT3:
+      DECL_INFO_STRING(brw_gt3_break, intel_brw_gt3_device, name, "Intel(R) HD Graphics BroadWell Server GT2");
+    case PCI_CHIP_BROADWLL_W_GT3:
+      DECL_INFO_STRING(brw_gt3_break, intel_brw_gt3_device, name, "Intel(R) HD Graphics BroadWell Workstation GT2");
+    case PCI_CHIP_BROADWLL_U_GT3:
+      DECL_INFO_STRING(brw_gt3_break, intel_brw_gt3_device, name, "Intel(R) HD Graphics BroadWell ULX GT2");
+brw_gt3_break:
+      intel_brw_gt3_device.vendor_id = device_id;
+      intel_brw_gt3_device.platform = intel_platform;
+      ret = &intel_brw_gt3_device;
+      break;
+
     case PCI_CHIP_SANDYBRIDGE_BRIDGE:
     case PCI_CHIP_SANDYBRIDGE_GT1:
     case PCI_CHIP_SANDYBRIDGE_GT2:
@@ -312,6 +402,26 @@ baytrail_t_device_break:
     }
   }
 
+#ifdef HAS_USERPTR
+  cl_driver dummy = cl_driver_new(NULL);
+  cl_buffer_mgr bufmgr = cl_driver_get_bufmgr(dummy);
+
+  const size_t sz = 4096;
+  void* host_ptr = NULL;
+  int err = posix_memalign(&host_ptr, 4096, sz);
+  if (err == 0) {
+    cl_buffer bo = cl_buffer_alloc_userptr(bufmgr, "CL memory object", host_ptr, sz, 0);
+    if (bo == NULL)
+      ret->host_unified_memory = CL_FALSE;
+    else
+      cl_buffer_unreference(bo);
+    free(host_ptr);
+  }
+  else
+    ret->host_unified_memory = CL_FALSE;
+  cl_driver_delete(dummy);
+#endif
+
   return ret;
 }
 
@@ -380,7 +490,10 @@ cl_get_device_info(cl_device_id     device,
                device != &intel_baytrail_t_device &&
                device != &intel_hsw_gt1_device &&
                device != &intel_hsw_gt2_device &&
-               device != &intel_hsw_gt3_device
+               device != &intel_hsw_gt3_device &&
+               device != &intel_brw_gt1_device &&
+               device != &intel_brw_gt2_device &&
+               device != &intel_brw_gt3_device
                ))
     return CL_INVALID_DEVICE;
 
@@ -482,7 +595,10 @@ cl_device_get_version(cl_device_id device, cl_int *ver)
                device != &intel_baytrail_t_device &&
                device != &intel_hsw_gt1_device &&
                device != &intel_hsw_gt2_device &&
-               device != &intel_hsw_gt3_device))
+               device != &intel_hsw_gt3_device &&
+               device != &intel_brw_gt1_device &&
+               device != &intel_brw_gt2_device &&
+               device != &intel_brw_gt3_device))
     return CL_INVALID_DEVICE;
   if (ver == NULL)
     return CL_SUCCESS;
@@ -493,6 +609,9 @@ cl_device_get_version(cl_device_id device, cl_int *ver)
   } else if (device == &intel_hsw_gt1_device || device == &intel_hsw_gt2_device
         || device == &intel_hsw_gt3_device) {
     *ver = 75;
+  } else if (device == &intel_brw_gt1_device || device == &intel_brw_gt2_device
+        || device == &intel_brw_gt3_device) {
+    *ver = 8;
   } else
     return CL_INVALID_VALUE;
 
@@ -535,7 +654,7 @@ cl_check_builtin_kernel_dimension(cl_kernel kernel, cl_device_id device)
 LOCAL size_t
 cl_get_kernel_max_wg_sz(cl_kernel kernel)
 {
-  size_t work_group_size;
+  size_t work_group_size, thread_cnt;
   int simd_width = interp_kernel_get_simd_width(kernel->opaque);
   int vendor_id = kernel->program->ctx->device->vendor_id;
   if (!interp_kernel_use_slm(kernel->opaque)) {
@@ -544,9 +663,15 @@ cl_get_kernel_max_wg_sz(cl_kernel kernel)
     else
       work_group_size = kernel->program->ctx->device->max_compute_unit *
                         kernel->program->ctx->device->max_thread_per_unit * simd_width;
-  } else
-    work_group_size = kernel->program->ctx->device->max_work_group_size /
-                      (16 / simd_width);
+  } else {
+    thread_cnt = kernel->program->ctx->device->max_compute_unit *
+                 kernel->program->ctx->device->max_thread_per_unit / kernel->program->ctx->device->sub_slice_count;
+    if(thread_cnt > 64)
+      thread_cnt = 64;
+    work_group_size = thread_cnt * simd_width;
+    if(work_group_size > kernel->program->ctx->device->max_work_group_size)
+      work_group_size = kernel->program->ctx->device->max_work_group_size;
+  }
   return work_group_size;
 }
 
@@ -565,7 +690,10 @@ cl_get_kernel_workgroup_info(cl_kernel kernel,
                device != &intel_baytrail_t_device &&
                device != &intel_hsw_gt1_device &&
                device != &intel_hsw_gt2_device &&
-               device != &intel_hsw_gt3_device))
+               device != &intel_hsw_gt3_device &&
+               device != &intel_brw_gt1_device &&
+               device != &intel_brw_gt2_device &&
+               device != &intel_brw_gt3_device))
     return CL_INVALID_DEVICE;
 
   CHECK_KERNEL(kernel);
@@ -607,6 +735,7 @@ cl_get_kernel_workgroup_info(cl_kernel kernel,
 
         return CL_SUCCESS;
       }
+      return CL_SUCCESS;
     default:
       return CL_INVALID_VALUE;
   };
diff --git a/src/cl_device_id.h b/src/cl_device_id.h
index 31bce47..ee6a8e6 100644
--- a/src/cl_device_id.h
+++ b/src/cl_device_id.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -27,6 +27,7 @@ struct _cl_device_id {
   cl_uint  vendor_id;
   cl_uint  max_compute_unit;               // maximum EU number
   cl_uint  max_thread_per_unit;            // maximum EU threads per EU.
+  cl_uint  sub_slice_count;                // Device's sub slice count
   cl_uint  max_work_item_dimensions;       // should be 3.
   size_t   max_work_item_sizes[3];         // equal to maximum work group size.
   size_t   max_work_group_size;            // maximum work group size under simd16 mode.
@@ -59,7 +60,7 @@ struct _cl_device_id {
   size_t   image3d_max_width;
   size_t   image3d_max_height;
   size_t   image3d_max_depth;
-  cl_ulong image_mem_size;
+  size_t   image_mem_size;
   cl_uint  max_samplers;
   size_t   max_parameter_size;
   cl_uint  mem_base_addr_align;
diff --git a/src/cl_driver.cpp b/src/cl_driver.cpp
index 19ac4ae..03b980e 100644
--- a/src/cl_driver.cpp
+++ b/src/cl_driver.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/src/cl_driver.h b/src/cl_driver.h
index 9cdba98..97ca559 100644
--- a/src/cl_driver.h
+++ b/src/cl_driver.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -81,6 +81,22 @@ typedef enum cl_llccc_cache_control {
   llccc_ucllc    = 0x3<<1
 } cl_llccc_cache_control;
 
+/* Target Cache control options for gen8 */
+typedef enum cl_target_cache_control {
+  tcc_ec_only    = 0x0<<3,
+  tcc_llc_only   = 0x1<<3,
+  tcc_llc_ec     = 0x2<<3,
+  tcc_llc_ec_l3  = 0x3<<3
+} cl_target_cache_control;
+
+/* Memory type LLC/ELLC Cache control options for gen8 */
+typedef enum cl_mtllc_cache_control {
+  mtllc_pte      = 0x0<<5,
+  mtllc_none     = 0x1<<5,
+  mtllc_wt       = 0x2<<5,
+  mtllc_wb       = 0x3<<5
+} cl_mtllc_cache_control;
+
 typedef enum gpu_command_status {
   command_queued    = 3,
   command_submitted = 2,
@@ -269,13 +285,15 @@ extern cl_gpgpu_walker_cb *cl_gpgpu_walker;
 typedef cl_buffer (cl_buffer_alloc_cb)(cl_buffer_mgr, const char*, size_t, size_t);
 extern cl_buffer_alloc_cb *cl_buffer_alloc;
 
+typedef cl_buffer (cl_buffer_alloc_userptr_cb)(cl_buffer_mgr, const char*, void *, size_t, unsigned long);
+extern cl_buffer_alloc_userptr_cb *cl_buffer_alloc_userptr;
+
 /* Set a buffer's tiling mode */
 typedef cl_buffer (cl_buffer_set_tiling_cb)(cl_buffer, int tiling, size_t stride);
 extern cl_buffer_set_tiling_cb *cl_buffer_set_tiling;
 
 #include "cl_context.h"
 #include "cl_mem.h"
-typedef struct _cl_context *cl_context;
 
 typedef cl_buffer (cl_buffer_alloc_from_texture_cb)(cl_context, unsigned int, int, unsigned int,
                                                     struct _cl_mem_image *gl_image);
@@ -338,6 +356,10 @@ extern cl_buffer_unpin_cb *cl_buffer_unpin;
 typedef int (cl_buffer_subdata_cb)(cl_buffer, unsigned long, unsigned long, const void*);
 extern cl_buffer_subdata_cb *cl_buffer_subdata;
 
+/* Get data from buffer */
+typedef int (cl_buffer_get_subdata_cb)(cl_buffer, unsigned long, unsigned long, void*);
+extern cl_buffer_get_subdata_cb *cl_buffer_get_subdata;
+
 /* Wait for all pending rendering for this buffer to complete */
 typedef int (cl_buffer_wait_rendering_cb) (cl_buffer);
 extern cl_buffer_wait_rendering_cb *cl_buffer_wait_rendering;
@@ -345,6 +367,9 @@ extern cl_buffer_wait_rendering_cb *cl_buffer_wait_rendering;
 typedef int (cl_buffer_get_fd_cb)(cl_buffer, int *fd);
 extern cl_buffer_get_fd_cb *cl_buffer_get_fd;
 
+typedef int (cl_buffer_get_tiling_align_cb)(cl_context ctx, uint32_t tiling_mode, uint32_t dim);
+extern cl_buffer_get_tiling_align_cb *cl_buffer_get_tiling_align;
+
 /* Get the device id */
 typedef int (cl_driver_get_device_id_cb)(void);
 extern cl_driver_get_device_id_cb *cl_driver_get_device_id;
diff --git a/src/cl_driver_defs.c b/src/cl_driver_defs.c
index 72f25d9..2b68539 100644
--- a/src/cl_driver_defs.c
+++ b/src/cl_driver_defs.c
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -29,6 +29,7 @@ LOCAL cl_driver_get_device_id_cb *cl_driver_get_device_id = NULL;
 
 /* Buffer */
 LOCAL cl_buffer_alloc_cb *cl_buffer_alloc = NULL;
+LOCAL cl_buffer_alloc_userptr_cb *cl_buffer_alloc_userptr = NULL;
 LOCAL cl_buffer_set_tiling_cb *cl_buffer_set_tiling = NULL;
 LOCAL cl_buffer_alloc_from_texture_cb *cl_buffer_alloc_from_texture = NULL;
 LOCAL cl_buffer_release_from_texture_cb *cl_buffer_release_from_texture = NULL;
@@ -44,10 +45,12 @@ LOCAL cl_buffer_get_size_cb *cl_buffer_get_size = NULL;
 LOCAL cl_buffer_pin_cb *cl_buffer_pin = NULL;
 LOCAL cl_buffer_unpin_cb *cl_buffer_unpin = NULL;
 LOCAL cl_buffer_subdata_cb *cl_buffer_subdata = NULL;
+LOCAL cl_buffer_get_subdata_cb *cl_buffer_get_subdata = NULL;
 LOCAL cl_buffer_wait_rendering_cb *cl_buffer_wait_rendering = NULL;
 LOCAL cl_buffer_get_buffer_from_libva_cb *cl_buffer_get_buffer_from_libva = NULL;
 LOCAL cl_buffer_get_image_from_libva_cb *cl_buffer_get_image_from_libva = NULL;
 LOCAL cl_buffer_get_fd_cb *cl_buffer_get_fd = NULL;
+LOCAL cl_buffer_get_tiling_align_cb *cl_buffer_get_tiling_align = NULL;
 
 /* cl_khr_gl_sharing */
 LOCAL cl_gl_acquire_texture_cb *cl_gl_acquire_texture = NULL;
diff --git a/src/cl_driver_type.h b/src/cl_driver_type.h
index 891a33c..c39d3f1 100644
--- a/src/cl_driver_type.h
+++ b/src/cl_driver_type.h
@@ -4,6 +4,8 @@
  * will allow us to make the use of a software performance simulator easier and
  * to minimize the code specific for the HW and for the simulator
  **************************************************************************/
+#ifndef __CL_DRIVER_TYPE_H__
+#define __CL_DRIVER_TYPE_H__
 
 /* Encapsulates command buffer / data buffer / kernels */
 typedef struct _cl_buffer *cl_buffer;
@@ -21,4 +23,5 @@ typedef struct _cl_gpgpu *cl_gpgpu;
 typedef struct _cl_gpgpu_event *cl_gpgpu_event;
 
 typedef struct _cl_context_prop *cl_context_prop;
-typedef struct _cl_sampler *cl_sampler;
+
+#endif
diff --git a/src/cl_enqueue.c b/src/cl_enqueue.c
index af118ad..5798e20 100644
--- a/src/cl_enqueue.c
+++ b/src/cl_enqueue.c
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -35,19 +35,13 @@ cl_int cl_enqueue_read_buffer(enqueue_data* data)
   cl_mem mem = data->mem_obj;
   assert(mem->type == CL_MEM_BUFFER_TYPE ||
          mem->type == CL_MEM_SUBBUFFER_TYPE);
-  void* src_ptr;
   struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
-
-  if (!(src_ptr = cl_mem_map_auto(data->mem_obj))) {
-    err = CL_MAP_FAILURE;
-    goto error;
-  }
-
-  memcpy(data->ptr, (char*)src_ptr + data->offset + buffer->sub_offset, data->size);
-
-  err = cl_mem_unmap_auto(data->mem_obj);
-
-error:
+  if (!mem->is_userptr) {
+    if (cl_buffer_get_subdata(mem->bo, data->offset + buffer->sub_offset,
+			       data->size, data->ptr) != 0)
+      err = CL_MAP_FAILURE;
+  } else
+    memcpy(data->ptr, (char*)mem->host_ptr + data->offset + buffer->sub_offset, data->size);
   return err;
 }
 
@@ -66,7 +60,7 @@ cl_int cl_enqueue_read_buffer_rect(enqueue_data* data)
          mem->type == CL_MEM_SUBBUFFER_TYPE);
   struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
 
-  if (!(src_ptr = cl_mem_map_auto(mem))) {
+  if (!(src_ptr = cl_mem_map_auto(mem, 0))) {
     err = CL_MAP_FAILURE;
     goto error;
   }
@@ -105,24 +99,13 @@ error:
 
 cl_int cl_enqueue_write_buffer(enqueue_data *data)
 {
-  cl_int err = CL_SUCCESS;
   cl_mem mem = data->mem_obj;
   assert(mem->type == CL_MEM_BUFFER_TYPE ||
          mem->type == CL_MEM_SUBBUFFER_TYPE);
   struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
-  void* dst_ptr;
-
-  if (!(dst_ptr = cl_mem_map_auto(data->mem_obj))) {
-    err = CL_MAP_FAILURE;
-    goto error;
-  }
-
-  memcpy((char*)dst_ptr + data->offset + buffer->sub_offset, data->const_ptr, data->size);
-
-  err = cl_mem_unmap_auto(data->mem_obj);
 
-error:
-  return err;
+  return cl_buffer_subdata(mem->bo, data->offset + buffer->sub_offset,
+			   data->size, data->const_ptr);
 }
 
 cl_int cl_enqueue_write_buffer_rect(enqueue_data *data)
@@ -140,7 +123,7 @@ cl_int cl_enqueue_write_buffer_rect(enqueue_data *data)
          mem->type == CL_MEM_SUBBUFFER_TYPE);
   struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
 
-  if (!(dst_ptr = cl_mem_map_auto(mem))) {
+  if (!(dst_ptr = cl_mem_map_auto(mem, 1))) {
     err = CL_MAP_FAILURE;
     goto error;
   }
@@ -188,7 +171,7 @@ cl_int cl_enqueue_read_image(enqueue_data *data)
   const size_t* origin = data->origin;
   const size_t* region = data->region;
 
-  if (!(src_ptr = cl_mem_map_auto(mem))) {
+  if (!(src_ptr = cl_mem_map_auto(mem, 0))) {
     err = CL_MAP_FAILURE;
     goto error;
   }
@@ -231,7 +214,7 @@ cl_int cl_enqueue_write_image(enqueue_data *data)
   cl_mem mem = data->mem_obj;
   CHECK_IMAGE(mem, image);
 
-  if (!(dst_ptr = cl_mem_map_auto(mem))) {
+  if (!(dst_ptr = cl_mem_map_auto(mem, 1))) {
     err = CL_MAP_FAILURE;
     goto error;
   }
@@ -256,11 +239,15 @@ cl_int cl_enqueue_map_buffer(enqueue_data *data)
          mem->type == CL_MEM_SUBBUFFER_TYPE);
   struct _cl_mem_buffer* buffer = (struct _cl_mem_buffer*)mem;
 
-  if(data->unsync_map == 1)
-    //because using unsync map in clEnqueueMapBuffer, so force use map_gtt here
-    ptr = cl_mem_map_gtt(mem);
-  else
-    ptr = cl_mem_map_auto(mem);
+  if (mem->is_userptr)
+    ptr = mem->host_ptr;
+  else {
+    if(data->unsync_map == 1)
+      //because using unsync map in clEnqueueMapBuffer, so force use map_gtt here
+      ptr = cl_mem_map_gtt(mem);
+    else
+      ptr = cl_mem_map_auto(mem, data->write_map ? 1 : 0);
+  }
 
   if (ptr == NULL) {
     err = CL_MAP_FAILURE;
@@ -268,7 +255,7 @@ cl_int cl_enqueue_map_buffer(enqueue_data *data)
   }
   data->ptr = ptr;
 
-  if(mem->flags & CL_MEM_USE_HOST_PTR) {
+  if((mem->flags & CL_MEM_USE_HOST_PTR) && !mem->is_userptr) {
     assert(mem->host_ptr);
     ptr = (char*)ptr + data->offset + buffer->sub_offset;
     memcpy(mem->host_ptr + data->offset + buffer->sub_offset, ptr, data->size);
@@ -290,7 +277,7 @@ cl_int cl_enqueue_map_image(enqueue_data *data)
     //because using unsync map in clEnqueueMapBuffer, so force use map_gtt here
     ptr = cl_mem_map_gtt(mem);
   else
-    ptr = cl_mem_map_auto(mem);
+    ptr = cl_mem_map_auto(mem, data->write_map ? 1 : 0);
 
   if (ptr == NULL) {
     err = CL_MAP_FAILURE;
@@ -353,7 +340,8 @@ cl_int cl_enqueue_unmap_mem_object(enqueue_data *data)
       assert(mapped_ptr >= memobj->host_ptr &&
         mapped_ptr + mapped_size <= memobj->host_ptr + memobj->size);
       /* Sync the data. */
-      memcpy(v_ptr, mapped_ptr, mapped_size);
+      if (!memobj->is_userptr)
+        memcpy(v_ptr, mapped_ptr, mapped_size);
     } else {
       CHECK_IMAGE(memobj, image);
 
@@ -412,7 +400,7 @@ cl_int cl_enqueue_native_kernel(enqueue_data *data)
       const cl_mem buffer = mem_list[i];
       CHECK_MEM(buffer);
 
-      *((void **)args_mem_loc[i]) = cl_mem_map_auto(buffer);
+      *((void **)args_mem_loc[i]) = cl_mem_map_auto(buffer, 0);
   }
   data->user_func(data->ptr);
 
diff --git a/src/cl_enqueue.h b/src/cl_enqueue.h
index a9b3601..09305af 100644
--- a/src/cl_enqueue.h
+++ b/src/cl_enqueue.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -65,6 +65,7 @@ typedef struct _enqueue_data {
   void *            ptr;              /* Ptr for write and return value */
   const cl_mem*     mem_list;         /* mem_list of clEnqueueNativeKernel */
   uint8_t           unsync_map;       /* Indicate the clEnqueueMapBuffer/Image is unsync map */
+  uint8_t           write_map;        /* Indicate if the clEnqueueMapBuffer is write enable */
   void (*user_func)(void *);          /* pointer to a host-callable user function */
 } enqueue_data;
 
diff --git a/src/cl_event.c b/src/cl_event.c
index 99e60eb..e20342a 100644
--- a/src/cl_event.c
+++ b/src/cl_event.c
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -472,9 +472,8 @@ void cl_event_set_status(cl_event event, cl_int status)
     /* All user events complete, now wait enqueue events */
     ret = cl_event_wait_events(enqueue_cb->num_events, enqueue_cb->wait_list,
         enqueue_cb->event->queue);
-    ret = ret;
     assert(ret != CL_ENQUEUE_EXECUTE_DEFER);
-
+    ret = ~ret;
     cb = enqueue_cb;
     enqueue_cb = enqueue_cb->next;
 
diff --git a/src/cl_event.h b/src/cl_event.h
index cfe5ddd..0730530 100644
--- a/src/cl_event.h
+++ b/src/cl_event.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/src/cl_extensions.h b/src/cl_extensions.h
index 52ee0a4..e6cdce8 100644
--- a/src/cl_extensions.h
+++ b/src/cl_extensions.h
@@ -92,8 +92,5 @@ typedef struct cl_extensions {
   char ext_str[256];
 } cl_extensions_t;
 
-struct _cl_platform_id;
-typedef struct _cl_platform_id * cl_platform_id;
-
 extern void
 cl_intel_platform_extension_init(cl_platform_id intel_platform);
diff --git a/src/cl_gbe_loader.cpp b/src/cl_gbe_loader.cpp
index 7da0475..c3454e8 100644
--- a/src/cl_gbe_loader.cpp
+++ b/src/cl_gbe_loader.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/src/cl_gbe_loader.h b/src/cl_gbe_loader.h
index da9d034..6fa4c98 100644
--- a/src/cl_gbe_loader.h
+++ b/src/cl_gbe_loader.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/src/cl_gen75_device.h b/src/cl_gen75_device.h
index 682ee06..d6743a4 100644
--- a/src/cl_gen75_device.h
+++ b/src/cl_gen75_device.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/src/cl_gen7_device.h b/src/cl_gen7_device.h
index 69cc0b9..470531a 100644
--- a/src/cl_gen7_device.h
+++ b/src/cl_gen7_device.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/src/cl_gl_api.c b/src/cl_gl_api.c
index 04dde5a..519aab6 100644
--- a/src/cl_gl_api.c
+++ b/src/cl_gl_api.c
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/src/cl_gt_device.h b/src/cl_gt_device.h
index e2fcee3..37abfd2 100644
--- a/src/cl_gt_device.h
+++ b/src/cl_gt_device.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -54,14 +54,17 @@
 .max_samplers = 16,
 .mem_base_addr_align = sizeof(cl_long) * 16 * 8,
 .min_data_type_align_size = sizeof(cl_long) * 16,
-.single_fp_config = 0, /* XXX */
 .double_fp_config = 0,
 .global_mem_cache_type = CL_READ_WRITE_CACHE,
 .global_mem_size = 1024 * 1024 * 1024,
 .max_constant_buffer_size = 512 << 10,
 .max_constant_args = 8,
 .error_correction_support = CL_FALSE,
+#ifdef HAS_USERPTR
+.host_unified_memory = CL_TRUE,
+#else
 .host_unified_memory = CL_FALSE,
+#endif
 .profiling_timer_resolution = 80, /* ns */
 .endian_little = CL_TRUE,
 .available = CL_TRUE,
diff --git a/src/cl_image.c b/src/cl_image.c
index ced9789..9907f90 100644
--- a/src/cl_image.c
+++ b/src/cl_image.c
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/src/cl_image.h b/src/cl_image.h
index 86cc76a..ae74509 100644
--- a/src/cl_image.h
+++ b/src/cl_image.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/src/cl_internals.h b/src/cl_internals.h
index 693de1d..cb3fc23 100644
--- a/src/cl_internals.h
+++ b/src/cl_internals.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/src/cl_kernel.c b/src/cl_kernel.c
index 55b707a..a869515 100644
--- a/src/cl_kernel.c
+++ b/src/cl_kernel.c
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/src/cl_kernel.h b/src/cl_kernel.h
index 1ed90a5..140bbb1 100644
--- a/src/cl_kernel.h
+++ b/src/cl_kernel.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/src/cl_khr_icd.c b/src/cl_khr_icd.c
index 50a0898..8715bbd 100644
--- a/src/cl_khr_icd.c
+++ b/src/cl_khr_icd.c
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/src/cl_khr_icd.h b/src/cl_khr_icd.h
index 1e206b4..3985d80 100644
--- a/src/cl_khr_icd.h
+++ b/src/cl_khr_icd.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/src/cl_mem.c b/src/cl_mem.c
index 81c4d64..0fbd304 100644
--- a/src/cl_mem.c
+++ b/src/cl_mem.c
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -33,6 +33,7 @@
 #include <assert.h>
 #include <stdio.h>
 #include <string.h>
+#include <unistd.h>
 
 #define FIELD_SIZE(CASE,TYPE)               \
   case JOIN(CL_,CASE):                      \
@@ -223,6 +224,7 @@ cl_mem_allocate(enum cl_mem_type type,
                 cl_mem_flags flags,
                 size_t sz,
                 cl_int is_tiled,
+                void *host_ptr,
                 cl_int *errcode)
 {
   cl_buffer_mgr bufmgr = NULL;
@@ -251,6 +253,7 @@ cl_mem_allocate(enum cl_mem_type type,
   mem->ref_n = 1;
   mem->magic = CL_MAGIC_MEM_HEADER;
   mem->flags = flags;
+  mem->is_userptr = 0;
 
   if (sz != 0) {
     /* Pinning will require stricter alignment rules */
@@ -260,7 +263,28 @@ cl_mem_allocate(enum cl_mem_type type,
     /* Allocate space in memory */
     bufmgr = cl_context_get_bufmgr(ctx);
     assert(bufmgr);
+
+#ifdef HAS_USERPTR
+    if (ctx->device->host_unified_memory) {
+      /* currently only cl buf is supported, will add cl image support later */
+      if ((flags & CL_MEM_USE_HOST_PTR) && host_ptr != NULL) {
+        /* userptr not support tiling */
+        if (!is_tiled) {
+          int page_size = getpagesize();
+          if ((((unsigned long)host_ptr | sz) & (page_size - 1)) == 0) {
+            mem->is_userptr = 1;
+            mem->bo = cl_buffer_alloc_userptr(bufmgr, "CL userptr memory object", host_ptr, sz, 0);
+          }
+        }
+      }
+    }
+
+    if (!mem->is_userptr)
+      mem->bo = cl_buffer_alloc(bufmgr, "CL memory object", sz, alignment);
+#else
     mem->bo = cl_buffer_alloc(bufmgr, "CL memory object", sz, alignment);
+#endif
+
     if (UNLIKELY(mem->bo == NULL)) {
       err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
       goto error;
@@ -387,12 +411,15 @@ cl_mem_new_buffer(cl_context ctx,
   sz = ALIGN(sz, 4);
 
   /* Create the buffer in video memory */
-  mem = cl_mem_allocate(CL_MEM_BUFFER_TYPE, ctx, flags, sz, CL_FALSE, &err);
+  mem = cl_mem_allocate(CL_MEM_BUFFER_TYPE, ctx, flags, sz, CL_FALSE, data, &err);
   if (mem == NULL || err != CL_SUCCESS)
     goto error;
 
   /* Copy the data if required */
-  if (flags & CL_MEM_COPY_HOST_PTR || flags & CL_MEM_USE_HOST_PTR)
+  if (flags & CL_MEM_COPY_HOST_PTR)
+    cl_buffer_subdata(mem->bo, 0, sz, data);
+
+  if ((flags & CL_MEM_USE_HOST_PTR) && !mem->is_userptr)
     cl_buffer_subdata(mem->bo, 0, sz, data);
 
   if (flags & CL_MEM_USE_HOST_PTR || flags & CL_MEM_COPY_HOST_PTR)
@@ -571,8 +598,8 @@ void
 cl_mem_copy_image_to_image(const size_t *dst_origin,const size_t *src_origin, const size_t *region,
                            const struct _cl_mem_image *dst_image, const struct _cl_mem_image *src_image)
 {
-  char* dst= cl_mem_map_auto((cl_mem)dst_image);
-  char* src= cl_mem_map_auto((cl_mem)src_image);
+  char* dst= cl_mem_map_auto((cl_mem)dst_image, 1);
+  char* src= cl_mem_map_auto((cl_mem)src_image, 0);
   size_t dst_offset = dst_image->bpp * dst_origin[0] + dst_image->row_pitch * dst_origin[1] + dst_image->slice_pitch * dst_origin[2];
   size_t src_offset = src_image->bpp * src_origin[0] + src_image->row_pitch * src_origin[1] + src_image->slice_pitch * src_origin[2];
   dst= (char*)dst+ dst_offset;
@@ -601,7 +628,7 @@ cl_mem_copy_image(struct _cl_mem_image *image,
 		  size_t slice_pitch,
 		  void* host_ptr)
 {
-  char* dst_ptr = cl_mem_map_auto((cl_mem)image);
+  char* dst_ptr = cl_mem_map_auto((cl_mem)image, 1);
   size_t origin[3] = {0, 0, 0};
   size_t region[3] = {image->w, image->h, image->depth};
 
@@ -610,13 +637,6 @@ cl_mem_copy_image(struct _cl_mem_image *image,
   cl_mem_unmap_auto((cl_mem)image);
 }
 
-static const uint32_t tile_sz = 4096; /* 4KB per tile */
-static const uint32_t tilex_w = 512;  /* tileX width in bytes */
-static const uint32_t tilex_h = 8;    /* tileX height in number of rows */
-static const uint32_t tiley_w = 128;  /* tileY width in bytes */
-static const uint32_t tiley_h = 32;   /* tileY height in number of rows */
-static const uint32_t valign = 2;     /* vertical alignment is 2. */
-
 cl_image_tiling_t cl_get_default_tiling(void)
 {
   static int initialized = 0;
@@ -749,13 +769,13 @@ _cl_mem_new_image(cl_context ctx,
   /* Tiling requires to align both pitch and height */
   if (tiling == CL_NO_TILE) {
     aligned_pitch = w * bpp;
-    aligned_h  = ALIGN(h, valign);
+    aligned_h  = ALIGN(h, cl_buffer_get_tiling_align(ctx, CL_NO_TILE, 1));
   } else if (tiling == CL_TILE_X) {
-    aligned_pitch = ALIGN(w * bpp, tilex_w);
-    aligned_h     = ALIGN(h, tilex_h);
+    aligned_pitch = ALIGN(w * bpp, cl_buffer_get_tiling_align(ctx, CL_TILE_X, 0));
+    aligned_h     = ALIGN(h, cl_buffer_get_tiling_align(ctx, CL_TILE_X, 1));
   } else if (tiling == CL_TILE_Y) {
-    aligned_pitch = ALIGN(w * bpp, tiley_w);
-    aligned_h     = ALIGN(h, tiley_h);
+    aligned_pitch = ALIGN(w * bpp, cl_buffer_get_tiling_align(ctx, CL_TILE_Y, 0));
+    aligned_h     = ALIGN(h, cl_buffer_get_tiling_align(ctx, CL_TILE_Y, 1));
   }
 
   sz = aligned_pitch * aligned_h * depth;
@@ -769,7 +789,7 @@ _cl_mem_new_image(cl_context ctx,
     sz = aligned_pitch * aligned_h * depth;
   }
 
-  mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, &err);
+  mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, flags, sz, tiling != CL_NO_TILE, NULL, &err);
   if (mem == NULL || err != CL_SUCCESS)
     goto error;
 
@@ -779,7 +799,7 @@ _cl_mem_new_image(cl_context ctx,
       image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER)
     aligned_slice_pitch = 0;
   else
-    aligned_slice_pitch = aligned_pitch * ALIGN(h, 2);
+    aligned_slice_pitch = aligned_pitch * ALIGN(h, cl_buffer_get_tiling_align(ctx, CL_NO_TILE, 1));
 
   cl_mem_image_init(cl_mem_image(mem), w, h, image_type, depth, *fmt,
                     intel_fmt, bpp, aligned_pitch, aligned_slice_pitch, tiling,
@@ -914,8 +934,8 @@ _cl_mem_new_image_from_buffer(cl_context ctx,
                     mem_buffer->base.size / bpp, 0, 0, 0, 0, NULL, errcode_ret);
   if (image == NULL)
     return NULL;
-  void *src = cl_mem_map(buffer);
-  void *dst = cl_mem_map(image);
+  void *src = cl_mem_map(buffer, 0);
+  void *dst = cl_mem_map(image, 1);
   //
   // FIXME, we could use copy buffer to image to do this on GPU latter.
   // currently the copy buffer to image function doesn't support 1D image.
@@ -1414,6 +1434,16 @@ cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
   size_t global_off[] = {0,0,0};
   size_t global_sz[] = {1,1,1};
   size_t local_sz[] = {LOCAL_SZ_0,LOCAL_SZ_1,LOCAL_SZ_1};
+  // the src and dst mem rect is continuous, the copy is degraded to buf copy
+  if((region[0] == dst_row_pitch) && (region[0] == src_row_pitch) &&
+  (region[1] * src_row_pitch == src_slice_pitch) && (region[1] * dst_row_pitch == dst_slice_pitch)){
+    cl_int src_offset = src_origin[2]*src_slice_pitch + src_origin[1]*src_row_pitch + src_origin[0];
+    cl_int dst_offset = dst_origin[2]*dst_slice_pitch + dst_origin[1]*dst_row_pitch + dst_origin[0];
+    cl_int size = region[0]*region[1]*region[2];
+    ret = cl_mem_copy(queue, src_buf, dst_buf,src_offset, dst_offset, size);
+    return ret;
+  }
+
   if(region[1] == 1) local_sz[1] = 1;
   if(region[2] == 1) local_sz[2] = 1;
   global_sz[0] = ((region[0] + local_sz[0] - 1) / local_sz[0]) * local_sz[0];
@@ -1426,18 +1456,33 @@ cl_mem_copy_buffer_rect(cl_command_queue queue, cl_mem src_buf, cl_mem dst_buf,
   assert(src_buf->ctx == dst_buf->ctx);
 
   /* setup the kernel and run. */
-  extern char cl_internal_copy_buf_rect_str[];
-  extern size_t cl_internal_copy_buf_rect_str_size;
-
-  ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_RECT,
-      cl_internal_copy_buf_rect_str, (size_t)cl_internal_copy_buf_rect_str_size, NULL);
+  size_t region0 = region[0];
+  if( (src_offset % 4== 0) && (dst_offset % 4== 0) && (src_row_pitch % 4== 0) && (dst_row_pitch % 4== 0)
+  && (src_slice_pitch % 4== 0) && (dst_slice_pitch % 4== 0) && (region0 % 4 == 0) ){
+    extern char cl_internal_copy_buf_rect_align4_str[];
+    extern size_t cl_internal_copy_buf_rect_align4_str_size;
+    region0 /= 4;
+    src_offset /= 4;
+    dst_offset /= 4;
+    src_row_pitch /= 4;
+    dst_row_pitch /= 4;
+    src_slice_pitch /= 4;
+    dst_slice_pitch /= 4;
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_RECT_ALIGN4,
+    cl_internal_copy_buf_rect_align4_str, (size_t)cl_internal_copy_buf_rect_align4_str_size, NULL);
+  }else{
+    extern char cl_internal_copy_buf_rect_str[];
+    extern size_t cl_internal_copy_buf_rect_str_size;
+    ker = cl_context_get_static_kernel_from_bin(queue->ctx, CL_ENQUEUE_COPY_BUFFER_RECT,
+    cl_internal_copy_buf_rect_str, (size_t)cl_internal_copy_buf_rect_str_size, NULL);
+  }
 
   if (!ker)
     return CL_OUT_OF_RESOURCES;
 
   cl_kernel_set_arg(ker, 0, sizeof(cl_mem), &src_buf);
   cl_kernel_set_arg(ker, 1, sizeof(cl_mem), &dst_buf);
-  cl_kernel_set_arg(ker, 2, sizeof(cl_int), &region[0]);
+  cl_kernel_set_arg(ker, 2, sizeof(cl_int), &region0);
   cl_kernel_set_arg(ker, 3, sizeof(cl_int), &region[1]);
   cl_kernel_set_arg(ker, 4, sizeof(cl_int), &region[2]);
   cl_kernel_set_arg(ker, 5, sizeof(cl_int), &src_offset);
@@ -1730,9 +1775,9 @@ cl_mem_copy_buffer_to_image(cl_command_queue queue, cl_mem buffer, struct _cl_me
 
 
 LOCAL void*
-cl_mem_map(cl_mem mem)
+cl_mem_map(cl_mem mem, int write)
 {
-  cl_buffer_map(mem->bo, 1);
+  cl_buffer_map(mem->bo, write);
   assert(cl_buffer_get_virtual(mem->bo));
   return cl_buffer_get_virtual(mem->bo);
 }
@@ -1769,12 +1814,12 @@ cl_mem_unmap_gtt(cl_mem mem)
 }
 
 LOCAL void*
-cl_mem_map_auto(cl_mem mem)
+cl_mem_map_auto(cl_mem mem, int write)
 {
   if (IS_IMAGE(mem) && cl_mem_image(mem)->tiling != CL_NO_TILE)
     return cl_mem_map_gtt(mem);
   else
-    return cl_mem_map(mem);
+    return cl_mem_map(mem, write);
 }
 
 LOCAL cl_int
@@ -1816,7 +1861,7 @@ LOCAL cl_mem cl_mem_new_libva_buffer(cl_context ctx,
   cl_int err = CL_SUCCESS;
   cl_mem mem = NULL;
 
-  mem = cl_mem_allocate(CL_MEM_BUFFER_TYPE, ctx, 0, 0, CL_FALSE, &err);
+  mem = cl_mem_allocate(CL_MEM_BUFFER_TYPE, ctx, 0, 0, CL_FALSE, NULL, &err);
   if (mem == NULL || err != CL_SUCCESS)
     goto error;
 
@@ -1857,7 +1902,7 @@ LOCAL cl_mem cl_mem_new_libva_image(cl_context ctx,
     goto error;
   }
 
-  mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, 0, 0, 0, &err);
+  mem = cl_mem_allocate(CL_MEM_IMAGE_TYPE, ctx, 0, 0, 0, NULL, &err);
   if (mem == NULL || err != CL_SUCCESS) {
     err = CL_OUT_OF_HOST_MEMORY;
     goto error;
diff --git a/src/cl_mem.h b/src/cl_mem.h
index 3174c5c..ac1175d 100644
--- a/src/cl_mem.h
+++ b/src/cl_mem.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -25,6 +25,7 @@
 #include "CL/cl.h"
 #include "cl_khr_icd.h"
 #include <assert.h>
+#include <pthread.h>
 
 #ifndef CL_VERSION_1_2
 #define CL_MEM_OBJECT_IMAGE1D                       0x10F4
@@ -91,6 +92,7 @@ typedef  struct _cl_mem {
   int map_ref;              /* The mapped count. */
   uint8_t mapped_gtt;       /* This object has mapped gtt, for unmap. */
   cl_mem_dstr_cb *dstr_cb;  /* The destroy callback. */
+  uint8_t is_userptr;    /* CL_MEM_USE_HOST_PTR is enabled*/
 } _cl_mem;
 
 struct _cl_mem_image {
@@ -231,7 +233,7 @@ extern cl_int cl_mem_copy_buffer_to_image(cl_command_queue, cl_mem, struct _cl_m
                                           const size_t, const size_t *, const size_t *);
 
 /* Directly map a memory object */
-extern void *cl_mem_map(cl_mem);
+extern void *cl_mem_map(cl_mem, int);
 
 /* Unmap a memory object */
 extern cl_int cl_mem_unmap(cl_mem);
@@ -246,7 +248,7 @@ extern void *cl_mem_map_gtt_unsync(cl_mem);
 extern cl_int cl_mem_unmap_gtt(cl_mem);
 
 /* Directly map a memory object - tiled images are mapped in GTT mode */
-extern void *cl_mem_map_auto(cl_mem);
+extern void *cl_mem_map_auto(cl_mem, int);
 
 /* Unmap a memory object - tiled images are unmapped in GTT mode */
 extern cl_int cl_mem_unmap_auto(cl_mem);
@@ -261,6 +263,7 @@ cl_mem_allocate(enum cl_mem_type type,
                 cl_mem_flags flags,
                 size_t sz,
                 cl_int is_tiled,
+                void *host_ptr,
                 cl_int *errcode);
 
 void
diff --git a/src/cl_mem_gl.c b/src/cl_mem_gl.c
index 28d2ac6..be9eedf 100644
--- a/src/cl_mem_gl.c
+++ b/src/cl_mem_gl.c
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -63,7 +63,7 @@ cl_mem_new_gl_texture(cl_context ctx,
     goto error;
   }
 
-  mem = cl_mem_allocate(CL_MEM_GL_IMAGE_TYPE, ctx, flags, 0, 0, &err);
+  mem = cl_mem_allocate(CL_MEM_GL_IMAGE_TYPE, ctx, flags, 0, 0, NULL, &err);
   if (mem == NULL || err != CL_SUCCESS)
     goto error;
 
diff --git a/src/cl_platform_id.c b/src/cl_platform_id.c
index e7c8d6a..a97c00f 100644
--- a/src/cl_platform_id.c
+++ b/src/cl_platform_id.c
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/src/cl_platform_id.h b/src/cl_platform_id.h
index c7c716e..7b78db1 100644
--- a/src/cl_platform_id.h
+++ b/src/cl_platform_id.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -20,12 +20,12 @@
 #ifndef __CL_PLATFORM_ID_H__
 #define __CL_PLATFORM_ID_H__
 
+#include "CL/cl.h"
 #include "cl_internals.h"
 #include "cl_extensions.h"
 #include "cl_khr_icd.h"
-#include "CL/cl.h"
-
 #include "src/OCLConfig.h"
+#include "src/git_sha1.h"
 
 struct _cl_platform_id {
   DEFINE_ICD(dispatch)
@@ -63,10 +63,19 @@ extern cl_int cl_get_platform_info(cl_platform_id    platform,
 #define _JOINT(x, y) _STR(x) "." _STR(y)
 #define _JOINT3(x, y, z) _STR(x) "." _STR(y) "." _STR(z)
 
+#ifdef BEIGNET_GIT_SHA1
+       #define BEIGNET_GIT_SHA1_STRING " (" BEIGNET_GIT_SHA1 ")"
+#else
+       #define BEIGNET_GIT_SHA1_STRING
+#endif
 
+#ifdef LIBCL_DRIVER_VERSION_PATCH
 #define LIBCL_DRIVER_VERSION_STRING _JOINT3(LIBCL_DRIVER_VERSION_MAJOR, LIBCL_DRIVER_VERSION_MINOR, LIBCL_DRIVER_VERSION_PATCH)
-#define LIBCL_VERSION_STRING "OpenCL " _JOINT(LIBCL_C_VERSION_MAJOR, LIBCL_C_VERSION_MINOR) " beignet " LIBCL_DRIVER_VERSION_STRING
-#define LIBCL_C_VERSION_STRING "OpenCL C " _JOINT(LIBCL_C_VERSION_MAJOR, LIBCL_C_VERSION_MINOR) " beignet " LIBCL_DRIVER_VERSION_STRING
+#else
+#define LIBCL_DRIVER_VERSION_STRING _JOINT(LIBCL_DRIVER_VERSION_MAJOR, LIBCL_DRIVER_VERSION_MINOR)
+#endif
+#define LIBCL_VERSION_STRING "OpenCL " _JOINT(LIBCL_C_VERSION_MAJOR, LIBCL_C_VERSION_MINOR) " beignet " LIBCL_DRIVER_VERSION_STRING BEIGNET_GIT_SHA1_STRING
+#define LIBCL_C_VERSION_STRING "OpenCL C " _JOINT(LIBCL_C_VERSION_MAJOR, LIBCL_C_VERSION_MINOR) " beignet " LIBCL_DRIVER_VERSION_STRING BEIGNET_GIT_SHA1_STRING
 
 #endif /* __CL_PLATFORM_ID_H__ */
 
diff --git a/src/cl_program.c b/src/cl_program.c
index 79dff34..fa67ef2 100644
--- a/src/cl_program.c
+++ b/src/cl_program.c
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -249,6 +249,14 @@ cl_program_create_from_binary(cl_context             ctx,
     program->source_type = FROM_LLVM;
   }
   else if (*program->binary == 0) {
+    program->opaque = interp_program_new_from_binary(program->ctx->device->vendor_id, program->binary, program->binary_sz);
+    if (UNLIKELY(program->opaque == NULL)) {
+      err = CL_INVALID_PROGRAM;
+      goto error;
+    }
+
+    /* Create all the kernels */
+    TRY (cl_program_load_gen_program, program);
     program->binary_type = CL_PROGRAM_BINARY_TYPE_EXECUTABLE;
   }
 
@@ -739,7 +747,11 @@ cl_program_compile(cl_program            p,
     /* Create all the kernels */
     p->source_type = FROM_LLVM;
     p->binary_type = CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;
+  }else if(p->source_type == FROM_BINARY){
+    err = CL_INVALID_OPERATION;
+    return err;
   }
+
   p->is_built = 1;
   p->build_status = CL_BUILD_SUCCESS;
   return CL_SUCCESS;
diff --git a/src/cl_program.h b/src/cl_program.h
index 6dea29a..3ab7acd 100644
--- a/src/cl_program.h
+++ b/src/cl_program.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/src/cl_sampler.c b/src/cl_sampler.c
index d718256..45c1fdf 100644
--- a/src/cl_sampler.c
+++ b/src/cl_sampler.c
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/src/cl_sampler.h b/src/cl_sampler.h
index 4785928..fc4b7e7 100644
--- a/src/cl_sampler.h
+++ b/src/cl_sampler.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/src/cl_thread.c b/src/cl_thread.c
index 5713d70..0d99574 100644
--- a/src/cl_thread.c
+++ b/src/cl_thread.c
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -37,7 +37,6 @@ static int thread_array_num = 1;
 static int *thread_slot_map = NULL;
 static int thread_magic_num = 1;
 static pthread_mutex_t thread_queue_map_lock = PTHREAD_MUTEX_INITIALIZER;
-static pthread_key_t destroy_key;
 
 static __thread int thread_id = -1;
 static __thread int thread_magic = -1;
@@ -55,13 +54,6 @@ typedef struct _queue_thread_private {
   pthread_mutex_t thread_data_lock;
 } queue_thread_private;
 
-static void thread_data_destructor(void *dummy) {
-  pthread_mutex_lock(&thread_queue_map_lock);
-  thread_slot_map[thread_id] = 0;
-  pthread_mutex_unlock(&thread_queue_map_lock);
-  free(dummy);
-}
-
 static thread_spec_data * __create_thread_spec_data(cl_command_queue queue, int create)
 {
   queue_thread_private *thread_private = ((queue_thread_private *)(queue->thread_data));
@@ -69,7 +61,6 @@ static thread_spec_data * __create_thread_spec_data(cl_command_queue queue, int
   int i = 0;
 
   if (thread_id == -1) {
-    void * dummy = malloc(sizeof(int));
 
     pthread_mutex_lock(&thread_queue_map_lock);
     for (i = 0; i < thread_array_num; i++) {
@@ -90,8 +81,6 @@ static thread_spec_data * __create_thread_spec_data(cl_command_queue queue, int
 
     thread_magic = thread_magic_num++;
     pthread_mutex_unlock(&thread_queue_map_lock);
-
-    pthread_setspecific(destroy_key, dummy);
   }
 
   pthread_mutex_lock(&thread_private->thread_data_lock);
@@ -129,7 +118,6 @@ void* cl_thread_data_create(void)
     thread_slot_map = calloc(thread_array_num, sizeof(int));
     pthread_mutex_unlock(&thread_queue_map_lock);
 
-    pthread_key_create(&destroy_key, thread_data_destructor);
   }
 
   pthread_mutex_init(&thread_private->thread_data_lock, NULL);
@@ -238,7 +226,6 @@ void cl_thread_data_destroy(cl_command_queue queue)
   thread_spec_data** threads_data;
 
   pthread_mutex_lock(&thread_private->thread_data_lock);
-  assert(thread_private->threads_data_num == thread_array_num);
   threads_data_num = thread_private->threads_data_num;
   threads_data = thread_private->threads_data;
   thread_private->threads_data_num = 0;
diff --git a/src/cl_thread.h b/src/cl_thread.h
index ecc99ad..7b48a26 100644
--- a/src/cl_thread.h
+++ b/src/cl_thread.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/src/cl_utils.h b/src/cl_utils.h
index 26cf329..28fdef6 100644
--- a/src/cl_utils.h
+++ b/src/cl_utils.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/src/git_sha1.sh b/src/git_sha1.sh
new file mode 100755
index 0000000..f44f078
--- /dev/null
+++ b/src/git_sha1.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+SOURCE_DIR=$1
+FILE=$2
+
+touch ${SOURCE_DIR}/${FILE}_tmp
+if test -d ${SOURCE_DIR}/../.git; then
+        if which git > /dev/null; then
+            git --git-dir=${SOURCE_DIR}/../.git log -n 1 --oneline | \
+                sed 's/^\([^ ]*\) .*/#define BEIGNET_GIT_SHA1 "git-\1"/' \
+                > ${SOURCE_DIR}/${FILE}_tmp
+        fi
+fi
+
+#updating ${SOURCE_DIR}/${FILE}
+if ! cmp -s ${SOURCE_DIR}/${FILE}_tmp ${SOURCE_DIR}/${FILE}; then
+                mv  ${SOURCE_DIR}/${FILE}_tmp ${SOURCE_DIR}/${FILE}
+else
+                rm  ${SOURCE_DIR}/${FILE}_tmp
+fi
diff --git a/src/intel/intel_batchbuffer.c b/src/intel/intel_batchbuffer.c
index d3da3cc..a65ac86 100644
--- a/src/intel/intel_batchbuffer.c
+++ b/src/intel/intel_batchbuffer.c
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -118,8 +118,9 @@ intel_batchbuffer_flush(intel_batchbuffer_t *batch)
 
   *(uint32_t*)batch->ptr = MI_BATCH_BUFFER_END;
   batch->ptr += 4;
-  dri_bo_unmap(batch->buffer);
   used = batch->ptr - batch->map;
+  dri_bo_unmap(batch->buffer);
+  batch->ptr = batch->map = NULL;
 
   if (!is_locked)
     intel_driver_lock_hardware(batch->intel);
@@ -135,8 +136,9 @@ intel_batchbuffer_flush(intel_batchbuffer_t *batch)
   if (!is_locked)
     intel_driver_unlock_hardware(batch->intel);
 
-  // Release the buffer
-  intel_batchbuffer_terminate(batch);
+  // Can't release buffer here. gpgpu only can be delete only when the batch buffer is complete.
+  // Remain the buffer for gpgpu delete check.
+  //intel_batchbuffer_terminate(batch);
 }
 
 LOCAL void 
diff --git a/src/intel/intel_batchbuffer.h b/src/intel/intel_batchbuffer.h
index 4c28a7c..121c824 100644
--- a/src/intel/intel_batchbuffer.h
+++ b/src/intel/intel_batchbuffer.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/src/intel/intel_defines.h b/src/intel/intel_defines.h
index 02ffde4..e983718 100644
--- a/src/intel/intel_defines.h
+++ b/src/intel/intel_defines.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -87,6 +87,7 @@
 
 #define PIPELINE_SELECT_3D              0
 #define PIPELINE_SELECT_MEDIA           1
+#define PIPELINE_SELECT_GPGPU           2
 
 #define UF0_CS_REALLOC                  (1 << 13)
 #define UF0_VFE_REALLOC                 (1 << 12)
@@ -288,6 +289,11 @@
 #define I965_TILEWALK_XMAJOR                 0
 #define I965_TILEWALK_YMAJOR                 1
 
+#define GEN8_TILEMODE_LINEAR                 0
+#define GEN8_TILEMODE_WMAJOR                 1
+#define GEN8_TILEMODE_XMAJOR                 2
+#define GEN8_TILEMODE_YMAJOR                 3
+
 #define I965_SURCHAN_SELECT_ZERO             0
 #define I965_SURCHAN_SELECT_ONE              1
 #define I965_SURCHAN_SELECT_RED              4
@@ -303,6 +309,8 @@
 #define GEN7_L3_CNTL_REG2_ADDRESS_OFFSET         (0xB020)
 #define GEN7_L3_CNTL_REG3_ADDRESS_OFFSET         (0xB024)
 
+#define GEN8_L3_CNTL_REG_ADDRESS_OFFSET          (0x7034)
+
 // To issue pipe controls (reset L3 / SLM or stall)
 #define GEN7_PIPE_CONTROL_MEDIA 0x2
 #define GEN7_PIPE_CONTROL_3D 0x3
diff --git a/src/intel/intel_driver.c b/src/intel/intel_driver.c
index deb83c8..c370c66 100644
--- a/src/intel/intel_driver.c
+++ b/src/intel/intel_driver.c
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -79,31 +79,6 @@
 #include "cl_device_id.h"
 #include "cl_platform_id.h"
 
-#define SET_BLOCKED_SIGSET(DRIVER)   do {                     \
-  sigset_t bl_mask;                                           \
-  sigfillset(&bl_mask);                                       \
-  sigdelset(&bl_mask, SIGFPE);                                \
-  sigdelset(&bl_mask, SIGILL);                                \
-  sigdelset(&bl_mask, SIGSEGV);                               \
-  sigdelset(&bl_mask, SIGBUS);                                \
-  sigdelset(&bl_mask, SIGKILL);                               \
-  pthread_sigmask(SIG_SETMASK, &bl_mask, &(DRIVER)->sa_mask); \
-} while (0)
-
-#define RESTORE_BLOCKED_SIGSET(DRIVER) do {                   \
-  pthread_sigmask(SIG_SETMASK, &(DRIVER)->sa_mask, NULL);     \
-} while (0)
-
-#define PPTHREAD_MUTEX_LOCK(DRIVER) do {                      \
-  SET_BLOCKED_SIGSET(DRIVER);                                 \
-  pthread_mutex_lock(&(DRIVER)->ctxmutex);                    \
-} while (0)
-
-#define PPTHREAD_MUTEX_UNLOCK(DRIVER) do {                    \
-  pthread_mutex_unlock(&(DRIVER)->ctxmutex);                  \
-  RESTORE_BLOCKED_SIGSET(DRIVER);                             \
-} while (0)
-
 static void
 intel_driver_delete(intel_driver_t *driver)
 {
@@ -183,7 +158,9 @@ intel_driver_init(intel_driver_t *driver, int dev_fd)
   else
     FATAL ("Unsupported Gen for emulation");
 #else
-  if (IS_GEN75(driver->device_id))
+  if (IS_GEN8(driver->device_id))
+    driver->gen_ver = 8;
+  else if (IS_GEN75(driver->device_id))
     driver->gen_ver = 75;
   else if (IS_GEN7(driver->device_id))
     driver->gen_ver = 7;
@@ -421,11 +398,13 @@ intel_get_device_id(void)
   return intel_device_id;
 }
 
+extern void intel_gpgpu_delete_all(intel_driver_t *driver);
 static void
 cl_intel_driver_delete(intel_driver_t *driver)
 {
   if (driver == NULL)
     return;
+  intel_gpgpu_delete_all(driver);
   intel_driver_context_destroy(driver);
   intel_driver_close(driver);
   intel_driver_terminate(driver);
@@ -439,7 +418,6 @@ cl_intel_driver_new(cl_context_prop props)
   intel_driver_t *driver = NULL;
   TRY_ALLOC_NO_ERR (driver, intel_driver_new());
   if(UNLIKELY(intel_driver_open(driver, props) != CL_SUCCESS)) goto error;
-  intel_driver_open(driver, props);
 exit:
   return driver;
 error:
@@ -475,6 +453,44 @@ static int get_cl_tiling(uint32_t drm_tiling)
   return CL_NO_TILE;
 }
 
+static uint32_t intel_buffer_get_tiling_align(cl_context ctx, uint32_t tiling_mode, uint32_t dim)
+{
+  uint32_t gen_ver = ((intel_driver_t *)ctx->drv)->gen_ver;
+  uint32_t ret = 0;
+
+  switch (tiling_mode) {
+  case CL_TILE_X:
+    if (dim == 0) { //tileX width in bytes
+      ret = 512;
+    } else if (dim == 1) { //tileX height in number of rows
+      ret = 8;
+    } else
+      assert(0);
+    break;
+
+  case CL_TILE_Y:
+    if (dim == 0) { //tileY width in bytes
+      ret = 128;
+    } else if (dim == 1) { //tileY height in number of rows
+      ret = 32;
+    } else
+      assert(0);
+    break;
+
+  case CL_NO_TILE:
+    if (dim == 1) { //vertical alignment
+      if (gen_ver == 8)
+        ret = 4;
+      else
+        ret = 2;
+    } else
+      assert(0);
+    break;
+  }
+
+  return ret;
+}
+
 #if defined(HAS_EGL)
 #include "intel_dri_resource_sharing.h"
 #include "cl_image.h"
@@ -674,6 +690,20 @@ cl_buffer intel_share_image_from_libva(cl_context ctx,
   return (cl_buffer)intel_bo;
 }
 
+static cl_buffer intel_buffer_alloc_userptr(cl_buffer_mgr bufmgr, const char* name, void *data,size_t size, unsigned long flags)
+{
+#ifdef HAS_USERPTR
+  drm_intel_bo *bo;
+  bo = drm_intel_bo_alloc_userptr((drm_intel_bufmgr *)bufmgr, name, data, I915_TILING_NONE, 0, size, flags);
+  /* Fallback to unsynchronized userptr allocation if kernel has no MMU notifier enabled. */
+  if (bo == NULL)
+    bo = drm_intel_bo_alloc_userptr((drm_intel_bufmgr *)bufmgr, name, data, I915_TILING_NONE, 0, size, flags | I915_USERPTR_UNSYNCHRONIZED);
+  return (cl_buffer)bo;
+#else
+  return NULL;
+#endif
+}
+
 static int32_t get_intel_tiling(cl_int tiling, uint32_t *intel_tiling)
 {
   switch (tiling) {
@@ -718,6 +748,7 @@ intel_setup_callbacks(void)
   cl_driver_get_bufmgr = (cl_driver_get_bufmgr_cb *) intel_driver_get_bufmgr;
   cl_driver_get_device_id = (cl_driver_get_device_id_cb *) intel_get_device_id;
   cl_buffer_alloc = (cl_buffer_alloc_cb *) drm_intel_bo_alloc;
+  cl_buffer_alloc_userptr = (cl_buffer_alloc_userptr_cb*) intel_buffer_alloc_userptr;
   cl_buffer_set_tiling = (cl_buffer_set_tiling_cb *) intel_buffer_set_tiling;
 #if defined(HAS_EGL)
   cl_buffer_alloc_from_texture = (cl_buffer_alloc_from_texture_cb *) intel_alloc_buffer_from_texture;
@@ -738,7 +769,9 @@ intel_setup_callbacks(void)
   cl_buffer_pin = (cl_buffer_pin_cb *) drm_intel_bo_pin;
   cl_buffer_unpin = (cl_buffer_unpin_cb *) drm_intel_bo_unpin;
   cl_buffer_subdata = (cl_buffer_subdata_cb *) drm_intel_bo_subdata;
+  cl_buffer_get_subdata = (cl_buffer_get_subdata_cb *) drm_intel_bo_get_subdata;
   cl_buffer_wait_rendering = (cl_buffer_wait_rendering_cb *) drm_intel_bo_wait_rendering;
   cl_buffer_get_fd = (cl_buffer_get_fd_cb *) drm_intel_bo_gem_export_to_prime;
+  cl_buffer_get_tiling_align = (cl_buffer_get_tiling_align_cb *)intel_buffer_get_tiling_align;
   intel_set_gpgpu_callbacks(intel_get_device_id());
 }
diff --git a/src/intel/intel_driver.h b/src/intel/intel_driver.h
index 107fdfc..61653db 100644
--- a/src/intel/intel_driver.h
+++ b/src/intel/intel_driver.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -54,6 +54,7 @@
 #include <drm.h>
 #include <i915_drm.h>
 #include <intel_bufmgr.h>
+#include <intel/intel_gpgpu.h>
 
 #define CMD_MI                                  (0x0 << 29)
 #define CMD_2D                                  (0x2 << 29)
@@ -73,6 +74,7 @@
 #define BR13_8888                               (0x3 << 24)
 
 struct dri_state;
+struct intel_gpgpu_node;
 typedef struct _XDisplay Display;
 
 typedef struct intel_driver
@@ -88,8 +90,34 @@ typedef struct intel_driver
   int need_close;
   Display *x11_display;
   struct dri_state *dri_ctx;
+  struct intel_gpgpu_node *gpgpu_list;
 } intel_driver_t;
 
+#define SET_BLOCKED_SIGSET(DRIVER)   do {                     \
+  sigset_t bl_mask;                                           \
+  sigfillset(&bl_mask);                                       \
+  sigdelset(&bl_mask, SIGFPE);                                \
+  sigdelset(&bl_mask, SIGILL);                                \
+  sigdelset(&bl_mask, SIGSEGV);                               \
+  sigdelset(&bl_mask, SIGBUS);                                \
+  sigdelset(&bl_mask, SIGKILL);                               \
+  pthread_sigmask(SIG_SETMASK, &bl_mask, &(DRIVER)->sa_mask); \
+} while (0)
+
+#define RESTORE_BLOCKED_SIGSET(DRIVER) do {                   \
+  pthread_sigmask(SIG_SETMASK, &(DRIVER)->sa_mask, NULL);     \
+} while (0)
+
+#define PPTHREAD_MUTEX_LOCK(DRIVER) do {                      \
+  SET_BLOCKED_SIGSET(DRIVER);                                 \
+  pthread_mutex_lock(&(DRIVER)->ctxmutex);                    \
+} while (0)
+
+#define PPTHREAD_MUTEX_UNLOCK(DRIVER) do {                    \
+  pthread_mutex_unlock(&(DRIVER)->ctxmutex);                  \
+  RESTORE_BLOCKED_SIGSET(DRIVER);                             \
+} while (0)
+
 /* device control */
 extern void intel_driver_lock_hardware(intel_driver_t*);
 extern void intel_driver_unlock_hardware(intel_driver_t*);
diff --git a/src/intel/intel_gpgpu.c b/src/intel/intel_gpgpu.c
index c4b9156..b6e19db 100644
--- a/src/intel/intel_gpgpu.c
+++ b/src/intel/intel_gpgpu.c
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -33,8 +33,6 @@
 #include "intel/intel_gpgpu.h"
 #include "intel/intel_defines.h"
 #include "intel/intel_structs.h"
-#include "intel/intel_batchbuffer.h"
-#include "intel/intel_driver.h"
 #include "program.h" // for BTI_RESERVED_NUM
 
 #include "cl_alloc.h"
@@ -58,7 +56,7 @@
 /* Stores both binding tables and surface states */
 typedef struct surface_heap {
   uint32_t binding_table[256];
-  char surface[256][sizeof(gen6_surface_state_t)];
+  char surface[256*sizeof(gen_surface_state_t)];
 } surface_heap_t;
 
 typedef struct intel_event {
@@ -69,58 +67,6 @@ typedef struct intel_event {
 
 #define MAX_IF_DESC    32
 
-/* We can bind only a limited number of buffers */
-enum { max_buf_n = 128 };
-
-enum { max_img_n = 128};
-
-enum {max_sampler_n = 16 };
-
-/* Handle GPGPU state */
-struct intel_gpgpu
-{
-  void* ker_opaque;
-  size_t global_wk_sz[3];
-  void* printf_info;
-  intel_driver_t *drv;
-  intel_batchbuffer_t *batch;
-  cl_gpgpu_kernel *ker;
-  drm_intel_bo *binded_buf[max_buf_n];  /* all buffers binded for the call */
-  uint32_t target_buf_offset[max_buf_n];/* internal offset for buffers binded for the call */
-  uint32_t binded_offset[max_buf_n];    /* their offsets in the curbe buffer */
-  uint32_t binded_n;                    /* number of buffers binded */
-
-  unsigned long img_bitmap;              /* image usage bitmap. */
-  unsigned int img_index_base;          /* base index for image surface.*/
-
-  unsigned long sampler_bitmap;          /* sampler usage bitmap. */
-
-  struct { drm_intel_bo *bo; } stack_b;
-  struct { drm_intel_bo *bo; } perf_b;
-  struct { drm_intel_bo *bo; } scratch_b;
-  struct { drm_intel_bo *bo; } constant_b;
-  struct { drm_intel_bo *bo; } time_stamp_b;  /* time stamp buffer */
-  struct { drm_intel_bo *bo;
-           drm_intel_bo *ibo;} printf_b;      /* the printf buf and index buf*/
-
-  struct { drm_intel_bo *bo; } aux_buf;
-  struct {
-    uint32_t surface_heap_offset;
-    uint32_t curbe_offset;
-    uint32_t idrt_offset;
-    uint32_t sampler_state_offset;
-    uint32_t sampler_border_color_state_offset;
-  } aux_offset;
-
-  uint32_t per_thread_scratch;
-  struct {
-    uint32_t num_cs_entries;
-    uint32_t size_cs_entry;  /* size of one entry in 512bit elements */
-  } curb;
-
-  uint32_t max_threads;      /* max threads requested by the user */
-};
-
 typedef struct intel_gpgpu intel_gpgpu_t;
 
 typedef void (intel_gpgpu_set_L3_t)(intel_gpgpu_t *gpgpu, uint32_t use_slm);
@@ -135,6 +81,29 @@ intel_gpgpu_post_action_t *intel_gpgpu_post_action = NULL;
 typedef uint64_t (intel_gpgpu_read_ts_reg_t)(drm_intel_bufmgr *bufmgr);
 intel_gpgpu_read_ts_reg_t *intel_gpgpu_read_ts_reg = NULL;
 
+
+typedef void (intel_gpgpu_set_base_address_t)(intel_gpgpu_t *gpgpu);
+intel_gpgpu_set_base_address_t *intel_gpgpu_set_base_address = NULL;
+
+typedef void (intel_gpgpu_setup_bti_t)(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t internal_offset,
+                                       uint32_t size, unsigned char index, uint32_t format);
+intel_gpgpu_setup_bti_t *intel_gpgpu_setup_bti = NULL;
+
+
+typedef void (intel_gpgpu_load_vfe_state_t)(intel_gpgpu_t *gpgpu);
+intel_gpgpu_load_vfe_state_t *intel_gpgpu_load_vfe_state = NULL;
+
+typedef void (intel_gpgpu_build_idrt_t)(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel);
+intel_gpgpu_build_idrt_t *intel_gpgpu_build_idrt = NULL;
+
+
+typedef void (intel_gpgpu_load_curbe_buffer_t)(intel_gpgpu_t *gpgpu);
+intel_gpgpu_load_curbe_buffer_t *intel_gpgpu_load_curbe_buffer = NULL;
+
+
+typedef void (intel_gpgpu_load_idrt_t)(intel_gpgpu_t *gpgpu);
+intel_gpgpu_load_idrt_t *intel_gpgpu_load_idrt = NULL;
+
 static void
 intel_gpgpu_sync(void *buf)
 {
@@ -157,7 +126,7 @@ static void intel_gpgpu_unref_batch_buf(void *buf)
 }
 
 static void
-intel_gpgpu_delete(intel_gpgpu_t *gpgpu)
+intel_gpgpu_delete_finished(intel_gpgpu_t *gpgpu)
 {
   if (gpgpu == NULL)
     return;
@@ -183,6 +152,77 @@ intel_gpgpu_delete(intel_gpgpu_t *gpgpu)
   cl_free(gpgpu);
 }
 
+/* Destroy the all intel_gpgpu, no matter finish or not, when driver destroy */
+void intel_gpgpu_delete_all(intel_driver_t *drv)
+{
+  struct intel_gpgpu_node *p;
+  if(drv->gpgpu_list == NULL)
+    return;
+
+  PPTHREAD_MUTEX_LOCK(drv);
+  while(drv->gpgpu_list) {
+    p = drv->gpgpu_list;
+    drv->gpgpu_list = p->next;
+    intel_gpgpu_delete_finished(p->gpgpu);
+    cl_free(p);
+  }
+  PPTHREAD_MUTEX_UNLOCK(drv);
+}
+
+static void
+intel_gpgpu_delete(intel_gpgpu_t *gpgpu)
+{
+  intel_driver_t *drv = gpgpu->drv;
+  struct intel_gpgpu_node *p, *node;
+
+  PPTHREAD_MUTEX_LOCK(drv);
+  p = drv->gpgpu_list;
+  if(p) {
+    node = p->next;
+    while(node) {
+      if(node->gpgpu->batch && node->gpgpu->batch->buffer &&
+         !drm_intel_bo_busy(node->gpgpu->batch->buffer)) {
+        p->next = node->next;
+        intel_gpgpu_delete_finished(node->gpgpu);
+        cl_free(node);
+        node = p->next;
+      } else {
+        p = node;
+        node = node->next;
+      }
+    }
+    node = drv->gpgpu_list;
+    if(node->gpgpu->batch && node->gpgpu->batch->buffer &&
+       !drm_intel_bo_busy(node->gpgpu->batch->buffer)) {
+      drv->gpgpu_list = drv->gpgpu_list->next;
+      intel_gpgpu_delete_finished(node->gpgpu);
+      cl_free(node);
+      node = p->next;
+    }
+  }
+  if (gpgpu == NULL)
+    return;
+
+  if(gpgpu->batch && gpgpu->batch->buffer &&
+     !drm_intel_bo_busy(gpgpu->batch->buffer)) {
+    TRY_ALLOC_NO_ERR (node, CALLOC(struct intel_gpgpu_node));
+    node->gpgpu = gpgpu;
+    node->next = NULL;
+    p = drv->gpgpu_list;
+    if(p == NULL)
+      drv->gpgpu_list= node;
+    else {
+      while(p->next)
+        p = p->next;
+      p->next = node;
+    }
+  } else
+    intel_gpgpu_delete_finished(gpgpu);
+
+error:
+  PPTHREAD_MUTEX_UNLOCK(drv);
+}
+
 static intel_gpgpu_t*
 intel_gpgpu_new(intel_driver_t *drv)
 {
@@ -205,7 +245,7 @@ static void
 intel_gpgpu_select_pipeline(intel_gpgpu_t *gpgpu)
 {
   BEGIN_BATCH(gpgpu->batch, 1);
-  OUT_BATCH(gpgpu->batch, CMD_PIPELINE_SELECT | PIPELINE_SELECT_MEDIA);
+  OUT_BATCH(gpgpu->batch, CMD_PIPELINE_SELECT | PIPELINE_SELECT_GPGPU);
   ADVANCE_BATCH(gpgpu->batch);
 }
 
@@ -220,9 +260,14 @@ intel_gpgpu_get_cache_ctrl_gen75()
 {
   return llccc_ec | l3cc_ec;
 }
+static uint32_t
+intel_gpgpu_get_cache_ctrl_gen8()
+{
+  return tcc_llc_ec_l3 | mtllc_wb;
+}
 
 static void
-intel_gpgpu_set_base_address(intel_gpgpu_t *gpgpu)
+intel_gpgpu_set_base_address_gen7(intel_gpgpu_t *gpgpu)
 {
   const uint32_t def_cc = cl_gpgpu_get_cache_ctrl(); /* default Cache Control value */
   BEGIN_BATCH(gpgpu->batch, 10);
@@ -239,7 +284,9 @@ intel_gpgpu_set_base_address(intel_gpgpu_t *gpgpu)
             I915_GEM_DOMAIN_INSTRUCTION,
             I915_GEM_DOMAIN_INSTRUCTION,
             gpgpu->aux_offset.surface_heap_offset + (0 | (def_cc << 8) | (def_cc << 4) | (0 << 3)| BASE_ADDRESS_MODIFY));
+
   OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Dynamic State Base Addr */
+
   OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Indirect Obj Base Addr */
   OUT_BATCH(gpgpu->batch, 0 | (def_cc << 8) | BASE_ADDRESS_MODIFY); /* Instruction Base Addr  */
   /* If we output an AUB file, we limit the total size to 64MB */
@@ -260,21 +307,85 @@ intel_gpgpu_set_base_address(intel_gpgpu_t *gpgpu)
   ADVANCE_BATCH(gpgpu->batch);
 }
 
+static void
+intel_gpgpu_set_base_address_gen8(intel_gpgpu_t *gpgpu)
+{
+    const uint32_t def_cc = cl_gpgpu_get_cache_ctrl(); /* default Cache Control value */
+    BEGIN_BATCH(gpgpu->batch, 16);
+    OUT_BATCH(gpgpu->batch, CMD_STATE_BASE_ADDRESS | 14);
+    /* 0, Gen State Mem Obj CC, Stateless Mem Obj CC, Stateless Access Write Back */
+    OUT_BATCH(gpgpu->batch, 0 | (def_cc << 4) | (0 << 1)| BASE_ADDRESS_MODIFY);    /* General State Base Addr   */
+    OUT_BATCH(gpgpu->batch, 0);
+    OUT_BATCH(gpgpu->batch, 0 | (def_cc << 16));
+    /* 0, State Mem Obj CC */
+    /* We use a state base address for the surface heap since IVB clamp the
+     * binding table pointer at 11 bits. So, we cannot use pointers directly while
+     * using the surface heap
+     */
+    assert(gpgpu->aux_offset.surface_heap_offset % 4096 == 0);
+    OUT_RELOC(gpgpu->batch, gpgpu->aux_buf.bo,
+              I915_GEM_DOMAIN_SAMPLER,
+              I915_GEM_DOMAIN_SAMPLER,
+              gpgpu->aux_offset.surface_heap_offset + (0 | (def_cc << 4) | (0 << 1)| BASE_ADDRESS_MODIFY));
+    OUT_BATCH(gpgpu->batch, 0);
+    OUT_RELOC(gpgpu->batch, gpgpu->aux_buf.bo,
+              I915_GEM_DOMAIN_RENDER,
+              I915_GEM_DOMAIN_RENDER,
+              (0 | (def_cc << 4) | (0 << 1)| BASE_ADDRESS_MODIFY)); /* Dynamic State Base Addr */
+    OUT_BATCH(gpgpu->batch, 0);
+    OUT_BATCH(gpgpu->batch, 0 | (def_cc << 4) | BASE_ADDRESS_MODIFY); /* Indirect Obj Base Addr */
+    OUT_BATCH(gpgpu->batch, 0);
+    //OUT_BATCH(gpgpu->batch, 0 | (def_cc << 4) | BASE_ADDRESS_MODIFY); /* Instruction Base Addr  */
+    OUT_RELOC(gpgpu->batch, (drm_intel_bo *)gpgpu->ker->bo,
+              I915_GEM_DOMAIN_INSTRUCTION,
+              I915_GEM_DOMAIN_INSTRUCTION,
+              0 + (0 | (def_cc << 4) | (0 << 1)| BASE_ADDRESS_MODIFY));
+    OUT_BATCH(gpgpu->batch, 0);
+    /* If we output an AUB file, we limit the total size to 64MB */
+#if USE_FULSIM
+    OUT_BATCH(gpgpu->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* General State Access Upper Bound */
+    OUT_BATCH(gpgpu->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Dynamic State Access Upper Bound */
+    OUT_BATCH(gpgpu->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Indirect Obj Access Upper Bound */
+    OUT_BATCH(gpgpu->batch, 0x04000000 | BASE_ADDRESS_MODIFY); /* Instruction Access Upper Bound */
+#else
+    OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
+    /* According to mesa i965 driver code, we must set the dynamic state access upper bound
+     * to a valid bound value, otherwise, the border color pointer may be rejected and you
+     * may get incorrect border color. This is a known hardware bug. */
+    OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
+    OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
+    OUT_BATCH(gpgpu->batch, 0xfffff000 | BASE_ADDRESS_MODIFY);
+#endif /* USE_FULSIM */
+    ADVANCE_BATCH(gpgpu->batch);
+}
+
 uint32_t intel_gpgpu_get_scratch_index_gen7(uint32_t size) {
   return size / 1024 - 1;
 }
 
 uint32_t intel_gpgpu_get_scratch_index_gen75(uint32_t size) {
+    //align in backend, if non pow2, must align when alloc scratch bo.
+    assert((size & (size - 1)) == 0);
     size = size >> 11;
     uint32_t index = 0;
     while((size >>= 1) > 0)
       index++;   //get leading one
 
-    //non pow 2 size
-    if(size & (size - 1)) index++;
     return index;
 }
 
+uint32_t intel_gpgpu_get_scratch_index_gen8(uint32_t size) {
+    //align in backend, if non pow2, must align when alloc scratch bo.
+    assert((size & (size - 1)) == 0);
+    size = size >> 10;
+    uint32_t index = 0;
+    while((size >>= 1) > 0)
+      index++;   //get leading one
+
+    return index;
+}
+
+
 static cl_int
 intel_gpgpu_get_max_curbe_size(uint32_t device_id)
 {
@@ -299,7 +410,7 @@ intel_gpgpu_get_curbe_size(intel_gpgpu_t *gpgpu)
 }
 
 static void
-intel_gpgpu_load_vfe_state(intel_gpgpu_t *gpgpu)
+intel_gpgpu_load_vfe_state_gen7(intel_gpgpu_t *gpgpu)
 {
   int32_t scratch_index;
   BEGIN_BATCH(gpgpu->batch, 8);
@@ -327,7 +438,37 @@ intel_gpgpu_load_vfe_state(intel_gpgpu_t *gpgpu)
 }
 
 static void
-intel_gpgpu_load_curbe_buffer(intel_gpgpu_t *gpgpu)
+intel_gpgpu_load_vfe_state_gen8(intel_gpgpu_t *gpgpu)
+{
+  int32_t scratch_index;
+  BEGIN_BATCH(gpgpu->batch, 9);
+  OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_POINTERS | (9-2));
+
+  if(gpgpu->per_thread_scratch > 0) {
+    scratch_index = intel_gpgpu_get_scratch_index(gpgpu->per_thread_scratch);
+    OUT_RELOC(gpgpu->batch, gpgpu->scratch_b.bo,
+              I915_GEM_DOMAIN_RENDER,
+              I915_GEM_DOMAIN_RENDER,
+              scratch_index);
+  }
+  else {
+    OUT_BATCH(gpgpu->batch, 0);
+  }
+  OUT_BATCH(gpgpu->batch, 0);
+
+  /* max_thread | urb entries | (reset_gateway|bypass_gate_way | gpgpu_mode) */
+  OUT_BATCH(gpgpu->batch, 0 | ((gpgpu->max_threads - 1) << 16) | (2 << 8) | 0xc0); //urb entries can't be 0
+  OUT_BATCH(gpgpu->batch, 0);
+  /* urb entries size | curbe_size */
+  OUT_BATCH(gpgpu->batch, 2<<16 | intel_gpgpu_get_curbe_size(gpgpu));
+  OUT_BATCH(gpgpu->batch, 0);
+  OUT_BATCH(gpgpu->batch, 0);
+  OUT_BATCH(gpgpu->batch, 0);
+  ADVANCE_BATCH(gpgpu->batch);
+}
+
+static void
+intel_gpgpu_load_curbe_buffer_gen7(intel_gpgpu_t *gpgpu)
 {
   BEGIN_BATCH(gpgpu->batch, 4);
   OUT_BATCH(gpgpu->batch, CMD(2,0,1) | (4 - 2));  /* length-2 */
@@ -338,7 +479,18 @@ intel_gpgpu_load_curbe_buffer(intel_gpgpu_t *gpgpu)
 }
 
 static void
-intel_gpgpu_load_idrt(intel_gpgpu_t *gpgpu)
+intel_gpgpu_load_curbe_buffer_gen8(intel_gpgpu_t *gpgpu)
+{
+  BEGIN_BATCH(gpgpu->batch, 4);
+  OUT_BATCH(gpgpu->batch, CMD(2,0,1) | (4 - 2));  /* length-2 */
+  OUT_BATCH(gpgpu->batch, 0);                     /* mbz */
+  OUT_BATCH(gpgpu->batch, intel_gpgpu_get_curbe_size(gpgpu) * 32);
+  OUT_BATCH(gpgpu->batch, gpgpu->aux_offset.curbe_offset);
+  ADVANCE_BATCH(gpgpu->batch);
+}
+
+static void
+intel_gpgpu_load_idrt_gen7(intel_gpgpu_t *gpgpu)
 {
   BEGIN_BATCH(gpgpu->batch, 4);
   OUT_BATCH(gpgpu->batch, CMD(2,0,2) | (4 - 2)); /* length-2 */
@@ -348,6 +500,18 @@ intel_gpgpu_load_idrt(intel_gpgpu_t *gpgpu)
   ADVANCE_BATCH(gpgpu->batch);
 }
 
+static void
+intel_gpgpu_load_idrt_gen8(intel_gpgpu_t *gpgpu)
+{
+  BEGIN_BATCH(gpgpu->batch, 4);
+  OUT_BATCH(gpgpu->batch, CMD(2,0,2) | (4 - 2)); /* length-2 */
+  OUT_BATCH(gpgpu->batch, 0);                    /* mbz */
+  OUT_BATCH(gpgpu->batch, 1 << 5);
+  OUT_BATCH(gpgpu->batch, gpgpu->aux_offset.idrt_offset);
+  ADVANCE_BATCH(gpgpu->batch);
+}
+
+
 static const uint32_t gpgpu_l3_config_reg1[] = {
   0x00080040, 0x02040040, 0x00800040, 0x01000038,
   0x02000030, 0x01000038, 0x00000038, 0x00000040,
@@ -479,6 +643,23 @@ intel_gpgpu_set_L3_gen75(intel_gpgpu_t *gpgpu, uint32_t use_slm)
 }
 
 static void
+intel_gpgpu_set_L3_gen8(intel_gpgpu_t *gpgpu, uint32_t use_slm)
+{
+  BEGIN_BATCH(gpgpu->batch, 3);
+  OUT_BATCH(gpgpu->batch, CMD_LOAD_REGISTER_IMM | 1); /* length - 2 */
+  OUT_BATCH(gpgpu->batch, GEN8_L3_CNTL_REG_ADDRESS_OFFSET);
+  // FIXME, this is a workaround for switch SLM enable and disable random hang
+  if(use_slm)
+    OUT_BATCH(gpgpu->batch, 0x60000121);  /* {SLM=192, URB=128, Rest=384} */
+  else
+    OUT_BATCH(gpgpu->batch, 0x60000160);  /* {SLM=0, URB=384, Rest=384, Sum=768} */
+
+  //if(use_slm)
+  //  gpgpu->batch->enable_slm = 1;
+  intel_gpgpu_pipe_control(gpgpu);
+}
+
+static void
 intel_gpgpu_batch_start(intel_gpgpu_t *gpgpu)
 {
   intel_batchbuffer_start_atomic(gpgpu->batch, 256);
@@ -559,14 +740,6 @@ intel_gpgpu_batch_reset(intel_gpgpu_t *gpgpu, size_t sz)
 {
   return intel_batchbuffer_reset(gpgpu->batch, sz);
 }
-/* check we do not get a 0 starting address for binded buf */
-static void
-intel_gpgpu_check_binded_buf_address(intel_gpgpu_t *gpgpu)
-{
-  uint32_t i;
-  for (i = 0; i < gpgpu->binded_n; ++i)
-    assert(gpgpu->binded_buf[i]->offset != 0);
-}
 
 static void
 intel_gpgpu_flush_batch_buffer(intel_batchbuffer_t *batch)
@@ -582,7 +755,16 @@ intel_gpgpu_flush(intel_gpgpu_t *gpgpu)
   if (!gpgpu->batch || !gpgpu->batch->buffer)
     return;
   intel_gpgpu_flush_batch_buffer(gpgpu->batch);
-  intel_gpgpu_check_binded_buf_address(gpgpu);
+  /* FIXME:
+     Remove old assert here for binded buffer offset 0 which
+     tried to guard possible NULL buffer pointer check in kernel, as
+     in case like "runtime_null_kernel_arg", but that's wrong to just
+     take buffer offset 0 as NULL, and cause failure for normal
+     kernels which has no such NULL ptr check but with buffer offset 0
+     (which is possible now and will be normal if full PPGTT is on).
+
+     Need to fix NULL ptr check otherwise.
+  */
 }
 
 static int
@@ -633,13 +815,13 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
     dri_bo_unreference(gpgpu->aux_buf.bo);
   gpgpu->aux_buf.bo = NULL;
 
-  //surface heap must be 4096 bytes aligned because state base address use 20bit for the address
-  size_aux = ALIGN(size_aux, 4096);
+  /* begin with surface heap to make sure it's page aligned,
+     because state base address use 20bit for the address */
   gpgpu->aux_offset.surface_heap_offset = size_aux;
   size_aux += sizeof(surface_heap_t);
 
   //curbe must be 32 bytes aligned
-  size_aux = ALIGN(size_aux, 32);
+  size_aux = ALIGN(size_aux, 64);
   gpgpu->aux_offset.curbe_offset = size_aux;
   size_aux += gpgpu->curb.num_cs_entries * gpgpu->curb.size_cs_entry * 32;
 
@@ -658,7 +840,10 @@ intel_gpgpu_state_init(intel_gpgpu_t *gpgpu,
   gpgpu->aux_offset.sampler_border_color_state_offset = size_aux;
   size_aux += GEN_MAX_SAMPLERS * sizeof(gen7_sampler_border_color_t);
 
-  bo = dri_bo_alloc(gpgpu->drv->bufmgr, "AUX_BUFFER", size_aux, 0);
+  /* make sure aux buffer is page aligned */
+  size_aux = ALIGN(size_aux, 4096);
+
+  bo = dri_bo_alloc(gpgpu->drv->bufmgr, "AUX_BUFFER", size_aux, 4096);
   if (!bo || dri_bo_map(bo, 1) != 0) {
     fprintf(stderr, "%s:%d: %s.\n", __FILE__, __LINE__, strerror(errno));
     if (bo)
@@ -690,87 +875,68 @@ intel_gpgpu_set_buf_reloc_gen7(intel_gpgpu_t *gpgpu, int32_t index, dri_bo* obj_
 }
 
 static dri_bo*
-intel_gpgpu_alloc_constant_buffer_gen7(intel_gpgpu_t *gpgpu, uint32_t size, uint8_t bti)
+intel_gpgpu_alloc_constant_buffer(intel_gpgpu_t *gpgpu, uint32_t size, uint8_t bti)
 {
-  uint32_t s = size - 1;
-  assert(size != 0);
-
-  surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
-  gen7_surface_state_t *ss2 = (gen7_surface_state_t *) heap->surface[bti];
-  memset(ss2, 0, sizeof(gen7_surface_state_t));
-  ss2->ss0.surface_type = I965_SURFACE_BUFFER;
-  ss2->ss0.surface_format = I965_SURFACEFORMAT_R32G32B32A32_UINT;
-  ss2->ss2.width  = s & 0x7f;            /* bits 6:0 of sz */
-  ss2->ss2.height = (s >> 7) & 0x3fff;   /* bits 20:7 of sz */
-  ss2->ss3.depth  = (s >> 21) & 0x3ff;   /* bits 30:21 of sz */
-  ss2->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
-  heap->binding_table[bti] = offsetof(surface_heap_t, surface) + bti* sizeof(gen7_surface_state_t);
-
   if(gpgpu->constant_b.bo)
     dri_bo_unreference(gpgpu->constant_b.bo);
-  gpgpu->constant_b.bo = drm_intel_bo_alloc(gpgpu->drv->bufmgr, "CONSTANT_BUFFER", s, 64);
+  gpgpu->constant_b.bo = drm_intel_bo_alloc(gpgpu->drv->bufmgr, "CONSTANT_BUFFER", size, 64);
   if (gpgpu->constant_b.bo == NULL)
     return NULL;
-  ss2->ss1.base_addr = gpgpu->constant_b.bo->offset;
-  dri_bo_emit_reloc(gpgpu->aux_buf.bo,
-                      I915_GEM_DOMAIN_RENDER,
-                      I915_GEM_DOMAIN_RENDER,
-                      0,
-                      gpgpu->aux_offset.surface_heap_offset +
-                      heap->binding_table[bti] +
-                      offsetof(gen7_surface_state_t, ss1),
-                      gpgpu->constant_b.bo);
+
+  intel_gpgpu_setup_bti(gpgpu, gpgpu->constant_b.bo, 0, size, bti, I965_SURFACEFORMAT_R32G32B32A32_UINT);
   return gpgpu->constant_b.bo;
 }
 
-static dri_bo*
-intel_gpgpu_alloc_constant_buffer_gen75(intel_gpgpu_t *gpgpu, uint32_t size, uint8_t bti)
+static void
+intel_gpgpu_setup_bti_gen7(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t internal_offset,
+                                   uint32_t size, unsigned char index, uint32_t format)
 {
   uint32_t s = size - 1;
-  assert(size != 0);
-
   surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
-  gen7_surface_state_t *ss2 = (gen7_surface_state_t *) heap->surface[bti];
-  memset(ss2, 0, sizeof(gen7_surface_state_t));
-  ss2->ss0.surface_type = I965_SURFACE_BUFFER;
-  ss2->ss0.surface_format = I965_SURFACEFORMAT_R32G32B32A32_UINT;
-  ss2->ss2.width  = s & 0x7f;            /* bits 6:0 of sz */
-  ss2->ss2.height = (s >> 7) & 0x3fff;   /* bits 20:7 of sz */
-  ss2->ss3.depth  = (s >> 21) & 0x3ff;   /* bits 30:21 of sz */
-  ss2->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
-  ss2->ss7.shader_r = I965_SURCHAN_SELECT_RED;
-  ss2->ss7.shader_g = I965_SURCHAN_SELECT_GREEN;
-  ss2->ss7.shader_b = I965_SURCHAN_SELECT_BLUE;
-  ss2->ss7.shader_a = I965_SURCHAN_SELECT_ALPHA;
-  heap->binding_table[bti] = offsetof(surface_heap_t, surface) + bti* sizeof(gen7_surface_state_t);
+  gen7_surface_state_t *ss0 = (gen7_surface_state_t *) &heap->surface[index * sizeof(gen7_surface_state_t)];
+  memset(ss0, 0, sizeof(gen7_surface_state_t));
+  ss0->ss0.surface_type = I965_SURFACE_BUFFER;
+  ss0->ss0.surface_format = format;
+  ss0->ss2.width  = s & 0x7f;   /* bits 6:0 of sz */
+  // Per bspec, I965_SURFACE_BUFFER and RAW format, size must be a multiple of 4 byte.
+  if(format == I965_SURFACEFORMAT_RAW)
+    assert((ss0->ss2.width & 0x03) == 3);
+  ss0->ss2.height = (s >> 7) & 0x3fff; /* bits 20:7 of sz */
+  ss0->ss3.depth  = (s >> 21) & 0x3ff; /* bits 30:21 of sz */
+  ss0->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
+  heap->binding_table[index] = offsetof(surface_heap_t, surface) + index * sizeof(gen7_surface_state_t);
 
-  if(gpgpu->constant_b.bo)
-    dri_bo_unreference(gpgpu->constant_b.bo);
-  gpgpu->constant_b.bo = drm_intel_bo_alloc(gpgpu->drv->bufmgr, "CONSTANT_BUFFER", s, 64);
-  if (gpgpu->constant_b.bo == NULL)
-    return NULL;
-  ss2->ss1.base_addr = gpgpu->constant_b.bo->offset;
+  ss0->ss1.base_addr = buf->offset + internal_offset;
   dri_bo_emit_reloc(gpgpu->aux_buf.bo,
                       I915_GEM_DOMAIN_RENDER,
                       I915_GEM_DOMAIN_RENDER,
-                      0,
+                      internal_offset,
                       gpgpu->aux_offset.surface_heap_offset +
-                      heap->binding_table[bti] +
+                      heap->binding_table[index] +
                       offsetof(gen7_surface_state_t, ss1),
-                      gpgpu->constant_b.bo);
-  return gpgpu->constant_b.bo;
+                      buf);
 }
 
 static void
-intel_gpgpu_setup_bti(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t internal_offset, uint32_t size, unsigned char index)
+intel_gpgpu_setup_bti_gen75(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t internal_offset,
+                                   uint32_t size, unsigned char index, uint32_t format)
 {
   uint32_t s = size - 1;
   surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
-  gen7_surface_state_t *ss0 = (gen7_surface_state_t *) heap->surface[index];
+  gen7_surface_state_t *ss0 = (gen7_surface_state_t *) &heap->surface[index * sizeof(gen7_surface_state_t)];
   memset(ss0, 0, sizeof(gen7_surface_state_t));
   ss0->ss0.surface_type = I965_SURFACE_BUFFER;
-  ss0->ss0.surface_format = I965_SURFACEFORMAT_RAW;
+  ss0->ss0.surface_format = format;
+  if(format != I965_SURFACEFORMAT_RAW) {
+    ss0->ss7.shader_r = I965_SURCHAN_SELECT_RED;
+    ss0->ss7.shader_g = I965_SURCHAN_SELECT_GREEN;
+    ss0->ss7.shader_b = I965_SURCHAN_SELECT_BLUE;
+    ss0->ss7.shader_a = I965_SURCHAN_SELECT_ALPHA;
+  }
   ss0->ss2.width  = s & 0x7f;   /* bits 6:0 of sz */
+  // Per bspec, I965_SURFACE_BUFFER and RAW format, size must be a multiple of 4 byte.
+  if(format == I965_SURFACEFORMAT_RAW)
+    assert((ss0->ss2.width & 0x03) == 3);
   ss0->ss2.height = (s >> 7) & 0x3fff; /* bits 20:7 of sz */
   ss0->ss3.depth  = (s >> 21) & 0x3ff; /* bits 30:21 of sz */
   ss0->ss5.cache_control = cl_gpgpu_get_cache_ctrl();
@@ -787,6 +953,41 @@ intel_gpgpu_setup_bti(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t internal
                       buf);
 }
 
+static void
+intel_gpgpu_setup_bti_gen8(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t internal_offset,
+                                   uint32_t size, unsigned char index, uint32_t format)
+{
+  uint32_t s = size - 1;
+  surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
+  gen8_surface_state_t *ss0 = (gen8_surface_state_t *) &heap->surface[index * sizeof(gen8_surface_state_t)];
+  memset(ss0, 0, sizeof(gen8_surface_state_t));
+  ss0->ss0.surface_type = I965_SURFACE_BUFFER;
+  ss0->ss0.surface_format = format;
+  if(format != I965_SURFACEFORMAT_RAW) {
+    ss0->ss7.shader_channel_select_red = I965_SURCHAN_SELECT_RED;
+    ss0->ss7.shader_channel_select_green = I965_SURCHAN_SELECT_GREEN;
+    ss0->ss7.shader_channel_select_blue = I965_SURCHAN_SELECT_BLUE;
+    ss0->ss7.shader_channel_select_alpha = I965_SURCHAN_SELECT_ALPHA;
+  }
+  ss0->ss2.width  = s & 0x7f;   /* bits 6:0 of sz */
+  // Per bspec, I965_SURFACE_BUFFER and RAW format, size must be a multiple of 4 byte.
+  if(format == I965_SURFACEFORMAT_RAW)
+    assert((ss0->ss2.width & 0x03) == 3);
+  ss0->ss2.height = (s >> 7) & 0x3fff; /* bits 20:7 of sz */
+  ss0->ss3.depth  = (s >> 21) & 0x3ff; /* bits 30:21 of sz */
+  ss0->ss1.mem_obj_ctrl_state = cl_gpgpu_get_cache_ctrl();
+  heap->binding_table[index] = offsetof(surface_heap_t, surface) + index * sizeof(gen8_surface_state_t);
+  ss0->ss8.surface_base_addr_lo = (buf->offset64 + internal_offset) & 0xffffffff;
+  ss0->ss9.surface_base_addr_hi = ((buf->offset64 + internal_offset) >> 32) & 0xffffffff;
+  dri_bo_emit_reloc(gpgpu->aux_buf.bo,
+                    I915_GEM_DOMAIN_RENDER,
+                    I915_GEM_DOMAIN_RENDER,
+                    internal_offset,
+                    gpgpu->aux_offset.surface_heap_offset +
+                    heap->binding_table[index] +
+                    offsetof(gen8_surface_state_t, ss8),
+                    buf);
+}
 
 static int
 intel_is_surface_array(cl_mem_object_type type)
@@ -827,8 +1028,10 @@ intel_get_surface_type(cl_mem_object_type type)
 static uint32_t get_surface_type(intel_gpgpu_t *gpgpu, int index, cl_mem_object_type type)
 {
   uint32_t surface_type;
-  if (((IS_IVYBRIDGE(gpgpu->drv->device_id) || IS_HASWELL(gpgpu->drv->device_id))) &&
-      index >= 128 + BTI_RESERVED_NUM &&
+  if (((IS_IVYBRIDGE(gpgpu->drv->device_id) ||
+        IS_HASWELL(gpgpu->drv->device_id) ||
+        IS_BROADWELL(gpgpu->drv->device_id))) &&
+      index >= BTI_MAX_IMAGE_NUM + BTI_RESERVED_NUM &&
       type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
     surface_type = I965_SURFACE_2D;
   else
@@ -850,7 +1053,7 @@ intel_gpgpu_bind_image_gen7(intel_gpgpu_t *gpgpu,
                               int32_t tiling)
 {
   surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
-  gen7_surface_state_t *ss = (gen7_surface_state_t *) heap->surface[index];
+  gen7_surface_state_t *ss = (gen7_surface_state_t *) &heap->surface[index * sizeof(gen7_surface_state_t)];
 
   memset(ss, 0, sizeof(*ss));
   ss->ss0.vertical_line_stride = 0; // always choose VALIGN_2
@@ -896,7 +1099,7 @@ intel_gpgpu_bind_image_gen75(intel_gpgpu_t *gpgpu,
                               int32_t tiling)
 {
   surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
-  gen7_surface_state_t *ss = (gen7_surface_state_t *) heap->surface[index];
+  gen7_surface_state_t *ss = (gen7_surface_state_t *) &heap->surface[index * sizeof(gen7_surface_state_t)];
   memset(ss, 0, sizeof(*ss));
   ss->ss0.vertical_line_stride = 0; // always choose VALIGN_2
   ss->ss0.surface_type = get_surface_type(gpgpu, index, type);
@@ -931,6 +1134,71 @@ intel_gpgpu_bind_image_gen75(intel_gpgpu_t *gpgpu,
 }
 
 static void
+intel_gpgpu_bind_image_gen8(intel_gpgpu_t *gpgpu,
+                            uint32_t index,
+                            dri_bo* obj_bo,
+                            uint32_t obj_bo_offset,
+                            uint32_t format,
+                            cl_mem_object_type type,
+                            int32_t w,
+                            int32_t h,
+                            int32_t depth,
+                            int32_t pitch,
+                            int32_t tiling)
+{
+  surface_heap_t *heap = gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.surface_heap_offset;
+  gen8_surface_state_t *ss = (gen8_surface_state_t *) &heap->surface[index * sizeof(gen8_surface_state_t)];
+  memset(ss, 0, sizeof(*ss));
+  ss->ss0.vertical_line_stride = 0; // always choose VALIGN_2
+  ss->ss0.surface_type = get_surface_type(gpgpu, index, type);
+  ss->ss0.surface_format = format;
+  if (intel_is_surface_array(type)) {
+    ss->ss0.surface_array = 1;
+    ss->ss1.surface_qpitch = (h + 3)/4;
+  }
+  ss->ss0.horizontal_alignment = 1;
+  ss->ss0.vertical_alignment = 1;
+
+  if (tiling == GPGPU_TILE_X) {
+    ss->ss0.tile_mode = GEN8_TILEMODE_XMAJOR;
+  } else if (tiling == GPGPU_TILE_Y) {
+    ss->ss0.tile_mode = GEN8_TILEMODE_YMAJOR;
+  } else
+    assert(tiling == GPGPU_NO_TILE);// W mode is not supported now.
+
+  ss->ss2.width = w - 1;
+  ss->ss2.height = h - 1;
+  ss->ss3.depth = depth - 1;
+
+  ss->ss8.surface_base_addr_lo = obj_bo->offset64 & 0xffffffff;
+  ss->ss9.surface_base_addr_hi = (obj_bo->offset64 >> 32) & 0xffffffff;
+
+  ss->ss4.render_target_view_ext = depth - 1;
+  ss->ss4.min_array_elt = 0;
+  ss->ss3.surface_pitch = pitch - 1;
+
+  ss->ss1.mem_obj_ctrl_state = cl_gpgpu_get_cache_ctrl();
+  ss->ss7.shader_channel_select_red = I965_SURCHAN_SELECT_RED;
+  ss->ss7.shader_channel_select_green = I965_SURCHAN_SELECT_GREEN;
+  ss->ss7.shader_channel_select_blue = I965_SURCHAN_SELECT_BLUE;
+  ss->ss7.shader_channel_select_alpha = I965_SURCHAN_SELECT_ALPHA;
+  ss->ss0.render_cache_rw_mode = 1; /* XXX do we need to set it? */
+
+  heap->binding_table[index] = offsetof(surface_heap_t, surface) +
+                               index * surface_state_sz;
+  dri_bo_emit_reloc(gpgpu->aux_buf.bo,
+                    I915_GEM_DOMAIN_RENDER,
+                    I915_GEM_DOMAIN_RENDER,
+                    obj_bo_offset,
+                    gpgpu->aux_offset.surface_heap_offset +
+                    heap->binding_table[index] +
+                    offsetof(gen8_surface_state_t, ss8),
+                    obj_bo);
+
+  assert(index < GEN_MAX_SURFACES);
+}
+
+static void
 intel_gpgpu_bind_buf(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t offset,
                      uint32_t internal_offset, uint32_t size, uint8_t bti)
 {
@@ -939,7 +1207,7 @@ intel_gpgpu_bind_buf(intel_gpgpu_t *gpgpu, drm_intel_bo *buf, uint32_t offset,
   gpgpu->target_buf_offset[gpgpu->binded_n] = internal_offset;
   gpgpu->binded_offset[gpgpu->binded_n] = offset;
   gpgpu->binded_n++;
-  intel_gpgpu_setup_bti(gpgpu, buf, internal_offset, size, bti);
+  intel_gpgpu_setup_bti(gpgpu, buf, internal_offset, size, bti, I965_SURFACEFORMAT_RAW);
 }
 
 static int
@@ -972,11 +1240,11 @@ intel_gpgpu_set_stack(intel_gpgpu_t *gpgpu, uint32_t offset, uint32_t size, uint
   drm_intel_bufmgr *bufmgr = gpgpu->drv->bufmgr;
   gpgpu->stack_b.bo = drm_intel_bo_alloc(bufmgr, "STACK", size, 64);
 
-  intel_gpgpu_bind_buf(gpgpu, gpgpu->stack_b.bo, offset, 0, size, bti);
+  cl_gpgpu_bind_buf((cl_gpgpu)gpgpu, (cl_buffer)gpgpu->stack_b.bo, offset, 0, size, bti);
 }
 
 static void
-intel_gpgpu_build_idrt(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
+intel_gpgpu_build_idrt_gen7(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
 {
   gen6_interface_descriptor_t *desc;
   drm_intel_bo *ker_bo = NULL;
@@ -1031,6 +1299,47 @@ intel_gpgpu_build_idrt(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
                     gpgpu->aux_buf.bo);
 }
 
+static void
+intel_gpgpu_build_idrt_gen8(intel_gpgpu_t *gpgpu, cl_gpgpu_kernel *kernel)
+{
+  gen8_interface_descriptor_t *desc;
+
+  desc = (gen8_interface_descriptor_t*) (gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.idrt_offset);
+
+  memset(desc, 0, sizeof(*desc));
+  desc->desc0.kernel_start_pointer = 0; /* reloc */
+  desc->desc2.single_program_flow = 0;
+  desc->desc2.floating_point_mode = 0; /* use IEEE-754 rule */
+  desc->desc6.rounding_mode = 0; /* round to nearest even */
+
+  assert((gpgpu->aux_buf.bo->offset + gpgpu->aux_offset.sampler_state_offset) % 32 == 0);
+  desc->desc3.sampler_state_pointer = gpgpu->aux_offset.sampler_state_offset >> 5;
+  desc->desc4.binding_table_entry_count = 0; /* no prefetch */
+  desc->desc4.binding_table_pointer = 0;
+  desc->desc5.curbe_read_len = kernel->curbe_sz / 32;
+  desc->desc5.curbe_read_offset = 0;
+
+  /* Barriers / SLM are automatically handled on Gen7+ */
+  size_t slm_sz = kernel->slm_sz;
+  /* group_threads_num should not be set to 0 even if the barrier is disabled per bspec */
+  desc->desc6.group_threads_num = kernel->thread_n;
+  desc->desc6.barrier_enable = kernel->use_slm;
+  if (slm_sz == 0)
+    slm_sz = 0;
+  else if (slm_sz <= 4*KB)
+    slm_sz = 4*KB;
+  else if (slm_sz <= 8*KB)
+    slm_sz = 8*KB;
+  else if (slm_sz <= 16*KB)
+    slm_sz = 16*KB;
+  else if (slm_sz <= 32*KB)
+    slm_sz = 32*KB;
+  else
+    slm_sz = 64*KB;
+  slm_sz = slm_sz >> 12;
+  desc->desc6.slm_sz = slm_sz;
+}
+
 static int
 intel_gpgpu_upload_curbes(intel_gpgpu_t *gpgpu, const void* data, uint32_t size)
 {
@@ -1089,7 +1398,7 @@ int translate_wrap_mode(uint32_t cl_address_mode, int using_nearest)
 }
 
 static void
-intel_gpgpu_insert_sampler(intel_gpgpu_t *gpgpu, uint32_t index, uint32_t clk_sampler)
+intel_gpgpu_insert_sampler_gen7(intel_gpgpu_t *gpgpu, uint32_t index, uint32_t clk_sampler)
 {
   int using_nearest = 0;
   uint32_t wrap_mode;
@@ -1152,13 +1461,77 @@ intel_gpgpu_insert_sampler(intel_gpgpu_t *gpgpu, uint32_t index, uint32_t clk_sa
 
 }
 
+
+static void
+intel_gpgpu_insert_sampler_gen8(intel_gpgpu_t *gpgpu, uint32_t index, uint32_t clk_sampler)
+{
+  int using_nearest = 0;
+  uint32_t wrap_mode;
+  gen8_sampler_state_t *sampler;
+
+  sampler = (gen8_sampler_state_t *)(gpgpu->aux_buf.bo->virtual + gpgpu->aux_offset.sampler_state_offset)  + index;
+  memset(sampler, 0, sizeof(*sampler));
+  assert((gpgpu->aux_buf.bo->offset + gpgpu->aux_offset.sampler_border_color_state_offset) % 32 == 0);
+  if ((clk_sampler & __CLK_NORMALIZED_MASK) == CLK_NORMALIZED_COORDS_FALSE)
+    sampler->ss3.non_normalized_coord = 1;
+  else
+    sampler->ss3.non_normalized_coord = 0;
+
+  switch (clk_sampler & __CLK_FILTER_MASK) {
+  case CLK_FILTER_NEAREST:
+    sampler->ss0.min_filter = GEN_MAPFILTER_NEAREST;
+    sampler->ss0.mip_filter = GEN_MIPFILTER_NONE;
+    sampler->ss0.mag_filter = GEN_MAPFILTER_NEAREST;
+    using_nearest = 1;
+    break;
+  case CLK_FILTER_LINEAR:
+    sampler->ss0.min_filter = GEN_MAPFILTER_LINEAR;
+    sampler->ss0.mip_filter = GEN_MIPFILTER_NONE;
+    sampler->ss0.mag_filter = GEN_MAPFILTER_LINEAR;
+    break;
+  }
+
+  wrap_mode = translate_wrap_mode(clk_sampler & __CLK_ADDRESS_MASK, using_nearest);
+  sampler->ss3.s_wrap_mode = wrap_mode;
+  /* XXX mesa i965 driver code point out that if the surface is a 1D surface, we may need
+   * to set t_wrap_mode to GEN_TEXCOORDMODE_WRAP. */
+  sampler->ss3.t_wrap_mode = wrap_mode;
+  sampler->ss3.r_wrap_mode = wrap_mode;
+
+  sampler->ss0.lod_preclamp = 1; /* OpenGL mode */
+  sampler->ss0.default_color_mode = 0; /* OpenGL/DX10 mode */
+
+  sampler->ss0.base_level = 0;
+
+  sampler->ss1.max_lod = 0;
+  sampler->ss1.min_lod = 0;
+
+  if (sampler->ss0.min_filter != GEN_MAPFILTER_NEAREST)
+     sampler->ss3.address_round |= GEN_ADDRESS_ROUNDING_ENABLE_U_MIN |
+                                   GEN_ADDRESS_ROUNDING_ENABLE_V_MIN |
+                                   GEN_ADDRESS_ROUNDING_ENABLE_R_MIN;
+  if (sampler->ss0.mag_filter != GEN_MAPFILTER_NEAREST)
+     sampler->ss3.address_round |= GEN_ADDRESS_ROUNDING_ENABLE_U_MAG |
+                                   GEN_ADDRESS_ROUNDING_ENABLE_V_MAG |
+                                   GEN_ADDRESS_ROUNDING_ENABLE_R_MAG;
+}
+
+static void
+intel_gpgpu_bind_sampler_gen7(intel_gpgpu_t *gpgpu, uint32_t *samplers, size_t sampler_sz)
+{
+  int index;
+  assert(sampler_sz <= GEN_MAX_SAMPLERS);
+  for(index = 0; index < sampler_sz; index++)
+    intel_gpgpu_insert_sampler_gen7(gpgpu, index, samplers[index]);
+}
+
 static void
-intel_gpgpu_bind_sampler(intel_gpgpu_t *gpgpu, uint32_t *samplers, size_t sampler_sz)
+intel_gpgpu_bind_sampler_gen8(intel_gpgpu_t *gpgpu, uint32_t *samplers, size_t sampler_sz)
 {
   int index;
   assert(sampler_sz <= GEN_MAX_SAMPLERS);
   for(index = 0; index < sampler_sz; index++)
-    intel_gpgpu_insert_sampler(gpgpu, index, samplers[index]);
+    intel_gpgpu_insert_sampler_gen8(gpgpu, index, samplers[index]);
 }
 
 static void
@@ -1179,7 +1552,7 @@ intel_gpgpu_set_perf_counters(intel_gpgpu_t *gpgpu, cl_buffer *perf)
 }
 
 static void
-intel_gpgpu_walker(intel_gpgpu_t *gpgpu,
+intel_gpgpu_walker_gen7(intel_gpgpu_t *gpgpu,
                    uint32_t simd_sz,
                    uint32_t thread_n,
                    const size_t global_wk_off[3],
@@ -1224,6 +1597,56 @@ intel_gpgpu_walker(intel_gpgpu_t *gpgpu,
   ADVANCE_BATCH(gpgpu->batch);
 }
 
+static void
+intel_gpgpu_walker_gen8(intel_gpgpu_t *gpgpu,
+                   uint32_t simd_sz,
+                   uint32_t thread_n,
+                   const size_t global_wk_off[3],
+                   const size_t global_wk_sz[3],
+                   const size_t local_wk_sz[3])
+{
+  const uint32_t global_wk_dim[3] = {
+    global_wk_sz[0] / local_wk_sz[0],
+    global_wk_sz[1] / local_wk_sz[1],
+    global_wk_sz[2] / local_wk_sz[2]
+  };
+  uint32_t right_mask = ~0x0;
+  size_t group_sz = local_wk_sz[0] * local_wk_sz[1] * local_wk_sz[2];
+
+  assert(simd_sz == 8 || simd_sz == 16);
+
+  uint32_t shift = (group_sz & (simd_sz - 1));
+  shift = (shift == 0) ? simd_sz : shift;
+  right_mask = (1 << shift) - 1;
+
+  BEGIN_BATCH(gpgpu->batch, 15);
+  OUT_BATCH(gpgpu->batch, CMD_GPGPU_WALKER | 13);
+  OUT_BATCH(gpgpu->batch, 0);                        /* kernel index == 0 */
+  OUT_BATCH(gpgpu->batch, 0);                        /* Indirect Data Length */
+  OUT_BATCH(gpgpu->batch, 0);                        /* Indirect Data Start Address */
+  assert(thread_n <= 64);
+  if (simd_sz == 16)
+    OUT_BATCH(gpgpu->batch, (1 << 30) | (thread_n-1)); /* SIMD16 | thread max */
+  else
+    OUT_BATCH(gpgpu->batch, (0 << 30) | (thread_n-1)); /* SIMD8  | thread max */
+  OUT_BATCH(gpgpu->batch, 0);
+  OUT_BATCH(gpgpu->batch, 0);
+  OUT_BATCH(gpgpu->batch, global_wk_dim[0]);
+  OUT_BATCH(gpgpu->batch, 0);
+  OUT_BATCH(gpgpu->batch, 0);
+  OUT_BATCH(gpgpu->batch, global_wk_dim[1]);
+  OUT_BATCH(gpgpu->batch, 0);
+  OUT_BATCH(gpgpu->batch, global_wk_dim[2]);
+  OUT_BATCH(gpgpu->batch, right_mask);
+  OUT_BATCH(gpgpu->batch, ~0x0);                     /* we always set height as 1, so set bottom mask as all 1*/
+  ADVANCE_BATCH(gpgpu->batch);
+
+  BEGIN_BATCH(gpgpu->batch, 2);
+  OUT_BATCH(gpgpu->batch, CMD_MEDIA_STATE_FLUSH | 0);
+  OUT_BATCH(gpgpu->batch, 0);                        /* kernel index == 0 */
+  ADVANCE_BATCH(gpgpu->batch);
+}
+
 static intel_event_t*
 intel_gpgpu_event_new(intel_gpgpu_t *gpgpu)
 {
@@ -1388,7 +1811,7 @@ intel_gpgpu_set_printf_buf(intel_gpgpu_t *gpgpu, uint32_t i, uint32_t size, uint
   }
   memset(bo->virtual, 0, size);
   drm_intel_bo_unmap(bo);
-  intel_gpgpu_bind_buf(gpgpu, bo, offset, 0, size, bti);
+  cl_gpgpu_bind_buf((cl_gpgpu)gpgpu, (cl_buffer)bo, offset, 0, size, bti);
   return 0;
 }
 
@@ -1463,14 +1886,14 @@ intel_set_gpgpu_callbacks(int device_id)
   cl_gpgpu_state_init = (cl_gpgpu_state_init_cb *) intel_gpgpu_state_init;
   cl_gpgpu_set_perf_counters = (cl_gpgpu_set_perf_counters_cb *) intel_gpgpu_set_perf_counters;
   cl_gpgpu_upload_curbes = (cl_gpgpu_upload_curbes_cb *) intel_gpgpu_upload_curbes;
+  cl_gpgpu_alloc_constant_buffer  = (cl_gpgpu_alloc_constant_buffer_cb *) intel_gpgpu_alloc_constant_buffer;
   cl_gpgpu_states_setup = (cl_gpgpu_states_setup_cb *) intel_gpgpu_states_setup;
   cl_gpgpu_upload_samplers = (cl_gpgpu_upload_samplers_cb *) intel_gpgpu_upload_samplers;
   cl_gpgpu_batch_reset = (cl_gpgpu_batch_reset_cb *) intel_gpgpu_batch_reset;
   cl_gpgpu_batch_start = (cl_gpgpu_batch_start_cb *) intel_gpgpu_batch_start;
   cl_gpgpu_batch_end = (cl_gpgpu_batch_end_cb *) intel_gpgpu_batch_end;
   cl_gpgpu_flush = (cl_gpgpu_flush_cb *) intel_gpgpu_flush;
-  cl_gpgpu_walker = (cl_gpgpu_walker_cb *) intel_gpgpu_walker;
-  cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler;
+  cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler_gen7;
   cl_gpgpu_set_scratch = (cl_gpgpu_set_scratch_cb *) intel_gpgpu_set_scratch;
   cl_gpgpu_event_new = (cl_gpgpu_event_new_cb *)intel_gpgpu_event_new;
   cl_gpgpu_event_flush = (cl_gpgpu_event_flush_cb *)intel_gpgpu_event_flush;
@@ -1487,18 +1910,42 @@ intel_set_gpgpu_callbacks(int device_id)
   cl_gpgpu_set_printf_info = (cl_gpgpu_set_printf_info_cb *)intel_gpgpu_set_printf_info;
   cl_gpgpu_get_printf_info = (cl_gpgpu_get_printf_info_cb *)intel_gpgpu_get_printf_info;
 
+  if (IS_BROADWELL(device_id)) {
+    cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen8;
+    intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen8;
+    cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen8;
+    intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen8;
+    intel_gpgpu_post_action = intel_gpgpu_post_action_gen7; //BDW need not restore SLM, same as gen7
+    intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_gen7;
+    intel_gpgpu_set_base_address = intel_gpgpu_set_base_address_gen8;
+    intel_gpgpu_setup_bti = intel_gpgpu_setup_bti_gen8;
+    intel_gpgpu_load_vfe_state = intel_gpgpu_load_vfe_state_gen8;
+    cl_gpgpu_walker = (cl_gpgpu_walker_cb *)intel_gpgpu_walker_gen8;
+    intel_gpgpu_build_idrt = intel_gpgpu_build_idrt_gen8;
+    intel_gpgpu_load_curbe_buffer = intel_gpgpu_load_curbe_buffer_gen8;
+    intel_gpgpu_load_idrt = intel_gpgpu_load_idrt_gen8;
+    cl_gpgpu_bind_sampler = (cl_gpgpu_bind_sampler_cb *) intel_gpgpu_bind_sampler_gen8;
+    return;
+  }
+
+  intel_gpgpu_set_base_address = intel_gpgpu_set_base_address_gen7;
+  intel_gpgpu_load_vfe_state = intel_gpgpu_load_vfe_state_gen7;
+  cl_gpgpu_walker = (cl_gpgpu_walker_cb *)intel_gpgpu_walker_gen7;
+  intel_gpgpu_build_idrt = intel_gpgpu_build_idrt_gen7;
+  intel_gpgpu_load_curbe_buffer = intel_gpgpu_load_curbe_buffer_gen7;
+  intel_gpgpu_load_idrt = intel_gpgpu_load_idrt_gen7;
+
   if (IS_HASWELL(device_id)) {
     cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen75;
-    cl_gpgpu_alloc_constant_buffer  = (cl_gpgpu_alloc_constant_buffer_cb *) intel_gpgpu_alloc_constant_buffer_gen75;
     intel_gpgpu_set_L3 = intel_gpgpu_set_L3_gen75;
     cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen75;
     intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen75;
     intel_gpgpu_post_action = intel_gpgpu_post_action_gen75;
     intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_gen7; //HSW same as ivb
+    intel_gpgpu_setup_bti = intel_gpgpu_setup_bti_gen75;
   }
   else if (IS_IVYBRIDGE(device_id)) {
     cl_gpgpu_bind_image = (cl_gpgpu_bind_image_cb *) intel_gpgpu_bind_image_gen7;
-    cl_gpgpu_alloc_constant_buffer  = (cl_gpgpu_alloc_constant_buffer_cb *) intel_gpgpu_alloc_constant_buffer_gen7;
     if (IS_BAYTRAIL_T(device_id)) {
       intel_gpgpu_set_L3 = intel_gpgpu_set_L3_baytrail;
       intel_gpgpu_read_ts_reg = intel_gpgpu_read_ts_reg_baytrail;
@@ -1509,5 +1956,6 @@ intel_set_gpgpu_callbacks(int device_id)
     cl_gpgpu_get_cache_ctrl = (cl_gpgpu_get_cache_ctrl_cb *)intel_gpgpu_get_cache_ctrl_gen7;
     intel_gpgpu_get_scratch_index = intel_gpgpu_get_scratch_index_gen7;
     intel_gpgpu_post_action = intel_gpgpu_post_action_gen7;
+    intel_gpgpu_setup_bti = intel_gpgpu_setup_bti_gen7;
   }
 }
diff --git a/src/intel/intel_gpgpu.h b/src/intel/intel_gpgpu.h
index d593ac7..ad7290e 100644
--- a/src/intel/intel_gpgpu.h
+++ b/src/intel/intel_gpgpu.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -23,10 +23,74 @@
 
 #include "cl_utils.h"
 #include "cl_driver.h"
+#include "intel/intel_batchbuffer.h"
+#include "intel/intel_driver.h"
 
 #include <stdlib.h>
 #include <stdint.h>
 
+
+/* We can bind only a limited number of buffers */
+enum { max_buf_n = 128 };
+
+enum { max_img_n = 128};
+
+enum {max_sampler_n = 16 };
+
+struct intel_driver;
+struct intel_batchbuffer;
+
+/* Handle GPGPU state */
+struct intel_gpgpu
+{
+  void* ker_opaque;
+  size_t global_wk_sz[3];
+  void* printf_info;
+  struct intel_driver *drv;
+  struct intel_batchbuffer *batch;
+  cl_gpgpu_kernel *ker;
+  drm_intel_bo *binded_buf[max_buf_n];  /* all buffers binded for the call */
+  uint32_t target_buf_offset[max_buf_n];/* internal offset for buffers binded for the call */
+  uint32_t binded_offset[max_buf_n];    /* their offsets in the curbe buffer */
+  uint32_t binded_n;                    /* number of buffers binded */
+
+  unsigned long img_bitmap;              /* image usage bitmap. */
+  unsigned int img_index_base;          /* base index for image surface.*/
+
+  unsigned long sampler_bitmap;          /* sampler usage bitmap. */
+
+  struct { drm_intel_bo *bo; } stack_b;
+  struct { drm_intel_bo *bo; } perf_b;
+  struct { drm_intel_bo *bo; } scratch_b;
+  struct { drm_intel_bo *bo; } constant_b;
+  struct { drm_intel_bo *bo; } time_stamp_b;  /* time stamp buffer */
+  struct { drm_intel_bo *bo;
+           drm_intel_bo *ibo;} printf_b;      /* the printf buf and index buf*/
+
+  struct { drm_intel_bo *bo; } aux_buf;
+  struct {
+    uint32_t surface_heap_offset;
+    uint32_t curbe_offset;
+    uint32_t idrt_offset;
+    uint32_t sampler_state_offset;
+    uint32_t sampler_border_color_state_offset;
+  } aux_offset;
+
+  uint32_t per_thread_scratch;
+  struct {
+    uint32_t num_cs_entries;
+    uint32_t size_cs_entry;  /* size of one entry in 512bit elements */
+  } curb;
+
+  uint32_t max_threads;      /* max threads requested by the user */
+};
+
+struct intel_gpgpu_node {
+  struct intel_gpgpu *gpgpu;
+  struct intel_gpgpu_node *next;
+};
+
+
 /* Set the gpgpu related call backs */
 extern void intel_set_gpgpu_callbacks(int device_id);
 
diff --git a/src/intel/intel_structs.h b/src/intel/intel_structs.h
index ef76bb4..258fbb9 100644
--- a/src/intel/intel_structs.h
+++ b/src/intel/intel_structs.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -98,74 +98,60 @@ typedef struct gen6_interface_descriptor
   uint32_t desc7; /* unused */
 } gen6_interface_descriptor_t;
 
-typedef struct gen6_surface_state
+typedef struct gen8_interface_descriptor
 {
   struct {
-    uint32_t cube_pos_z:1;
-    uint32_t cube_neg_z:1;
-    uint32_t cube_pos_y:1;
-    uint32_t cube_neg_y:1;
-    uint32_t cube_pos_x:1;
-    uint32_t cube_neg_x:1;
-    uint32_t pad:2;
-    uint32_t render_cache_read_mode:1;
-    uint32_t cube_map_corner_mode:1;
-    uint32_t mipmap_layout_mode:1;
-    uint32_t vert_line_stride_ofs:1;
-    uint32_t vert_line_stride:1;
-    uint32_t color_blend:1;
-    uint32_t writedisable_blue:1;
-    uint32_t writedisable_green:1;
-    uint32_t writedisable_red:1;
-    uint32_t writedisable_alpha:1;
-    uint32_t surface_format:9;
-    uint32_t data_return_format:1;
-    uint32_t pad0:1;
-    uint32_t surface_type:3;
-  } ss0;
+    uint32_t pad6:6;
+    uint32_t kernel_start_pointer:26;
+  } desc0;
+  struct {
+    uint32_t kernel_start_pointer_high:16;
+    uint32_t pad6:16;
+  } desc1;
 
   struct {
-    uint32_t base_addr;
-  } ss1;
+    uint32_t pad:7;
+    uint32_t software_exception:1;
+    uint32_t pad2:3;
+    uint32_t maskstack_exception:1;
+    uint32_t pad3:1;
+    uint32_t illegal_opcode_exception:1;
+    uint32_t pad4:2;
+    uint32_t floating_point_mode:1;
+    uint32_t thread_priority:1;
+    uint32_t single_program_flow:1;
+    uint32_t denorm_mode:1;
+    uint32_t thread_preemption_disable:1;
+    uint32_t pad5:11;
+  } desc2;
 
   struct {
-    uint32_t render_target_rotation:2;
-    uint32_t mip_count:4;
-    uint32_t width:13;
-    uint32_t height:13;
-  } ss2;
+    uint32_t pad:2;
+    uint32_t sampler_count:3;
+    uint32_t sampler_state_pointer:27;
+  } desc3;
 
   struct {
-    uint32_t tile_walk:1;
-    uint32_t tiled_surface:1;
-    uint32_t pad:1;
-    uint32_t pitch:18;
-    uint32_t depth:11;
-  } ss3;
+    uint32_t binding_table_entry_count:5;  /* prefetch entries only */
+    uint32_t binding_table_pointer:27;     /* 11 bit only on IVB+ */
+  } desc4;
 
   struct {
-    uint32_t multisample_pos_index:3;
-    uint32_t pad:1;
-    uint32_t multisample_count:3;
-    uint32_t pad1:1;
-    uint32_t rt_view_extent:9;
-    uint32_t min_array_elt:11;
-    uint32_t min_lod:4;
-  } ss4;
+    uint32_t curbe_read_offset:16;         /* in GRFs */
+    uint32_t curbe_read_len:16;            /* in GRFs */
+  } desc5;
 
   struct {
-    uint32_t pad:16;
-    uint32_t cache_control:2;  /* different values for GT and IVB */
-    uint32_t gfdt:1;           /* allows selective flushing of LLC (e.g. for scanout) */
-    uint32_t encrypted_data:1;
-    uint32_t y_offset:4;
-    uint32_t vertical_alignment:1;
-    uint32_t x_offset:7;
-  } ss5;
+    uint32_t group_threads_num:8;        /* 0..64, 0 - no barrier use */
+    uint32_t barrier_return_byte:8;
+    uint32_t slm_sz:5;                   /* 0..16 - 0K..64K */
+    uint32_t barrier_enable:1;
+    uint32_t rounding_mode:2;
+    uint32_t barrier_return_grf_offset:8;
+  } desc6;
 
-  uint32_t ss6; /* unused */
-  uint32_t ss7; /* unused */
-} gen6_surface_state_t;
+  uint32_t desc7; /* unused */
+} gen8_interface_descriptor_t;
 
 typedef struct gen7_surface_state
 {
@@ -246,8 +232,161 @@ typedef struct gen7_surface_state
   } ss7;
 } gen7_surface_state_t;
 
-STATIC_ASSERT(sizeof(gen6_surface_state_t) == sizeof(gen7_surface_state_t));
-static const size_t surface_state_sz = sizeof(gen6_surface_state_t);
+typedef struct gen8_surface_state
+{
+  struct {
+    uint32_t cube_pos_z:1;
+    uint32_t cube_neg_z:1;
+    uint32_t cube_pos_y:1;
+    uint32_t cube_neg_y:1;
+    uint32_t cube_pos_x:1;
+    uint32_t cube_neg_x:1;
+    uint32_t media_boundary_pixel_mode:2;
+    uint32_t render_cache_rw_mode:1;
+    uint32_t sampler_L2_bypass_mode:1;
+    uint32_t vertical_line_stride_offset:1;
+    uint32_t vertical_line_stride:1;
+    uint32_t tile_mode:2;
+    uint32_t horizontal_alignment:2;
+    uint32_t vertical_alignment:2;
+    uint32_t surface_format:9;
+    uint32_t pad0:1;
+    uint32_t surface_array:1;
+    uint32_t surface_type:3;
+  } ss0;
+
+  struct {
+    uint32_t surface_qpitch:15;
+    uint32_t pad0:3;
+    uint32_t pad1:1;
+    uint32_t base_mip_level:5;
+    uint32_t mem_obj_ctrl_state:7;
+    uint32_t pad2:1;
+  } ss1;
+
+  struct {
+    uint32_t width:14;
+    uint32_t pad1:2;
+    uint32_t height:14;
+    uint32_t pad0:2;
+  } ss2;
+
+  struct {
+    uint32_t surface_pitch:18;
+    uint32_t pad1:2;
+    uint32_t pad0:1;
+    uint32_t depth:11;
+  } ss3;
+
+  struct {
+    union {
+      struct {
+        uint32_t multisample_pos_palette_idx:3;
+        uint32_t multisample_num:3;
+        uint32_t multisample_format:1;
+        uint32_t render_target_view_ext:11;
+        uint32_t min_array_elt:11;
+        uint32_t render_target_and_sample_rotation:2;
+        uint32_t pad1:1;
+      };
+
+      uint32_t pad0;
+    };
+  } ss4;
+
+  struct {
+    uint32_t mip_count:4;
+    uint32_t surface_min_lod:4;
+    uint32_t pad5:4;
+    uint32_t pad4:2;
+    uint32_t conherency_type:1;
+    uint32_t pad3:3;
+    uint32_t pad2:2;
+    uint32_t cube_ewa:1;
+    uint32_t y_offset:3;
+    uint32_t pad0:1;
+    uint32_t x_offset:7;
+  } ss5;
+
+  struct {
+    union {
+      union {
+        struct {
+          uint32_t aux_surface_mode:3;
+          uint32_t aux_surface_pitch:9;
+          uint32_t pad3:4;
+        };
+        struct {
+          uint32_t uv_plane_y_offset:14;
+          uint32_t pad2:2;
+        };
+      };
+
+      struct {
+        uint32_t uv_plane_x_offset:14;
+        uint32_t pad1:1;
+        uint32_t seperate_uv_plane_enable:1;
+      };
+      struct {
+        uint32_t aux_sruface_qpitch:15;
+        uint32_t pad0:1;
+      };
+    };
+  } ss6;
+
+  struct {
+    uint32_t resource_min_lod:12;
+    uint32_t pad0:4;
+    uint32_t shader_channel_select_alpha:3;
+    uint32_t shader_channel_select_blue:3;
+    uint32_t shader_channel_select_green:3;
+    uint32_t shader_channel_select_red:3;
+    uint32_t alpha_clear_color:1;
+    uint32_t blue_clear_color:1;
+    uint32_t green_clear_color:1;
+    uint32_t red_clear_color:1;
+  } ss7;
+
+  struct {
+    uint32_t surface_base_addr_lo;
+  } ss8;
+
+  struct {
+    uint32_t surface_base_addr_hi;
+  } ss9;
+
+	struct {
+		uint32_t pad0:12;
+		uint32_t aux_base_addr_lo:20;
+	} ss10;
+
+	struct {
+		uint32_t aux_base_addr_hi:32;
+	} ss11;
+
+  struct {
+    uint32_t pad0;
+  } ss12;
+
+  /* 13~15 have meaning only when aux surface mode == AUX_HIZ */
+  struct {
+    uint32_t pad0;
+  } ss13;
+  struct {
+    uint32_t pad0;
+  } ss14;
+  struct {
+    uint32_t pad0;
+  } ss15;
+} gen8_surface_state_t;
+
+typedef union gen_surface_state
+{
+  gen7_surface_state_t gen7_surface_state;
+  gen8_surface_state_t gen8_surface_state;
+} gen_surface_state_t;
+
+static const size_t surface_state_sz = sizeof(gen_surface_state_t);
 
 typedef struct gen6_vfe_state_inline
 {
@@ -454,6 +593,61 @@ typedef struct gen7_sampler_state
 
 STATIC_ASSERT(sizeof(gen6_sampler_state_t) == sizeof(gen7_sampler_state_t));
 
+typedef struct gen8_sampler_state
+{
+  struct {
+    uint32_t aniso_algorithm:1;
+    uint32_t lod_bias:13;
+    uint32_t min_filter:3;
+    uint32_t mag_filter:3;
+    uint32_t mip_filter:2;
+    uint32_t base_level:5;
+    uint32_t lod_preclamp:2;
+    uint32_t default_color_mode:1;
+    uint32_t pad0:1;
+    uint32_t disable:1;
+  } ss0;
+
+  struct {
+    uint32_t cube_control_mode:1;
+    uint32_t shadow_function:3;
+    uint32_t chromakey_mode:1;
+    uint32_t chromakey_index:2;
+    uint32_t chromakey_enable:1;
+    uint32_t max_lod:12;
+    uint32_t min_lod:12;
+  } ss1;
+
+  struct {
+    uint32_t lod_clamp_mag_mode:1;
+    uint32_t flexible_filter_valign:1;
+    uint32_t flexible_filter_halign:1;
+    uint32_t flexible_filter_coeff_size:1;
+    uint32_t flexible_filter_mode:1;
+    uint32_t pad1:1;
+    uint32_t indirect_state_ptr:18;
+    uint32_t pad0:2;
+    uint32_t sep_filter_height:2;
+    uint32_t sep_filter_width:2;
+    uint32_t sep_filter_coeff_table_size:2;
+  } ss2;
+
+  struct {
+    uint32_t r_wrap_mode:3;
+    uint32_t t_wrap_mode:3;
+    uint32_t s_wrap_mode:3;
+    uint32_t pad:1;
+    uint32_t non_normalized_coord:1;
+    uint32_t trilinear_quality:2;
+    uint32_t address_round:6;
+    uint32_t max_aniso:3;
+    uint32_t pad0:2;
+    uint32_t non_sep_filter_footprint_mask:8;
+  } ss3;
+} gen8_sampler_state_t;
+
+STATIC_ASSERT(sizeof(gen6_sampler_state_t) == sizeof(gen8_sampler_state_t));
+
 #undef BITFIELD_BIT
 #undef BITFIELD_RANGE
 
diff --git a/src/kernels/cl_internal_copy_buf_rect_align4.cl b/src/kernels/cl_internal_copy_buf_rect_align4.cl
new file mode 100644
index 0000000..fbfe7b2
--- /dev/null
+++ b/src/kernels/cl_internal_copy_buf_rect_align4.cl
@@ -0,0 +1,15 @@
+kernel void __cl_copy_buffer_rect_align4 ( global int* src, global int* dst,
+                                          unsigned int region0, unsigned int region1, unsigned int region2,
+                                          unsigned int src_offset, unsigned int dst_offset,
+                                          unsigned int src_row_pitch, unsigned int src_slice_pitch,
+                                          unsigned int dst_row_pitch, unsigned int dst_slice_pitch)
+{
+  int i = get_global_id(0);
+  int j = get_global_id(1);
+  int k = get_global_id(2);
+  if((i >= region0) || (j>= region1) || (k>=region2))
+    return;
+  src_offset += k * src_slice_pitch + j * src_row_pitch + i;
+  dst_offset += k * dst_slice_pitch + j * dst_row_pitch + i;
+  dst[dst_offset] = src[src_offset];
+}
diff --git a/src/x11/dricommon.c b/src/x11/dricommon.c
index bd4ac50..03f542c 100644
--- a/src/x11/dricommon.c
+++ b/src/x11/dricommon.c
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/src/x11/dricommon.h b/src/x11/dricommon.h
index 5a950b4..aa3a50f 100644
--- a/src/x11/dricommon.h
+++ b/src/x11/dricommon.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/src/x11/mesa_egl_extension.c b/src/x11/mesa_egl_extension.c
index a7fc8cb..4a3e89c 100644
--- a/src/x11/mesa_egl_extension.c
+++ b/src/x11/mesa_egl_extension.c
@@ -123,7 +123,6 @@ _eglLockDisplay(EGLDisplay dpy)
 static _EGLContext *
 _eglLookupContext(EGLContext ctx, EGLDisplay disp)
 {
-  disp = disp;
   return (_EGLContext *) ctx;
 }
 
diff --git a/src/x11/va_dri2.c b/src/x11/va_dri2.c
index 5225acd..8779fa5 100644
--- a/src/x11/va_dri2.c
+++ b/src/x11/va_dri2.c
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/src/x11/va_dri2.h b/src/x11/va_dri2.h
index 1a1f96e..78e859c 100644
--- a/src/x11/va_dri2.h
+++ b/src/x11/va_dri2.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/src/x11/va_dri2str.h b/src/x11/va_dri2str.h
index db10e16..d97050f 100644
--- a/src/x11/va_dri2str.h
+++ b/src/x11/va_dri2str.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/src/x11/va_dri2tokens.h b/src/x11/va_dri2tokens.h
index d3c31f3..022609d 100644
--- a/src/x11/va_dri2tokens.h
+++ b/src/x11/va_dri2tokens.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/utests/CMakeLists.txt b/utests/CMakeLists.txt
index 9c531de..8cc8b43 100644
--- a/utests/CMakeLists.txt
+++ b/utests/CMakeLists.txt
@@ -22,7 +22,6 @@ set (utests_sources
   utest_error.c
   compiler_basic_arithmetic.cpp
   compiler_displacement_map_element.cpp
-  compiler_shader_toy.cpp
   compiler_mandelbrot.cpp
   compiler_mandelbrot_alternate.cpp
   compiler_box_blur_float.cpp
@@ -41,6 +40,7 @@ set (utests_sources
   compiler_ceil.cpp
   compiler_clz_short.cpp
   compiler_clz_int.cpp
+  compiler_popcount.cpp
   compiler_convert_uchar_sat.cpp
   compiler_copy_buffer.cpp
   compiler_copy_image.cpp
@@ -103,6 +103,8 @@ set (utests_sources
   compiler_write_only.cpp
   compiler_write_only_shorts.cpp
   compiler_switch.cpp
+  compiler_overflow.cpp
+  compiler_bswap.cpp
   compiler_math.cpp
   compiler_atomic_functions.cpp
   compiler_async_copy.cpp
@@ -152,6 +154,7 @@ set (utests_sources
   builtin_convert_sat.cpp
   sub_buffer.cpp
   runtime_createcontext.cpp
+  runtime_set_kernel_arg.cpp
   runtime_null_kernel_arg.cpp
   runtime_event.cpp
   runtime_barrier_list.cpp
@@ -172,6 +175,7 @@ set (utests_sources
   compiler_getelementptr_bitcast.cpp
   compiler_simd_any.cpp
   compiler_simd_all.cpp
+  compiler_time_stamp.cpp
   compiler_double_precision.cpp
   load_program_from_bin_file.cpp
   load_program_from_gen_bin.cpp
@@ -185,7 +189,12 @@ set (utests_sources
   builtin_kernel_max_global_size.cpp
   image_1D_buffer.cpp
   compare_image_2d_and_1d_array.cpp
+  compiler_fill_image_1d_array.cpp
+  compiler_fill_image_2d_array.cpp
   compiler_constant_expr.cpp
+  compiler_assignment_operation_in_if.cpp
+  vload_bench.cpp
+  runtime_use_host_ptr_buffer.cpp
   utest_assert.cpp
   utest.cpp
   utest_file_map.cpp
@@ -205,6 +214,11 @@ else(GEN_PCI_ID)
   DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/../backend/src/gbe_bin_generater ${kernel_bin}.cl)
 endif(GEN_PCI_ID)
 
+if (DRM_INTEL_USERPTR)
+SET(CMAKE_CXX_FLAGS "-DHAS_USERPTR ${CMAKE_CXX_FLAGS}")
+SET(CMAKE_C_FLAGS "-DHAS_USERPTR ${CMAKE_C_FLAGS}")
+endif (DRM_INTEL_USERPTR)
+
 ADD_CUSTOM_TARGET(kernel_bin.bin
     DEPENDS ${kernel_bin}.bin)
 
@@ -228,6 +242,10 @@ else()
 SET(UTESTS_REQUIRED_EGL_LIB "")
 endif()
 
+if (COMPILER STREQUAL "CLANG")
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-tautological-compare")
+endif ()
+
 ADD_LIBRARY(utests SHARED ${ADDMATHFUNC} ${utests_sources})
 
 TARGET_LINK_LIBRARIES(utests cl m ${OPENGL_LIBRARIES} ${UTESTS_REQUIRED_EGL_LIB} ${CMAKE_THREAD_LIBS_INIT})
diff --git a/utests/builtin_kernel_max_global_size.cpp b/utests/builtin_kernel_max_global_size.cpp
index c777564..e6910cd 100644
--- a/utests/builtin_kernel_max_global_size.cpp
+++ b/utests/builtin_kernel_max_global_size.cpp
@@ -18,12 +18,13 @@ void builtin_kernel_max_global_size(void)
   OCL_ASSERT(builtin_kernel_1d != NULL);
   size_t param_value_size;
   void* param_value;
-  clGetKernelWorkGroupInfo(builtin_kernel_1d, device, CL_KERNEL_GLOBAL_WORK_SIZE, 0, NULL, &param_value_size);
+  OCL_CALL(clGetKernelWorkGroupInfo, builtin_kernel_1d, device, CL_KERNEL_GLOBAL_WORK_SIZE, 0, NULL, &param_value_size);
   param_value = malloc(param_value_size);
-  clGetKernelWorkGroupInfo(builtin_kernel_1d, device, CL_KERNEL_GLOBAL_WORK_SIZE, param_value_size, param_value, 0);
+  OCL_CALL(clGetKernelWorkGroupInfo, builtin_kernel_1d, device, CL_KERNEL_GLOBAL_WORK_SIZE, param_value_size, param_value, 0);
   OCL_ASSERT(*(size_t*)param_value == 256 * 1024 *1024);
   clReleaseKernel(builtin_kernel_1d);
   clReleaseProgram(built_in_prog);
+  free(built_in_kernel_names);
   free(param_value);
 }
 
diff --git a/utests/builtin_pow.cpp b/utests/builtin_pow.cpp
index 8ed17ed..93863a5 100644
--- a/utests/builtin_pow.cpp
+++ b/utests/builtin_pow.cpp
@@ -1,6 +1,7 @@
 #include "utest_helper.hpp"
 #include <cmath>
 #include <algorithm>
+#include <string.h>
 
 #define udebug 0
 #define printf_c(...) \
@@ -15,12 +16,12 @@ const int count_input = count_input_ori * count_input_ori;
 
 float input_data1[count_input];
 float input_data2[count_input];
-const int max_function = 1;
+const int max_function = 2; // builtin_pow.cl has 2 outputs: pow(src1,src2) and src1
 
 static void cpu_compiler_math(const float *src1, const float *src2, float *dst)
 {
   dst[0] = powf(src1[0], src2[0]);
-//  dst[1] = src1[0];
+  dst[1] = src1[0];
 }
 
 static void builtin_pow(void)
@@ -36,6 +37,11 @@ static void builtin_pow(void)
       input_data2[i*count_input_ori+k] = ori_data[k];
     }
 
+  const char* env_strict = getenv("OCL_STRICT_CONFORMANCE");
+  float ULPSIZE_FACTOR = 1.0;
+  if (env_strict == NULL || strcmp(env_strict, "0") == 0)
+    ULPSIZE_FACTOR = 10000.;
+
   OCL_CREATE_KERNEL("builtin_pow");
 
   OCL_CREATE_BUFFER(buf[0], CL_MEM_READ_WRITE, count_input * max_function * sizeof(float), NULL);
@@ -69,7 +75,7 @@ static void builtin_pow(void)
 #if udebug
       if ( (isinf(cpu_data[index_cur]) && !isinf(gpu_data[index_cur])) ||
            (isnan(cpu_data[index_cur]) && !isnan(gpu_data[index_cur])) ||
-           (fabs(gpu_data[index_cur] - cpu_data[index_cur]) > 1e-5f)   )
+           (fabs(gpu_data[index_cur] - cpu_data[index_cur]) > cl_FLT_ULP(cpu_data[index_cur]) * ULPSIZE_FACTOR.)   )
       {
         printf_c("%d/%d: x:%f, y:%f -> gpu:%f  cpu:%f\n", k, i, input_data1[k], input_data2[k], gpu_data[index_cur], cpu_data[index_cur]);
       }
@@ -82,11 +88,11 @@ static void builtin_pow(void)
        OCL_ASSERT(isnan(gpu_data[index_cur]));
      else
      {
-       OCL_ASSERT(fabs(gpu_data[index_cur] - cpu_data[index_cur]) < 1e-3f);
+       OCL_ASSERT(fabs(gpu_data[index_cur] - cpu_data[index_cur]) < cl_FLT_ULP(cpu_data[index_cur]) * ULPSIZE_FACTOR);
      }
 #endif
     }
   }
 }
 
-MAKE_UTEST_FROM_FUNCTION_WITH_ISSUE(builtin_pow)
+MAKE_UTEST_FROM_FUNCTION(builtin_pow)
diff --git a/utests/builtin_tgamma.cpp b/utests/builtin_tgamma.cpp
index 4c824d0..16dac97 100644
--- a/utests/builtin_tgamma.cpp
+++ b/utests/builtin_tgamma.cpp
@@ -1,5 +1,6 @@
 #include <cmath>
 #include "utest_helper.hpp"
+#include <string.h>
 
 void builtin_tgamma(void)
 {
@@ -14,6 +15,10 @@ void builtin_tgamma(void)
   OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
   globals[0] = n;
   locals[0] = 16;
+  const char* env_strict = getenv("OCL_STRICT_CONFORMANCE");
+  float ULPSIZE_FACTOR = 1.0;
+  if (env_strict == NULL || strcmp(env_strict, "0") == 0)
+    ULPSIZE_FACTOR = 10000.;
 
   for (int j = 0; j < 1024; j ++) {
     OCL_MAP_BUFFER(0);
@@ -27,10 +32,10 @@ void builtin_tgamma(void)
     OCL_MAP_BUFFER(1);
     float *dst = (float*)buf_data[1];
     for (int i = 0; i < n; ++i) {
-      float cpu = gammaf(src[i]);
+      float cpu = tgammaf(src[i]);
       if (isinf(cpu)) {
         OCL_ASSERT(isinf(dst[i]));
-      } else if (fabsf(cpu - dst[i]) >= 1e-3) {
+      } else if (fabsf(cpu - dst[i]) >= cl_FLT_ULP(cpu) * ULPSIZE_FACTOR) {
         printf("%f %f %f\n", src[i], cpu, dst[i]);
         OCL_ASSERT(0);
       }
diff --git a/utests/compare_image_2d_and_1d_array.cpp b/utests/compare_image_2d_and_1d_array.cpp
index f2c828e..a2de507 100644
--- a/utests/compare_image_2d_and_1d_array.cpp
+++ b/utests/compare_image_2d_and_1d_array.cpp
@@ -8,6 +8,10 @@ static void compare_image_2d_and_1d_array(void)
   cl_image_format format;
   cl_image_desc desc;
   cl_sampler sampler;
+  uint32_t* dst0;
+  uint32_t* dst1;
+  size_t origin[3] = { };
+  size_t region[3];
 
   // Create the 1D array buffer.
   memset(&desc, 0x0, sizeof(cl_image_desc));
@@ -60,19 +64,26 @@ static void compare_image_2d_and_1d_array(void)
   locals[1] = 16;
   OCL_NDRANGE(2);
 
-  OCL_MAP_BUFFER_GTT(0);
-  OCL_MAP_BUFFER_GTT(1);
+  // Check result
+  region[0] = w;
+  region[1] = h;
+  region[2] = 1;
+  dst0 = (uint32_t*)malloc(w*h*sizeof(uint32_t));
+  dst1 = (uint32_t*)malloc(w*h*sizeof(uint32_t));
+  OCL_READ_IMAGE(buf[0], origin, region, dst0);
+  OCL_READ_IMAGE(buf[1], origin, region, dst1);
+
   for (int j = 0; j < h; ++j) {
     for (int i = 0; i < w; i++) {
       // Because the array index will not join the sample caculation, the result should
       // be different between the 2D and 1D_array.
       if (j % 2 == 0)
-        OCL_ASSERT(((uint32_t*)buf_data[0])[j * w + i] == ((uint32_t*)buf_data[1])[j * w + i]);
+        OCL_ASSERT(dst0[j * w + i] == dst1[j * w + i]);
     }
   }
-  OCL_UNMAP_BUFFER_GTT(0);
-  OCL_UNMAP_BUFFER_GTT(1);
 
+  free(dst0);
+  free(dst1);
   OCL_CALL(clReleaseSampler, sampler);
 }
 
diff --git a/utests/compiler_assignment_operation_in_if.cpp b/utests/compiler_assignment_operation_in_if.cpp
new file mode 100644
index 0000000..676c222
--- /dev/null
+++ b/utests/compiler_assignment_operation_in_if.cpp
@@ -0,0 +1,45 @@
+#include "utest_helper.hpp"
+
+typedef struct cpu_int3{
+	int x;
+	int y;
+	int z;
+}cpu_int3;
+
+static void cpu(int gidx, int *dst) {
+  cpu_int3 d1 = {gidx, gidx-1, gidx-3};
+  int k = gidx % 5;
+  if (k == 1){
+    d1.x = d1.y;
+  }
+  int * addr = dst + gidx;
+  *addr = d1.x;
+}
+
+void compiler_assignment_operation_in_if(void){
+  const size_t n = 16;
+  int cpu_dst[16];
+	
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_assignment_operation_in_if");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(int), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  globals[0] = 16;
+  locals[0] = 16;
+
+  // Run the kernel on GPU
+  OCL_NDRANGE(1);
+
+  // Run on CPU
+  for (int32_t i = 0; i < (int32_t) n; ++i)
+    cpu(i, cpu_dst);
+
+  // Compare
+  OCL_MAP_BUFFER(0);
+  for (int32_t i = 0; i < (int32_t) n; ++i)
+    OCL_ASSERT(((int *)buf_data[0])[i] == cpu_dst[i]);
+  OCL_UNMAP_BUFFER(0);
+
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_assignment_operation_in_if)
diff --git a/utests/compiler_box_blur.cpp b/utests/compiler_box_blur.cpp
index e4e053e..84445cf 100644
--- a/utests/compiler_box_blur.cpp
+++ b/utests/compiler_box_blur.cpp
@@ -12,7 +12,7 @@ static void compiler_box_blur()
   OCL_CREATE_KERNEL("compiler_box_blur");
 
   /* Load the picture */
-  src = cl_read_bmp("lenna128x128.bmp", &w, &h);
+  src = cl_read_bmp("sample.bmp", &w, &h);
   sz = w * h * sizeof(int);
 
   /* Run the kernel */
diff --git a/utests/compiler_box_blur_float.cpp b/utests/compiler_box_blur_float.cpp
index a3c97bc..8a75a25 100644
--- a/utests/compiler_box_blur_float.cpp
+++ b/utests/compiler_box_blur_float.cpp
@@ -13,7 +13,7 @@ static void compiler_box_blur_float()
   OCL_CREATE_KERNEL("compiler_box_blur_float");
 
   /* Load the picture */
-  tmp = cl_read_bmp("lenna128x128.bmp", &w, &h);
+  tmp = cl_read_bmp("sample.bmp", &w, &h);
   sz = w * h * sizeof(float[4]);
   src = (float4*)malloc(sz);
 
diff --git a/utests/compiler_box_blur_image.cpp b/utests/compiler_box_blur_image.cpp
index d94a97c..49423d6 100644
--- a/utests/compiler_box_blur_image.cpp
+++ b/utests/compiler_box_blur_image.cpp
@@ -12,10 +12,10 @@ static void compiler_box_blur_image()
   OCL_CREATE_KERNEL("compiler_box_blur_image");
 
   /* Load the picture */
-  src = cl_read_bmp("lenna128x128.bmp", &w, &h);
+  src = cl_read_bmp("sample.bmp", &w, &h);
 
   format.image_channel_order = CL_RGBA;
-  format.image_channel_data_type = CL_UNORM_INT8;
+  format.image_channel_data_type = CL_UNSIGNED_INT8;
   desc.image_type = CL_MEM_OBJECT_IMAGE2D;
   desc.image_width = w;
   desc.image_height = h;
diff --git a/utests/compiler_bswap.cpp b/utests/compiler_bswap.cpp
new file mode 100644
index 0000000..b5986b9
--- /dev/null
+++ b/utests/compiler_bswap.cpp
@@ -0,0 +1,109 @@
+#include "utest_helper.hpp"
+#include "string.h"
+
+namespace {
+#define cpu_htons(A)     ((((uint16_t)(A) & 0xff00) >> 8) | \
+    (((uint16_t)(A) & 0x00ff) << 8))
+#define cpu_htonl(A)     ((((uint32_t)(A) & 0xff000000) >> 24) | \
+    (((uint32_t)(A) & 0x00ff0000) >> 8) | \
+    (((uint32_t)(A) & 0x0000ff00) << 8) | \
+    (((uint32_t)(A) & 0x000000ff) << 24))
+
+template <typename T> static void cpu(int global_id, T *src, T *dst)
+{
+    T f = src[global_id];
+    T g = 0;
+    if(sizeof(T) == sizeof(int16_t))
+      g = cpu_htons(f);
+    else if(sizeof(T) == sizeof(int32_t))
+      g = cpu_htonl(f);
+    dst[global_id] = g;
+}
+
+template <typename T> static void gen_rand_val (T & val)
+{
+    val = static_cast<T>(rand() );
+}
+
+template <typename T>
+inline static void print_data (T& val)
+{
+    if(sizeof(T) == sizeof(uint16_t))
+        printf(" %hx", val);
+    else
+        printf(" %x", val);
+}
+
+template <typename T> static void dump_data (T* src, T* dst, int n)
+{
+    printf("\nRaw: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((T *)buf_data[0])[i]);
+    }
+
+    printf("\nCPU: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(dst[i]);
+    }
+    printf("\nGPU: \n");
+    for (int32_t i = 0; i < (int32_t) n; ++i) {
+        print_data(((T *)buf_data[1])[i]);
+    }
+}
+
+template<typename T>
+void test(const char *kernel_name)
+{
+  const size_t n = 64;
+  T cpu_dst[n];
+  T cpu_src[n];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_bswap", kernel_name);
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(T), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+
+  OCL_MAP_BUFFER(0);
+  for (int32_t i = 0; i < (int32_t) n; ++i) {
+    gen_rand_val(cpu_src[i]);
+  }
+
+  memcpy(buf_data[0], cpu_src, sizeof(T) * n);
+
+  /* Clear the dst buffer to avoid random data. */
+  OCL_MAP_BUFFER(1);
+  memset(buf_data[1], 0, sizeof(T) * n);
+  OCL_UNMAP_BUFFER(1);
+
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+
+  // Run on CPU
+  for (int32_t i = 0; i < (int32_t) n; ++i)
+    cpu(i, cpu_src, cpu_dst);
+
+  OCL_MAP_BUFFER(1);
+ // dump_data(cpu_src, cpu_dst, n);
+
+  OCL_ASSERT(!memcmp(buf_data[1], cpu_dst, sizeof(T) * n));
+
+  OCL_UNMAP_BUFFER(1);
+  OCL_UNMAP_BUFFER(0);
+}
+
+}
+
+#define compiler_bswap(type, kernel) \
+static void compiler_bswap_ ##type(void)\
+{\
+  test<type>(# kernel);\
+}\
+MAKE_UTEST_FROM_FUNCTION(compiler_bswap_ ## type);
+
+compiler_bswap(int16_t, compiler_bswap_short)
+compiler_bswap(uint16_t, compiler_bswap_ushort)
+compiler_bswap(int32_t, compiler_bswap_int)
+compiler_bswap(uint32_t, compiler_bswap_uint)
diff --git a/utests/compiler_fill_image_1d_array.cpp b/utests/compiler_fill_image_1d_array.cpp
new file mode 100644
index 0000000..cc7cf0a
--- /dev/null
+++ b/utests/compiler_fill_image_1d_array.cpp
@@ -0,0 +1,73 @@
+#include <string.h>
+#include "utest_helper.hpp"
+
+static void compiler_fill_image_1d_array(void)
+{
+  const size_t w = 64;
+  const size_t array = 8;
+  cl_image_format format;
+  cl_image_desc desc;
+  size_t origin[3] = { };
+  size_t region[3];
+  uint32_t* dst;
+
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE1D_ARRAY;
+  desc.image_width = w;
+  desc.image_row_pitch = 0;//w * sizeof(uint32_t);
+  desc.image_array_size = array;
+
+  // Setup kernel and images
+  OCL_CREATE_KERNEL("test_fill_image_1d_array");
+
+  OCL_CREATE_IMAGE(buf[0], 0, &format, &desc, NULL);
+
+  OCL_MAP_BUFFER_GTT(0);
+  memset(buf_data[0], 0, sizeof(uint32_t) * w * array);
+  OCL_UNMAP_BUFFER_GTT(0);
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  globals[0] = w/2;
+  locals[0] = 16;
+  globals[1] = 8;
+  locals[1] = 8;
+  OCL_NDRANGE(2);
+
+  // Check result
+  region[0] = w;
+  region[1] = array;
+  region[2] = 1;
+  dst = (uint32_t*)malloc(w*array*sizeof(uint32_t));
+  OCL_READ_IMAGE(buf[0], origin, region, dst);
+
+#if 0
+  printf("------ The image result is: -------\n");
+  for (uint32_t j = 0; j < array; j++) {
+    for (uint32_t i = 0; i < w; i++) {
+      printf(" %2x", dst[j*w + i]);
+    }
+    printf("\n");
+  }
+#endif
+
+  for (uint32_t j = 0; j < array - 1; j++) {
+    for (uint32_t i = 0; i < w/2; i++) {
+      OCL_ASSERT(dst[j*w + i] == 0x03020100);
+    }
+    for (uint32_t i = w/2; i < w; i++) {
+      OCL_ASSERT(dst[j*w + i] == 0);
+    }
+  }
+
+  for (uint32_t i = 0; i < w; i++) {
+    OCL_ASSERT(dst[(array - 1)*w + i] == 0x0);
+  }
+  free(dst);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_fill_image_1d_array);
diff --git a/utests/compiler_fill_image_2d_array.cpp b/utests/compiler_fill_image_2d_array.cpp
new file mode 100644
index 0000000..649b416
--- /dev/null
+++ b/utests/compiler_fill_image_2d_array.cpp
@@ -0,0 +1,84 @@
+#include <string.h>
+#include "utest_helper.hpp"
+
+static void compiler_fill_image_2d_array(void)
+{
+  const size_t w = 64;
+  const size_t h = 16;
+  const size_t array = 8;
+  cl_image_format format;
+  cl_image_desc desc;
+  size_t origin[3] = { };
+  size_t region[3];
+  uint32_t* dst;
+
+  memset(&desc, 0x0, sizeof(cl_image_desc));
+  memset(&format, 0x0, sizeof(cl_image_format));
+
+  format.image_channel_order = CL_RGBA;
+  format.image_channel_data_type = CL_UNSIGNED_INT8;
+  desc.image_type = CL_MEM_OBJECT_IMAGE2D_ARRAY;
+  desc.image_width = w;
+  desc.image_height = h;
+  desc.image_row_pitch = 0;//w * sizeof(uint32_t);
+  desc.image_array_size = array;
+
+  // Setup kernel and images
+  OCL_CREATE_KERNEL("test_fill_image_2d_array");
+
+  OCL_CREATE_IMAGE(buf[0], 0, &format, &desc, NULL);
+
+  OCL_MAP_BUFFER_GTT(0);
+  memset(buf_data[0], 0, sizeof(uint32_t) * w * h * array);
+  OCL_UNMAP_BUFFER_GTT(0);
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  globals[0] = w/2;
+  locals[0] = 16;
+  globals[1] = h;
+  locals[1] = 8;
+  globals[2] = array;
+  locals[2] = 8;
+  OCL_NDRANGE(3);
+
+  // Check result
+  region[0] = w;
+  region[1] = h;
+  region[2] = array;
+  dst = (uint32_t*)malloc(w*h*array*sizeof(uint32_t));
+  OCL_READ_IMAGE(buf[0], origin, region, dst);
+
+#if 0
+  printf("------ The image result is: -------\n");
+  for (uint32_t k = 0; k < array; k++) {
+    for (uint32_t j = 0; j < h; j++) {
+      for (uint32_t i = 0; i < w; i++) {
+        printf(" %2x", dst[k*h*w + j*w + i]);
+      }
+      printf("\n");
+    }
+    printf("\n");
+  }
+#endif
+
+  for (uint32_t k = 0; k < array - 1; k++) {
+    for (uint32_t j = 0; j < h; j++) {
+      for (uint32_t i = 0; i < w/2; i++) {
+        OCL_ASSERT(dst[k*w*h + j*w + i] == 0x03020100);
+      }
+      for (uint32_t i = w/2; i < w; i++) {
+        OCL_ASSERT(dst[k*w*h + j*w + i] == 0);
+      }
+    }
+  }
+
+  for (uint32_t j = 0; j < h; j++) {
+    for (uint32_t i = 0; i < w; i++) {
+      OCL_ASSERT(dst[(array - 1)*w*h + j*w + i] == 0x0);
+    }
+  }
+  free(dst);
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_fill_image_2d_array);
diff --git a/utests/compiler_local_memory_barrier.cpp b/utests/compiler_local_memory_barrier.cpp
index 6c9c98e..f5cf3d4 100644
--- a/utests/compiler_local_memory_barrier.cpp
+++ b/utests/compiler_local_memory_barrier.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/utests/compiler_local_memory_barrier_wg64.cpp b/utests/compiler_local_memory_barrier_wg64.cpp
index 0cb69f5..756eef0 100644
--- a/utests/compiler_local_memory_barrier_wg64.cpp
+++ b/utests/compiler_local_memory_barrier_wg64.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/utests/compiler_local_memory_two_ptr.cpp b/utests/compiler_local_memory_two_ptr.cpp
index fde5533..398dcce 100644
--- a/utests/compiler_local_memory_two_ptr.cpp
+++ b/utests/compiler_local_memory_two_ptr.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/utests/compiler_mandelbrot.cpp b/utests/compiler_mandelbrot.cpp
index 7758dae..78c6ea3 100644
--- a/utests/compiler_mandelbrot.cpp
+++ b/utests/compiler_mandelbrot.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/utests/compiler_mandelbrot_alternate.cpp b/utests/compiler_mandelbrot_alternate.cpp
index 2e5d59f..f8ebd0b 100644
--- a/utests/compiler_mandelbrot_alternate.cpp
+++ b/utests/compiler_mandelbrot_alternate.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/utests/compiler_overflow.cpp b/utests/compiler_overflow.cpp
new file mode 100644
index 0000000..1404cfe
--- /dev/null
+++ b/utests/compiler_overflow.cpp
@@ -0,0 +1,129 @@
+#include "utest_helper.hpp"
+
+namespace {
+
+typedef struct {
+  unsigned long x;
+  unsigned long y;
+  unsigned long z;
+  unsigned long w;
+}ulong4;
+
+typedef struct {
+  uint32_t x;
+  uint32_t y;
+  uint32_t z;
+  uint32_t w;
+} uint4;
+
+typedef struct {
+  uint16_t x;
+  uint16_t y;
+  uint16_t z;
+  uint16_t w;
+} ushort4;
+
+typedef struct {
+  uint8_t x;
+  uint8_t y;
+  uint8_t z;
+  uint8_t w;
+} uchar4;
+
+template <typename U>
+U get_max()
+{
+  int shift_bit = sizeof(U)*8;
+  U u_max = 0;
+  for (int i = 0; i < shift_bit; i++)
+    u_max |= 1<<(shift_bit-i-1);
+  return u_max;
+}
+
+template<typename T, typename U>
+void test(const char *kernel_name, int func_type)
+{
+  const size_t n = 16;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_overflow", kernel_name);
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(T), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
+  OCL_CREATE_BUFFER(buf[2], 0, n * sizeof(T), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(cl_mem), &buf[2]);
+
+  U max = get_max<U>();
+
+  OCL_MAP_BUFFER(0);
+  for (uint32_t i = 0; i < n; ++i) {
+    if(func_type == 0) {
+      ((T*)buf_data[0])[i].x = max;
+      ((T*)buf_data[0])[i].y = max;
+      ((T*)buf_data[0])[i].z = max;
+      ((T*)buf_data[0])[i].w = i;
+    }else if(func_type == 1) {
+      ((T*)buf_data[0])[i].x = 0;
+      ((T*)buf_data[0])[i].y = 0;
+      ((T*)buf_data[0])[i].z = 0;
+      ((T*)buf_data[0])[i].w = n+2-i;
+    }else
+      OCL_ASSERT(0);
+  }
+  OCL_UNMAP_BUFFER(0);
+  OCL_MAP_BUFFER(1);
+  for (uint32_t i = 0; i < n; ++i) {
+      ((T*)buf_data[1])[i].x = 1;
+      ((T*)buf_data[1])[i].y = 1;
+      ((T*)buf_data[1])[i].z = 1;
+      ((T*)buf_data[1])[i].w = 1;
+  }
+  OCL_UNMAP_BUFFER(1);
+
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(2);
+  for (uint32_t i = 0; i < 16; ++i) {
+   // printf("%u,%u,%u,%u\n", ((T*)buf_data[2])[i].x,((T*)buf_data[2])[i].y, ((T*)buf_data[2])[i].z, ((T*)buf_data[2])[i].w  );
+    if(func_type == 0) {
+      OCL_ASSERT(((T*)buf_data[2])[i].x == 0);
+      OCL_ASSERT(((T*)buf_data[2])[i].y == 1);
+      OCL_ASSERT(((T*)buf_data[2])[i].z == 1);
+      OCL_ASSERT(((T*)buf_data[2])[i].w == i+2);
+    }else if(func_type == 1) {
+      OCL_ASSERT(((T*)buf_data[2])[i].x == max);
+      OCL_ASSERT(((T*)buf_data[2])[i].y == max-1);
+      OCL_ASSERT(((T*)buf_data[2])[i].z == max-1);
+      OCL_ASSERT(((T*)buf_data[2])[i].w == n-i);
+    }else
+      OCL_ASSERT(0);
+  }
+  OCL_UNMAP_BUFFER(2);
+}
+
+}
+
+#define compiler_overflow_add(type, subtype, kernel, func_type) \
+static void compiler_overflow_add_ ##type(void)\
+{\
+  test<type, subtype>(# kernel, func_type);\
+}\
+MAKE_UTEST_FROM_FUNCTION(compiler_overflow_add_ ## type);
+
+#define compiler_overflow_sub(type, subtype, kernel, func_type) \
+static void compiler_overflow_sub_ ##type(void)\
+{\
+  test<type, subtype>(# kernel, func_type);\
+}\
+MAKE_UTEST_FROM_FUNCTION(compiler_overflow_sub_ ## type);
+
+compiler_overflow_add(ulong4, unsigned long, compiler_overflow_ulong4_add, 0)
+compiler_overflow_add(uint4, uint32_t, compiler_overflow_uint4_add, 0)
+compiler_overflow_add(ushort4, uint16_t, compiler_overflow_ushort4_add, 0)
+compiler_overflow_add(uchar4, uint8_t, compiler_overflow_uchar4_add, 0)
+
+// as llvm intrincs function doesn't support byte/short overflow,
+// we just test uint overflow here.
+compiler_overflow_sub(uint4, uint32_t, compiler_overflow_uint4_sub, 1)
diff --git a/utests/compiler_popcount.cpp b/utests/compiler_popcount.cpp
new file mode 100644
index 0000000..c960ae6
--- /dev/null
+++ b/utests/compiler_popcount.cpp
@@ -0,0 +1,75 @@
+#include "utest_helper.hpp"
+
+namespace {
+
+template<typename T>
+T get_max();
+
+#define DEF_TEMPLATE(TYPE, NAME)                                    \
+template <>                                                         \
+TYPE get_max<TYPE>()                                                \
+{                                                                   \
+  static TYPE max = CL_##NAME##_MAX;                                \
+  return max;                                                       \
+}                                                                   \
+                                                                    \
+template <>                                                         \
+u##TYPE get_max<u##TYPE>()                                          \
+{                                                                   \
+  static u##TYPE max = CL_U##NAME##_MAX;                            \
+  return max;                                                       \
+}
+
+DEF_TEMPLATE(int8_t, CHAR)
+DEF_TEMPLATE(int16_t, SHRT)
+DEF_TEMPLATE(int32_t, INT)
+DEF_TEMPLATE(int64_t, LONG)
+
+template<typename T>
+void test(const char *kernel_name, int s_type)
+{
+  const int n = sizeof(T) * 8;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL_FROM_FILE("compiler_popcount", kernel_name);
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(T), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(T), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = n;
+  locals[0] = n;
+
+  OCL_MAP_BUFFER(0);
+  ((T*)buf_data[0])[0] = 0;
+  for (int32_t i = 1; i < (int32_t) n; ++i){
+    ((T*)buf_data[0])[i] = get_max<T>() >> i;
+  }
+  OCL_UNMAP_BUFFER(0);
+
+  OCL_NDRANGE(1);
+
+  OCL_MAP_BUFFER(1);
+  OCL_ASSERT(((T*)buf_data[1])[0] == 0);
+  for (int i = 1; i < n; ++i){
+    OCL_ASSERT(((T*)buf_data[1])[i] == n-i-s_type);
+  }
+  OCL_UNMAP_BUFFER(1);
+}
+
+}
+
+#define compiler_popcount(type, kernel, s_type) \
+static void compiler_popcount_ ##type(void)\
+{\
+  test<type>(# kernel, s_type);\
+}\
+MAKE_UTEST_FROM_FUNCTION(compiler_popcount_ ## type);
+
+compiler_popcount(int8_t, test_char, 1)
+compiler_popcount(uint8_t, test_uchar, 0)
+compiler_popcount(int16_t, test_short, 1)
+compiler_popcount(uint16_t, test_ushort, 0)
+compiler_popcount(int32_t, test_int, 1)
+compiler_popcount(uint32_t, test_uint, 0)
+compiler_popcount(int64_t, test_long, 1)
+compiler_popcount(uint64_t, test_ulong, 0)
diff --git a/utests/compiler_shader_toy.cpp b/utests/compiler_shader_toy.cpp
deleted file mode 100644
index 58bcc6f..0000000
--- a/utests/compiler_shader_toy.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/* 
- * Copyright © 2012 Intel Corporation
- *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library. If not, see <http://www.gnu.org/licenses/>.
- *
- * Author: Benjamin Segovia <benjamin.segovia at intel.com>
- */
-
-/* This is a super simple wrapper for the OpenCL kernels I ported from GLSL code
- * taken in Inigo's web site:
- * http://www.iquilezles.org/apps/shadertoy/index.html
- *
- * They are pretty cool and rather complex kernels. Just the right thing to have
- * something a bit more complicated and interesting than unit tests.
- *
- * The code here is just to wrap the common code used by all the kernels (to run
- * the code and assert its correctness)
- */
-#include "utest_helper.hpp"
-
-static const int dim = 256;
-
-// tricky here 'name' stands for Kernel and Reference
-// 'file' stands for .cl file name and dst image name
-static void run_kernel(int w, int h, const char *file, const char *name)
-{
-  const size_t global[2] = {size_t(w), size_t(h)};
-  const size_t local[2] = {16, 1};
-  const size_t sz = w * h * sizeof(char[4]);
-  const float fx = float(w);
-  const float fy = float(h);
-  char kernel_file[256];
-  char dst_img[256];
-  char ref_img[256];
-
-  snprintf(kernel_file, sizeof(kernel_file), "%s.cl", file);
-  snprintf(dst_img, sizeof(dst_img), "%s.bmp", file);
-  snprintf(ref_img, sizeof(ref_img), "%s_ref.bmp", name);
-  OCL_CALL (cl_kernel_init, kernel_file, name, SOURCE, NULL);
-
-  OCL_CREATE_BUFFER(buf[0], 0, sz, NULL);
-  OCL_CALL (clSetKernelArg, kernel, 0, sizeof(cl_mem), &buf[0]);
-  OCL_CALL (clSetKernelArg, kernel, 1, sizeof(float), &fx);
-  OCL_CALL (clSetKernelArg, kernel, 2, sizeof(float), &fy);
-  OCL_CALL (clSetKernelArg, kernel, 3, sizeof(int), &w);
-  OCL_CALL (clEnqueueNDRangeKernel, queue, kernel, 2, NULL, global, local, 0, NULL, NULL);
-  OCL_MAP_BUFFER(0);
-  int *dst = (int*) buf_data[0];
-
-  /* Save the image (for debug purpose) */
-  cl_write_bmp(dst, w, h, dst_img);
-
-  /* Compare with the golden image */
-  OCL_CHECK_IMAGE(dst, w, h, ref_img);
-}
-
-#define DECL_SHADER_TOY_TEST(W,H,FILE_NAME, KERNEL_NAME) \
-  static void FILE_NAME(void) { run_kernel(W,H,#FILE_NAME, #KERNEL_NAME); } \
-  MAKE_UTEST_FROM_FUNCTION(FILE_NAME);
-
-DECL_SHADER_TOY_TEST(dim,dim,compiler_clod,compiler_clod);
-DECL_SHADER_TOY_TEST(dim,dim,compiler_ribbon,compiler_ribbon);
-DECL_SHADER_TOY_TEST(dim,dim,compiler_nautilus,compiler_nautilus);
-DECL_SHADER_TOY_TEST(dim,dim,compiler_menger_sponge_no_shadow,compiler_menger_sponge_no_shadow);
-DECL_SHADER_TOY_TEST(dim,dim,compiler_julia,compiler_julia);
-DECL_SHADER_TOY_TEST(dim,dim,compiler_julia_no_break,compiler_julia_no_break);
-// test for function calls
-DECL_SHADER_TOY_TEST(dim,dim,compiler_clod_function_call,compiler_clod);
-DECL_SHADER_TOY_TEST(dim,dim,compiler_julia_function_call,compiler_julia);
-
-// Still issues here for LLVM 3.2
-// DECL_SHADER_TOY_TEST(dim,dim,compiler_chocolux,compiler_chocolux);
-// DECL_SHADER_TOY_TEST(dim,dim,compiler_menger_sponge,compiler_menger_sponge);
-
-#undef DECL_SHADER_TOY_TEST
-
diff --git a/utests/compiler_time_stamp.cpp b/utests/compiler_time_stamp.cpp
new file mode 100644
index 0000000..4da5752
--- /dev/null
+++ b/utests/compiler_time_stamp.cpp
@@ -0,0 +1,52 @@
+#include "utest_helper.hpp"
+
+static void cpu(int global_id, int *src, int *dst) {
+  int i;
+  int final[16];
+  for (i = 0; i < 16; ++i) {
+    int array[16], j;
+    for (j = 0; j < 16; ++j)
+      array[j] = global_id;
+    for (j = 0; j < src[0]; ++j)
+      array[j] = 1+src[j];
+    final[i] = array[i];
+  }
+  dst[global_id] = final[global_id];
+}
+
+void compiler_time_stamp(void)
+{
+  const size_t n = 16;
+  int cpu_dst[16], cpu_src[16];
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("compiler_time_stamp");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  globals[0] = 16;
+  locals[0] = 16;
+
+  // Run random tests
+  for (uint32_t pass = 0; pass < 1; ++pass) {
+    OCL_MAP_BUFFER(0);
+    for (int32_t i = 0; i < (int32_t) n; ++i)
+      cpu_src[i] = ((int32_t*)buf_data[0])[i] = rand() % 16;
+    OCL_UNMAP_BUFFER(0);
+
+    // Run the kernel on GPU
+    OCL_NDRANGE(1);
+
+    // Run on CPU
+    for (int32_t i = 0; i <(int32_t) n; ++i) cpu(i, cpu_src, cpu_dst);
+
+    // Compare
+    OCL_MAP_BUFFER(1);
+    for (int32_t i = 0; i < 11; ++i)
+      OCL_ASSERT(((int32_t*)buf_data[1])[i] == cpu_dst[i]);
+    OCL_UNMAP_BUFFER(1);
+  }
+}
+
+MAKE_UTEST_FROM_FUNCTION(compiler_time_stamp);
diff --git a/utests/compiler_write_only.cpp b/utests/compiler_write_only.cpp
index 3935535..4ae01ad 100644
--- a/utests/compiler_write_only.cpp
+++ b/utests/compiler_write_only.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/utests/runtime_flat_address_space.cpp b/utests/runtime_flat_address_space.cpp
index 08167c4..9b8bece 100644
--- a/utests/runtime_flat_address_space.cpp
+++ b/utests/runtime_flat_address_space.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/utests/runtime_set_kernel_arg.cpp b/utests/runtime_set_kernel_arg.cpp
new file mode 100644
index 0000000..d58c77e
--- /dev/null
+++ b/utests/runtime_set_kernel_arg.cpp
@@ -0,0 +1,30 @@
+#include "utest_helper.hpp"
+
+void runtime_set_kernel_arg(void)
+{
+  const size_t n = 16;
+
+  cl_float3 src;
+  src.s[0] = 1; src.s[1] =2; src.s[2] = 3;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("set_kernel_arg");
+  OCL_CREATE_BUFFER(buf[0], 0, n * sizeof(uint32_t), NULL);
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_float3), &src);
+
+    // Run the kernel
+  globals[0] = n;
+  locals[0] = 16;
+  OCL_NDRANGE(1);
+  OCL_MAP_BUFFER(0);
+
+  // Check results
+  for (uint32_t i = 0; i < n; ++i) {
+//    printf("%d %d\n",i, ((uint32_t*)buf_data[0])[i]);
+    OCL_ASSERT(((uint32_t*)buf_data[0])[i] == src.s[i%3]);
+  }
+  OCL_UNMAP_BUFFER(0);
+}
+
+MAKE_UTEST_FROM_FUNCTION(runtime_set_kernel_arg);
diff --git a/utests/runtime_use_host_ptr_buffer.cpp b/utests/runtime_use_host_ptr_buffer.cpp
new file mode 100644
index 0000000..4603f90
--- /dev/null
+++ b/utests/runtime_use_host_ptr_buffer.cpp
@@ -0,0 +1,39 @@
+#include "utest_helper.hpp"
+
+static void runtime_use_host_ptr_buffer(void)
+{
+  const size_t n = 4096*100;
+
+  // Setup kernel and buffers
+  OCL_CREATE_KERNEL("runtime_use_host_ptr_buffer");
+
+  int ret = posix_memalign(&buf_data[0], 4096, sizeof(uint32_t) * n);
+  OCL_ASSERT(ret == 0);
+
+  for (uint32_t i = 0; i < n; ++i) ((uint32_t*)buf_data[0])[i] = i;
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_USE_HOST_PTR, n * sizeof(uint32_t), buf_data[0]);
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  globals[0] = n;
+  locals[0] = 256;
+  OCL_NDRANGE(1);
+
+  // Check result
+
+#ifdef HAS_USERPTR
+  OCL_FINISH();
+#else
+  void* mapptr = (int*)clEnqueueMapBuffer(queue, buf[0], CL_TRUE, CL_MAP_READ, 0, n*sizeof(uint32_t), 0, NULL, NULL, NULL);
+  OCL_ASSERT(mapptr == buf_data[0]);
+  clEnqueueUnmapMemObject(queue, buf[0], mapptr, 0, NULL, NULL);
+#endif
+
+  for (uint32_t i = 0; i < n; ++i)
+    OCL_ASSERT(((uint32_t*)buf_data[0])[i] == i / 2);
+
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+}
+
+MAKE_UTEST_FROM_FUNCTION(runtime_use_host_ptr_buffer);
diff --git a/utests/setenv.sh.in b/utests/setenv.sh.in
index b0f575f..ac06b10 100644
--- a/utests/setenv.sh.in
+++ b/utests/setenv.sh.in
@@ -1,7 +1,8 @@
 #!/bin/sh
 #
-export OCL_PCM_PATH=@LOCAL_PCM_OBJECT_DIR@
-export OCL_PCH_PATH=@LOCAL_PCH_OBJECT_DIR@
+export OCL_BITCODE_LIB_PATH=@LOCAL_OCL_BITCODE_BIN@
+export OCL_HEADER_FILE_DIR=@LOCAL_OCL_HEADER_DIR@
+export OCL_PCH_PATH=@LOCAL_OCL_PCH_OBJECT@
 export OCL_KERNEL_PATH=@CMAKE_CURRENT_SOURCE_DIR@/../kernels
 export OCL_GBE_PATH=@LOCAL_GBE_OBJECT_DIR@
 export OCL_INTERP_PATH=@LOCAL_INTERP_OBJECT_DIR@
diff --git a/utests/utest.cpp b/utests/utest.cpp
index b491cae..b914891 100644
--- a/utests/utest.cpp
+++ b/utests/utest.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -106,8 +106,8 @@ void catch_signal(void){
   }
 }
 
-UTest::UTest(Function fn, const char *name, bool haveIssue, bool needDestroyProgram)
-       : fn(fn), name(name), haveIssue(haveIssue), needDestroyProgram(needDestroyProgram) {
+UTest::UTest(Function fn, const char *name, bool isBenchMark, bool haveIssue, bool needDestroyProgram)
+       : fn(fn), name(name), isBenchMark(isBenchMark), haveIssue(haveIssue), needDestroyProgram(needDestroyProgram) {
 
   if (utestList == NULL) {
     utestList = new vector<UTest>;
@@ -165,7 +165,19 @@ void UTest::runAllNoIssue(void) {
 
   for (; retStatistics.finishrun < utestList->size(); ++retStatistics.finishrun) {
     const UTest &utest = (*utestList)[retStatistics.finishrun];
-    if (utest.fn == NULL || utest.haveIssue) continue;
+    if (utest.fn == NULL || utest.haveIssue || utest.isBenchMark) continue;
+    do_run(utest);
+    cl_kernel_destroy(utest.needDestroyProgram);
+    cl_buffer_destroy();
+  }
+}
+
+void UTest::runAllBenchMark(void) {
+  if (utestList == NULL) return;
+
+  for (; retStatistics.finishrun < utestList->size(); ++retStatistics.finishrun) {
+    const UTest &utest = (*utestList)[retStatistics.finishrun];
+    if (utest.fn == NULL || utest.haveIssue || !utest.isBenchMark) continue;
     do_run(utest);
     cl_kernel_destroy(utest.needDestroyProgram);
     cl_buffer_destroy();
diff --git a/utests/utest.hpp b/utests/utest.hpp
index 375ef70..0dc611d 100644
--- a/utests/utest.hpp
+++ b/utests/utest.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -47,11 +47,13 @@ struct UTest
   /*! Empty test */
   UTest(void);
   /*! Build a new unit test and append it to the unit test list */
-  UTest(Function fn, const char *name, bool haveIssue = false, bool needDestroyProgram = true);
+  UTest(Function fn, const char *name, bool isBenchMark = false, bool haveIssue = false, bool needDestroyProgram = true);
   /*! Function to execute */
   Function fn;
   /*! Name of the test */
   const char *name;
+  /*! whether it is a bench mark. */
+  bool isBenchMark;
   /*! Indicate whether current test cases has issue to be fixes */
   bool haveIssue;
   /*! Indicate whether destroy kernels/program. */
@@ -62,6 +64,8 @@ struct UTest
   static void run(const char *name);
   /*! Run all the tests without known issue*/
   static void runAllNoIssue(void);
+  /*! Run all the benchmark. */
+  static void runAllBenchMark(void);
   /*! Run all the tests */
   static void runAll(void);
   /*! List all test cases */
@@ -77,7 +81,7 @@ struct UTest
 
 #define MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(FN, KEEP_PROGRAM) \
   static void __ANON__##FN##__(void) { UTEST_EXPECT_SUCCESS(FN()); } \
-  static const UTest __##FN##__(__ANON__##FN##__, #FN, false, !(KEEP_PROGRAM));
+  static const UTest __##FN##__(__ANON__##FN##__, #FN, false, false, !(KEEP_PROGRAM));
 
 
 /*! Turn a function into a unit test */
@@ -91,9 +95,14 @@ struct UTest
   static const UTest __##FN##__(__ANON__##FN##__, #FN, true);
 
 /*! Turn a function into a unit performance test */
+#define MAKE_BENCHMARK_FROM_FUNCTION_KEEP_PROGRAM(FN, KEEP_PROGRAM) \
+  static void __ANON__##FN##__(void) { BENCHMARK(FN()); } \
+  static const UTest __##FN##__(__ANON__##FN##__, #FN, true, false, !(KEEP_PROGRAM));
+
 #define MAKE_BENCHMARK_FROM_FUNCTION(FN) \
   static void __ANON__##FN##__(void) { BENCHMARK(FN()); } \
-  static const UTest __##FN##__(__ANON__##FN##__, #FN);
+  static const UTest __##FN##__(__ANON__##FN##__, #FN, true);
+
 
 /*! No assert is expected */
 #define UTEST_EXPECT_SUCCESS(EXPR) \
@@ -125,14 +134,16 @@ struct UTest
 
 #define BENCHMARK(EXPR) \
  do { \
-    int ret = 0; \
+    int ret = 0;\
     try { \
       ret = EXPR; \
-      printf("  %s  [SUCCESS] [Result: %d]\n", #EXPR, ret);\
+      std::cout << "    [Result: " << ret << "]    [SUCCESS]" << std::endl; \
+      UTest::retStatistics.passCount += 1; \
     } \
     catch (Exception e) { \
       std::cout << "  " << #EXPR << "    [FAILED]" << std::endl; \
       std::cout << "    " << e.what() << std::endl; \
+      UTest::retStatistics.failCount++; \
     } \
   } while (0)
 #endif /* __UTEST_UTEST_HPP__ */
diff --git a/utests/utest_assert.cpp b/utests/utest_assert.cpp
index f3b9a00..6ccd347 100644
--- a/utests/utest_assert.cpp
+++ b/utests/utest_assert.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/utests/utest_assert.hpp b/utests/utest_assert.hpp
index f93f9ac..1d44588 100644
--- a/utests/utest_assert.hpp
+++ b/utests/utest_assert.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/utests/utest_error.c b/utests/utest_error.c
index 4582a33..fecb34b 100644
--- a/utests/utest_error.c
+++ b/utests/utest_error.c
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/utests/utest_error.h b/utests/utest_error.h
index 2da29b0..1651052 100644
--- a/utests/utest_error.h
+++ b/utests/utest_error.h
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/utests/utest_exception.hpp b/utests/utest_exception.hpp
index e19141f..5568785 100644
--- a/utests/utest_exception.hpp
+++ b/utests/utest_exception.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/utests/utest_file_map.cpp b/utests/utest_file_map.cpp
index 55b7771..e2808ca 100644
--- a/utests/utest_file_map.cpp
+++ b/utests/utest_file_map.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/utests/utest_file_map.hpp b/utests/utest_file_map.hpp
index 83d79ea..e5a7aa1 100644
--- a/utests/utest_file_map.hpp
+++ b/utests/utest_file_map.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
diff --git a/utests/utest_generator.py b/utests/utest_generator.py
index 7522001..5da2752 100644
--- a/utests/utest_generator.py
+++ b/utests/utest_generator.py
@@ -135,7 +135,7 @@ which can print more values and information to assist debuging the issue.
 
 #include "utest_helper.hpp"
 #include <stdio.h>
-#include <math.h>
+#include <cmath>
 #include <algorithm>
 #include <string.h>
 
diff --git a/utests/utest_helper.cpp b/utests/utest_helper.cpp
index cb4dd66..df0e508 100644
--- a/utests/utest_helper.cpp
+++ b/utests/utest_helper.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -262,9 +262,10 @@ cl_kernel_init(const char *file_name, const char *kernel_name, int format, const
       goto error;
     }
     prevFileName = file_name;
+
+    /* OCL requires to build the program even if it is created from a binary */
+    OCL_CALL (clBuildProgram, program, 1, &device, build_opt, NULL, NULL);
   }
-  /* OCL requires to build the program even if it is created from a binary */
-  OCL_CALL (clBuildProgram, program, 1, &device, build_opt, NULL, NULL);
 
   /* Create a kernel from the program */
   if (kernel)
@@ -537,12 +538,19 @@ int *cl_read_bmp(const char *filename, int *width, int *height)
   char magic[2];
   int ret;
   ret = fread(&magic[0], 1, 2, fp);
-  ret = ret;
-  assert(2 == ret);
+  if(2 != ret){
+    fclose(fp);
+    free(bmppath);
+    return NULL;
+  }
   assert(magic[0] == 'B' && magic[1] == 'M');
 
   ret = fread(&hdr, sizeof(hdr), 1, fp);
-  assert(1 == ret);
+  if(1 != ret){
+    fclose(fp);
+    free(bmppath);
+    return NULL;
+  }
 
   assert(hdr.width > 0 && hdr.height > 0 && hdr.nplanes == 1 && hdr.compression == 0);
 
@@ -655,7 +663,7 @@ int cl_check_image(const int *img, int w, int h, const char *bmp)
   return (float(discrepancy) / float(n) > max_error_ratio) ? 0 : 1;
 }
 
-const float cl_FLT_ULP(float float_number)
+float cl_FLT_ULP(float float_number)
 {
   SF floatBin, ulpBin, ulpBinBase;
   floatBin.f = float_number;
@@ -668,7 +676,7 @@ const float cl_FLT_ULP(float float_number)
   return ulpBin.f - ulpBinBase.f;
 }
 
-const int cl_INT_ULP(int int_number)
+int cl_INT_ULP(int int_number)
 {
   return 0;
 }
diff --git a/utests/utest_helper.hpp b/utests/utest_helper.hpp
index de4d277..026eb1c 100644
--- a/utests/utest_helper.hpp
+++ b/utests/utest_helper.hpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -225,10 +225,10 @@ extern void cl_write_bmp(const int *data, int width, int height, const char *fil
 extern int cl_check_image(const int *img, int w, int h, const char *bmp);
 
 /* Calculator ULP of each FLOAT value */
-extern const float cl_FLT_ULP(float float_number);
+extern float cl_FLT_ULP(float float_number);
 
 /* Calculator ULP of each INT value */
-extern const int cl_INT_ULP(int int_number);
+extern int cl_INT_ULP(int int_number);
 
 #endif /* __UTEST_HELPER_HPP__ */
 
diff --git a/utests/utest_math_gen.py b/utests/utest_math_gen.py
index 30a9b24..75926b6 100755
--- a/utests/utest_math_gen.py
+++ b/utests/utest_math_gen.py
@@ -216,17 +216,17 @@ static float atanpi(float x){
   cospi_cpu_func=reduce1+cospi
   cospiUtests = func('cospi','cospi',[cospi_input_type],cospi_output_type,[cospi_input_values],'2 * FLT_ULP',cospi_cpu_func)
   
-#  ##### gentype erf(gentype)
-#  erf_input_values = base_input_values
-#  erf_input_type = ['float','float2','float4','float8','float16']
-#  erf_output_type = ['float','float2','float4','float8','float16']
-#  erfUtests = func('erf','erf',[erf_input_type],erf_output_type,[erf_input_values],'16 * FLT_ULP')
+  ##### gentype erf(gentype)
+  erf_input_values = base_input_values
+  erf_input_type = ['float','float2','float4','float8','float16']
+  erf_output_type = ['float','float2','float4','float8','float16']
+  erfUtests = func('erf','erf',[erf_input_type],erf_output_type,[erf_input_values],'16 * FLT_ULP')
 
-#  ##### gentype erfc(gentype)
-#  erfc_input_values = base_input_values
-#  erfc_input_type = ['float','float2','float4','float8','float16']
-#  erfc_output_type = ['float','float2','float4','float8','float16']
-#  erfcUtests = func('erfc','erfc',[erfc_input_type],erfc_output_type,[erfc_input_values],'16 * FLT_ULP')
+  ##### gentype erfc(gentype)
+  erfc_input_values = base_input_values
+  erfc_input_type = ['float','float2','float4','float8','float16']
+  erfc_output_type = ['float','float2','float4','float8','float16']
+  erfcUtests = func('erfc','erfc',[erfc_input_type],erfc_output_type,[erfc_input_values],'16 * FLT_ULP')
   
   ##### gentype exp(gentype x)
   exp_input_values = base_input_values
diff --git a/utests/utest_run.cpp b/utests/utest_run.cpp
index cd4356a..d797c09 100644
--- a/utests/utest_run.cpp
+++ b/utests/utest_run.cpp
@@ -4,7 +4,7 @@
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
@@ -106,6 +106,17 @@ int main(int argc, char *argv[])
 
         break;
 
+      case 'b':
+        try {
+          UTest::runAllBenchMark();
+        }
+        catch (Exception e){
+          std::cout << "  " << e.what() << "    [SUCCESS]" << std::endl;
+        }
+
+        break;
+
+
       case 'h':
       default:
         usage();
diff --git a/utests/vload_bench.cpp b/utests/vload_bench.cpp
new file mode 100644
index 0000000..3765996
--- /dev/null
+++ b/utests/vload_bench.cpp
@@ -0,0 +1,98 @@
+#include "utest_helper.hpp"
+#include <sys/time.h>
+
+#define N_ITERATIONS 10000
+
+#define T uint8_t
+template <typename T>
+static double vload_bench(const char *kernelFunc, uint32_t N, uint32_t offset, bool benchMode)
+{
+  const size_t n = benchMode ? (512 * 1024) : (8 * 1024);
+  struct timeval start, end;
+
+  // Setup kernel and buffers
+  std::string kernelName = kernelFunc + std::to_string(N);
+  OCL_CALL (cl_kernel_init, "vload_bench.cl", kernelName.c_str(), SOURCE, NULL);
+  //OCL_CREATE_KERNEL("compiler_array");
+  buf_data[0] = (T*) malloc(sizeof(T) * n);
+  for (uint32_t i = 0; i < n; ++i) ((T*)buf_data[0])[i] = i; //rand() & ((1LL << N) - 1);
+  OCL_CREATE_BUFFER(buf[0], CL_MEM_COPY_HOST_PTR, n * sizeof(T), buf_data[0]);
+  OCL_CREATE_BUFFER(buf[1], 0, n * sizeof(uint32_t), NULL);
+  free(buf_data[0]);
+  buf_data[0] = NULL;
+
+  // Run the kernel
+  OCL_SET_ARG(0, sizeof(cl_mem), &buf[0]);
+  OCL_SET_ARG(1, sizeof(cl_mem), &buf[1]);
+  OCL_SET_ARG(2, sizeof(uint32_t), &offset);
+  globals[0] = n / ((N + 1) & ~0x1);
+  locals[0] = 256;
+  if (benchMode)
+    gettimeofday(&start, NULL);
+  OCL_NDRANGE(1);
+  if (benchMode) {
+    OCL_FINISH();
+    gettimeofday(&end, NULL);
+    double elapsed = (end.tv_sec - start.tv_sec) * 1e6 + (end.tv_usec - start.tv_usec);
+    double bandwidth = (globals[0] * (N_ITERATIONS) * sizeof(T) * N) / elapsed;
+    printf("\t%2.1fGB/S\n", bandwidth/1000.);
+    return bandwidth;
+  } else {
+    // Check result
+    OCL_MAP_BUFFER(0);
+    OCL_MAP_BUFFER(1);
+    for (uint32_t i = 0; i < globals[0]; ++i) {
+      OCL_ASSERT(((T*)buf_data[0])[i + offset] == ((uint32_t*)buf_data[1])[i]);
+    }
+    return 0;
+  }
+}
+
+#define VLOAD_TEST(T, kT) \
+static void vload_test_ ##kT(void) \
+{ \
+  uint8_t vectorSize[] = {2, 3, 4, 8, 16}; \
+  for(uint32_t i = 0; i < sizeof(vectorSize); i++) { \
+    for(uint32_t offset = 0; offset < vectorSize[i]; offset++) {\
+      (void)vload_bench<T>("vload_bench_1" #kT, vectorSize[i], offset, false); \
+    }\
+  } \
+}\
+MAKE_UTEST_FROM_FUNCTION_KEEP_PROGRAM(vload_test_ ##kT, true)
+
+#ifndef BUILD_BENCHMARK
+VLOAD_TEST(uint8_t, uchar)
+VLOAD_TEST(int8_t, char)
+VLOAD_TEST(uint16_t, ushort)
+VLOAD_TEST(int16_t, short)
+VLOAD_TEST(uint32_t, uint)
+VLOAD_TEST(int32_t, int)
+VLOAD_TEST(float, float)
+#endif
+
+#define VLOAD_BENCH(T, kT) \
+static int vload_bench_ ##kT(void) \
+{ \
+  uint8_t vectorSize[] = {2, 3, 4, 8, 16}; \
+  double totBandwidth = 0; \
+  unsigned int j = 0;\
+  printf("\n");\
+  for(uint32_t i = 0; i < sizeof(vectorSize); i++, j++) { \
+    printf("  Vector size %d:\n", vectorSize[i]); \
+    uint32_t k = 0;\
+    double bandwidthForOneSize = 0;\
+    for(uint32_t offset = 0; offset < vectorSize[i]; offset++, k++) {\
+      printf("\tOffset %d :", offset); \
+      bandwidthForOneSize += vload_bench<T>("vload_bench_10000"  #kT, vectorSize[i], offset, true); \
+    }\
+    totBandwidth += bandwidthForOneSize / k;\
+  } \
+  return totBandwidth/j;\
+}\
+MAKE_BENCHMARK_FROM_FUNCTION_KEEP_PROGRAM(vload_bench_ ##kT, true)
+
+#ifdef BUILD_BENCHMARK
+VLOAD_BENCH(uint8_t, uchar)
+VLOAD_BENCH(uint16_t, ushort)
+VLOAD_BENCH(uint32_t, uint)
+#endif

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/pkg-opencl/beignet.git



More information about the Pkg-opencl-commits mailing list