[pocl] 01/01: New upstream version 1.0

Wed Jan 17 05:52:50 UTC 2018

This is an automated email from the git hooks/post-receive script.

anbe pushed a commit to branch upstream
in repository pocl.

commit 7ee86405f9cf2059dbf523f3b92f95f44c8c1a82
Author: Andreas Beckmann <anbe at debian.org>
Date:   Tue Dec 19 12:31:30 2017 +0100

    New upstream version 1.0
---
 .bzrignore                                         |    52 -
 .gitattributes                                     |    75 +-
 CHANGES                                            |    41 +-
 CMakeLists.txt                                     |   403 +-
 CREDITS                                            |    10 +-
 README.ARM                                         |    35 +-
 README.packaging                                   |    32 +-
 TODO                                               |    92 +-
 TODO.piglit                                        |    29 -
 android/CLONE_POCL_PREBUILTS_HERE                  |     1 -
 android/androideabi.cmake                          |    10 -
 android/build-arm.sh                               |   161 -
 cmake/LLVM.cmake                                   |    91 +-
 tests/CMakeLists.txt => cmake/add_test_pocl.cmake  |    32 +-
 cmake/bitcode_rules.cmake                          |   128 +-
 config.h.in.cmake                                  |    75 +-
 config/xclang                                      |    73 -
 config2.h.in.cmake                                 |     5 +
 depcomp                                            |   632 -
 doc/build-envs.txt                                 |    88 -
 doc/luxmark.txt                                    |    13 -
 doc/sphinx/source/benchmarks.rst                   |    33 +
 doc/sphinx/source/conf.py                          |     4 +-
 doc/sphinx/source/conformance.rst                  |   225 +
 doc/sphinx/source/cuda.rst                         |   137 +
 doc/sphinx/source/development.rst                  |    23 +-
 doc/sphinx/source/docker.rst                       |    32 +
 doc/sphinx/source/env_variables.rst                |    31 +-
 doc/sphinx/source/faq.rst                          |     3 -
 doc/sphinx/source/features.rst                     |    15 +-
 doc/sphinx/source/index.rst                        |     5 +
 doc/sphinx/source/install.rst                      |   158 +-
 doc/sphinx/source/kernel_compiler.rst              |    35 +-
 doc/sphinx/source/maintainer-policy.rst            |     5 +-
 doc/sphinx/source/memory_management.rst            |    15 +
 doc/sphinx/source/pocl_binary.rst                  |     8 +-
 doc/sphinx/source/releasing.rst                    |    50 +-
 doc/sphinx/source/using.rst                        |    33 +-
 examples/AMD/CMakeLists.txt                        |     9 +-
 examples/AMDSDK2.9/CMakeLists.txt                  |    20 +-
 examples/AMDSDK3.0/CMakeLists.txt                  |    31 +-
 examples/ASL/CMakeLists.txt                        |   184 +-
 examples/ASL/asl.patch                             |    29 +
 examples/CMakeLists.txt                            |    35 +-
 examples/EinsteinToolkit/CMakeLists.txt            |    10 +-
 examples/EinsteinToolkit/EinsteinToolkit.c         |   735 +-
 examples/Halide/CMakeLists.txt                     |     9 +-
 examples/IntelSVM/CMakeLists.txt                   |     5 +-
 examples/OpenCV/opencv.patch                       |    11 +
 examples/PyOpenCL/CMakeLists.txt                   |    70 +-
 examples/PyOpenCL/build.sh                         |     7 +
 examples/PyOpenCL/configure.sh                     |     9 +
 examples/PyOpenCL/install.sh                       |     7 +
 examples/PyOpenCL/runtest.sh                       |     7 +
 examples/PyOpenCL/test_build_dir                   |    14 +-
 examples/Rodinia/CMakeLists.txt                    |     6 +-
 examples/VexCL/CMakeLists.txt                      |    19 +-
 examples/ViennaCL/CMakeLists.txt                   |    47 +-
 examples/arrayfire/CMakeLists.txt                  |   294 +-
 examples/clBLAS/CMakeLists.txt                     |    16 +-
 examples/clBLAS/clBLAS_float_error.patch           |    11 +
 examples/clBLAS/clBLAS_link.patch                  |    12 +
 examples/clFFT/CMakeLists.txt                      |    14 +-
 examples/conformance/CMakeLists.txt                |  1978 +++
 examples/example1-spir32/CMakeLists.txt            |    21 +-
 examples/example1-spir32/example1-spir.stdout      |     5 +
 examples/example1-spir32/example1.spir             |   Bin 1580 -> 4952 bytes
 examples/example1-spir32/example1_exec.c           |    14 +-
 examples/example1-spir64/CMakeLists.txt            |    23 +-
 examples/example1-spir64/example1-spir.stdout      |     5 +
 examples/example1-spir64/example1.spir             |   Bin 1604 -> 5828 bytes
 examples/example1-spir64/example1_exec.c           |    23 +-
 examples/example1/CMakeLists.txt                   |     2 +-
 examples/example1/example1.c                       |     6 +
 examples/example1/example1_exec.c                  |    28 +-
 examples/example2/CMakeLists.txt                   |     2 +-
 examples/example2/example2.c                       |    48 +-
 examples/example2a/CMakeLists.txt                  |     2 +-
 examples/example2a/example2a.c                     |    20 +-
 examples/opencl-book-samples/CMakeLists.txt        |    13 +-
 examples/piglit/CMakeLists.txt                     |    27 +-
 examples/scalarwave/CMakeLists.txt                 |     4 +-
 examples/scalarwave/scalarwave.c                   |    33 +-
 examples/trig/CMakeLists.txt                       |     3 +-
 examples/trig/trig.c                               |     5 +
 include/CL/cl2.hpp                                 |    26 +-
 include/CMakeLists.txt                             |     2 +-
 include/_builtin_renames.h                         |   202 +
 include/_enable_all_exts.h                         |     9 +
 include/_kernel.h                                  |   988 +-
 include/_kernel_c.h                                |    32 +-
 include/_kernel_constants.h                        |    79 +-
 include/pocl.h                                     |    11 +
 include/pocl_cache.h                               |     6 +-
 include/pocl_compiler_features.h                   |   222 +
 include/pocl_file_util.h                           |     2 +
 include/pocl_types.h                               |    29 +-
 lib/CL/CMakeLists.txt                              |    25 +-
 lib/CL/clBuildProgram.c                            |   568 +-
 ...ueueMarkerWithWaitList.c => clCompileProgram.c} |    57 +-
 lib/CL/clCreateBuffer.c                            |    16 +-
 lib/CL/clCreateCommandQueue.c                      |    17 +-
 lib/CL/clCreateCommandQueueWithProperties.c        |     5 +-
 lib/CL/clCreateContext.c                           |    18 +-
 lib/CL/clCreateContextFromType.c                   |     5 +-
 lib/CL/clCreateFromGLTexture2D.c                   |     2 +-
 lib/CL/clCreateFromGLTexture3D.c                   |     2 +-
 lib/CL/clCreateImage.c                             |   191 +-
 lib/CL/clCreateKernel.c                            |    68 +-
 lib/CL/clCreateProgramWithBinary.c                 |    66 +-
 ...tList.c => clCreateProgramWithBuiltInKernels.c} |    57 +-
 lib/CL/clCreateProgramWithSource.c                 |    12 +-
 lib/CL/clCreateSampler.c                           |    79 +-
 lib/CL/clCreateSubBuffer.c                         |    31 +-
 lib/CL/clCreateSubDevices.c                        |   141 +-
 lib/CL/clCreateUserEvent.c                         |     5 +
 lib/CL/clEnqueueBarrierWithWaitList.c              |    32 +-
 lib/CL/clEnqueueCopyBuffer.c                       |     9 +-
 lib/CL/clEnqueueFillBuffer.c                       |     9 +-
 lib/CL/clEnqueueFillImage.c                        |    77 +-
 lib/CL/clEnqueueMapBuffer.c                        |    48 +-
 lib/CL/clEnqueueMapImage.c                         |    67 +-
 lib/CL/clEnqueueMarkerWithWaitList.c               |     5 +
 lib/CL/clEnqueueMigrateMemObjects.c                |    33 +-
 lib/CL/clEnqueueNDRangeKernel.c                    |   470 +-
 lib/CL/clEnqueueNativeKernel.c                     |    16 +-
 lib/CL/clEnqueueReadBuffer.c                       |    22 +-
 lib/CL/clEnqueueReadBufferRect.c                   |    10 +-
 lib/CL/clEnqueueReadImage.c                        |    47 +-
 lib/CL/clEnqueueSVMFree.c                          |    12 +-
 lib/CL/clEnqueueSVMMap.c                           |    13 +-
 lib/CL/clEnqueueSVMMemFill.c                       |    13 +-
 lib/CL/clEnqueueSVMMemcpy.c                        |    13 +-
 lib/CL/clEnqueueSVMUnmap.c                         |    13 +-
 lib/CL/clEnqueueTask.c                             |    26 +
 lib/CL/clEnqueueUnmapMemObject.c                   |    33 +-
 lib/CL/clEnqueueWaitForEvents.c                    |    33 +-
 lib/CL/clEnqueueWriteBuffer.c                      |    28 +-
 lib/CL/clEnqueueWriteBufferRect.c                  |    22 +-
 lib/CL/clEnqueueWriteImage.c                       |    43 +-
 lib/CL/clGetDeviceIDs.c                            |    10 +-
 lib/CL/clGetDeviceInfo.c                           |   129 +-
 ... => clGetExtensionFunctionAddressForPlatform.c} |    60 +-
 lib/CL/clGetKernelArgInfo.c                        |     9 +
 lib/CL/clGetKernelInfo.c                           |     5 +
 lib/CL/clGetKernelWorkGroupInfo.c                  |    22 +-
 lib/CL/clGetMemObjectInfo.c                        |     9 +-
 lib/CL/clGetPlatformIDs.c                          |    35 +-
 lib/CL/clGetProgramBuildInfo.c                     |    33 +-
 lib/CL/clGetSamplerInfo.c                          |    44 +-
 lib/CL/clLinkProgram.c                             |   121 +
 lib/CL/clReleaseCommandQueue.c                     |    11 +-
 lib/CL/clReleaseContext.c                          |     3 +
 lib/CL/clReleaseDevice.c                           |    10 +-
 lib/CL/clReleaseEvent.c                            |    19 +-
 lib/CL/clReleaseKernel.c                           |    56 +-
 lib/CL/clReleaseMemObject.c                        |    13 +-
 lib/CL/clReleaseProgram.c                          |    24 +-
 lib/CL/clReleaseSampler.c                          |    42 +-
 lib/CL/clRetainCommandQueue.c                      |     1 +
 lib/CL/clRetainContext.c                           |     1 +
 lib/CL/clRetainDevice.c                            |     1 +
 lib/CL/clRetainEvent.c                             |    24 +
 lib/CL/clRetainKernel.c                            |     1 +
 lib/CL/clRetainMemObject.c                         |     1 +
 lib/CL/clRetainProgram.c                           |     1 +
 lib/CL/clRetainSampler.c                           |    36 +-
 lib/CL/clSetKernelArg.c                            |    57 +-
 lib/CL/clSetUserEventStatus.c                      |    32 +-
 lib/CL/clUnloadCompiler.c                          |    33 +-
 lib/CL/clUnloadPlatformCompiler.c                  |    18 +
 lib/CL/clWaitForEvents.c                           |    19 +-
 lib/CL/devices/CMakeLists.txt                      |     7 +-
 lib/CL/devices/basic/basic.c                       |   302 +-
 lib/CL/devices/common.c                            |   469 +-
 lib/CL/devices/common.h                            |    58 +-
 lib/CL/devices/cpuinfo.c                           |   114 +-
 {include => lib/CL/devices/cuda}/CMakeLists.txt    |    16 +-
 lib/CL/devices/cuda/pocl-cuda.c                    |  1623 +++
 .../rsqrt.cl => CL/devices/cuda/pocl-cuda.h}       |    24 +-
 lib/CL/devices/cuda/pocl-ptx-gen.cc                |   925 ++
 lib/CL/devices/cuda/pocl-ptx-gen.h                 |    62 +
 lib/CL/devices/devices.c                           |   253 +-
 lib/CL/devices/devices.h                           |     2 +-
 lib/CL/devices/hsa/pocl-hsa.c                      |   188 +-
 lib/CL/devices/prototypes.inc                      |     8 +-
 lib/CL/devices/pthread/pocl-pthread_scheduler.h    |     5 +-
 lib/CL/devices/pthread/pocl-pthread_utils.h        |    80 +-
 lib/CL/devices/pthread/pthread.c                   |   141 +-
 lib/CL/devices/pthread/pthread_scheduler.c         |   397 +-
 lib/CL/devices/pthread/pthread_utils.c             |   154 +-
 lib/CL/devices/tce/tce_common.cc                   |    95 +-
 lib/CL/devices/tce/tce_common.h                    |     1 +
 lib/CL/devices/tce/ttasim/ttasim.cc                |    12 +-
 lib/CL/devices/topology/pocl_topology.c            |   120 +-
 lib/CL/devices/topology/pocl_topology.h            |     2 +-
 lib/CL/pocl_binary.c                               |   130 +-
 lib/CL/pocl_build.c                                |   819 ++
 lib/CL/pocl_cache.c                                |   162 +-
 lib/CL/pocl_cl.h                                   |   289 +-
 lib/CL/pocl_debug.c                                |   146 +-
 lib/CL/pocl_debug.h                                |   183 +-
 lib/CL/pocl_image_util.c                           |   663 +-
 lib/CL/pocl_image_util.h                           |    42 +-
 lib/CL/pocl_img_buf_cpy.c                          |    22 +-
 lib/CL/pocl_intfn.h                                |    16 +-
 lib/CL/pocl_llvm.h                                 |    69 +-
 lib/CL/pocl_llvm_api.cc                            |  1957 ---
 lib/CL/pocl_llvm_api.h                             |    84 +
 lib/CL/pocl_llvm_build.cc                          |   880 ++
 lib/CL/pocl_llvm_metadata.cc                       |   818 ++
 lib/CL/pocl_llvm_utils.cc                          |   370 +
 lib/CL/pocl_llvm_wg.cc                             |   658 +
 lib/CL/pocl_mem_management.c                       |    16 +-
 lib/CL/pocl_mem_management.h                       |    23 +
 lib/CL/pocl_opengl.c                               |   121 +
 lib/CL/pocl_queue_util.c                           |   140 -
 lib/CL/pocl_runtime_config.h                       |     1 +
 lib/CL/pocl_shared.h                               |    29 +-
 lib/CL/pocl_tracing.c                              |     2 +-
 lib/CL/pocl_util.c                                 |   442 +-
 lib/CL/pocl_util.h                                 |    84 +-
 lib/CMakeLists.txt                                 |    47 +-
 lib/kernel/CMakeLists.txt                          |   258 +-
 lib/kernel/as_type.cl                              |     8 +-
 lib/kernel/async_work_group_copy.cl                |     1 +
 lib/kernel/async_work_group_strided_copy.cl        |    77 +
 lib/kernel/atomics.cl                              |    59 +-
 lib/kernel/barrier.ll                              |     4 +-
 lib/kernel/convert_type.cl                         | 13346 +++++++++----------
 lib/kernel/{hsail64 => cuda}/CMakeLists.txt        |    67 +-
 lib/kernel/cuda/atomic_add.ll                      |    47 +
 lib/kernel/cuda/atomic_and.ll                      |    47 +
 lib/kernel/cuda/atomic_cmpxchg.ll                  |    55 +
 lib/kernel/cuda/atomic_dec.ll                      |    47 +
 lib/kernel/cuda/atomic_inc.ll                      |    47 +
 lib/kernel/cuda/atomic_max.ll                      |    47 +
 lib/kernel/cuda/atomic_min.ll                      |    47 +
 lib/kernel/cuda/atomic_or.ll                       |    47 +
 lib/kernel/cuda/atomic_sub.ll                      |    47 +
 lib/kernel/cuda/atomic_xchg.ll                     |    65 +
 lib/kernel/cuda/atomic_xor.ll                      |    47 +
 lib/kernel/cuda/barrier.ll                         |     7 +
 .../cuda/get_global_id.c}                          |    68 +-
 .../cuda/get_global_offset.c}                      |    29 +-
 .../cuda/get_global_size.c}                        |    40 +-
 .../cuda/get_group_id.c}                           |    31 +-
 .../cuda/get_local_id.c}                           |    31 +-
 .../cuda/get_local_size.c}                         |    29 +-
 .../cuda/get_num_groups.c}                         |    29 +-
 lib/kernel/cuda/nvvm_functions.ll                  |    87 +
 lib/kernel/cuda/printf.c                           |    91 +
 lib/kernel/fract.cl                                |    22 +-
 lib/kernel/get_global_id.c                         |    13 +-
 ...{get_image_width.cl => get_image_array_size.cl} |    48 +-
 lib/kernel/get_image_channel_data_type.cl          |    53 +
 lib/kernel/get_image_channel_order.cl              |    53 +
 lib/kernel/get_image_depth.cl                      |    31 +-
 lib/kernel/get_image_dim.cl                        |    58 +-
 lib/kernel/get_image_height.cl                     |    36 +-
 lib/kernel/get_image_width.cl                      |    36 +-
 lib/kernel/{rsqrt.cl => half_cos.cl}               |    15 +-
 lib/kernel/{rsqrt.cl => half_divide.cl}            |    15 +-
 lib/kernel/{rsqrt.cl => half_exp.cl}               |    15 +-
 lib/kernel/{rsqrt.cl => half_exp10.cl}             |    15 +-
 lib/kernel/{rsqrt.cl => half_exp2.cl}              |    15 +-
 lib/kernel/{rsqrt.cl => half_log.cl}               |    15 +-
 lib/kernel/{rsqrt.cl => half_log10.cl}             |    15 +-
 lib/kernel/{rsqrt.cl => half_log2.cl}              |    15 +-
 lib/kernel/{rsqrt.cl => half_powr.cl}              |    15 +-
 lib/kernel/{rsqrt.cl => half_recip.cl}             |    15 +-
 lib/kernel/{rsqrt.cl => half_rsqrt.cl}             |    13 +-
 lib/kernel/{rsqrt.cl => half_sin.cl}               |    15 +-
 lib/kernel/{rsqrt.cl => half_sqrt.cl}              |    15 +-
 lib/kernel/{rsqrt.cl => half_tan.cl}               |    15 +-
 lib/kernel/host/CMakeLists.txt                     |   139 +-
 lib/kernel/hsail64/CMakeLists.txt                  |     9 +-
 lib/kernel/libclc-pocl/acosh.cl                    |   588 +
 lib/kernel/libclc-pocl/acospi.cl                   |   588 +
 lib/kernel/libclc-pocl/asinh.cl                    |   588 +
 lib/kernel/libclc-pocl/asinpi.cl                   |   588 +
 lib/kernel/libclc-pocl/atan2pi.cl                  |   588 +
 lib/kernel/libclc-pocl/atanh.cl                    |   588 +
 lib/kernel/libclc-pocl/atanpi.cl                   |   588 +
 lib/kernel/libclc-pocl/cos.cl                      |   588 +
 lib/kernel/libclc-pocl/cosh.cl                     |   588 +
 lib/kernel/libclc-pocl/cospi.cl                    |   588 +
 lib/kernel/libclc-pocl/degrees.cl                  |   588 +
 lib/kernel/libclc-pocl/ep_log.cl                   |   588 +
 lib/kernel/libclc-pocl/expfrexp.cl                 |   588 +
 lib/kernel/libclc-pocl/fmod.cl                     |   588 +
 lib/kernel/libclc-pocl/frexp.cl                    |   708 +
 lib/kernel/libclc-pocl/frfrexp.cl                  |   588 +
 lib/kernel/libclc-pocl/isfinite.cl                 |   588 +
 lib/kernel/libclc-pocl/isinf.cl                    |   588 +
 lib/kernel/libclc-pocl/isnan.cl                    |   588 +
 lib/kernel/libclc-pocl/isnormal.cl                 |   588 +
 lib/kernel/libclc-pocl/log1p.cl                    |   588 +
 lib/kernel/libclc-pocl/log2.cl                     |   588 +
 lib/kernel/libclc-pocl/logb.cl                     |   588 +
 lib/kernel/libclc-pocl/ocml_helpers.cl             |   588 +
 lib/kernel/libclc-pocl/pocl_fma.cl                 |   588 +
 lib/kernel/libclc-pocl/pow.cl                      |   588 +
 lib/kernel/libclc-pocl/pow_helpers.cl              |   588 +
 lib/kernel/libclc-pocl/pown.cl                     |   588 +
 lib/kernel/libclc-pocl/powr.cl                     |   588 +
 lib/kernel/libclc-pocl/radians.cl                  |   588 +
 lib/kernel/libclc-pocl/remainder.cl                |   588 +
 lib/kernel/libclc-pocl/remquo.cl                   |   708 +
 lib/kernel/libclc-pocl/rootn.cl                    |   588 +
 lib/kernel/libclc-pocl/sin.cl                      |   588 +
 lib/kernel/libclc-pocl/sincos.cl                   |   708 +
 lib/kernel/libclc-pocl/sincos_helpers.cl           |   588 +
 lib/kernel/libclc-pocl/sinh.cl                     |   588 +
 lib/kernel/libclc-pocl/sinpi.cl                    |   588 +
 lib/kernel/libclc-pocl/tan.cl                      |   588 +
 lib/kernel/libclc-pocl/tanh.cl                     |   588 +
 lib/kernel/libclc-pocl/tanpi.cl                    |   588 +
 lib/kernel/libclc/ROCM_LICENSE.txt                 |    44 +
 lib/kernel/libclc/acosh_fp32.cl                    |    47 +
 lib/kernel/libclc/acosh_fp64.cl                    |    97 +
 lib/kernel/libclc/acospi_fp32.cl                   |    90 +
 lib/kernel/libclc/acospi_fp64.cl                   |   102 +
 lib/kernel/libclc/asinh_fp32.cl                    |    74 +
 lib/kernel/libclc/asinh_fp64.cl                    |   237 +
 lib/kernel/libclc/asinpi_fp32.cl                   |    91 +
 lib/kernel/libclc/asinpi_fp64.cl                   |    97 +
 lib/kernel/libclc/atan2pi_fp32.cl                  |    76 +
 lib/kernel/libclc/atan2pi_fp64.cl                  |   164 +
 lib/kernel/libclc/atanh_fp32.cl                    |    64 +
 lib/kernel/libclc/atanh_fp64.cl                    |    74 +
 lib/kernel/libclc/atanpi_fp32.cl                   |   101 +
 lib/kernel/libclc/atanpi_fp64.cl                   |   103 +
 lib/kernel/libclc/cos_fp32.cl                      |    47 +
 lib/kernel/libclc/cos_fp64.cl                      |    48 +
 lib/kernel/libclc/cosh_fp32.cl                     |   101 +
 lib/kernel/libclc/cosh_fp64.cl                     |   124 +
 lib/kernel/libclc/cospi_fp32.cl                    |    72 +
 lib/kernel/libclc/cospi_fp64.cl                    |    73 +
 lib/kernel/libclc/degrees_fp32.cl                  |    29 +
 lib/kernel/libclc/degrees_fp64.cl                  |    29 +
 lib/kernel/libclc/ep_log.h                         |    26 +
 lib/kernel/libclc/ep_log_fp32.cl                   |     0
 lib/kernel/libclc/ep_log_fp64.cl                   |    85 +
 .../libclc/expfrexp_fp32.cl}                       |    34 +-
 .../libclc/expfrexp_fp64.cl}                       |    34 +-
 lib/kernel/libclc/fmod_fp32.cl                     |    12 +
 lib/kernel/libclc/fmod_fp64.cl                     |    12 +
 lib/kernel/{rsqrt.cl => libclc/frexp_fp32.cl}      |    22 +-
 lib/kernel/{rsqrt.cl => libclc/frexp_fp64.cl}      |    22 +-
 .../libclc/frfrexp_fp32.cl}                        |    33 +-
 .../libclc/frfrexp_fp64.cl}                        |    32 +-
 lib/kernel/libclc/isfinite_fp32.cl                 |     4 +
 lib/kernel/libclc/isfinite_fp64.cl                 |    13 +
 lib/kernel/libclc/isinf_fp32.cl                    |     4 +
 lib/kernel/libclc/isinf_fp64.cl                    |    13 +
 lib/kernel/libclc/isnan_fp32.cl                    |     4 +
 lib/kernel/libclc/isnan_fp64.cl                    |    13 +
 lib/kernel/libclc/isnormal_fp32.cl                 |     4 +
 lib/kernel/libclc/isnormal_fp64.cl                 |    13 +
 lib/kernel/libclc/length.cl                        |    90 +
 lib/kernel/libclc/log1p_fp32.cl                    |    83 +
 lib/kernel/libclc/log1p_fp64.cl                    |   100 +
 lib/kernel/libclc/log2_fp32.cl                     |    12 +
 lib/kernel/libclc/log2_fp64.cl                     |    12 +
 lib/kernel/libclc/log_base_fp32.cl                 |   201 +
 lib/kernel/libclc/log_base_fp64.cl                 |   155 +
 lib/kernel/libclc/logb_fp32.cl                     |    33 +
 lib/kernel/libclc/logb_fp64.cl                     |    33 +
 lib/kernel/libclc/misc.h                           |   209 +
 lib/kernel/libclc/normalize.cl                     |   159 +
 lib/kernel/libclc/ocml_helpers.h                   |    94 +
 lib/kernel/libclc/ocml_helpers_fp32.cl             |    40 +
 lib/kernel/libclc/ocml_helpers_fp64.cl             |    17 +
 lib/kernel/libclc/ocml_helpers_impl.cl             |   435 +
 lib/kernel/{rsqrt.cl => libclc/pocl_fma_fp32.cl}   |    21 +-
 lib/kernel/{rsqrt.cl => libclc/pocl_fma_fp64.cl}   |    21 +-
 lib/kernel/libclc/pow_base_fp32.cl                 |   204 +
 lib/kernel/libclc/pow_base_fp64.cl                 |   198 +
 lib/kernel/libclc/pow_fp32.cl                      |    12 +
 lib/kernel/libclc/pow_fp64.cl                      |    12 +
 lib/kernel/libclc/pow_helpers_fp32.cl              |    58 +
 lib/kernel/libclc/pow_helpers_fp64.cl              |    65 +
 lib/kernel/libclc/pown_fp32.cl                     |    12 +
 lib/kernel/libclc/pown_fp64.cl                     |    12 +
 lib/kernel/libclc/powr_fp32.cl                     |    12 +
 lib/kernel/libclc/powr_fp64.cl                     |    12 +
 lib/kernel/libclc/radians_fp32.cl                  |    30 +
 lib/kernel/libclc/radians_fp64.cl                  |    28 +
 lib/kernel/libclc/remainder_base_fp32.cl           |   188 +
 lib/kernel/libclc/remainder_base_fp64.cl           |   188 +
 lib/kernel/libclc/remainder_fp32.cl                |    12 +
 lib/kernel/libclc/remainder_fp64.cl                |    12 +
 lib/kernel/libclc/remquo_fp32.cl                   |    12 +
 lib/kernel/libclc/remquo_fp64.cl                   |    12 +
 lib/kernel/libclc/rootn_fp32.cl                    |    12 +
 lib/kernel/libclc/rootn_fp64.cl                    |    12 +
 lib/kernel/libclc/sin_fp32.cl                      |    49 +
 lib/kernel/libclc/sin_fp64.cl                      |    51 +
 lib/kernel/libclc/sincos_fp32.cl                   |    63 +
 lib/kernel/libclc/sincos_fp64.cl                   |    64 +
 lib/kernel/libclc/sincos_helpers_fp32.cl           |   380 +
 lib/kernel/libclc/sincos_helpers_fp32.h            |    29 +
 lib/kernel/libclc/sincos_helpers_fp64.cl           |   302 +
 lib/kernel/libclc/sincos_helpers_fp64.h            |    32 +
 .../libclc/singlevec.h}                            |    48 +-
 lib/kernel/libclc/sinh_fp32.cl                     |   106 +
 lib/kernel/libclc/sinh_fp64.cl                     |   121 +
 lib/kernel/libclc/sinpi_fp32.cl                    |    70 +
 lib/kernel/libclc/sinpi_fp64.cl                    |    69 +
 lib/kernel/libclc/tan_fp32.cl                      |    30 +
 lib/kernel/libclc/tan_fp64.cl                      |    69 +
 lib/kernel/libclc/tanh_fp32.cl                     |    92 +
 lib/kernel/libclc/tanh_fp64.cl                     |    95 +
 lib/kernel/{rsqrt.cl => libclc/tanpi_fp32.cl}      |    21 +-
 lib/kernel/{rsqrt.cl => libclc/tanpi_fp64.cl}      |    21 +-
 lib/kernel/libclc/vtables.h                        |    85 +
 lib/kernel/libclc/vtables_fp32.cl                  |   759 ++
 lib/kernel/libclc/vtables_fp64.cl                  |  1039 ++
 lib/kernel/libclc/vtables_macros.h                 |   211 +
 lib/{CL/clRetainDevice.c => kernel/mem_fence.c}    |    32 +-
 lib/kernel/pocl_image_rw_utils.h                   |    26 +-
 lib/kernel/printf.c                                |    28 +-
 lib/kernel/printf_constant.c                       |     7 +
 lib/kernel/read_image.cl                           |  1881 ++-
 lib/kernel/rsqrt.cl                                |     2 -
 lib/kernel/select.cl                               |    13 +-
 lib/kernel/sleef-pocl/README                       |    10 +
 lib/kernel/sleef-pocl/acos.cl                      |   229 +
 lib/kernel/sleef-pocl/acosh.cl                     |   183 +
 lib/kernel/sleef-pocl/asin.cl                      |   229 +
 lib/kernel/sleef-pocl/asinh.cl                     |   183 +
 lib/kernel/sleef-pocl/atan.cl                      |   229 +
 lib/kernel/sleef-pocl/atan2.cl                     |   231 +
 lib/kernel/sleef-pocl/atanh.cl                     |   183 +
 lib/kernel/sleef-pocl/cbrt.cl                      |   229 +
 lib/kernel/sleef-pocl/ceil.cl                      |   183 +
 lib/kernel/sleef-pocl/copysign.cl                  |   185 +
 lib/kernel/sleef-pocl/cos.cl                       |   229 +
 lib/kernel/sleef-pocl/cosh.cl                      |   183 +
 lib/kernel/sleef-pocl/cospi.cl                     |   183 +
 lib/kernel/sleef-pocl/erf.cl                       |   183 +
 lib/kernel/sleef-pocl/erfc.cl                      |   183 +
 lib/kernel/sleef-pocl/exp.cl                       |   183 +
 lib/kernel/sleef-pocl/exp10.cl                     |   183 +
 lib/kernel/sleef-pocl/exp2.cl                      |   183 +
 lib/kernel/sleef-pocl/expfrexp.cl                  |   193 +
 lib/kernel/sleef-pocl/expm1.cl                     |   183 +
 lib/kernel/sleef-pocl/fabs.cl                      |   183 +
 lib/kernel/sleef-pocl/fdim.cl                      |   185 +
 lib/kernel/sleef-pocl/floor.cl                     |   183 +
 lib/kernel/sleef-pocl/fma.cl                       |   187 +
 lib/kernel/sleef-pocl/fmax.cl                      |   185 +
 lib/kernel/sleef-pocl/fmin.cl                      |   185 +
 lib/kernel/sleef-pocl/fmod.cl                      |   185 +
 lib/kernel/sleef-pocl/frexp.cl                     |    77 +
 lib/kernel/sleef-pocl/frfrexp.cl                   |   183 +
 lib/kernel/sleef-pocl/hypot.cl                     |   231 +
 lib/kernel/sleef-pocl/ilogb.cl                     |   183 +
 lib/kernel/sleef-pocl/ldexp.cl                     |   185 +
 lib/kernel/sleef-pocl/lgamma.cl                    |   183 +
 lib/kernel/sleef-pocl/lgamma_r.cl                  |   608 +
 lib/kernel/sleef-pocl/log.cl                       |   229 +
 lib/kernel/sleef-pocl/log10.cl                     |   183 +
 lib/kernel/sleef-pocl/log1p.cl                     |   183 +
 lib/kernel/sleef-pocl/modf.cl                      |   595 +
 lib/kernel/sleef-pocl/native_cos.cl                |   183 +
 lib/kernel/sleef-pocl/native_sin.cl                |   183 +
 lib/kernel/sleef-pocl/native_tan.cl                |   183 +
 lib/kernel/sleef-pocl/nextafter.cl                 |   185 +
 lib/kernel/sleef-pocl/pow.cl                       |   185 +
 lib/kernel/sleef-pocl/pown.cl                      |   185 +
 lib/kernel/sleef-pocl/powr.cl                      |   185 +
 lib/kernel/sleef-pocl/rint.cl                      |   183 +
 lib/kernel/sleef-pocl/round.cl                     |   183 +
 lib/kernel/{rsqrt.cl => sleef-pocl/scalars.cl}     |    20 +-
 lib/kernel/sleef-pocl/sin.cl                       |   229 +
 lib/kernel/sleef-pocl/sincos.cl                    |   739 +
 lib/kernel/sleef-pocl/sinh.cl                      |   183 +
 lib/kernel/sleef-pocl/sinpi.cl                     |   183 +
 lib/kernel/sleef-pocl/sqrt.cl                      |   183 +
 lib/kernel/sleef-pocl/tan.cl                       |   229 +
 lib/kernel/sleef-pocl/tanh.cl                      |   183 +
 lib/kernel/sleef-pocl/tgamma.cl                    |   183 +
 lib/kernel/sleef-pocl/trunc.cl                     |   183 +
 lib/kernel/sleef/arch/helperadvsimd.h              |   701 +
 lib/kernel/sleef/arch/helperavx.h                  |   539 +
 lib/kernel/sleef/arch/helperavx2.h                 |   397 +
 lib/kernel/sleef/arch/helperavx2_128.h             |   370 +
 lib/kernel/sleef/arch/helperavx512f.h              |   499 +
 lib/kernel/sleef/arch/helperneon32.h               |   244 +
 lib/kernel/sleef/arch/helperpurec.h                |   540 +
 lib/kernel/sleef/arch/helpers.h                    |   127 +
 lib/kernel/sleef/arch/helpersse2.h                 |   440 +
 lib/kernel/sleef/arch/helpervecext.h               |   877 ++
 lib/kernel/sleef/arch/misc.h                       |   258 +
 lib/kernel/sleef/fma_test.c                        |    49 +
 lib/kernel/sleef/include/sleef.h                   |   890 ++
 lib/kernel/sleef/include/sleef_cl.h                |   691 +
 lib/kernel/sleef/libm/dd.h                         |   395 +
 lib/kernel/sleef/libm/df.h                         |   466 +
 lib/kernel/sleef/libm/rename.h                     |   143 +
 lib/kernel/sleef/libm/rename_vec128.h              |   137 +
 lib/kernel/sleef/libm/rename_vec256.h              |   137 +
 lib/kernel/sleef/libm/rename_vec512.h              |   137 +
 lib/kernel/sleef/libm/sleef_builtin.c              |   938 ++
 lib/kernel/sleef/libm/sleef_glue.cl                |    78 +
 lib/kernel/sleef/libm/sleef_glue_auto.c            |  4353 ++++++
 lib/kernel/sleef/libm/sleefdp.c                    |  2323 ++++
 lib/kernel/sleef/libm/sleefsimddp.c                |  2551 ++++
 lib/kernel/sleef/libm/sleefsimdsp.c                |  2307 ++++
 lib/kernel/sleef/libm/sleefsp.c                    |  2090 +++
 lib/kernel/sleef/test.c                            |    58 +
 lib/kernel/tce/CMakeLists.txt                      |     6 +-
 lib/kernel/templates.h                             |   216 +-
 lib/kernel/vecmathlib-pocl/generate-files.py       |     3 -
 lib/kernel/vecmathlib-pocl/half_cos.cl             |   368 +
 lib/kernel/vecmathlib-pocl/half_divide.cl          |   368 +
 lib/kernel/vecmathlib-pocl/half_exp.cl             |   368 +
 lib/kernel/vecmathlib-pocl/half_exp10.cl           |   368 +
 lib/kernel/vecmathlib-pocl/half_exp2.cl            |   368 +
 lib/kernel/vecmathlib-pocl/half_log.cl             |   368 +
 lib/kernel/vecmathlib-pocl/half_log10.cl           |   368 +
 lib/kernel/vecmathlib-pocl/half_log2.cl            |   368 +
 lib/kernel/vecmathlib-pocl/half_powr.cl            |   368 +
 lib/kernel/vecmathlib-pocl/half_recip.cl           |   368 +
 lib/kernel/vecmathlib-pocl/half_rsqrt.cl           |   368 +
 lib/kernel/vecmathlib-pocl/half_sin.cl             |   368 +
 lib/kernel/vecmathlib-pocl/half_sqrt.cl            |   368 +
 lib/kernel/vecmathlib-pocl/half_tan.cl             |   368 +
 lib/kernel/vecmathlib-pocl/kernel-vecmathlib.h     |   448 +
 lib/kernel/vecmathlib-pocl/native_cos.cl           |   368 +
 lib/kernel/vecmathlib-pocl/native_divide.cl        |   368 +
 lib/kernel/vecmathlib-pocl/native_exp.cl           |   368 +
 lib/kernel/vecmathlib-pocl/native_exp10.cl         |   368 +
 lib/kernel/vecmathlib-pocl/native_exp2.cl          |   368 +
 lib/kernel/vecmathlib-pocl/native_log.cl           |   368 +
 lib/kernel/vecmathlib-pocl/native_log10.cl         |   368 +
 lib/kernel/vecmathlib-pocl/native_log2.cl          |   368 +
 lib/kernel/vecmathlib-pocl/native_powr.cl          |   368 +
 lib/kernel/vecmathlib-pocl/native_recip.cl         |   368 +
 lib/kernel/vecmathlib-pocl/native_rsqrt.cl         |   368 +
 lib/kernel/vecmathlib-pocl/native_sin.cl           |   368 +
 lib/kernel/vecmathlib-pocl/native_sqrt.cl          |   368 +
 lib/kernel/vecmathlib-pocl/native_tan.cl           |   368 +
 lib/kernel/vecmathlib/mathfuncs_asin.h             |     2 +
 lib/kernel/vecmathlib/mathfuncs_fabs.h             |    25 +-
 lib/kernel/vecmathlib/vec_avx_double4.h            |    33 +-
 lib/kernel/vecmathlib/vec_avx_float8.h             |    30 +-
 lib/kernel/vecmathlib/vec_avx_fp16_16.h            |     2 +-
 lib/kernel/vecmathlib/vec_avx_fp8_32.h             |     2 +-
 lib/kernel/vecmathlib/vec_sse_double1.h            |    37 +-
 lib/kernel/vecmathlib/vec_sse_double2.h            |    44 +-
 lib/kernel/vecmathlib/vec_sse_float1.h             |    36 +-
 lib/kernel/vecmathlib/vec_sse_float4.h             |    48 +-
 lib/kernel/vload_half.cl                           |   286 +-
 lib/kernel/vload_store_half_f16c.c                 |   218 +
 lib/kernel/vstore_half.cl                          |   716 +-
 lib/kernel/wait_group_events.cl                    |     6 +-
 lib/kernel/write_image.cl                          |   416 +-
 lib/llvmopencl/AllocasToEntry.cc                   |     3 +-
 lib/llvmopencl/AutomaticLocals.cc                  |    30 +-
 lib/llvmopencl/Barrier.h                           |    12 +-
 lib/llvmopencl/BreakConstantGEPs.h                 |     4 +-
 lib/llvmopencl/CMakeLists.txt                      |     8 +-
 lib/llvmopencl/CanonicalizeBarriers.cc             |     5 +-
 lib/llvmopencl/DebugHelpers.cc                     |    11 +-
 lib/llvmopencl/Flatten.cc                          |   156 +-
 lib/llvmopencl/{Flatten.cc => FlattenGlobals.cc}   |   151 +-
 lib/llvmopencl/HandleSamplerInitialization.cc      |    10 +-
 lib/llvmopencl/ImplicitConditionalBarriers.cc      |    24 +-
 lib/llvmopencl/Kernel.cc                           |    29 +-
 lib/llvmopencl/Kernel.h                            |     4 +-
 lib/llvmopencl/LLVMFileUtils.cc                    |    47 +-
 lib/llvmopencl/LLVMUtils.cc                        |    15 +
 lib/llvmopencl/LLVMUtils.h                         |     9 +-
 lib/llvmopencl/OptimizeWorkItemFuncCalls.cc        |   159 +
 ...{BarrierBlock.h => OptimizeWorkItemFuncCalls.h} |    34 +-
 lib/llvmopencl/ParallelRegion.cc                   |    17 +-
 lib/llvmopencl/ParallelRegion.h                    |     2 -
 ...llocasToEntry.cc => RemoveOptnoneFromWIFunc.cc} |    63 +-
 .../{BarrierBlock.cc => RemoveOptnoneFromWIFunc.h} |    50 +-
 lib/llvmopencl/TargetAddressSpaces.cc              |    12 +-
 lib/llvmopencl/VariableUniformityAnalysis.cc       |    81 +-
 lib/llvmopencl/WorkItemAliasAnalysis.cc            |    13 -
 lib/llvmopencl/Workgroup.cc                        |    52 +-
 lib/llvmopencl/WorkitemLoops.cc                    |    98 +-
 lib/llvmopencl/WorkitemLoops.h                     |     4 +-
 lib/llvmopencl/WorkitemReplication.cc              |    10 +
 lib/llvmopencl/linker.cpp                          |    65 +-
 lib/llvmopencl/linker.h                            |     5 +-
 lib/poclu/misc.c                                   |     8 +-
 ocl-vendors/pocl-tests.icd.in                      |     1 -
 pocl.pc.in                                         |    11 -
 pocl.pc.in.cmake                                   |     2 +-
 tests/CMakeLists.txt                               |    61 +-
 tests/atlocal.in                                   |    11 -
 tests/kernel/CMakeLists.txt                        |    57 +-
 tests/kernel/image_query_funcs.c                   |     9 +-
 tests/kernel/test_convert_type_1.cl                |   320 +-
 tests/kernel/test_convert_type_16.cl               |   320 +-
 tests/kernel/test_convert_type_2.cl                |   320 +-
 tests/kernel/test_convert_type_4.cl                |   320 +-
 tests/kernel/test_convert_type_8.cl                |   320 +-
 tests/kernel/test_shuffle.cc                       |    21 +-
 tests/regression/CMakeLists.txt                    |    64 +-
 tests/regression/test_autolocals_in_constexprs.cpp |   114 +
 .../test_fors_with_var_iteration_counts.cpp        |     2 +-
 tests/regression/test_issue_231.cpp                |    10 +-
 tests/regression/test_issue_445.cpp                |     3 +-
 tests/regression/test_issue_553.cpp                |    75 +
 tests/regression/test_issue_577.cpp                |    41 +
 tests/regression/test_locals.cpp                   |     5 +-
 .../test_program_from_binary_with_local_1_1_1.c    |   149 +
 tests/runtime/CMakeLists.txt                       |    34 +-
 tests/runtime/test_buffer-image-copy.c             |    12 +-
 tests/runtime/test_clBuildProgram.c                |   102 +-
 tests/runtime/test_clCreateKernel.c                |     7 +
 tests/runtime/test_clCreateKernelsInProgram.c      |    10 +
 tests/runtime/test_clCreateProgramWithBinary.c     |     9 +-
 tests/runtime/test_clCreateSubDevices.c            |    29 +-
 tests/runtime/test_clEnqueueNativeKernel.c         |    14 +-
 tests/runtime/test_clFinish.c                      |    26 +-
 tests/runtime/test_clGetDeviceInfo.c               |     2 +
 tests/runtime/test_clGetEventInfo.c                |     4 +
 tests/runtime/test_clGetKernelArgInfo.c            |    61 +-
 tests/runtime/test_clGetSupportedImageFormats.c    |     4 +
 tests/runtime/test_clSetEventCallback.c            |    12 +
 .../test_clSetMemObjectDestructorCallback.c        |     5 +-
 tests/runtime/test_enqueue_kernel_from_binary.c    |    50 +-
 tests/runtime/test_event_cycle.c                   |    16 +-
 tests/runtime/test_event_free.c                    |     9 +-
 tests/runtime/test_kernel_cache_includes.c         |    10 +
 tests/runtime/test_kernel_src_in_pwd.h             |     9 +-
 tests/runtime/test_link_error.c                    |     6 +
 tests/runtime/test_read-copy-write-buffer.c        |    12 +-
 tests/runtime/test_user_event.c                    |    12 +-
 tests/runtime/test_version.c                       |     8 +-
 tests/tce/fp16/host.cpp                            |     1 -
 tests/tce/multi_AS_copy/host.cpp                   |     3 +-
 tests/tce/tcemc/host.cpp                           |     3 +-
 tests/tce/ttasim/host.cpp                          |     3 +-
 tests/testsuite-amd.at                             |   311 -
 tests/testsuite-amdsdk2_9.at                       |   586 -
 tests/testsuite-amdsdk3_0.at                       |   727 -
 tests/testsuite-cloverleaf.at                      |    17 -
 tests/testsuite-halide.at                          |    34 -
 tests/testsuite-opencv.at                          |   327 -
 tests/testsuite-parboil.at                         |   113 -
 tests/testsuite-piglit.at                          |     9 -
 tests/testsuite-regression.at                      |   270 -
 tests/testsuite-rodinia.at                         |   108 -
 tests/testsuite-runtime.at                         |   106 -
 tests/testsuite-samples.at                         |   114 -
 tests/testsuite-tce.at                             |    67 -
 tests/testsuite-vexcl.at                           |    80 -
 tests/testsuite-viennacl.at                        |   201 -
 tests/testsuite-workgroup.at                       |   160 -
 tests/testsuite.at                                 |   417 -
 tests/workgroup/CMakeLists.txt                     |     9 +
 tests/workgroup/issue_548.cl                       |    49 +
 tests/workgroup/issue_548_1_2_1_1.stdout           |     8 +
 tools/docker/ArchLinux/default                     |     9 +
 tools/docker/ArchLinux/distro                      |    14 +
 tools/docker/ArchLinux/test_install                |    10 +
 tools/docker/Debian/stretch                        |    11 +
 tools/docker/Debian/testing                        |    11 +
 tools/docker/Fedora/default                        |    11 +
 tools/docker/Ubuntu/16_04.32bit                    |    12 +
 tools/docker/Ubuntu/16_04.64bit                    |    12 +
 tools/docker/Ubuntu/default.32bit                  |    12 +
 tools/docker/Ubuntu/default.64bit                  |    12 +
 tools/docker/Ubuntu/default.conformance            |    13 +
 tools/docker/Ubuntu/distro                         |    17 +
 tools/docker/Ubuntu/test_install                   |    13 +
 tools/scripts/devel-envs.sh                        |     7 -
 tools/scripts/run_cuda_tests                       |    36 +
 windows/setup_and_build_win64.sh                   |    44 -
 678 files changed, 110710 insertions(+), 20760 deletions(-)

diff --git a/.bzrignore b/.bzrignore
deleted file mode 100644
index b3dddc2..0000000
--- a/.bzrignore
+++ /dev/null
@@ -1,52 +0,0 @@
-# srcdir
-
-Makefile.in
-Makefile.*.in
-./aclocal.m4
-./autom4te.cache
-./configure
-config/config.guess
-config/config.sub
-config/depcomp
-config/install-sh
-config/ltmain.sh
-config/missing
-config/ylwrap
-./config.h.in
-./m4/*
-
-tests/package.m4
-tests/testsuite
-
-# builddir
-
-Makefile
-pocl.pc
-.deps
-.libs
-*.lo
-*.o
-*.la
-
-./libtool
-./clconfig.h
-./config.h
-./config.log
-./config.status
-./stamp-h1
-
-examples/barriers/barriers
-examples/example1/example1
-examples/example2/example2
-examples/example2a/example2a
-examples/forloops/forloops
-examples/loopbarriers/loopbarriers
-examples/standalone/standalone.bc
-examples/standalone/standalone.h
-examples/trig/trig
-examples/scalarwave/scalarwave
-examples/kernel/kernel
-scripts/pocl-build
-scripts/pocl-kernel
-scripts/pocl-standalone
-scripts/pocl-workgroup
diff --git a/.gitattributes b/.gitattributes
index 141801b..28f23f4 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,41 +1,46 @@
-examples/Rodinia/pathfinder.stdout	export-ignore
-doc/benchmark_results/			export-ignore
-doc/binary_format.txt			export-ignore
-doc/buildbot/				export-ignore
-doc/handling_loops.txt			export-ignore
-doc/LAUNDRY				export-ignore
-doc/notes*.txt				export-ignore
-doc/spir-todo.txt			export-ignore
-doc/ttasim_kernel_capturer.txt		export-ignore
-doc/www/				export-ignore
-
-examples/piglit/sorted_ref		export-ignore
-examples/piglit/sorted_ref_llvm_3.5	export-ignore
+doc/benchmark_results/                          export-ignore
+doc/buildbot/                                   export-ignore
+doc/luxmark.txt                                 export-ignore
+doc/handling_loops.txt                          export-ignore
+doc/LAUNDRY                                     export-ignore
+doc/notes*.txt                                  export-ignore
+doc/spir-todo.txt                               export-ignore
+doc/ttasim_kernel_capturer.txt                  export-ignore
+doc/www/                                        export-ignore
 
+examples/piglit/sorted_ref*                     export-ignore
 # this one is ~20M
-examples/Rodinia/pathfinder.stdout	export-ignore
+examples/Rodinia/pathfinder.stdout              export-ignore
+
+lib/kernel/amdgcn                               export-ignore
+lib/kernel/convert_type.py                      export-ignore
+lib/kernel/vecmathlib/bench.cc                  export-ignore
+lib/kernel/vecmathlib/coeffs.out                export-ignore
+lib/kernel/vecmathlib/example.cc                export-ignore
+lib/kernel/vecmathlib/example_float.cc          export-ignore
+lib/kernel/vecmathlib/find-coeffs.m             export-ignore
+lib/kernel/vecmathlib/IDEAS                     export-ignore
+lib/kernel/vecmathlib/instantiations.cc         export-ignore
+lib/kernel/vecmathlib/interp.cc                 export-ignore
+
+lib/kernel/libclc-pocl/gen_vectorize.rb         export-ignore
+lib/kernel/sleef/gen*                           export-ignore
 
-lib/kernel/amdgcn			export-ignore
-lib/kernel/convert_type.py		export-ignore
-lib/kernel/vecmathlib/bench.cc		export-ignore
-lib/kernel/vecmathlib/coeffs.out	export-ignore
-lib/kernel/vecmathlib/example.cc	export-ignore
-lib/kernel/vecmathlib/example_float.cc	export-ignore
-lib/kernel/vecmathlib/find-coeffs.m	export-ignore
-lib/kernel/vecmathlib/IDEAS		export-ignore
-lib/kernel/vecmathlib/instantiations.cc	export-ignore
-lib/kernel/vecmathlib/interp.cc		export-ignore
+scripts/pocl-build.in                           export-ignore
+scripts/pocl-kernel.in                          export-ignore
+scripts/pocl-workgroup.in                       export-ignore
 
-scripts/pocl-build.in			export-ignore
-scripts/pocl-kernel.in			export-ignore
-scripts/pocl-workgroup.in		export-ignore
+tests/kernel/test_convert_type.py               export-ignore
+tests/kernel/test_convert_type.sh               export-ignore
+tests/testsuite*                                export-ignore
+tests/amdsdk.at                                 export-ignore
+tests/atlocal.in                                export-ignore
 
-tests/amdsdk.at				export-ignore
-tests/kernel/test_convert_type.py	export-ignore
-tests/kernel/test_convert_type.sh	export-ignore
+tools/gdb-breakpoints                           export-ignore
+tools/scripts/benchmark_barchart.py             export-ignore
+tools/scripts/benchmark.py                      export-ignore
+tools/scripts/devel-configure                   export-ignore
 
-tools/gdb-breakpoints			export-ignore
-tools/patches/clang-3.4-no-forced-64bit-doubles.patch		export-ignore
-tools/scripts/benchmark_barchart.py	export-ignore
-tools/scripts/benchmark.py		export-ignore
-tools/scripts/devel-configure		export-ignore
+# should we include these ?
+android/      export-ignore
+windows/      export-ignore
diff --git a/CHANGES b/CHANGES
index a5b8fc1..670bd0a 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,4 +1,43 @@
-0.14 March 2017
+1.0 December 2017
+================
+
+Highlights
+----------
+- Improved automatic local work-group sizing on kernel enqueue, taking
+  into account standard constraints, SIMD width for vectorization as
+  well as the number of compute units available on the device.
+- Support for NVIDIA GPUs via a new CUDA backend (currently experimental).
+- Removed support for BBVectorizer.
+- LLVM 5.0 is now supported.
+- A few build options have been added for distribution builds,
+  see README.packaging.
+- Somewhat improved scalability in the CPU driver. CPUs with many cores
+  and programs using a lot of WIs with small kernels can run somewhat faster.
+- The OpenCL 1.2 conformance tests now pass with selected CPUs. There are some
+  caveats though - see the documentation.
+- When conformance is enabled, some kernel library functions might be
+  slower than in previous releases.
+- Pocl now reports OpenCL 1.2 instead of 2.0, except HSA enabled builds.
+- Updated format of pocl binaries, which is NOT backwards compatible.
+  You'll need to clean any kernel caches.
+- Fixed several memory leaks.
+- Unresolved symbols (missing/misspelled functions etc) in a kernel will
+  result in error in clBuildProgram() instead of pocl silently ignoring
+  them and then aborting at dlopen().
+- New env variable POCL_MEMORY_LIMIT=<num> limits the Global memory size
+  reported by pocl to <num> gigabytes.
+- New env variable POCL_AFFINITY (defaults to 0): if enabled, sets
+  the affinity of each CPU driver pthread to a single core.
+- Improved AVX512 support (with LLVM 5.0). Note that even with LLVM 5.0
+  there are still a few bugs (see pocl issue #555); AVX512 + LLVM 4.0 are
+  a lot more broken, and probably not worth trying.
+- POCL_DEBUG env var has been revamped. You can now limit debuginfo to
+  these categories (or their combination): all,error,warning,general
+  memory,llvm,events,cache,locking,refcounts,timing,hsa,tce,cuda
+  The old setting POCL_DEBUG=1 now equals error+warning+general.
+
+
+0.14 April 2017
 ===============
 
 Highlights
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 18d36e7..d997f37 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -26,8 +26,8 @@
 cmake_minimum_required(VERSION 2.8.12 FATAL_ERROR)
 
 project(pocl)
-set(MAJOR_VERSION 0)
-set(MINOR_VERSION 14)
+set(MAJOR_VERSION 1)
+set(MINOR_VERSION 0)
 set(VERSION_SUFFIX "")
 set(VERSION_STRING ${MAJOR_VERSION}.${MINOR_VERSION}${VERSION_SUFFIX})
 set(POCL_VERSION ${VERSION_STRING})
@@ -49,13 +49,35 @@ option(POCL_DEBUG_MESSAGES
 
 option(ENABLE_HSA "Enable the HSA device driver for AMD GCN devices" OFF)
 
+option(ENABLE_CUDA "Enable the CUDA device driver for NVIDIA devices" OFF)
+
 option(KERNEL_CACHE_DEFAULT "Default value for the kernel compile cache. If disabled, pocl will still use the kernel cache, but will delete cachefiles on exit. You can still enable keeping the files it at runtime with an env var." ON)
 
+option(POCL_ICD_ABSOLUTE_PATH "Use absolute path in pocl.icd" ON)
+
+option(ENABLE_POCL_BUILDING "When OFF, env var POCL_BUILDING has no effect. Defaults to ON" ON)
+
+#### these are mostly useful for pocl developers
+
+option(DEVELOPER_MODE "This will SIGNIFICANTLY slow down pocl (but speed up its compilation). Only turn on if you know what you're doing." OFF)
+
+option(USE_POCL_MEMMANAGER "Enables custom memory manager. Except for special circumstances, this should be disabled." OFF)
+
+option(EXAMPLES_USE_GIT_MASTER "If enabled, some of the external testsuites in examples/ will try to use sources from Git master, instead of releases. This may result in failure to build or run the examples" OFF)
+
+####
+
 # currently only works with gcc as host compiler
 if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
   option(ENABLE_ASAN "Enable AddressSanitizer" OFF)
+  option(ENABLE_TSAN "Enable ThreadSanitizer" OFF)
+  option(ENABLE_LSAN "Enable LeakSanitizer" OFF)
+  option(ENABLE_UBSAN "Enable UBSanitizer" OFF)
 else()
-  set(ENABLE_ASAN 0)
+  set(ENABLE_ASAN OFF)
+  set(ENABLE_TSAN OFF)
+  set(ENABLE_LSAN OFF)
+  set(ENABLE_UBSAN OFF)
 endif()
 
 ##################################################################################
@@ -66,11 +88,17 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips")
   set(MIPS 1)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "armv7")
   set(ARMV7 1)
+  set(ARM32 1)
+  set(ARM 1)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "armv6")
   set(ARMV6 1)
+  set(ARM32 1)
+  set(ARM 1)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64")
   set(ARM64 1)
+  set(ARM 1)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(i.86|AMD64|x86_64)")
+  set(X86 1)
   if(POCL_DEVICE_ADDRESS_BITS MATCHES "32")
     set(I386 1)
   else()
@@ -78,6 +106,17 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(i.86|AMD64|x86_64)")
   endif()
 endif()
 
+if(CMAKE_MAJOR_VERSION GREATER 2)
+  include(ProcessorCount)
+  ProcessorCount(CORECOUNT)
+  if(CORECOUNT LESS 1)
+    set(CORECOUNT 1)
+  endif()
+else()
+  set(CORECOUNT 1)
+endif()
+message(STATUS "Host CPU cores: ${CORECOUNT}")
+
 ######################################################################################
 
 macro(set_expr VAR)
@@ -131,7 +170,7 @@ set(POCL_INSTALL_PUBLIC_LIBDIR "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}"
 set(POCL_INSTALL_PRIVATE_LIBDIR "${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/pocl" CACHE PATH "POCL private libdir")
 
 # for pocl.icd
-if(UNIX AND NOT CMAKE_CROSSCOMPILING AND CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+if(UNIX AND (NOT CMAKE_CROSSCOMPILING) AND (CMAKE_INSTALL_PREFIX STREQUAL "/usr"))
   set(POCL_INSTALL_ICD_VENDORDIR "/etc/OpenCL/vendors" CACHE PATH "POCL ICD file destination")
 else()
   set(POCL_INSTALL_ICD_VENDORDIR "${CMAKE_INSTALL_PREFIX}/etc/OpenCL/vendors" CACHE PATH "POCL ICD file destination")
@@ -179,7 +218,7 @@ endif()
 
 ######################################################################################
 
-set(CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake)
+set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
 find_package( Hwloc )
 
 if(NOT Hwloc_FOUND)
@@ -195,6 +234,35 @@ message(STATUS "Hwloc_LDFLAGS ${Hwloc_LDFLAGS}")
 message(STATUS "Hwloc_CFLAGS ${Hwloc_CFLAGS}")
 
 ######################################################################################
+
+if(NOT HOST_CPU_CACHELINE_SIZE)
+
+  set(CL_SIZE 0)
+  if(UNIX)
+    find_program(GETCONF "getconf")
+    if(GETCONF)
+      execute_process(COMMAND "getconf" "LEVEL1_DCACHE_LINESIZE"
+                      RESULT_VARIABLE RES OUTPUT_VARIABLE CL_SIZE)
+      if(RES)
+        message(WARNING "getconf exited with nonzero status!")
+        set(CL_SIZE 0)
+      else()
+        # getconf sometimes just returns zero
+        if(NOT (CL_SIZE EQUAL 0))
+          message(STATUS "L1D Cacheline size detected: ${CL_SIZE}")
+          set(HOST_CPU_CACHELINE_SIZE "${CL_SIZE}" CACHE STRING "L1D Cacheline size")
+        endif()
+      endif()
+    endif()
+  endif()
+
+  if(CL_SIZE EQUAL 0)
+    message(WARNING "Unable to detect cacheline size - assuming 64byte cacheline, override with -DHOST_CPU_CACHELINE_SIZE=<number> (Note: this is merely used for optimization, at worst pocl will be slightly slower)")
+    set(HOST_CPU_CACHELINE_SIZE "64" CACHE STRING "L1D Cacheline size")
+  endif()
+endif()
+
+######################################################################################
 #
 # Find executables to few tools required during build 
 #
@@ -218,6 +286,10 @@ if(NOT PATCH_EXEC)
   message(FATAL_ERROR "Could not find patch command.")
 endif()
 
+if(NOT XARGS_EXEC)
+  message(FATAL_ERROR "Could not find xargs command.")
+endif()
+
 ######################################################################################
 
 if (OCS_AVAILABLE)
@@ -239,7 +311,12 @@ if (OCS_AVAILABLE)
     set(HOST_DEVICE_BUILD_HASH "${LLC_TRIPLE}-${LLC_HOST_CPU}")
   endif()
 
+  if(ARM AND LLVM_3_9)
+    message(FATAL_ERROR "pocl does not build on ARM with LLVM 3.9 unless is was patched. Try LLVM 3.8 or (preferably) 4.0+")
+  endif()
+
 else()
+
   if(NOT DEFINED HOST_DEVICE_BUILD_HASH)
     message(FATAL_ERROR "For compiler-less builds, you must define HOST_DEVICE_BUILD_HASH")
   endif()
@@ -320,20 +397,68 @@ check_function_exists(vfork HAVE_VFORK)
 
 ######################################################################################
 
-if(NOT DEFINED DEFAULT_USE_VECMATHLIB)
-  if(CLANGXX_WORKS AND EXISTS "${CMAKE_SOURCE_DIR}/lib/kernel/vecmathlib/vecmathlib.h")
-    set(DEFAULT_USE_VECMATHLIB 1 CACHE INTERNAL "vecmathlib availability")
+if((DEFINED ENABLE_VECMATHLIB) AND (DEFINED ENABLE_SLEEF))
+  if(ENABLE_SLEEF AND ENABLE_VECMATHLIB)
+    message(FATAL_ERROR "requested to use both Vecmathlib and SLEEF - pick one!")
+  endif()
+
+  if((NOT ENABLE_SLEEF) AND (NOT ENABLE_VECMATHLIB))
+    message(FATAL_ERROR "requested to use neither Vecmathlib nor SLEEF - pick one!")
+  endif()
+endif()
+
+if(NOT DEFINED ENABLE_CONFORMANCE)
+  if(((DEFINED ENABLE_VECMATHLIB) AND ENABLE_VECMATHLIB) OR
+     ((DEFINED ENABLE_SLEEF) AND (NOT ENABLE_SLEEF)))
+    set(DEFAULT_CONF OFF)
+    set(DEFAULT_SLEEF OFF)
+    set(DEFAULT_VML ON)
+  else()
+    set(DEFAULT_CONF ON)
+    set(DEFAULT_SLEEF ON)
+    set(DEFAULT_VML OFF)
+  endif()
+else()
+  set(DEFAULT_CONF ${ENABLE_CONFORMANCE})
+  if (ENABLE_CONFORMANCE)
+    # requested conformant
+    if((DEFINED ENABLE_VECMATHLIB) AND ENABLE_VECMATHLIB)
+      message(FATAL_ERROR "requested to use Vecmathlib with enabled conformance")
+    endif()
+    if((DEFINED ENABLE_SLEEF) AND (NOT ENABLE_SLEEF))
+      message(FATAL_ERROR "conformance needs enabled SLEEF")
+    endif()
+    set(DEFAULT_SLEEF ON)
+    set(DEFAULT_VML OFF)
   else()
-    set(DEFAULT_USE_VECMATHLIB 0 CACHE INTERNAL "vecmathlib availability")
+    # requested non-conformant
+    set(DEFAULT_SLEEF ${ENABLE_SLEEF})
+    set(DEFAULT_VML ${ENABLE_VECMATHLIB})
+    # at least one
+    if((NOT DEFAULT_SLEEF) AND (NOT DEFAULT_VML))
+      set(DEFAULT_VML ON)
+      set(DEFAULT_SLEEF OFF)
+    endif()
   endif()
+
 endif()
 
-setup_cached_var(USE_VECMATHLIB "Vecmathlib use"
-  "Requested enabling vecmathlib use, but either clang++ doesnt work or vecmathlib sources are missing.. -> disabling vecmathlib use"
-  "Vecmathlib is usable, but requested disabling it")
+option(ENABLE_CONFORMANCE "Enable conformance to OpenCL standard. Disabling this may enable slightly faster kernel library functions (at a price of range/precision). Note that enabling this does not guarantee conformance (depends on hardware). Incompatible with Vecmathlib. Defaults to ON" ${DEFAULT_CONF})
+
+option(ENABLE_SLEEF "Use SLEEF for kernel library, mutually exclusive with ENABLE_VECMATHLIB" ${DEFAULT_SLEEF})
+
+option(ENABLE_VECMATHLIB "Use vecmathlib for kernel library, mutually exclusive with ENABLE_SLEEF" ${DEFAULT_VML})
+
+if((NOT CLANGXX_WORKS) AND ENABLE_VECMATHLIB)
+  message(WARNING "Disabling vecmathlib because clang++ doesn't seem to work!")
+  set(ENABLE_VECMATHLIB OFF)
+  set(ENABLE_VECMATHLIB OFF CACHE BOOL)
+  set(ENABLE_SLEEF ON)
+  set(ENABLE_SLEEF ON CACHE BOOL)
+endif()
 
 # vecmathlib does not compile with fp16 currently
-if(USE_VECMATHLIB AND (NOT CL_DISABLE_HALF))
+if(ENABLE_VECMATHLIB AND (NOT CL_DISABLE_HALF))
   message(STATUS "Half available, but disabling half support since vecmathlib is enabled.")
   set(CL_DISABLE_HALF 1)
   set(CL_DISABLE_HALF 1 CACHE BOOL "Disable cl_khr_fp16 because fp16 is not supported")
@@ -341,12 +466,44 @@ endif()
 
 ######################################################################################
 
+option(USE_VECMATHLIB_BUILTINS_ONLY "Use only __builtin_* functions in the kernel library." OFF)
+
+# for kernel code, disable PIC & stack protector
+#
+# it seems PIC and stack-protector defaults somehow depend on
+# clang build type or environment. PIC causes problems with
+# constant addrspace variables, and stack protector likely slows
+# down the kernels, so it needs to be determined whether it's worth
+# the trouble.
+set(DEFAULT_KERNEL_CL_FLAGS  "-x cl -fno-stack-protector -fno-PIC")
+set(DEFAULT_KERNEL_C_FLAGS "-xc -D__CBUILD__ -fno-stack-protector -fno-PIC")
+set(DEFAULT_KERNEL_CXX_FLAGS "-xc++ -std=c++11 -fno-stack-protector -fno-PIC")
+
+if(ENABLE_VECMATHLIB)
+  set(DEFAULT_KERNEL_CXX_FLAGS "${DEFAULT_KERNEL_CXX_FLAGS} -DVML_NO_IOSTREAM ${CLANGXX_STDLIB} ")
+  if(USE_VECMATHLIB_BUILTINS_ONLY)
+    set(DEFAULT_KERNEL_CL_FLAGS "${DEFAULT_KERNEL_CL_FLAGS} -DPOCL_VECMATHLIB_BUILTIN ")
+    set(DEFAULT_KERNEL_CXX_FLAGS "${DEFAULT_KERNEL_CXX_FLAGS} -DPOCL_VECMATHLIB_BUILTIN ")
+  endif()
+endif()
+
+set(EXTRA_KERNEL_FLAGS "" CACHE STRING "Extra arguments to all kernel compilation commands (defaults to empty)")
+set(EXTRA_KERNEL_CL_FLAGS "" CACHE STRING "Extra arguments to kernel CL compiler (defaults to empty)")
+set(EXTRA_KERNEL_CXX_FLAGS "" CACHE STRING "Extra arguments to kernel CXX compiler (defaults to empty)")
+set(EXTRA_KERNEL_C_FLAGS "" CACHE STRING "Extra arguments to kernel C compiler (defaults to empty)")
+
+set(KERNEL_CXX_FLAGS "${DEFAULT_KERNEL_CXX_FLAGS}${EXTRA_KERNEL_FLAGS}${EXTRA_KERNEL_CXX_FLAGS}")
+set(KERNEL_CL_FLAGS "${DEFAULT_KERNEL_CL_FLAGS}${EXTRA_KERNEL_FLAGS}${EXTRA_KERNEL_CL_FLAGS}")
+set(KERNEL_C_FLAGS "${DEFAULT_KERNEL_C_FLAGS}${EXTRA_KERNEL_FLAGS}${EXTRA_KERNEL_C_FLAGS}")
+
+######################################################################################
+
 if(UNIX)
   if(APPLE)
     # MacOS ld outputs useless warnings like
     # ld: warning: -macosx_version_min not specificed, assuming 10.7
     # suppress them with -w.
-    set(DEFAULT_HOST_LD_FLAGS "-Wl,-dylib -lm")
+    set(DEFAULT_HOST_LD_FLAGS "-dylib -w -lm")
   elseif(ANDROID_COMPILER)
     set(DEFAULT_HOST_LD_FLAGS "-L/system/lib/ -shared -ldl -lc -lm /system/lib/crtbegin_so.o /system/lib/crtend_so.o")
   else()
@@ -359,29 +516,6 @@ endif()
 
 ######################################################################################
 
-option(USE_VECMATHLIB_BUILTINS_ONLY "Use only __builtin_* functions in the kernel library." OFF)
-
-set(DEFAULT_KERNEL_CL_FLAGS "-Xclang -cl-std=CL2.0 -D__OPENCL_C_VERSION__=200")
-if(USE_VECMATHLIB)
-  set(DEFAULT_KERNEL_CLANGXX_FLAGS "-DVML_NO_IOSTREAM ${CLANGXX_STDLIB}")
-  if(USE_VECMATHLIB_BUILTINS_ONLY)
-    set(DEFAULT_KERNEL_CL_FLAGS "${DEFAULT_KERNEL_CL_FLAGS} -DPOCL_VECMATHLIB_BUILTIN")
-    set(DEFAULT_KERNEL_CLANGXX_FLAGS "${DEFAULT_KERNEL_CLANGXX_FLAGS} -DPOCL_VECMATHLIB_BUILTIN")
-  endif()
-endif()
-
-set(EXTRA_KERNEL_CL_FLAGS "" CACHE STRING "Extra arguments to kernel CL compiler (defaults to empty)")
-set(EXTRA_KERNEL_CXX_FLAGS "" CACHE STRING "Extra arguments to kernel CXX compiler (defaults to empty)")
-
-set(KERNEL_CLANGXX_FLAGS "${DEFAULT_KERNEL_CLANGXX_FLAGS} ${EXTRA_KERNEL_CXX_FLAGS}")
-set(KERNEL_CL_FLAGS "${DEFAULT_KERNEL_CL_FLAGS} ${EXTRA_KERNEL_CL_FLAGS}")
-
-message(STATUS "Clang++ flags for compiling kernel library: ${KERNEL_CLANGXX_FLAGS}")
-message(STATUS "OpenCL flags for compiling kernel library: ${KERNEL_CL_FLAGS}")
-
-
-######################################################################################
-
 if (OCS_AVAILABLE)
 
   option(SINGLE_LLVM_LIB "When on, tries to link pocl to the single big libLLVM before falling back to LLVM_LIBFILES)." ON)
@@ -403,6 +537,7 @@ if (OCS_AVAILABLE)
     else()
       message(STATUS "single big libLLVM library not found (Probably because LLVM is built with cmake). Falling back to linking libpocl to LLVM_LIBFILES")
       set(POCL_LLVM_LIBS ${LLVM_LIBFILES})
+      set(SINGLE_LLVM_LIB OFF CACHE BOOL "single big libLLVM")
     endif()
   endif()
 
@@ -437,20 +572,14 @@ endif()
 
 ######################################################################################
 
-if(MSVC)
-  find_package( PthreadsWin32 )
-  if(NOT Pthreads_FOUND)
-    message(FATAL_ERROR "Could not find pthreads-win32 libs!")
-  endif()
-  include_directories("${Pthreads_INCLUDE_DIRS}")
+set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
+set(THREADS_PREFER_PTHREAD_FLAG TRUE)
+find_package(Threads REQUIRED)
+
+if(CMAKE_VERSION VERSION_GREATER "3.0.99")
+  set(PTHREAD_LIBRARY Threads::Threads)
 else()
-  include(FindThreads)
-  if(Threads_FOUND)
-    set(PTHREAD_LDFLAGS ${CMAKE_THREAD_LIBS_INIT})
-    set(PTHREAD_CFLAGS "")
-  else()
-    message(FATAL_ERROR "Could not find threading library for this system")
-  endif()
+  set(PTHREAD_LIBRARY ${CMAKE_THREAD_LIBS_INIT})
 endif()
 
 ######################################################################################
@@ -516,6 +645,7 @@ else()
       if(OPENCL_LIBRARIES)
         set(CMAKE_REQUIRED_LIBRARIES "${OPENCL_LIBRARIES}")
         include(CheckFunctionExists)
+        unset (OPENCL_FOUND CACHE)
         CHECK_FUNCTION_EXISTS("clEnqueueFillImage" OPENCL_FOUND)
       endif()
     endif()
@@ -610,7 +740,12 @@ endif()
 set(DEFAULT_HOST_CLANG_FLAGS "${CLANG_TARGET_OPTION}${LLC_TRIPLE}")
 set(DEFAULT_HOST_LLC_FLAGS "-relocation-model=pic -mtriple=${LLC_TRIPLE}")
 
-if(LLC_TRIPLE MATCHES "^arm")
+if(ARM AND (NOT LLVM_OLDER_THAN_4_0))
+  #ARMs need to enable FP64 manually with 4.0
+  option(ENABLE_FP64 "Enable FP64" ON)
+endif()
+
+if(ARM32 OR (LLC_TRIPLE MATCHES "^arm"))
   if(LLC_TRIPLE MATCHES "gnueabihf")
     # hardfloat
     set(DEFAULT_HOST_LLC_FLAGS "${DEFAULT_HOST_LLC_FLAGS} -float-abi=hard")
@@ -623,11 +758,6 @@ if(LLC_TRIPLE MATCHES "^arm")
     set(DEFAULT_HOST_CLANG_FLAGS "${DEFAULT_HOST_CLANG_FLAGS} -mfloat-abi=soft")
     set(DEFAULT_HOST_AS_FLAGS "${DEFAULT_HOST_AS_FLAGS} -mfloat-abi=soft")
   endif()
-  #This is very much a Q&D solution. We assume every modern ARM out there
-  #has the vfp4 floating point unit.
-  #This used to be implicitly assumed in clang < 3.9. With 3.9, we have to pass
-  #this flag, or intrinsics in vecmathlib are not defined.
-  set(DEFAULT_HOST_CLANG_FLAGS "${DEFAULT_HOST_CLANG_FLAGS} -mfpu=vfp4")
 endif()
 
 if(CL_DISABLE_LONG)
@@ -637,23 +767,57 @@ if(CL_DISABLE_HALF)
   set(DEFAULT_HOST_CLANG_FLAGS "${DEFAULT_HOST_CLANG_FLAGS} -D_CL_DISABLE_HALF")
 endif()
 
+set(HOST_DEVICE_CL_VERSION "120")
+set(HOST_DEVICE_CL_STD "1.2")
+
 # define it here, b/c we'll need these both at runtime and buildtime
-set(HOST_DEVICE_EXTENSIONS "cl_khr_byte_addressable_store cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_spir")
+if(X86 OR ARM)
+  set(HOST_DEVICE_EXTENSIONS "cl_khr_byte_addressable_store cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_3d_image_writes")
+else()
+  # set some conservative defaults
+  set(HOST_DEVICE_EXTENSIONS "cl_khr_global_int32_base_atomics cl_khr_local_int32_base_atomics cl_khr_3d_image_writes")
+endif()
+
+if((HOST_DEVICE_CL_VERSION GREATER 199) AND (CLANG_SPIR))
+  set(HOST_DEVICE_EXTENSIONS "${HOST_DEVICE_EXTENSIONS} cl_khr_spir")
+endif()
+
 if(NOT CL_DISABLE_HALF)
   set(HOST_DEVICE_EXTENSIONS "${HOST_DEVICE_EXTENSIONS} cl_khr_fp16")
 endif()
+
 if(NOT CL_DISABLE_LONG)
-  set(HOST_DEVICE_EXTENSIONS "${HOST_DEVICE_EXTENSIONS} cl_khr_int64 cl_khr_fp64 cl_khr_int64_base_atomics cl_khr_int64_extended_atomics")
+  # must not be defined in HOST_DEVICE_EXTENSIONS list, because
+  # this extension doesn't exist in official extension list
+  set(HOST_DEVICE_EXTENSION_DEFINES "-Dcl_khr_int64")
+
+  # fp64 requires int64
+  if(X86)
+    set(HOST_DEVICE_EXTENSIONS "${HOST_DEVICE_EXTENSIONS} cl_khr_fp64 cl_khr_int64_base_atomics cl_khr_int64_extended_atomics")
+  endif()
+  if(ENABLE_FP64 AND (NOT LLVM_OLDER_THAN_4_0))
+    # 32bit arm doesnt always uspport doubles
+    set(HOST_DEVICE_EXTENSIONS "${HOST_DEVICE_EXTENSIONS} cl_khr_fp64")
+  endif()
 endif()
 
 set(TEMP_EXT "${HOST_DEVICE_EXTENSIONS}")
-set(HOST_DEVICE_EXTENSION_DEFINES "")
 separate_arguments(TEMP_EXT)
+set(TEMP_CLEXT "-Xclang -cl-ext=-all,")
 foreach(EXT ${TEMP_EXT})
   set(HOST_DEVICE_EXTENSION_DEFINES "${HOST_DEVICE_EXTENSION_DEFINES} -D${EXT}")
+  set(TEMP_CLEXT "${TEMP_CLEXT}+${EXT},")
 endforeach()
 
-set(HOST_DEVICE_CL_VERSION "200")
+if (NOT LLVM_OLDER_THAN_4_0)
+  set(HOST_DEVICE_EXTENSION_DEFINES "${HOST_DEVICE_EXTENSION_DEFINES} ${TEMP_CLEXT}")
+endif()
+
+if (HOST_DEVICE_EXTENSION_DEFINES MATCHES "cl_khr_fp64")
+  set(_CL_DISABLE_DOUBLE 0)
+else()
+  set(_CL_DISABLE_DOUBLE 1)
+endif()
 
 if(NOT DEFINED KERNELLIB_HOST_CPU_VARIANTS)
   set(KERNELLIB_HOST_CPU_VARIANTS "native")
@@ -663,7 +827,10 @@ endif()
 set(KERNELLIB_HOST_DISTRO_VARIANTS 0)
 if(KERNELLIB_HOST_CPU_VARIANTS STREQUAL "distro")
   if(X86_64 OR I386)
-    set(KERNELLIB_HOST_CPU_VARIANTS sse2 ssse3 sse41 avx avx_fma4 avx2 avx512)
+    set(KERNELLIB_HOST_CPU_VARIANTS sse2 ssse3 sse41 avx avx_f16c avx_fma4 avx2)
+    if(NOT LLVM_OLDER_THAN_3_9)
+      list(APPEND KERNELLIB_HOST_CPU_VARIANTS avx512)
+    endif()
   else()
     message(FATAL_ERROR "Don't know what CPU variants to use for kernel library on this platform.")
   endif()
@@ -681,6 +848,10 @@ set(EXTRA_HOST_LLC_FLAGS "" CACHE STRING "Extra parameters to llc for code gener
 
 set(HOST_AS_FLAGS "${DEFAULT_HOST_AS_FLAGS} ${EXTRA_HOST_AS_FLAGS}")
 set(HOST_LD_FLAGS "${DEFAULT_HOST_LD_FLAGS} ${EXTRA_HOST_LD_FLAGS}" )
+string(STRIP "${HOST_LD_FLAGS}" HOST_LD_FLAGS_STRIPPED)
+string(REGEX REPLACE "[\r\n\t ]+" "\", \"" HOST_LD_FLAGS_ARRAY "${HOST_LD_FLAGS_STRIPPED}")
+# string(REPLACE "###, ###" " oo \", \" oo " HOST_LD_FLAGS_ARRAY "${HOST_LD_FLAGS_ARRAY_1}")
+
 set(HOST_CLANG_FLAGS "${DEFAULT_HOST_CLANG_FLAGS} ${EXTRA_HOST_CLANG_FLAGS}")
 set(HOST_LLC_FLAGS "${DEFAULT_HOST_LLC_FLAGS} ${EXTRA_HOST_LLC_FLAGS}")
 
@@ -788,6 +959,7 @@ if(ENABLE_TCE)
   endforeach()
 
   set(TCE_DEVICE_CL_VERSION "120")
+  set(TCE_DEVICE_CL_STD "1.2")
 
   if("${LLVM_CXXFLAGS}" MATCHES "-fno-rtti")
     message(WARNING "TCE is enabled but your LLVM was not built with RTTI. You should rebuild LLVM with 'make REQUIRES_RTTI=1'. See the INSTALL file for more information.")
@@ -812,13 +984,34 @@ if(ENABLE_HSA)
   set(OCL_TARGETS "${OCL_TARGETS} hsail64")
   # this is for config.h
 
-  set(HSA_DEVICE_EXTENSIONS "cl_khr_byte_addressable_store cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_int64 cl_khr_fp64 cl_khr_int64_base_atomics cl_khr_int64_extended_atomics")
+  set(HSA_DEVICE_EXTENSIONS "cl_khr_byte_addressable_store cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_fp64 cl_khr_int64_base_atomics cl_khr_int64_extended_atomics")
   set(HSA_DEVICE_CL_VERSION "200")
+  set(HSA_DEVICE_CL_STD "2.0")
   find_path(HAVE_HSA_EXT_AMD_H "hsa_ext_amd.h" HINTS "${HSA_INCLUDEDIR}" ENV PATH)
 endif()
 
 ##########################################################
 
+if(ENABLE_CUDA)
+
+  # Require LLVM 4.0 or newer
+  if ("${LLVM_MAJOR}" STRLESS "4")
+    message(FATAL_ERROR "The CUDA backend requires LLVM 4.0 or newer")
+  endif()
+
+  set(OCL_DRIVERS "${OCL_DRIVERS} cuda")
+  set(OCL_TARGETS "${OCL_TARGETS} cuda")
+  # this is for config.h
+  # TODO unify with autotools
+  set(BUILD_CUDA 1)
+
+  set(CUDA_DEVICE_EXTENSIONS "cl_khr_byte_addressable_store cl_khr_global_int32_base_atomics cl_khr_global_int32_extended_atomics cl_khr_local_int32_base_atomics cl_khr_local_int32_extended_atomics cl_khr_fp64 cl_khr_int64_base_atomics cl_khr_int64_extended_atomics")
+  set(CUDA_DEVICE_CL_VERSION "120")
+  set(CUDA_DEVICE_CL_STD "1.2")
+endif()
+
+##########################################################
+
 message(STATUS "Building the following device drivers: ${OCL_DRIVERS}")
 
 set(BUILDDIR "${CMAKE_BINARY_DIR}")
@@ -916,10 +1109,12 @@ endif()
 # 6:0:5 == 0.12 (currently backwards compatible with 0.7, thus age = 5).
 # 7:0:6 == 0.13 (currently backwards compatible with 0.7, thus age = 6).
 # 8:0:7 == 0.14 (currently backwards compatible with 0.7, thus age = 7).
+# pocl 1.0 bumped the API version:
+# 2:0:0 == 1.0 (the libpocl.so will be named libpocl.so.2.0.X )
 
-set(LIB_CURRENT_VERSION 8)
+set(LIB_CURRENT_VERSION 2)
 set(LIB_REVISION_VERSION 0)
-set(LIB_AGE_VERSION 7)
+set(LIB_AGE_VERSION 0)
 
 math(EXPR LIB_FIRST_VERSION "${LIB_CURRENT_VERSION} - ${LIB_AGE_VERSION}")
 
@@ -934,7 +1129,8 @@ set(LIB_API_VERSION "${LIB_FIRST_VERSION}")
 # drastically. Let's try to follow the similar 'current' numbering as
 # the pocl host API library and perhaps tune the 'revision' and 'age' later.
 
-set(KERNEL_COMPILER_LIB_VERSION "${LIB_CURRENT_VERSION}.0.0")
+math(EXPR KER_LIB_CURRENT_VERSION "${LIB_CURRENT_VERSION} + 7")
+set(KERNEL_COMPILER_LIB_VERSION "${KER_LIB_CURRENT_VERSION}.0.0")
 
 ##########################################################
 
@@ -950,16 +1146,39 @@ set(PACKAGE_VERSION "${POCL_VERSION}")
 
 configure_file("config.h.in.cmake" "config.h.new" ESCAPE_QUOTES)
 rename_if_different("${CMAKE_BINARY_DIR}/config.h.new" "${CMAKE_BINARY_DIR}/config.h")
+
+configure_file("config2.h.in.cmake" "config2.h.new")
+rename_if_different("${CMAKE_BINARY_DIR}/config2.h.new" "${CMAKE_BINARY_DIR}/config2.h")
+
 include_directories("${CMAKE_BINARY_DIR}")
 
-# autotools compat
-file(WRITE "${CMAKE_BINARY_DIR}/install-paths.h.new" "#define PKGDATADIR \"${POCL_INSTALL_PRIVATE_DATADIR}\"")
-rename_if_different("${CMAKE_BINARY_DIR}/install-paths.h.new" "${CMAKE_BINARY_DIR}/install-paths.h")
+# This is used to generate the compiler feature detection header.
+# Currently it's not enabled because it requires CMake > 3.x and
+# also the autogenerated header needs some editing by hand
+# (it errors on all compilers except gcc > 4 and clang > 3)
+#
+#
+#include(WriteCompilerDetectionHeader)
+#write_compiler_detection_header(
+#  FILE "${CMAKE_BINARY_DIR}/compiler_features.h"
+#  PREFIX POCL
+#  COMPILERS GNU Clang
+#  FEATURES
+#    c_function_prototypes
+#    c_restrict
+#    c_static_assert
+#    c_variadic_macros
+#)
 
 ##########################################################
 
 if(ENABLE_ICD)
-  file(GENERATE OUTPUT "${CMAKE_BINARY_DIR}/pocl.icd" CONTENT "${POCL_INSTALL_PUBLIC_LIBDIR}/$<TARGET_FILE_NAME:pocl>" CONDITION 1)
+  if(POCL_ICD_ABSOLUTE_PATH)
+    set(CONTENT "${POCL_INSTALL_PUBLIC_LIBDIR}/$<TARGET_FILE_NAME:pocl>")
+  else()
+    set(CONTENT "$<TARGET_FILE_NAME:pocl>")
+  endif()
+  file(GENERATE OUTPUT "${CMAKE_BINARY_DIR}/pocl.icd" CONTENT "${CONTENT}" CONDITION 1)
   install(FILES "${CMAKE_BINARY_DIR}/pocl.icd"
          DESTINATION "${POCL_INSTALL_ICD_VENDORDIR}")
 
@@ -994,18 +1213,23 @@ message(STATUS "OPENCL_LIBS: ${OPENCL_LIBS}")
 message(STATUS "OPENCL_CFLAGS: ${OPENCL_CFLAGS}")
 
 # for tests / examples
-set(POCLU_LINK_OPTIONS ${LIBMATH} ${OPENCL_LIBS} "poclu")
+set(POCLU_LINK_OPTIONS ${OPENCL_LIBS} ${LIBMATH} "poclu")
 message(STATUS "POCLU LINK OPTS: ${POCLU_LINK_OPTIONS}")
 
 # poclcc bin
 add_subdirectory("bin")
 
+include(add_test_pocl)
+
 if (OCS_AVAILABLE)
   add_subdirectory("tests")
   add_subdirectory("examples")
 endif()
 
-add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} ${COMMAND_USES_TERMINAL})
+# make check & make check_tier1
+
+add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} -j ${CORECOUNT} ${COMMAND_USES_TERMINAL})
+add_custom_target(check_tier1 COMMAND ${CMAKE_CTEST_COMMAND} -L "'internal|amdsdk_30|piglit|PyOpenCL|conformance_suite_micro'" -j ${CORECOUNT} ${COMMAND_USES_TERMINAL})
 
 ##########################################################
 
@@ -1063,18 +1287,27 @@ if (ENABLE_HSA)
   MESSAGE(STATUS "HSAIL_ASM: ${HSAIL_ASM}")
 endif()
 MESSAGE(STATUS "")
-MESSAGE(STATUS "ICD_LD_FLAGS: ${ICD_LD_FLAGS}")
-MESSAGE(STATUS "KERNEL_CLANGXX_FLAGS: ${KERNEL_CLANGXX_FLAGS}")
-MESSAGE(STATUS "KERNEL_CL_FLAGS: ${KERNEL_CL_FLAGS}")
-MESSAGE(STATUS "PTHREAD_LDFLAGS: ${PTHREAD_LDFLAGS}")
-MESSAGE(STATUS "PTHREAD_CFLAGS: ${PTHREAD_CFLAGS}")
 MESSAGE(STATUS "LIB_API_VERSION: ${LIB_API_VERSION}")
 MESSAGE(STATUS "LIB_BUILD_VERSION: ${LIB_BUILD_VERSION}")
+MESSAGE(STATUS "ICD_LD_FLAGS: ${ICD_LD_FLAGS}")
+
+MESSAGE(STATUS "EXTRA_KERNEL_FLAGS: ${EXTRA_KERNEL_FLAGS}")
+MESSAGE(STATUS "EXTRA_KERNEL_CXX_FLAGS: ${EXTRA_KERNEL_CXX_FLAGS}")
+MESSAGE(STATUS "EXTRA_KERNEL_CL_FLAGS: ${EXTRA_KERNEL_CL_FLAGS}")
+MESSAGE(STATUS "EXTRA_KERNEL_C_FLAGS: ${EXTRA_KERNEL_C_FLAGS}")
+
+MESSAGE(STATUS "final KERNEL_CXX_FLAGS: ${KERNEL_CXX_FLAGS}")
+MESSAGE(STATUS "final KERNEL_CL_FLAGS: ${KERNEL_CL_FLAGS}")
+MESSAGE(STATUS "final KERNEL_C_FLAGS: ${KERNEL_C_FLAGS}")
+
 if (OCS_AVAILABLE)
   MESSAGE(STATUS "")
   MESSAGE(STATUS "LLVM_VERSION: ${LLVM_VERSION}")
-  MESSAGE(STATUS "LLVM_ASSERTS_BUILD ${LLVM_ASSERTS_BUILD}")
-  MESSAGE(STATUS "LLVM_BUILD_MODE ${LLVM_BUILD_MODE}")
+  MESSAGE(STATUS "LLVM_LIB_IS_SHARED: ${LLVM_LIB_IS_SHARED}")
+  MESSAGE(STATUS "LLVM_HAS_RTTI: ${LLVM_HAS_RTTI}")
+  MESSAGE(STATUS "LLVM_LIB_MODE: ${LLVM_LIB_MODE}")
+  MESSAGE(STATUS "LLVM_ASSERTS_BUILD: ${LLVM_ASSERTS_BUILD}")
+  MESSAGE(STATUS "LLVM_BUILD_MODE: ${LLVM_BUILD_MODE}")
   MESSAGE(STATUS "LLVM_CFLAGS: ${LLVM_CFLAGS}")
   MESSAGE(STATUS "LLVM_CXXFLAGS: ${LLVM_CXXFLAGS}")
   MESSAGE(STATUS "LLVM_CPPFLAGS: ${LLVM_CPPFLAGS}")
@@ -1122,22 +1355,34 @@ MESSAGE(STATUS "******* Enabled features:")
 MESSAGE(STATUS " ")
 
 MESSAGE(STATUS "CLANG_SPIR: ${CLANG_SPIR}")
+MESSAGE(STATUS "DEVELOPER_MODE: ${DEVELOPER_MODE}")
+MESSAGE(STATUS "ENABLE_CONFORMANCE: ${ENABLE_CONFORMANCE}")
 MESSAGE(STATUS "ENABLE_ICD: ${ENABLE_ICD}")
 MESSAGE(STATUS "ENABLE_TCE: ${ENABLE_TCE}")
 MESSAGE(STATUS "ENABLE_TCEMC: ${ENABLE_TCEMC}")
 MESSAGE(STATUS "ENABLE_HSA: ${ENABLE_HSA}")
+MESSAGE(STATUS "ENABLE_CUDA: ${ENABLE_CUDA}")
 MESSAGE(STATUS "ENABLE_ASAN (address sanitizer): ${ENABLE_ASAN}")
+MESSAGE(STATUS "ENABLE_LSAN (leak sanitizer): ${ENABLE_LSAN}")
+MESSAGE(STATUS "ENABLE_TSAN (thread sanitizer): ${ENABLE_TSAN}")
+MESSAGE(STATUS "ENABLE_UBSAN (UB sanitizer): ${ENABLE_UBSAN}")
+MESSAGE(STATUS "ENABLE_VECMATHLIB: ${ENABLE_VECMATHLIB}")
+MESSAGE(STATUS "ENABLE_SLEEF: ${ENABLE_SLEEF}")
+MESSAGE(STATUS "ENABLE_POCL_BUILDING: ${ENABLE_POCL_BUILDING}")
 MESSAGE(STATUS "INSTALL_OPENCL_HEADERS (Install our headers): ${INSTALL_OPENCL_HEADERS}")
 MESSAGE(STATUS "OCL_DRIVERS (Drivers built): ${OCL_DRIVERS}")
 MESSAGE(STATUS "OCL_TARGETS (Targets built): ${OCL_TARGETS}")
 MESSAGE(STATUS "OCS_AVAILABLE: ${OCS_AVAILABLE}")
+MESSAGE(STATUS "POCL_ICD_ABSOLUTE_PATH: ${POCL_ICD_ABSOLUTE_PATH}")
 MESSAGE(STATUS "SINGLE_LLVM_LIB: ${SINGLE_LLVM_LIB}")
 MESSAGE(STATUS "TESTS_USE_ICD: ${TESTS_USE_ICD}")
-MESSAGE(STATUS "USE_VECMATHLIB: ${USE_VECMATHLIB}")
 MESSAGE(STATUS "Available testsuites: ${ALL_TESTSUITES}")
 MESSAGE(STATUS "Enabled testsuites: ${ACTUALLY_ENABLED_TESTSUITES}")
 MESSAGE(STATUS "Disabled testsuites: ${DISABLED_TESTSUITES}")
+MESSAGE(STATUS "Testsuites are built from git master: ${EXAMPLES_USE_GIT_MASTER}")
 MESSAGE(STATUS "Kernel caching: ${KERNEL_CACHE_DEFAULT}")
 MESSAGE(STATUS "Kernel library CPU variants: ${KERNELLIB_HOST_CPU_VARIANTS}")
 MESSAGE(STATUS "Kernel library distro build: ${KERNELLIB_HOST_DISTRO_VARIANTS}")
 MESSAGE(STATUS "Use fake address space IDs: ${POCL_USE_FAKE_ADDR_SPACE_IDS}")
+MESSAGE(STATUS "Use pocl custom memory allocator: ${USE_POCL_MEMMANAGER}")
+MESSAGE(STATUS "L1d cacheline size: ${HOST_CPU_CACHELINE_SIZE}")
diff --git a/CREDITS b/CREDITS
index b18b722..8db22bb 100644
--- a/CREDITS
+++ b/CREDITS
@@ -50,4 +50,12 @@ Matthias Noack <ma.noack.pr at gmail.com>
 Sam McKelvie <sammck at gmail.com>
 Tom Gall <tom_gall at mac.com>
 Arda Coskunses <ardacoskunses at yahoo.com>
-
+Minh Quan HO <mqho at kalray.eu>
+Matt Wala <wala1 at illinois.edu>
+Jonas Hahnfeld <Hahnfeld at itc.rwth-aachen.de>
+Ronan Keryell <ronan.keryell at xilinx.com>
+Rodrigo Tobar <rtobar at icrar.org>
+Martin Krastev <blu.dark at gmail.com>
+Tom Stellard <tstellar at redhat.com>
+Nick Curtis <nicholas.curtis at uconn.edu>
+Konstantin Bakanov <kostik_b at zoho.com>
diff --git a/README.ARM b/README.ARM
index 36d8301..0ba2afc 100644
--- a/README.ARM
+++ b/README.ARM
@@ -1,15 +1,20 @@
-pocl works (as of 2012-11-21) quite well on a Panda board with
-Ubuntu.
-
-Current known issues:
-
-- You might need to install libhwloc from the sources if
-  the Ubuntu in Panda doesn't ship a new enough one.
-  http://www.open-mpi.org/projects/hwloc/
-- ABI/ISA compatibility issue (soft float or not). This is
-  producible in the example1 which takes float4 buffers as args.
-  example2, which uses scalar buffers works ok.
-- On Ubuntu 12.04, clang 3.2 and 3.3 cannot compile a simple 
-  C program due to wrong internal paths. This leads to all tests
-  failing. Description on how to fix this issue is here:
-  http://sourceforge.net/mailarchive/forum.php?thread_name=20130723124515.7995bb36%40fx8&forum_name=pocl-devel
+pocl builds (as of Aug 2017) on ODROID XU3 and ODROID C2
+but some tests fail.
+
+How to build:
+
+* get a clang / llvm. DO NOT use the ones downloaded from llvm.org, they only work
+  on the distro where they were compiled. Ubuntu LTS these days ships multiple llvm
+  versions even quite recent ones; get the clang+llvm from your distro's packages.
+
+* read the pocl build instructions in docs
+
+* LLVM will likely not recognize your cpu, and running cmake will give you a warning.
+  run cmake with -DLLC_HOST_CPU=<yourcpu>. "yourcpu" must be something LLVM recognizes,
+  usually it's simply "cortex-aXX" like cortex-a15 etc. You can get the full list by
+  running `llc -mcpu=help`.
+
+* example for building pocl on Ubuntu 16.04 + ARM:
+
+  apt install ocl-icd-libopencl1 ocl-icd-opencl-dev cmake libltdl-dev libhwloc-dev pkg-config
+  build-essential llvm-4.0-dev llvm-4.0 clang-4.0 libclang-4.0-dev
diff --git a/README.packaging b/README.packaging
index de3fce5..8362786 100644
--- a/README.packaging
+++ b/README.packaging
@@ -2,22 +2,28 @@ This file contains notes for making distribution packages of pocl.
 
 ICD
 ---
-Pocl should probably be built with ICD enabled for desktop
-distributions. Pocl does not have an ICD loader, so a dependancy
-on one would be beneficial.
+Pocl should probably be built with ICD enabled (``-DENABLE_ICD=ON`` CMake
+option) for desktop distributions. Pocl does not have an ICD loader,
+so a dependancy on one would be beneficial.
 
-The pocl.icd file (which the ICD loader uses to load the pocl lib)
-has a full path to the installed libpocl.so file. When building
-for a multiarch, remove the "@libdir@/" from pocl.icd.in before
-building. This way the system library loader can pick up the correct
-architecture library.
+CMake options for a distribution build
+--------------------------------------
+- ``-DKERNELLIB_HOST_CPU_VARIANTS=distro``
+  Note: this note only works for x86(-64) platform currently,
+  on other platforms, it has zero effect.
+  Enables runtime detection of CPU and builds separate
+  kernel libraries for most common x86 CPUs.
 
+- ``-DPOCL_ICD_ABSOLUTE_PATH=OFF``
+  The pocl.icd file (which the ICD loader uses to load the pocl lib)
+  by default has a full path to the installed libpocl.so file.
+  Set this option to OFF and pocl will only put the dynamic library
+  name into pocl.icd.
 
-Target(host) CPU
-----------------
-If not overridden, pocl uses LLVM (llc) to detect the host CPU, which
-is used to determine processor ISA extensions (like SSE and AVX).
-See: http://portablecl.org/docs/html/env_variables.html
+- ``-DENABLE_POCL_BUILDING=OFF``
+  When OFF, POCL_BUILDING option (which causes pocl to look for required
+  files in build / source directories) will be ignored
+  and pocl will always look in installed paths only.
 
 Mesa (OpenGL) interoperability
 ------------------------------
diff --git a/TODO b/TODO
index ce5f266..59e18c7 100644
--- a/TODO
+++ b/TODO
@@ -4,14 +4,9 @@ Version roadmap
 High priority (1.0 blockers):
     * make NVIDIA OpenCL SDK examples to work 
     * make Intel OpenCL SDK examples to work
-    * fix issues when calling kernels with struct or vector 
-      value parameters: https://github.com/pocl/pocl/issues/1
 
 Medium priority:
-    * complete the kernel runtime library.   
-    * complete the host runtime library.
     * device supporting AMD GPU cards.
-    * Check all the function pointers in the ICD dispatch struct.
 
 Known ambiguous OpenCL 1.2 features
 -----------------------------------
@@ -27,92 +22,25 @@ within a context that only holds their parent device, or not. This
 might even depend on whether the context was created "from type"
 or not.
 
-The experimental implementation in pocl currently assumes that
-sub-devices are to be treated independently from their parent
-device. This means, for example, that sub-devices cannot be used
-in a context that does not contain them (but contains their parent
-device). Note that this is different from the AMD behavior (which
-is tested in the DeviceFission AMD APP SDK example), but follows
-e.g. Intel's behavior. Clarification from the standard body is
-needed on which behavior is correct.
-
-There is room for optimizations in the current implementation,
-particularly for what concerns the program build system, since
-sub-devices share the bitcode with their parent device and
-building could be done only once. Such an optimization will
-actually become necessary if the other behavior (sub-devices as
-slaves of their parent device) is ever implemented in the future.
+The implementation of subdevices in pocl currently converts
+subdevices to their parents in most places, with the exception
+being clEnqueueNDRangeKernel. This means, for example, that
+sub-devices can be used in a context that does not contain
+them (but contains their parent device). Note this is equivalent
+to the AMD behavior (which is tested in the DeviceFission AMD APP
+SDK example), but differs from e.g. Intel's behavior. Clarification
+from the standard body is needed on which behavior is correct.
 
 Known missing OpenCL 1.2 features
 ---------------------------------
 
 Missing APIs used by the tested OpenCL example suites are
-entered here. This is not a complete list of unimplemented
-APIs in pocl, but one that has been updated whenever 
-missing APIs have been encountered in the test cases.
-
-(*) == Used by the opencl-book-samples. 
-(R) == Used by the Rodinia benchmark suite.
-(P) == Used by pyopencl
-(B) == Used by the Parboil benchmarks
-
-  4. THE OPENCL PLATFORM LAYER
-  
-* 4.1 Querying platform info (properly)
-* 4.3 Partitioning device
-* 4.4 Contexts
-  
-  5. THE OPENCL RUNTIME
-
-* 5.1 Command queues
-* 5.2.1 Creating buffer objects
-* 5.2.4 Mapping buffer objects
-* 5.3 Image objects
-* 5.3.3 Reading, Writing and Copying Image Objects
-* 5.4 Querying, Umapping, Migrating, ... Mem objects
-* 5.4.1 Retaining and Releasing Memory Objects
-* 5.4.2 Unmapping Mapped Memory Objects
-* 5.5 Sampler objects
-* 5.5.1 Creating Sampler Objects
-* 5.6.1 Creating Program Objects
-* 5.7.1 Creating Kernel Objects
-* 5.9 Event objects
-  * clWaitForEvents (*)
-* 5.10 Markers, Barriers and Waiting for Events
-  * clEnqueueMarker (deprecated in OpenCL 1.2) (*, B)
-* 5.12 Profiling 
-
-  6. THE OPENCL C PROGRAMMING LANGUAGE
-
-* 6.12.11 Atomic functions
-  * cl_khr_local_int32_base_atomics (Chapter_14/histogram)
-
-* 6.12.14.2 Built-in Image Read Functions
-  * read_imagef (R[particlefilter])
-  * read_imageui (B[sad])
+entered here.
 
   OpenCL 1.2 Extensions
 
 * 9.7 Sharing Memory Objects with OpenGL / OpenGL
   ES Buffer, Texture and Renderbuffer Objects
+
 * 9.7.6 Sharing memory objects that map to GL objects 
   between GL and CL contexts
-  * clEnqueueAcquireGLObjects (*)
-
-  Miscellaneous
-
-Other
------
-* configure should check for 'clang'
-* build system should use $(CXX) everywhere,
-  now some parts assume g++ and it fails if 
-  only c++ is installed
-
-Optimization opportunities
---------------------------
-* Even when using an in-order queue, schedule kernels
-  in parallel in case their input buffers are not depending
-  on the unfinished ones (should be legal per OpenCL 1.2 5.11).
-
-  
-  
diff --git a/TODO.piglit b/TODO.piglit
deleted file mode 100644
index 984292c..0000000
--- a/TODO.piglit
+++ /dev/null
@@ -1,29 +0,0 @@
-Problems found by Piglit testing framework
-==========================================
-
-by: Victor Oliveira
-
-http://people.freedesktop.org/~nh/piglit/
-
-API
----
-
-- clCreateKernel with non-existent kernel name
-- build options support is incomplete
-- clang accepts invalid opencl version in command-line
-- unimplemented parts in clReleaseProgram
-- clGetProgramBuildInfo returns an empty string
-- clSetKernelArg various error checks
-
-Execute
--------
-
-- increment and decrement vector types (e.g. int4)
-- implicit conversion between vector types (e.g. int4 -> float4)
-
-Unimplemented
--------------
-
-- clGetProgramInfo
-- clCreateKernelsInProgram
-
diff --git a/android/CLONE_POCL_PREBUILTS_HERE b/android/CLONE_POCL_PREBUILTS_HERE
deleted file mode 100644
index 17973fc..0000000
--- a/android/CLONE_POCL_PREBUILTS_HERE
+++ /dev/null
@@ -1 +0,0 @@
-clone pocl-android-prebuilts from https://github.com/krrishnarraj/pocl-android-prebuilts
diff --git a/android/androideabi.cmake b/android/androideabi.cmake
deleted file mode 100644
index b0efa41..0000000
--- a/android/androideabi.cmake
+++ /dev/null
@@ -1,10 +0,0 @@
-SET(CMAKE_SYSTEM_NAME Linux)
-SET(CMAKE_SYSTEM_VERSION 1)
-SET(CMAKE_SYSTEM_PROCESSOR arm)
-
-SET(CMAKE_C_COMPILER   arm-linux-androideabi-gcc)
-SET(CMAKE_CXX_COMPILER arm-linux-androideabi-g++)
-
-SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
-SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
-SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
diff --git a/android/build-arm.sh b/android/build-arm.sh
deleted file mode 100755
index a09cc45..0000000
--- a/android/build-arm.sh
+++ /dev/null
@@ -1,161 +0,0 @@
-#!/bin/bash
-#
-# Build script for Android
-#
-#   Copyright (c) 2014 Krishnaraj R Bhat (krrishnarraj at gmail.com)
-#
-#   Permission is hereby granted, free of charge, to any person obtaining a copy
-#   of this software and associated documentation files (the "Software"), to deal
-#   in the Software without restriction, including without limitation the rights
-#   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-#   copies of the Software, and to permit persons to whom the Software is
-#   furnished to do so, subject to the following conditions:
-#
-#   The above copyright notice and this permission notice shall be included in
-#   all copies or substantial portions of the Software.
-#
-#   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-#   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-#   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-#   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-#   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-#   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-#   THE SOFTWARE.
-#
-# Usage: build-arm.sh [release]
-# default - builds debug version for quick testing
-# release - builds release version with flto options. Much Much slower
-
-PWD=`pwd`
-I_AM=`id -un`
-MY_GROUP=`id -gn`
-ANDROID_TOOLCHAIN=/tmp/android-toolchain/
-
-echo "NDK standalone toolchain setup..."
-if [ ! -e $ANDROID_NDK/build/tools/make-standalone-toolchain.sh ]; then
-    echo "Install Android NDK and set environment variable ANDROID_NDK to its root"
-    return
-fi
-$ANDROID_NDK/build/tools/make-standalone-toolchain.sh \
-				--toolchain=arm-linux-androideabi-4.9 \
-				--arch=arm \
-				--platform=android-16 \
-				--install-dir=$ANDROID_TOOLCHAIN
-
-INSTALL_PREFIX=/data/data/org.pocl.libs/files/
-# Create directories for PREFIX, target location in android
-if [ ! -e $INSTALL_PREFIX ]; then
-    sudo mkdir -p $INSTALL_PREFIX
-    sudo mkdir -p $INSTALL_PREFIX/lib/pkgconfig/
-    sudo chown -R $I_AM:$MY_GROUP $INSTALL_PREFIX
-    sudo chmod 755 -R $INSTALL_PREFIX
-fi
-
-# Prebuilt llvm that runson(android) -> target(android)
-LLVM_HOST_ANDROID_TARGET_ANDROID=$PWD/pocl-android-prebuilts/arm/llvm/android
-if [ ! -e $LLVM_HOST_ANDROID_TARGET_ANDROID/lib/libclangFrontend.a  ]; then
-    echo "Build and place llvm(android) at " $LLVM_HOST_ANDROID_TARGET_ANDROID
-    return
-fi
-
-if [ ! -e $ANDROID_TOOLCHAIN/sysroot/usr/lib/libclangFrontend.a  ]; then
-echo "Copying llvm libs(android) to sysroot..."
-cp -rf $LLVM_HOST_ANDROID_TARGET_ANDROID/* $ANDROID_TOOLCHAIN/sysroot/usr/
-fi
-
-# Prebuilt llvm that runon(x64) -> target(android)
-LLVM_HOST_x64_TARGET_ANDROID=$PWD/pocl-android-prebuilts/arm/llvm/cross_compiler_for_android
-if [ ! -e $LLVM_HOST_x64_TARGET_ANDROID/bin/clang ]; then
-    echo "Build and place llvm runson(x64) -> target(android) at " $LLVM_HOST_x64_TARGET_ANDROID
-    return
-fi
-
-if [ ! -e $ANDROID_TOOLCHAIN/sysroot/usr/bin/clang ]; then
-echo "copying llvm(host) to sysroot...."
-cp -rf $LLVM_HOST_x64_TARGET_ANDROID/* $ANDROID_TOOLCHAIN/sysroot/usr/
-fi
-
-PREBUILT_NCURSES=$PWD/pocl-android-prebuilts/arm/ncurses
-if [ ! -e $PREBUILT_NCURSES/lib/libncurses.a ]; then
-    echo "Build and place ncurses for android at " $PREBUILT_NCURSES
-    return
-fi
-echo "copying ncurses to sysroot...."
-cp -rf $PREBUILT_NCURSES/* $ANDROID_TOOLCHAIN/sysroot/usr/
-ln -sf $ANDROID_TOOLCHAIN/sysroot/usr/lib/libncurses.a $ANDROID_TOOLCHAIN/sysroot/usr/lib/libcurses.a
-
-
-PREBUILT_LTDL=$PWD/pocl-android-prebuilts/arm/libtool
-if [ ! -e $PREBUILT_LTDL/lib/libltdl.a ]; then
-    echo "Build and place libltdl for android at " $PREBUILT_LTDL
-    return
-fi
-echo "copying ltdl to sysroot...."
-cp -rf $PREBUILT_LTDL/* $ANDROID_TOOLCHAIN/sysroot/usr/
-
-PREBUILT_HWLOC=$PWD/pocl-android-prebuilts/arm/hwloc
-if [ ! -e $PREBUILT_HWLOC/lib/libhwloc.a ]; then
-    echo "Build and place libhwloc for android at " $PREBUILT_HWLOC
-    return
-fi
-echo "copying hwloc to sysroot...."
-cp -rf $PREBUILT_HWLOC/* $ANDROID_TOOLCHAIN/sysroot/usr/
-
-PREBUILT_BINUTILS=$PWD/pocl-android-prebuilts/arm/binutils
-if [ ! -e $PREBUILT_BINUTILS/bin/ld ]; then
-    echo "Build and place binutils for android at " $PREBUILT_BINUTILS
-    return
-fi
-echo "copying ld to "$INSTALL_PREFIX
-cp -rf $PREBUILT_BINUTILS/* $INSTALL_PREFIX/
-
-ln -sf $ANDROID_TOOLCHAIN/sysroot/usr/lib/libc.so $ANDROID_TOOLCHAIN/sysroot/usr/lib/libpthread.so
-ln -sf $ANDROID_TOOLCHAIN/sysroot/usr/lib/libc.so $ANDROID_TOOLCHAIN/sysroot/usr/lib/librt.so
-ln -sf $ANDROID_TOOLCHAIN/sysroot/usr/include/GLES $ANDROID_TOOLCHAIN/sysroot/usr/include/GL
-rm $ANDROID_TOOLCHAIN/sysroot/usr/lib/libstdc++.*
-
-export PATH=$ANDROID_TOOLCHAIN/bin:$ANDROID_TOOLCHAIN/sysroot/usr/bin/:$PATH
-export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$ANDROID_TOOLCHAIN/sysroot/usr/lib/
-export HOST=arm-linux-androideabi
-export PREFIX=$INSTALL_PREFIX
-export SYSROOT=$ANDROID_TOOLCHAIN/sysroot/usr/
-export TARGET_CPU="cortex-a9"
-
-# flto option in gcc 4.8 eats all memory & eventually /tmp. Better to place tmp file in disk
-export TMPDIR=$HOME/tmp/junk/
-if [ ! -e $TMPDIR ]; then
-    mkdir -p $TMPDIR
-fi
-
-
-#if [ ! -e $PWD/../configure ]; then
-#    cd ..; ./autogen.sh; cd -
-#fi
-
-DEBUG_BUILD=1
-if [ $# -gt 0 ]  && [ $1 = "release" ] ; then
-    DEBUG_BUILD=0
-fi
-
-if [ $DEBUG_BUILD == 1 ] ; then
-#CC="arm-linux-androideabi-gcc  -static-libstdc++ " CXX="arm-linux-androideabi-g++  -static-libstdc++ "  ac_cv_c_bigendian=no LLC_HOST_CPU=$TARGET_CPU HWLOC_CFLAGS="-I"$ANDROID_TOOLCHAIN"/sysroot/usr/include" HWLOC_LIBS="-L"$ANDROID_TOOLCHAIN"/sysroot/usr/lib -lhwloc" CFLAGS=" -Os " CPPFLAGS=" -Os " LDFLAGS=" "  SYSROOTDIR=$ANDROID_TOOLCHAIN/sysroot/ ../configure --prefix=$PREFIX --host=$HOST --disable-icd --with-sysroot=$ANDROID_TOOLCHAIN/sysroot/ --enable-debug --verbose
-
-cmake -DCMAKE_TOOLCHAIN_FILE=androideabi.cmake -DCMAKE_BUILD_TYPE:STRING=Debug -DCMAKE_AR:FILEPATH=$HOST-gcc-ar -DCMAKE_RANLIB:FILEPATH=$HOST-gcc-ranlib -DCMAKE_CXX_FLAGS:STRING="-Os -ffunction-sections -fdata-sections -fno-lto" -DCMAKE_C_FLAGS:STRING="-Os -ffunction-sections -fdata-sections -fno-lto" -DCMAKE_EXE_LINKER_FLAGS:STRING='-fno-lto -fuse-linker-plugin -Wl,--gc-sections' -DCMAKE_MODULE_LINKER_FLAGS:STRING='-fno-lto -fuse-linker-plugin -Wl,--gc-sections'  -DCMAKE_SHARED_LINKER_F [...]
-make -j4
-
-else
-#ac_cv_c_bigendian=no LLC_HOST_CPU=$TARGET_CPU HWLOC_CFLAGS="-I"$ANDROID_TOOLCHAIN"/sysroot/usr/include" HWLOC_LIBS="-L"$ANDROID_TOOLCHAIN"/sysroot/usr/lib -lhwloc" CFLAGS=" -ffunction-sections -fdata-sections -Os -flto " CPPFLAGS=" -ffunction-sections -fdata-sections -Os -flto " LDFLAGS=" -Wl,--gc-sections -flto " SYSROOTDIR=$ANDROID_TOOLCHAIN/sysroot/ ../configure --prefix=$PREFIX --host=$HOST --disable-icd --with-sysroot=$ANDROID_TOOLCHAIN/sysroot/
-
-cmake -DCMAKE_TOOLCHAIN_FILE=androideabi.cmake -DCMAKE_BUILD_TYPE:STRING=Release -DCMAKE_AR:FILEPATH=$HOST-gcc-ar -DCMAKE_RANLIB:FILEPATH=$HOST-gcc-ranlib -DCMAKE_CXX_FLAGS:STRING="-Os -ffunction-sections -fdata-sections -flto" -DCMAKE_C_FLAGS:STRING="-Os -ffunction-sections -fdata-sections -flto" -DCMAKE_EXE_LINKER_FLAGS:STRING='-flto -fuse-linker-plugin -Wl,--gc-sections' -DCMAKE_MODULE_LINKER_FLAGS:STRING='-flto -fuse-linker-plugin -Wl,--gc-sections'  -DCMAKE_SHARED_LINKER_FLAGS:STRIN [...]
-make
-
-fi
-
-make install
-
-# Copy license files to install folder
-cp -f $ANDROID_TOOLCHAIN/sysroot/usr/share/LICENSE* $INSTALL_PREFIX/share/
-cp -f ../LICENSE $INSTALL_PREFIX/share/LICENSE.pocl
-
-echo -e "\n\nBuild completed...\nBuilt files are at "$PREFIX"\n"
-
diff --git a/cmake/LLVM.cmake b/cmake/LLVM.cmake
index 2b4d95d..7a92bc9 100644
--- a/cmake/LLVM.cmake
+++ b/cmake/LLVM.cmake
@@ -36,9 +36,11 @@ else()
   # search for any version
   find_program(LLVM_CONFIG
     NAMES "llvm-config"
+      "llvm-config-mp-5.0" "llvm-config-5.0" "llvm-config50"
+      "llvm-config-mp-4.0" "llvm-config-4.0" "llvm-config40"
+      "llvm-config-mp-3.9" "llvm-config-3.9" "llvm-config39"
       "llvm-config-mp-3.8" "llvm-config-3.8" "llvm-config38"
       "llvm-config-mp-3.7" "llvm-config-3.7" "llvm-config37"
-      "llvm-config-mp-3.9" "llvm-config-3.9" "llvm-config39"
     DOC "llvm-config executable")
 endif()
 
@@ -158,26 +160,52 @@ if(LLVM_VERSION MATCHES "3[.]([0-9]+)")
   if(LLVM_MINOR STREQUAL "6")
     set(LLVM_3_6 1)
     set(LLVM_OLDER_THAN_3_9 1)
-    set(LLVM_OLDER_THAN_4_0 1)
   elseif(LLVM_MINOR STREQUAL "7")
     set(LLVM_3_7 1)
     set(LLVM_OLDER_THAN_3_9 1)
-    set(LLVM_OLDER_THAN_4_0 1)
   elseif(LLVM_MINOR STREQUAL "8")
     set(LLVM_3_8 1)
     set(LLVM_OLDER_THAN_3_9 1)
-    set(LLVM_OLDER_THAN_4_0 1)
   elseif(LLVM_MINOR STREQUAL "9")
     set(LLVM_3_9 1)
-    set(LLVM_OLDER_THAN_4_0 1)
   else()
     message(FATAL_ERROR "Unknown/unsupported llvm version: 3.${LLVM_MINOR}")
   endif()
+  set(LLVM_OLDER_THAN_4_0 1)
+  set(LLVM_OLDER_THAN_5_0 1)
 elseif(LLVM_VERSION MATCHES "4[.]0")
     set(LLVM_MAJOR 4)
     set(LLVM_4_0 1)
+    set(LLVM_OLDER_THAN_5_0 1)
+elseif(LLVM_VERSION MATCHES "5[.]0")
+    set(LLVM_MAJOR 5)
+    set(LLVM_5_0 1)
 else()
-  message(FATAL_ERROR "LLVM version 3.7+ required, found: ${LLVM_VERSION}")
+  message(FATAL_ERROR "LLVM version between 3.7 and 5.0 required, found: ${LLVM_VERSION}")
+endif()
+
+#############################################################
+
+if(NOT LLVM_OLDER_THAN_4_0)
+
+  run_llvm_config(LLVM_HAS_RTTI --has-rtti)
+
+  run_llvm_config(LLVM_LIB_IS_SHARED --shared-mode)
+
+  if(LLVM_LIB_IS_SHARED MATCHES "shared")
+    set(LLVM_LIB_MODE --link-shared)
+  else()
+    set(LLVM_LIB_MODE --link-static)
+  endif()
+
+  unset(LLVM_LIBS)
+  run_llvm_config(LLVM_LIBS --libs ${LLVM_LIB_MODE})
+  # Convert LLVM_LIBS from string -> list format to make handling them easier
+  separate_arguments(LLVM_LIBS)
+
+  run_llvm_config(LLVM_SYSLIBS --system-libs ${LLVM_LIB_MODE})
+  string(STRIP "${LLVM_SYSLIBS}" LLVM_SYSLIBS)
+
 endif()
 
 ####################################################################
@@ -565,6 +593,10 @@ endif()
 
 set_cache_var(LLC_TRIPLE "LLC_TRIPLE")
 
+# FIXME: The cpu name printed by llc --version is the same cpu that will be
+# targeted if ypu pass -mcpu=native to llc, so we could replace this auto-detection
+# with just: set(LLC_HOST_CPU "native"), however, we can't do this at the moment
+# because of the work-around for arm1176jz-s.
 if(NOT DEFINED LLC_HOST_CPU AND NOT CMAKE_CROSSCOMPILING)
   message(STATUS "Find out LLC host CPU with ${LLVM_LLC}")
   execute_process(COMMAND ${LLVM_LLC} "--version" RESULT_VARIABLE RES_VAR OUTPUT_VARIABLE OUTPUT_VAR)
@@ -587,7 +619,7 @@ if(NOT DEFINED LLC_HOST_CPU AND NOT CMAKE_CROSSCOMPILING)
 endif()
 
 if(LLC_HOST_CPU MATCHES "unknown")
-  message(FATAL_ERROR "LLVM could not recognize your CPU model automatically. Please rerun cmake with -DLLC_HOST_CPU=<model> (to see a list of models, try: llc -mcpu help)")
+  message(FATAL_ERROR "LLVM could not recognize your CPU model automatically. Please run CMake with -DLLC_HOST_CPU=<cpu> (you can find valid names with: llc -mcpu=help)")
 endif()
 
 set(LLC_HOST_CPU "${LLC_HOST_CPU}" CACHE STRING "The Host CPU to use with llc")
@@ -620,7 +652,7 @@ if(NOT DEFINED ${CACHE_VAR_NAME})
        if( !module )
          exit(1);
        else
-         module->dump();
+         printf(\"DataLayout = %s\\n\", module->getDataLayoutStr().c_str());
 
        return 0;
     }")
@@ -695,34 +727,47 @@ if(LLVM_OLDER_THAN_3_9)
     message(FATAL_ERROR "Detected a Clang <3.9 patched with the SPIR address space arg metadata. Unsupported mode. ")
   endif()
 else()
-  if(NOT AS_CHECK_RESULT MATCHES "!1 = !{i32 1, i32 2, i32 3}")
+  if(NOT AS_CHECK_RESULT MATCHES "= !{i32 1, i32 2, i32 3}")
     set(POCL_USE_FAKE_ADDR_SPACE_IDS 1)
   else()
     set(POCL_USE_FAKE_ADDR_SPACE_IDS 0)
   endif()
 endif()
 
-######################################################################################
-# Test for presence of Clang calling convention patch from
-# https://github.com/pocl/pocl/issues/1
+if(LLVM_OLDER_THAN_5_0)
+  set(CLANG_RESOURCE_DIR "${LLVM_LIBDIR}/clang/${LLVM_VERSION_FULL}")
+else()
+  execute_process(COMMAND "${CLANG}" "--print-resource-dir" OUTPUT_VARIABLE CLANG_RESOURCE_DIR)
+  string(STRIP "${CLANG_RESOURCE_DIR}" CLANG_RESOURCE_DIR)
+endif()
+
+if(LLVM_OLDER_THAN_5_0)
+
+  ######################################################################################
+  # Test for presence of Clang calling convention patch from
+  # https://github.com/pocl/pocl/issues/1
 
-execute_process(
-  COMMAND
+  execute_process(
+    COMMAND
     "${CLANG}" "-S" "-xcl" "-emit-llvm" "${CMAKE_SOURCE_DIR}/cmake/spir-cc-test-kernel.cl" "-o" "-"
     OUTPUT_VARIABLE SPIR_PATCH_TEST_IR
     ERROR_VARIABLE _DUMMY
     RESULT_VARIABLE SPIR_CC_RES)
 
-if(SPIR_CC_RES)
-  message(FATAL_ERROR "Clang exited with non-zero status when trying to compile calling convention test")
-endif()
+  if(SPIR_CC_RES)
+    message(FATAL_ERROR "Clang exited with non-zero status when trying to compile calling convention test")
+  endif()
 
-string(FIND "${SPIR_PATCH_TEST_IR}" "spir_kernel" SPIR_CC_RES)
-if("${SPIR_CC_RES}" MATCHES "-1")
-  set(CLANG_IS_PATCHED_FOR_SPIR_CC 0)
-  message(STATUS "Clang is NOT patched for SPIR CC")
+  string(FIND "${SPIR_PATCH_TEST_IR}" "spir_kernel" SPIR_CC_RES)
+  if("${SPIR_CC_RES}" MATCHES "-1")
+    set(CLANG_IS_PATCHED_FOR_SPIR_CC 0)
+    message(STATUS "Clang is NOT patched for SPIR CC")
+  else()
+    set(CLANG_IS_PATCHED_FOR_SPIR_CC 1)
+    set(POCL_KCACHE_SALT "${POCL_KCACHE_SALT}-spirccpatch")
+    message(STATUS "Clang IS patched for SPIR CC")
+  endif()
 else()
   set(CLANG_IS_PATCHED_FOR_SPIR_CC 1)
-  set(POCL_KCACHE_SALT "${POCL_KCACHE_SALT}-spirccpatch")
-  message(STATUS "Clang IS patched for SPIR CC")
+  message(STATUS "Clang 5.0+ use SPIR CC by default")
 endif()
diff --git a/tests/CMakeLists.txt b/cmake/add_test_pocl.cmake
similarity index 78%
copy from tests/CMakeLists.txt
copy to cmake/add_test_pocl.cmake
index b7d490e..f2a3f38 100644
--- a/tests/CMakeLists.txt
+++ b/cmake/add_test_pocl.cmake
@@ -1,7 +1,7 @@
 #=============================================================================
-#   CMake build system files
+#   CMake build system files - add_test_pocl() test wrapper
 #
-#   Copyright (c) 2014 pocl developers
+#   Copyright (c) 2014-2017 pocl developers
 #
 #   Permission is hereby granted, free of charge, to any person obtaining a copy
 #   of this software and associated documentation files (the "Software"), to deal
@@ -23,12 +23,6 @@
 #
 #=============================================================================
 
-#function(add_test_custom RUN_CMD TEST_NAME RESULT_FILE)
-#  foreach(LOOPVAR ${ARGN})
-#    set(RUN_CMD "${RUN_CMD}####${LOOPVAR}")
-#  endforeach()
-#endfunction()
-
 include(CMakeParseArguments)
 
 # This is a wrapper around add_test
@@ -45,9 +39,6 @@ function(add_test_pocl)
   cmake_parse_arguments(POCL_TEST "${options}" "${oneValueArgs}"
                         "${multiValueArgs}" ${ARGN})
 
-  #message(STATUS "POCL_TEST_NAME: ${POCL_TEST_NAME}")
-  #message(STATUS "POCL_TEST_COMMAND: ${POCL_TEST_COMMAND}")
-
   unset(RUN_CMD)
   foreach(LOOPVAR ${POCL_TEST_COMMAND})
     if(NOT RUN_CMD)
@@ -79,22 +70,3 @@ function(add_test_pocl)
                        FAIL_REGULAR_EXPRESSION "FAIL")
 
 endfunction()
-
-
-add_test("pocl_version_check" "runtime/test_version")
-set_tests_properties("pocl_version_check"
-  PROPERTIES
-  ENVIRONMENT "POCL_DEVICES=basic"
-  PASS_REGULAR_EXPRESSION "basic"
-  LABELS "internal")
-
-#######################################################################
-
-add_subdirectory("kernel")
-add_subdirectory("regression")
-add_subdirectory("runtime")
-add_subdirectory("workgroup")
-if(ENABLE_TCE)
-  add_subdirectory("tce")
-endif()
-
diff --git a/cmake/bitcode_rules.cmake b/cmake/bitcode_rules.cmake
index a46bd3b..dce9076 100644
--- a/cmake/bitcode_rules.cmake
+++ b/cmake/bitcode_rules.cmake
@@ -25,11 +25,10 @@
 
 # cmake version of lib/kernel/rules.mk
 
+separate_arguments(KERNEL_C_FLAGS)
 separate_arguments(KERNEL_CL_FLAGS)
-separate_arguments(KERNEL_CLANGXX_FLAGS)
+separate_arguments(KERNEL_CXX_FLAGS)
 
-#/usr/bin/clang --target=x86_64-pc-linux-gnu -march=bdver1 -Xclang -ffake-address-space-map -emit-llvm -ffp-contract=off -D__OPENCL_VERSION__=120 -DPOCL_VECMATHLIB_BUILTIN -D__CBUILD__ -o get_local_id.bc -c ${CMAKE_SOURCE_DIR}/lib/kernel/get_local_id.c -include ${CMAKE_SOURCE_DIR}/include/_kernel_c.h
-#	  @CLANG@ ${CLANG_FLAGS} ${KERNEL_CL_FLAGS} -D__CBUILD__ -c -o $@ -include ${abs_top_srcdir}/include/_kernel_c.h $< 
 function(compile_c_to_bc FILENAME SUBDIR BC_FILE_LIST)
     get_filename_component(FNAME "${FILENAME}" NAME)
     set(BC_FILE "${CMAKE_CURRENT_BINARY_DIR}/${SUBDIR}/${FNAME}.bc")
@@ -40,56 +39,108 @@ function(compile_c_to_bc FILENAME SUBDIR BC_FILE_LIST)
         DEPENDS "${FULL_F_PATH}"
         "${CMAKE_SOURCE_DIR}/include/pocl_types.h"
         "${CMAKE_SOURCE_DIR}/include/_kernel_c.h"
-        ${KERNEL_DEPEND_HEADERS}
+        ${VML_KERNEL_DEPEND_HEADERS}
         COMMAND "${CLANG}" ${CLANG_FLAGS} ${DEVICE_CL_FLAGS}
-        "-xc" "-D__CBUILD__" "-o" "${BC_FILE}" "-c" "${FULL_F_PATH}"
+        ${KERNEL_C_FLAGS} "-o" "${BC_FILE}" "-c" "${FULL_F_PATH}"
         "-include" "${CMAKE_SOURCE_DIR}/include/_kernel_c.h"
         COMMENT "Building C to LLVM bitcode ${BC_FILE}"
         VERBATIM)
 endfunction()
 
-# /usr/bin/clang++ --target=x86_64-pc-linux-gnu -march=bdver1 -Xclang -ffake-address-space-map -emit-llvm -ffp-contract=off -DVML_NO_IOSTREAM -DPOCL_VECMATHLIB_BUILTIN -o trunc.bc -c ${CMAKE_SOURCE_DIR}/lib/kernel/vecmathlib-pocl/trunc.cc
-# 	@CLANGXX@ ${CLANG_FLAGS} ${KERNEL_CLANGXX_FLAGS} -c -o $@ $<
 function(compile_cc_to_bc FILENAME SUBDIR BC_FILE_LIST)
     get_filename_component(FNAME "${FILENAME}" NAME)
     set(BC_FILE "${CMAKE_CURRENT_BINARY_DIR}/${SUBDIR}/${FNAME}.bc")
     set(${BC_FILE_LIST} ${${BC_FILE_LIST}} ${BC_FILE} PARENT_SCOPE)
     set(FULL_F_PATH "${CMAKE_SOURCE_DIR}/lib/kernel/${FILENAME}")
 
-    #MESSAGE(STATUS "BC_FILE: ${BC_FILE}")
-
     add_custom_command(OUTPUT "${BC_FILE}"
         DEPENDS "${FULL_F_PATH}"
-          ${KERNEL_DEPEND_HEADERS}
-        COMMAND  "${CLANGXX}" ${CLANG_FLAGS} ${KERNEL_CLANGXX_FLAGS}
-        ${DEVICE_CL_FLAGS} "-std=c++11" "-o" "${BC_FILE}" "-c" "${FULL_F_PATH}"
+          ${VML_KERNEL_DEPEND_HEADERS}
+        COMMAND  "${CLANGXX}" ${CLANG_FLAGS} ${KERNEL_CXX_FLAGS}
+        ${DEVICE_CL_FLAGS} "-o" "${BC_FILE}" "-c" "${FULL_F_PATH}"
         COMMENT "Building C++ to LLVM bitcode ${BC_FILE}"
         VERBATIM)
 endfunction()
 
-# /usr/bin/clang --target=x86_64-pc-linux-gnu -march=bdver1 -Xclang -ffake-address-space-map -emit-llvm -ffp-contract=off -x cl -D__OPENCL_VERSION__=120 -DPOCL_VECMATHLIB_BUILTIN -fsigned-char -o atan2pi.bc -c ${CMAKE_SOURCE_DIR}/lib/kernel/vecmathlib-pocl/atan2pi.cl -include ${CMAKE_SOURCE_DIR}/include/_kernel.h
-function(compile_cl_to_bc FILENAME SUBDIR BC_FILE_LIST)
+function(compile_cl_to_bc FILENAME SUBDIR BC_FILE_LIST EXTRA_CONFIG)
     get_filename_component(FNAME "${FILENAME}" NAME)
+    get_filename_component(FNAME_WE "${FILENAME}" NAME_WE)
     set(BC_FILE "${CMAKE_CURRENT_BINARY_DIR}/${SUBDIR}/${FNAME}.bc")
     set(${BC_FILE_LIST} ${${BC_FILE_LIST}} ${BC_FILE} PARENT_SCOPE)
     set(FULL_F_PATH "${CMAKE_SOURCE_DIR}/lib/kernel/${FILENAME}")
 
-    #MESSAGE(STATUS "BC_FILE: ${BC_FILE}")
+    set(DEPENDLIST
+          "${CMAKE_SOURCE_DIR}/include/_kernel.h"
+          "${CMAKE_SOURCE_DIR}/include/_kernel_c.h"
+          "${CMAKE_SOURCE_DIR}/include/pocl_types.h")
+    set(INCLUDELIST
+        "-include" "${CMAKE_SOURCE_DIR}/include/_kernel.h"
+        "-include" "${CMAKE_SOURCE_DIR}/include/_enable_all_exts.h")
+
+    if(FILENAME MATCHES "sleef")
+      list(APPEND DEPENDLIST
+          "${EXTRA_CONFIG}"
+          )
+      list(APPEND DEPENDLIST ${SLEEF_CL_KERNEL_DEPEND_HEADERS})
+      list(APPEND INCLUDELIST
+        "-DMAX_PRECISION"
+        "-I" "${CMAKE_SOURCE_DIR}/lib/kernel/sleef/include" # for sleef_cl.h
+        "-include" "${EXTRA_CONFIG}")
+    endif()
+
+    if(FILENAME MATCHES "vecmathlib")
+      list(APPEND DEPENDLIST ${VML_KERNEL_DEPEND_HEADERS})
+    endif()
+
+    if(FILENAME MATCHES "libclc")
+      list(APPEND DEPENDLIST ${LIBCLC_KERNEL_DEPEND_HEADERS})
+
+      set(I32 "${CMAKE_SOURCE_DIR}/lib/kernel/libclc/${FNAME_WE}_fp32.cl")
+      if(EXISTS "${I32}")
+        list(APPEND DEPENDLIST "${I32}")
+      endif()
+
+      set(I64 "${CMAKE_SOURCE_DIR}/lib/kernel/libclc/${FNAME_WE}_fp64.cl")
+      if(EXISTS "${I64}")
+        list(APPEND DEPENDLIST "${I64}")
+      endif()
+
+      list(APPEND INCLUDELIST
+        "-I" "${CMAKE_SOURCE_DIR}/lib/kernel/libclc")
+    endif()
 
     add_custom_command( OUTPUT "${BC_FILE}"
         DEPENDS "${FULL_F_PATH}"
-          "${CMAKE_SOURCE_DIR}/include/_kernel.h"
-          "${CMAKE_SOURCE_DIR}/include/_kernel_c.h"
-          "${CMAKE_SOURCE_DIR}/include/pocl_types.h"
-          ${KERNEL_DEPEND_HEADERS}
-        COMMAND "${CLANG}" ${CLANG_FLAGS} "-x" "cl" ${KERNEL_CL_FLAGS} ${DEVICE_CL_FLAGS}
+          ${DEPENDLIST}
+        COMMAND "${CLANG}" ${CLANG_FLAGS}
+        ${KERNEL_CL_FLAGS} ${DEVICE_CL_FLAGS}
         "-o" "${BC_FILE}" "-c" "${FULL_F_PATH}"
-        "-include" "${CMAKE_SOURCE_DIR}/include/_kernel.h"
-        "-include" "${CMAKE_SOURCE_DIR}/include/_enable_all_exts.h"
+        ${INCLUDELIST}
         COMMENT "Building CL to LLVM bitcode ${BC_FILE}"
         VERBATIM)
 endfunction()
 
+# ARGN - extra defines / arguments to clang
+# can't use c_to_bc, since SLEEF's C files need to be prefixed with EXT
+# (because the same files are compiled multiple times)
+function(compile_sleef_c_to_bc EXT FILENAME SUBDIR BCLIST)
+    get_filename_component(FNAME "${FILENAME}" NAME)
+    set(BC_FILE "${CMAKE_CURRENT_BINARY_DIR}/${SUBDIR}/${EXT}_${FNAME}.bc")
+    list(APPEND ${BCLIST} "${BC_FILE}")
+    set(${BCLIST} ${${BCLIST}} PARENT_SCOPE)
+    set(FULL_F_PATH "${CMAKE_SOURCE_DIR}/lib/kernel/${FILENAME}")
+
+    add_custom_command( OUTPUT "${BC_FILE}"
+        DEPENDS "${FULL_F_PATH}"
+        ${SLEEF_C_KERNEL_DEPEND_HEADERS}
+        COMMAND "${CLANG}" ${CLANG_FLAGS} ${KERNEL_C_FLAGS} ${ARGN}
+        "-I" "${CMAKE_SOURCE_DIR}/lib/kernel/sleef/arch"
+        "-I" "${CMAKE_SOURCE_DIR}/lib/kernel/sleef/libm"
+        "-I" "${CMAKE_SOURCE_DIR}/lib/kernel/sleef/include"
+        "-O1" "-o" "${BC_FILE}" "-c" "${FULL_F_PATH}"
+        COMMENT "Building SLEEF to LLVM bitcode ${BC_FILE}"
+        VERBATIM)
+endfunction()
 
 
 function(compile_ll_to_bc FILENAME SUBDIR BC_FILE_LIST)
@@ -98,23 +149,23 @@ function(compile_ll_to_bc FILENAME SUBDIR BC_FILE_LIST)
     set(${BC_FILE_LIST} ${${BC_FILE_LIST}} ${BC_FILE} PARENT_SCOPE)
     set(FULL_F_PATH "${CMAKE_SOURCE_DIR}/lib/kernel/${FILENAME}")
 
-
     add_custom_command( OUTPUT "${BC_FILE}"
         DEPENDS ""
-        COMMAND "${LLVM_AS}" "-o" "${BC_FILE}" "${CMAKE_CURRENT_SOURCE_DIR}/../${FILENAME}"
+        COMMAND "${LLVM_AS}" "-o" "${BC_FILE}"
+                "${CMAKE_CURRENT_SOURCE_DIR}/../${FILENAME}"
         COMMENT "Building LL to LLVM bitcode ${BC_FILE}" 
         VERBATIM)
 endfunction()
 
 
-macro(compile_to_bc SUBDIR OUTPUT_FILE_LIST)
+macro(compile_to_bc SUBDIR OUTPUT_FILE_LIST EXTRA_CONFIG)
   foreach(FILENAME ${ARGN})
   if(FILENAME MATCHES "[.]c$")
     compile_c_to_bc("${FILENAME}" "${SUBDIR}" ${OUTPUT_FILE_LIST})
   elseif(FILENAME MATCHES "[.]cc$")
     compile_cc_to_bc("${FILENAME}" "${SUBDIR}" ${OUTPUT_FILE_LIST})
   elseif(FILENAME MATCHES "[.]cl$")
-    compile_cl_to_bc("${FILENAME}" "${SUBDIR}" ${OUTPUT_FILE_LIST})
+    compile_cl_to_bc("${FILENAME}" "${SUBDIR}" ${OUTPUT_FILE_LIST} "${EXTRA_CONFIG}")
   elseif(FILENAME MATCHES "[.]ll$")
     compile_ll_to_bc("${FILENAME}" "${SUBDIR}" ${OUTPUT_FILE_LIST})
   else()
@@ -125,30 +176,39 @@ endmacro()
 
 
 
-function(make_kernel_bc OUTPUT_VAR NAME SUBDIR)
+function(make_kernel_bc OUTPUT_VAR NAME SUBDIR USE_SLEEF EXTRA_BC EXTRA_CONFIG)
   set(KERNEL_BC "${CMAKE_CURRENT_BINARY_DIR}/kernel-${NAME}.bc")
   set(${OUTPUT_VAR} "${KERNEL_BC}" PARENT_SCOPE)
 
   file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${SUBDIR}")
-  compile_to_bc("${SUBDIR}" BC_LIST ${ARGN})
+  compile_to_bc("${SUBDIR}" BC_LIST "${EXTRA_CONFIG}" ${ARGN})
 
+  set(DEPENDLIST ${BC_LIST})
   # fix too long commandline with cat and xargs
   set(BC_LIST_FILE_TXT "")
   foreach(FILENAME ${BC_LIST})
     # straight parsing semicolon separated list with xargs -d didn't work on windows.. no such switch available
     set(BC_LIST_FILE_TXT "${BC_LIST_FILE_TXT} \"${FILENAME}\"")
   endforeach()
+  if(USE_SLEEF)
+    set(BC_LIST_FILE_TXT "${BC_LIST_FILE_TXT} \"${EXTRA_BC}\"")
+    list(APPEND DEPENDLIST ${EXTRA_BC})
+  endif()
   set(BC_LIST_FILE "${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/kernel_${NAME}_linklist.txt")
   file(WRITE "${BC_LIST_FILE}" "${BC_LIST_FILE_TXT}")
 
-  set(OPT_CMD "${LLVM_OPT}" ${LLC_FLAGS} "-O3" "-fp-contract=off" "-o" "${KERNEL_BC}" "kernel-${NAME}-unoptimized.bc")
+  # don't waste time optimizing the kernels IR when in developer mode
+  if(DEVELOPER_MODE)
+    set(LINK_OPT_COMMAND COMMAND "${XARGS_EXEC}" "${LLVM_LINK}" "-o" "${KERNEL_BC}" < "${BC_LIST_FILE}")
+  else()
+    set(LINK_CMD COMMAND "${XARGS_EXEC}" "${LLVM_LINK}" "-o" "kernel-${NAME}-unoptimized.bc" < "${BC_LIST_FILE}")
+    set(OPT_CMD COMMAND "${LLVM_OPT}" ${LLC_FLAGS} "-O3" "-fp-contract=off" "-o" "${KERNEL_BC}" "kernel-${NAME}-unoptimized.bc")
+    set(LINK_OPT_COMMAND ${LINK_CMD} ${OPT_CMD})
+  endif()
 
   add_custom_command( OUTPUT "${KERNEL_BC}"
-# ${KERNEL_BC}: ${OBJ}
-        DEPENDS ${BC_LIST}
-#	    @LLVM_LINK@ $^ -o - | @LLVM_OPT@ ${LLC_FLAGS} ${KERNEL_LIB_OPT_FLAGS} -O3 -fp-contract=off -o $@
-        COMMAND "${XARGS_EXEC}" "${LLVM_LINK}" "-o" "kernel-${NAME}-unoptimized.bc" < "${BC_LIST_FILE}"
-        COMMAND ${OPT_CMD}
+        DEPENDS ${DEPENDLIST}
+        ${LINK_OPT_COMMAND}
         COMMENT "Linking & optimizing Kernel bitcode ${KERNEL_BC}"
         VERBATIM)
 
diff --git a/config.h.in.cmake b/config.h.in.cmake
index cd03447..2def6b5 100644
--- a/config.h.in.cmake
+++ b/config.h.in.cmake
@@ -1,4 +1,3 @@
-
 /* The normal alignment of `double16', in bytes. */
 #define ALIGNOF_DOUBLE16 @ALIGNOF_DOUBLE16@
 
@@ -6,39 +5,32 @@
 #define ALIGNOF_FLOAT16 @ALIGNOF_FLOAT16@
 
 #cmakedefine BUILD_HSA
-
-#define POCL_BUILT_WITH_CMAKE
+#cmakedefine BUILD_CUDA
 
 #define BUILDDIR "@BUILDDIR@"
 
 /* "Build with ICD" */
 #cmakedefine BUILD_ICD
 
-#ifndef LLVM_VERSION
-#define LLVM_VERSION "@LLVM_VERSION_FULL@"
-#endif
-
-#define CLANG "@CLANG@"
+#define HOST_CPU_CACHELINE_SIZE @HOST_CPU_CACHELINE_SIZE@
 
-/* clang++ executable */
-#define CLANGXX "@CLANGXX@"
+#cmakedefine CLANG_IS_PATCHED_FOR_SPIR_CC
 
-#define HSAIL_ASM "@HSAIL_ASM@"
+#define CLANG "@CLANG@"
 
-/* clang++ compiler flags */
-/* TODO in sources */
-#define KERNEL_CLANGXX_FLAGS "@KERNEL_CLANGXX_FLAGS@"
+#define CLANG_RESOURCE_DIR "@CLANG_RESOURCE_DIR@"
 
 /* "Using a SPIR generator Clang from Khronos." */
 #cmakedefine CLANG_SPIR
 
+/* clang++ executable */
+#define CLANGXX "@CLANGXX@"
 
-/* TODO in sources */
-#define KERNEL_CL_FLAGS  "@KERNEL_CL_FLAGS@"
-
+#define FORCED_CLFLAGS  "@FORCED_CLFLAGS@"
 
+#cmakedefine ENABLE_CONFORMANCE
 
-#define FORCED_CLFLAGS  "@FORCED_CLFLAGS@"
+#cmakedefine ENABLE_POCL_BUILDING
 
 #cmakedefine HAVE_FORK
 
@@ -55,7 +47,6 @@
 
 #cmakedefine HAVE_HSA_EXT_AMD_H
 
-
 #define HOST  "@HOST@"
 
 #define HOST_AS_FLAGS  "@HOST_AS_FLAGS@"
@@ -64,7 +55,7 @@
 
 #define HOST_DEVICE_EXTENSIONS "@HOST_DEVICE_EXTENSIONS@"
 
-#define HOST_CPU  "@HOST_CPU@"
+#define HOST_CPU  "@LLC_HOST_CPU@"
 
 #define HOST_LD_FLAGS  "@HOST_LD_FLAGS@"
 
@@ -76,6 +67,9 @@
 
 #define HSA_DEVICE_EXTENSIONS "@HSA_DEVICE_EXTENSIONS@"
 
+#define HSAIL_ASM "@HSAIL_ASM@"
+
+
 #define KERNELLIB_HOST_CPU_VARIANTS "@KERNELLIB_HOST_CPU_VARIANTS@"
 
 #cmakedefine KERNELLIB_HOST_DISTRO_VARIANTS
@@ -97,15 +91,20 @@
 /* "Using LLVM 4.0" */
 #cmakedefine LLVM_4_0
 
-#cmakedefine POCL_USE_FAKE_ADDR_SPACE_IDS
+/* "Using LLVM 5.0" */
+#cmakedefine LLVM_5_0
+
+#cmakedefine LLVM_BUILD_MODE_DEBUG
+
+#ifndef LLVM_VERSION
+#define LLVM_VERSION "@LLVM_VERSION_FULL@"
+#endif
 
 #define LINK_COMMAND "@LINK_COMMAND@"
 
 /* Defined to greatest expected alignment for extended types, in bytes. */
 #define MAX_EXTENDED_ALIGNMENT @MAX_EXTENDED_ALIGNMENT@
 
-
-
 /* used in lib/CL/devices/basic */
 #define OCL_KERNEL_TARGET  "@OCL_KERNEL_TARGET@"
 #define OCL_KERNEL_TARGET_CPU  "@OCL_KERNEL_TARGET_CPU@"
@@ -114,6 +113,7 @@
 
 #define PACKAGE_VERSION "@PACKAGE_VERSION@"
 
+#define POCL_KCACHE_SALT "@POCL_KCACHE_SALT@"
 
 #define POCL_KERNEL_CACHE_DEFAULT @POCL_KERNEL_CACHE_DEFAULT@
 
@@ -125,13 +125,9 @@
 
 #define POCL_INSTALL_PRIVATE_DATADIR "@POCL_INSTALL_PRIVATE_DATADIR@"
 
-/* these are *host* values */
-
-/* The size of `__fp16', as computed by sizeof. */
-#define SIZEOF___FP16  @SIZEOF___FP16@
+#cmakedefine POCL_USE_FAKE_ADDR_SPACE_IDS
 
-#cmakedefine CLANG_IS_PATCHED_FOR_SPIR_CC
-#cmakedefine POCL_KCACHE_SALT "@POCL_KCACHE_SALT@"
+/* these are *host* values */
 
 /* used in tce_common.c & pocl_llvm_api.cc  */
 #define SRCDIR  "@SRCDIR@"
@@ -142,26 +138,31 @@
 
 #define TCE_DEVICE_EXTENSIONS "@TCE_DEVICE_EXTENSIONS@"
 
-/* "Use vecmathlib if available for the target." */
-#cmakedefine USE_VECMATHLIB
-
-
 /* Defined on big endian systems */
 #define WORDS_BIGENDIAN @WORDS_BIGENDIAN@
 
-/* Disable cl_khr_int64 when a clang bug is present */
+/* Disable 64bit ints when a clang bug is present */
 #cmakedefine _CL_DISABLE_LONG
 
 /* Disable cl_khr_fp16 because fp16 is not supported */
 #cmakedefine _CL_DISABLE_HALF
 
-#define POCL_CL_VERSION "2.0"
+/* Disable cl_khr_fp64 because fp64 is not supported */
+#cmakedefine _CL_DISABLE_DOUBLE
+
+#define POCL_CL_VERSION "1.2"
 
 #define HSA_DEVICE_CL_VERSION_MAJOR 2
 #define HSA_DEVICE_CL_VERSION_MINOR 0
 
-#define HOST_DEVICE_CL_VERSION_MAJOR 2
-#define HOST_DEVICE_CL_VERSION_MINOR 0
+#define CUDA_DEVICE_CL_VERSION_MAJOR 1
+#define CUDA_DEVICE_CL_VERSION_MINOR 2
+
+#define HOST_DEVICE_CL_VERSION_MAJOR 1
+#define HOST_DEVICE_CL_VERSION_MINOR 2
 
 #define TCE_DEVICE_CL_VERSION_MAJOR 1
 #define TCE_DEVICE_CL_VERSION_MINOR 2
+
+
+#cmakedefine USE_POCL_MEMMANAGER
diff --git a/config/xclang b/config/xclang
deleted file mode 100755
index a264f9c..0000000
--- a/config/xclang
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/bin/sh
-
-CFLAGS=""
-
-while getopts h:b:o:cSEgO: o
-do
-    case "$o" in
-        h)   host="${OPTARG}";;
-        b)   build="${OPTARG}";;
-        o)   output="-o ${OPTARG}";;
-        c)   mode=-c;;
-        S)   mode=-S;;
-        E)   mode=-E;;
-        O)   CFLAGS="$CFLAGS -O${OPTARG}";;
-        g)   CFLAGS="$CFLAGS -g";;
-        [?]) echo >&2 "Usage: $0 [-h <host>] [-b <build>] [-c | -S | -E] [-o <output_file>] <input_file>" && exit 1;;
-    esac
-done
-shift $((${OPTIND}-1))
-
-# CLANG and llc should be defined and exported in configure script
-# but in case of direct call, do something not too bad
-CLANG="${CLANG:-clang}"
-LLC="${LLC:-llc}"
-
-if [ "x$host" = x ]
-then
-    exec $CLANG $CFLAGS $mode $output $1
-fi
-
-config=`dirname $0`
-build="$build"
-if [ -z "$build" ]
-then
-    build=`$config/config.guess`
-fi
-
-if [ -z $TMPDIR ]
-then
-    TMPDIR=/tmp
-fi
-tmpdir=$TMPDIR/xclang$PPID
-mkdir $tmpdir
-
-if [ "$mode" = -c ]
-then
-    $CLANG $CFLAGS -target $host -c -emit-llvm -o $tmpdir/xclang.bc $@
-    $LLC -mtriple=$build -o $tmpdir/xclang.s $tmpdir/xclang.bc
-    exec $CLANG -target $build -c $output $tmpdir/xclang.s
-fi
-
-if [ "$mode" = -S ]
-then
-    $CLANG $CFLAGS -target $host -c -emit-llvm -o $tmpdir/xclang.bc $@
-    exec $LLC -mtriple=$build $output $tmpdir/xclang.bc
-fi
-
-if [ "$mode" = -E ]
-then
-    exec $CLANG $CFLAGS -E $output $1
-fi
-
-$CLANG $CFLAGS -target $host -c -emit-llvm -o $tmpdir/xclang.bc $@
-
-# Adjust $LLC call for hard-float in case the build is that
-case "$build" in
-arm*gnueabihf)
-   $LLC -mtriple=$build -float-abi=hard -o $tmpdir/xclang.s $tmpdir/xclang.bc;;
-*)
-   $LLC -mtriple=$build -o $tmpdir/xclang.s $tmpdir/xclang.bc;;
-esac
-
-exec $CLANG -target $build $output $tmpdir/xclang.s
diff --git a/config2.h.in.cmake b/config2.h.in.cmake
new file mode 100644
index 0000000..e0f4380
--- /dev/null
+++ b/config2.h.in.cmake
@@ -0,0 +1,5 @@
+/* this config file is for values NOT escaped for C/C++
+ * required e.g. for values with doublequotes, like C string arrays */
+
+#define HOST_LD_FLAGS_ARRAY "@HOST_LD_FLAGS_ARRAY@"
+
diff --git a/depcomp b/depcomp
deleted file mode 100755
index aeba4e8..0000000
--- a/depcomp
+++ /dev/null
@@ -1,632 +0,0 @@
-#! /bin/sh
-# depcomp - compile a program generating dependencies as side-effects
-
-scriptversion=2009-04-28.21; # UTC
-
-# Copyright (C) 1999, 2000, 2003, 2004, 2005, 2006, 2007, 2009 Free
-# Software Foundation, Inc.
-
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 2, or (at your option)
-# any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
-# 02110-1301, USA.
-
-# As a special exception to the GNU General Public License, if you
-# distribute this file as part of a program that contains a
-# configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that program.
-
-# Originally written by Alexandre Oliva <oliva at dcc.unicamp.br>.
-
-case $1 in
-  '')
-     echo "$0: No command.  Try \`$0 --help' for more information." 1>&2
-     exit 1;
-     ;;
-  -h | --h*)
-    cat <<\EOF
-Usage: depcomp [--help] [--version] PROGRAM [ARGS]
-
-Run PROGRAMS ARGS to compile a file, generating dependencies
-as side-effects.
-
-Environment variables:
-  depmode     Dependency tracking mode.
-  source      Source file read by `PROGRAMS ARGS'.
-  object      Object file output by `PROGRAMS ARGS'.
-  DEPDIR      directory where to store dependencies.
-  depfile     Dependency file to output.
-  tmpdepfile  Temporary file to use when outputing dependencies.
-  libtool     Whether libtool is used (yes/no).
-
-Report bugs to <bug-automake at gnu.org>.
-EOF
-    exit $?
-    ;;
-  -v | --v*)
-    echo "depcomp $scriptversion"
-    exit $?
-    ;;
-esac
-
-if test -z "$depmode" || test -z "$source" || test -z "$object"; then
-  echo "depcomp: Variables source, object and depmode must be set" 1>&2
-  exit 1
-fi
-
-# Dependencies for sub/bar.o or sub/bar.obj go into sub/.deps/bar.Po.
-depfile=${depfile-`echo "$object" |
-  sed 's|[^\\/]*$|'${DEPDIR-.deps}'/&|;s|\.\([^.]*\)$|.P\1|;s|Pobj$|Po|'`}
-tmpdepfile=${tmpdepfile-`echo "$depfile" | sed 's/\.\([^.]*\)$/.T\1/'`}
-
-rm -f "$tmpdepfile"
-
-# Some modes work just like other modes, but use different flags.  We
-# parameterize here, but still list the modes in the big case below,
-# to make depend.m4 easier to write.  Note that we *cannot* use a case
-# here, because this file can only contain one case statement.
-if test "$depmode" = hp; then
-  # HP compiler uses -M and no extra arg.
-  gccflag=-M
-  depmode=gcc
-fi
-
-if test "$depmode" = dashXmstdout; then
-   # This is just like dashmstdout with a different argument.
-   dashmflag=-xM
-   depmode=dashmstdout
-fi
-
-cygpath_u="cygpath -u -f -"
-if test "$depmode" = msvcmsys; then
-   # This is just like msvisualcpp but w/o cygpath translation.
-   # Just convert the backslash-escaped backslashes to single forward
-   # slashes to satisfy depend.m4
-   cygpath_u="sed s,\\\\\\\\,/,g"
-   depmode=msvisualcpp
-fi
-
-case "$depmode" in
-gcc3)
-## gcc 3 implements dependency tracking that does exactly what
-## we want.  Yay!  Note: for some reason libtool 1.4 doesn't like
-## it if -MD -MP comes after the -MF stuff.  Hmm.
-## Unfortunately, FreeBSD c89 acceptance of flags depends upon
-## the command line argument order; so add the flags where they
-## appear in depend2.am.  Note that the slowdown incurred here
-## affects only configure: in makefiles, %FASTDEP% shortcuts this.
-  for arg
-  do
-    case $arg in
-    -c) set fnord "$@" -MT "$object" -MD -MP -MF "$tmpdepfile" "$arg" ;;
-    *)  set fnord "$@" "$arg" ;;
-    esac
-    shift # fnord
-    shift # $arg
-  done
-  "$@"
-  stat=$?
-  if test $stat -eq 0; then :
-  else
-    rm -f "$tmpdepfile"
-    exit $stat
-  fi
-  mv "$tmpdepfile" "$depfile"
-  ;;
-
-gcc)
-## There are various ways to get dependency output from gcc.  Here's
-## why we pick this rather obscure method:
-## - Don't want to use -MD because we'd like the dependencies to end
-##   up in a subdir.  Having to rename by hand is ugly.
-##   (We might end up doing this anyway to support other compilers.)
-## - The DEPENDENCIES_OUTPUT environment variable makes gcc act like
-##   -MM, not -M (despite what the docs say).
-## - Using -M directly means running the compiler twice (even worse
-##   than renaming).
-  if test -z "$gccflag"; then
-    gccflag=-MD,
-  fi
-  "$@" -Wp,"$gccflag$tmpdepfile"
-  stat=$?
-  if test $stat -eq 0; then :
-  else
-    rm -f "$tmpdepfile"
-    exit $stat
-  fi
-  rm -f "$depfile"
-  echo "$object : \\" > "$depfile"
-  alpha=ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
-## The second -e expression handles DOS-style file names with drive letters.
-  sed -e 's/^[^:]*: / /' \
-      -e 's/^['$alpha']:\/[^:]*: / /' < "$tmpdepfile" >> "$depfile"
-## This next piece of magic avoids the `deleted header file' problem.
-## The problem is that when a header file which appears in a .P file
-## is deleted, the dependency causes make to die (because there is
-## typically no way to rebuild the header).  We avoid this by adding
-## dummy dependencies for each header file.  Too bad gcc doesn't do
-## this for us directly.
-  tr ' ' '
-' < "$tmpdepfile" |
-## Some versions of gcc put a space before the `:'.  On the theory
-## that the space means something, we add a space to the output as
-## well.
-## Some versions of the HPUX 10.20 sed can't process this invocation
-## correctly.  Breaking it into two sed invocations is a workaround.
-    sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
-  rm -f "$tmpdepfile"
-  ;;
-
-hp)
-  # This case exists only to let depend.m4 do its work.  It works by
-  # looking at the text of this script.  This case will never be run,
-  # since it is checked for above.
-  exit 1
-  ;;
-
-sgi)
-  if test "$libtool" = yes; then
-    "$@" "-Wp,-MDupdate,$tmpdepfile"
-  else
-    "$@" -MDupdate "$tmpdepfile"
-  fi
-  stat=$?
-  if test $stat -eq 0; then :
-  else
-    rm -f "$tmpdepfile"
-    exit $stat
-  fi
-  rm -f "$depfile"
-
-  if test -f "$tmpdepfile"; then  # yes, the sourcefile depend on other files
-    echo "$object : \\" > "$depfile"
-
-    # Clip off the initial element (the dependent).  Don't try to be
-    # clever and replace this with sed code, as IRIX sed won't handle
-    # lines with more than a fixed number of characters (4096 in
-    # IRIX 6.2 sed, 8192 in IRIX 6.5).  We also remove comment lines;
-    # the IRIX cc adds comments like `#:fec' to the end of the
-    # dependency line.
-    tr ' ' '
-' < "$tmpdepfile" \
-    | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' | \
-    tr '
-' ' ' >> "$depfile"
-    echo >> "$depfile"
-
-    # The second pass generates a dummy entry for each header file.
-    tr ' ' '
-' < "$tmpdepfile" \
-   | sed -e 's/^.*\.o://' -e 's/#.*$//' -e '/^$/ d' -e 's/$/:/' \
-   >> "$depfile"
-  else
-    # The sourcefile does not contain any dependencies, so just
-    # store a dummy comment line, to avoid errors with the Makefile
-    # "include basename.Plo" scheme.
-    echo "#dummy" > "$depfile"
-  fi
-  rm -f "$tmpdepfile"
-  ;;
-
-aix)
-  # The C for AIX Compiler uses -M and outputs the dependencies
-  # in a .u file.  In older versions, this file always lives in the
-  # current directory.  Also, the AIX compiler puts `$object:' at the
-  # start of each line; $object doesn't have directory information.
-  # Version 6 uses the directory in both cases.
-  dir=`echo "$object" | sed -e 's|/[^/]*$|/|'`
-  test "x$dir" = "x$object" && dir=
-  base=`echo "$object" | sed -e 's|^.*/||' -e 's/\.o$//' -e 's/\.lo$//'`
-  if test "$libtool" = yes; then
-    tmpdepfile1=$dir$base.u
-    tmpdepfile2=$base.u
-    tmpdepfile3=$dir.libs/$base.u
-    "$@" -Wc,-M
-  else
-    tmpdepfile1=$dir$base.u
-    tmpdepfile2=$dir$base.u
-    tmpdepfile3=$dir$base.u
-    "$@" -M
-  fi
-  stat=$?
-
-  if test $stat -eq 0; then :
-  else
-    rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3"
-    exit $stat
-  fi
-
-  for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3"
-  do
-    test -f "$tmpdepfile" && break
-  done
-  if test -f "$tmpdepfile"; then
-    # Each line is of the form `foo.o: dependent.h'.
-    # Do two passes, one to just change these to
-    # `$object: dependent.h' and one to simply `dependent.h:'.
-    sed -e "s,^.*\.[a-z]*:,$object:," < "$tmpdepfile" > "$depfile"
-    # That's a tab and a space in the [].
-    sed -e 's,^.*\.[a-z]*:[	 ]*,,' -e 's,$,:,' < "$tmpdepfile" >> "$depfile"
-  else
-    # The sourcefile does not contain any dependencies, so just
-    # store a dummy comment line, to avoid errors with the Makefile
-    # "include basename.Plo" scheme.
-    echo "#dummy" > "$depfile"
-  fi
-  rm -f "$tmpdepfile"
-  ;;
-
-icc)
-  # Intel's C compiler understands `-MD -MF file'.  However on
-  #    icc -MD -MF foo.d -c -o sub/foo.o sub/foo.c
-  # ICC 7.0 will fill foo.d with something like
-  #    foo.o: sub/foo.c
-  #    foo.o: sub/foo.h
-  # which is wrong.  We want:
-  #    sub/foo.o: sub/foo.c
-  #    sub/foo.o: sub/foo.h
-  #    sub/foo.c:
-  #    sub/foo.h:
-  # ICC 7.1 will output
-  #    foo.o: sub/foo.c sub/foo.h
-  # and will wrap long lines using \ :
-  #    foo.o: sub/foo.c ... \
-  #     sub/foo.h ... \
-  #     ...
-
-  "$@" -MD -MF "$tmpdepfile"
-  stat=$?
-  if test $stat -eq 0; then :
-  else
-    rm -f "$tmpdepfile"
-    exit $stat
-  fi
-  rm -f "$depfile"
-  # Each line is of the form `foo.o: dependent.h',
-  # or `foo.o: dep1.h dep2.h \', or ` dep3.h dep4.h \'.
-  # Do two passes, one to just change these to
-  # `$object: dependent.h' and one to simply `dependent.h:'.
-  sed "s,^[^:]*:,$object :," < "$tmpdepfile" > "$depfile"
-  # Some versions of the HPUX 10.20 sed can't process this invocation
-  # correctly.  Breaking it into two sed invocations is a workaround.
-  sed 's,^[^:]*: \(.*\)$,\1,;s/^\\$//;/^$/d;/:$/d' < "$tmpdepfile" |
-    sed -e 's/$/ :/' >> "$depfile"
-  rm -f "$tmpdepfile"
-  ;;
-
-hp2)
-  # The "hp" stanza above does not work with aCC (C++) and HP's ia64
-  # compilers, which have integrated preprocessors.  The correct option
-  # to use with these is +Maked; it writes dependencies to a file named
-  # 'foo.d', which lands next to the object file, wherever that
-  # happens to be.
-  # Much of this is similar to the tru64 case; see comments there.
-  dir=`echo "$object" | sed -e 's|/[^/]*$|/|'`
-  test "x$dir" = "x$object" && dir=
-  base=`echo "$object" | sed -e 's|^.*/||' -e 's/\.o$//' -e 's/\.lo$//'`
-  if test "$libtool" = yes; then
-    tmpdepfile1=$dir$base.d
-    tmpdepfile2=$dir.libs/$base.d
-    "$@" -Wc,+Maked
-  else
-    tmpdepfile1=$dir$base.d
-    tmpdepfile2=$dir$base.d
-    "$@" +Maked
-  fi
-  stat=$?
-  if test $stat -eq 0; then :
-  else
-     rm -f "$tmpdepfile1" "$tmpdepfile2"
-     exit $stat
-  fi
-
-  for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2"
-  do
-    test -f "$tmpdepfile" && break
-  done
-  if test -f "$tmpdepfile"; then
-    sed -e "s,^.*\.[a-z]*:,$object:," "$tmpdepfile" > "$depfile"
-    # Add `dependent.h:' lines.
-    sed -ne '2,${
-	       s/^ *//
-	       s/ \\*$//
-	       s/$/:/
-	       p
-	     }' "$tmpdepfile" >> "$depfile"
-  else
-    echo "#dummy" > "$depfile"
-  fi
-  rm -f "$tmpdepfile" "$tmpdepfile2"
-  ;;
-
-tru64)
-   # The Tru64 compiler uses -MD to generate dependencies as a side
-   # effect.  `cc -MD -o foo.o ...' puts the dependencies into `foo.o.d'.
-   # At least on Alpha/Redhat 6.1, Compaq CCC V6.2-504 seems to put
-   # dependencies in `foo.d' instead, so we check for that too.
-   # Subdirectories are respected.
-   dir=`echo "$object" | sed -e 's|/[^/]*$|/|'`
-   test "x$dir" = "x$object" && dir=
-   base=`echo "$object" | sed -e 's|^.*/||' -e 's/\.o$//' -e 's/\.lo$//'`
-
-   if test "$libtool" = yes; then
-      # With Tru64 cc, shared objects can also be used to make a
-      # static library.  This mechanism is used in libtool 1.4 series to
-      # handle both shared and static libraries in a single compilation.
-      # With libtool 1.4, dependencies were output in $dir.libs/$base.lo.d.
-      #
-      # With libtool 1.5 this exception was removed, and libtool now
-      # generates 2 separate objects for the 2 libraries.  These two
-      # compilations output dependencies in $dir.libs/$base.o.d and
-      # in $dir$base.o.d.  We have to check for both files, because
-      # one of the two compilations can be disabled.  We should prefer
-      # $dir$base.o.d over $dir.libs/$base.o.d because the latter is
-      # automatically cleaned when .libs/ is deleted, while ignoring
-      # the former would cause a distcleancheck panic.
-      tmpdepfile1=$dir.libs/$base.lo.d   # libtool 1.4
-      tmpdepfile2=$dir$base.o.d          # libtool 1.5
-      tmpdepfile3=$dir.libs/$base.o.d    # libtool 1.5
-      tmpdepfile4=$dir.libs/$base.d      # Compaq CCC V6.2-504
-      "$@" -Wc,-MD
-   else
-      tmpdepfile1=$dir$base.o.d
-      tmpdepfile2=$dir$base.d
-      tmpdepfile3=$dir$base.d
-      tmpdepfile4=$dir$base.d
-      "$@" -MD
-   fi
-
-   stat=$?
-   if test $stat -eq 0; then :
-   else
-      rm -f "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" "$tmpdepfile4"
-      exit $stat
-   fi
-
-   for tmpdepfile in "$tmpdepfile1" "$tmpdepfile2" "$tmpdepfile3" "$tmpdepfile4"
-   do
-     test -f "$tmpdepfile" && break
-   done
-   if test -f "$tmpdepfile"; then
-      sed -e "s,^.*\.[a-z]*:,$object:," < "$tmpdepfile" > "$depfile"
-      # That's a tab and a space in the [].
-      sed -e 's,^.*\.[a-z]*:[	 ]*,,' -e 's,$,:,' < "$tmpdepfile" >> "$depfile"
-   else
-      echo "#dummy" > "$depfile"
-   fi
-   rm -f "$tmpdepfile"
-   ;;
-
-#nosideeffect)
-  # This comment above is used by automake to tell side-effect
-  # dependency tracking mechanisms from slower ones.
-
-dashmstdout)
-  # Important note: in order to support this mode, a compiler *must*
-  # always write the preprocessed file to stdout, regardless of -o.
-  "$@" || exit $?
-
-  # Remove the call to Libtool.
-  if test "$libtool" = yes; then
-    while test "X$1" != 'X--mode=compile'; do
-      shift
-    done
-    shift
-  fi
-
-  # Remove `-o $object'.
-  IFS=" "
-  for arg
-  do
-    case $arg in
-    -o)
-      shift
-      ;;
-    $object)
-      shift
-      ;;
-    *)
-      set fnord "$@" "$arg"
-      shift # fnord
-      shift # $arg
-      ;;
-    esac
-  done
-
-  test -z "$dashmflag" && dashmflag=-M
-  # Require at least two characters before searching for `:'
-  # in the target name.  This is to cope with DOS-style filenames:
-  # a dependency such as `c:/foo/bar' could be seen as target `c' otherwise.
-  "$@" $dashmflag |
-    sed 's:^[  ]*[^: ][^:][^:]*\:[    ]*:'"$object"'\: :' > "$tmpdepfile"
-  rm -f "$depfile"
-  cat < "$tmpdepfile" > "$depfile"
-  tr ' ' '
-' < "$tmpdepfile" | \
-## Some versions of the HPUX 10.20 sed can't process this invocation
-## correctly.  Breaking it into two sed invocations is a workaround.
-    sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
-  rm -f "$tmpdepfile"
-  ;;
-
-dashXmstdout)
-  # This case only exists to satisfy depend.m4.  It is never actually
-  # run, as this mode is specially recognized in the preamble.
-  exit 1
-  ;;
-
-makedepend)
-  "$@" || exit $?
-  # Remove any Libtool call
-  if test "$libtool" = yes; then
-    while test "X$1" != 'X--mode=compile'; do
-      shift
-    done
-    shift
-  fi
-  # X makedepend
-  shift
-  cleared=no eat=no
-  for arg
-  do
-    case $cleared in
-    no)
-      set ""; shift
-      cleared=yes ;;
-    esac
-    if test $eat = yes; then
-      eat=no
-      continue
-    fi
-    case "$arg" in
-    -D*|-I*)
-      set fnord "$@" "$arg"; shift ;;
-    # Strip any option that makedepend may not understand.  Remove
-    # the object too, otherwise makedepend will parse it as a source file.
-    -arch)
-      eat=yes ;;
-    -*|$object)
-      ;;
-    *)
-      set fnord "$@" "$arg"; shift ;;
-    esac
-  done
-  obj_suffix=`echo "$object" | sed 's/^.*\././'`
-  touch "$tmpdepfile"
-  ${MAKEDEPEND-makedepend} -o"$obj_suffix" -f"$tmpdepfile" "$@"
-  rm -f "$depfile"
-  cat < "$tmpdepfile" > "$depfile"
-  sed '1,2d' "$tmpdepfile" | tr ' ' '
-' | \
-## Some versions of the HPUX 10.20 sed can't process this invocation
-## correctly.  Breaking it into two sed invocations is a workaround.
-    sed -e 's/^\\$//' -e '/^$/d' -e '/:$/d' | sed -e 's/$/ :/' >> "$depfile"
-  rm -f "$tmpdepfile" "$tmpdepfile".bak
-  ;;
-
-cpp)
-  # Important note: in order to support this mode, a compiler *must*
-  # always write the preprocessed file to stdout.
-  "$@" || exit $?
-
-  # Remove the call to Libtool.
-  if test "$libtool" = yes; then
-    while test "X$1" != 'X--mode=compile'; do
-      shift
-    done
-    shift
-  fi
-
-  # Remove `-o $object'.
-  IFS=" "
-  for arg
-  do
-    case $arg in
-    -o)
-      shift
-      ;;
-    $object)
-      shift
-      ;;
-    *)
-      set fnord "$@" "$arg"
-      shift # fnord
-      shift # $arg
-      ;;
-    esac
-  done
-
-  "$@" -E |
-    sed -n -e '/^# [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' \
-       -e '/^#line [0-9][0-9]* "\([^"]*\)".*/ s:: \1 \\:p' |
-    sed '$ s: \\$::' > "$tmpdepfile"
-  rm -f "$depfile"
-  echo "$object : \\" > "$depfile"
-  cat < "$tmpdepfile" >> "$depfile"
-  sed < "$tmpdepfile" '/^$/d;s/^ //;s/ \\$//;s/$/ :/' >> "$depfile"
-  rm -f "$tmpdepfile"
-  ;;
-
-msvisualcpp)
-  # Important note: in order to support this mode, a compiler *must*
-  # always write the preprocessed file to stdout.
-  "$@" || exit $?
-
-  # Remove the call to Libtool.
-  if test "$libtool" = yes; then
-    while test "X$1" != 'X--mode=compile'; do
-      shift
-    done
-    shift
-  fi
-
-  IFS=" "
-  for arg
-  do
-    case "$arg" in
-    -o)
-      shift
-      ;;
-    $object)
-      shift
-      ;;
-    "-Gm"|"/Gm"|"-Gi"|"/Gi"|"-ZI"|"/ZI")
-	set fnord "$@"
-	shift
-	shift
-	;;
-    *)
-	set fnord "$@" "$arg"
-	shift
-	shift
-	;;
-    esac
-  done
-  "$@" -E 2>/dev/null |
-  sed -n '/^#line [0-9][0-9]* "\([^"]*\)"/ s::\1:p' | $cygpath_u | sort -u > "$tmpdepfile"
-  rm -f "$depfile"
-  echo "$object : \\" > "$depfile"
-  sed < "$tmpdepfile" -n -e 's% %\\ %g' -e '/^\(.*\)$/ s::	\1 \\:p' >> "$depfile"
-  echo "	" >> "$depfile"
-  sed < "$tmpdepfile" -n -e 's% %\\ %g' -e '/^\(.*\)$/ s::\1\::p' >> "$depfile"
-  rm -f "$tmpdepfile"
-  ;;
-
-msvcmsys)
-  # This case exists only to let depend.m4 do its work.  It works by
-  # looking at the text of this script.  This case will never be run,
-  # since it is checked for above.
-  exit 1
-  ;;
-
-none)
-  exec "$@"
-  ;;
-
-*)
-  echo "Unknown depmode $depmode" 1>&2
-  exit 1
-  ;;
-esac
-
-exit 0
-
-# Local Variables:
-# mode: shell-script
-# sh-indentation: 2
-# eval: (add-hook 'write-file-hooks 'time-stamp)
-# time-stamp-start: "scriptversion="
-# time-stamp-format: "%:y-%02m-%02d.%02H"
-# time-stamp-time-zone: "UTC"
-# time-stamp-end: "; # UTC"
-# End:
diff --git a/doc/build-envs.txt b/doc/build-envs.txt
deleted file mode 100644
index 3e0ab33..0000000
--- a/doc/build-envs.txt
+++ /dev/null
@@ -1,88 +0,0 @@
-pocl's build system is influenced by the following environment
-variables:
-
-Since pocl is a compiler, it both compiles (producing code) and is
-compiled (it consists of code). This distinction typically called
-"host" and "target": The host is where pocl is running, the target is
-where the OpenCL code will be running. These two systems can be wildly
-different.
-
-
-
-=== Building pocl itself, i.e. host flags ===
-
-Since pocl extends Clang/LLVM, these flags probably need to be
-compatible with the way Clang/LLVM was built. Since Clang/LLVM are
-written in C++, this is in particular the case for C++ code. C code is
-fairly portable and could be compiled with different settings. Not
-that Clang/LLVM may not be built by Clang, but may e.g. be built by
-GCC -- in this case, you probably want to use GCC for building pocl as
-well.
-
-Note that there are no flags for building OpenCL code. While it would
-theoretically be possible to write parts of pocl in OpenCL, it would
-be very strange to do so since pocl itself is not running on a device.
-
-Compile C:
-   CC
-   CPPFLAGS
-   CFLAGS
-
-Compile C++:
-   CXX
-   CXXCPPFLAGS
-   CXXFLAGS
-
-Link:
-   LD (???)
-   LDFLAGS
-
-
-
-=== Building kernels and the kernel library, i.e. target flags ===
-
-Since pocl extends Clang for building kernels, and since this heavily
-relies on bytecode files, the kernel library needs to be built by
-Clang as well. While part of the kernel library is written in OpenCL,
-there are other parts written in C, in bytecode, or in C++.
-
-Compile C to bytecode (why are these the same as OpenCL?):
-   CLANG
-   CLFLAGS (should this be CLANG_CFLAGS instead?)
-
-Compile C++ to bytecode:
-   CLANGXX
-   CLANGXX_FLAGS (should this be CLANG_CXXFLAGS instead?)
-
-Compile OpenCL to bytecode:
-   CLANG
-   CLFLAGS (should this be CLANG_CLFLAGS instead?)
-
-Optimize bytecode:
-   OPT
-
-Convert bytecode to assembler:
-   LLC
-   HOST_LLC_FLAGS
-
-Convert assembler to object file:
-   CLANG
-   HOST_CLANG_FLAGS
-
-Post-process object file:
-   LINK_CMD
-   HOST_LD_FLAGS
-
-Fine-tune the host CPU:
-   LLC_HOST_CPU 
-    This overrides LLVM's autodetected host CPU at configure time. The CPU is used
-    to fine-tune the host-based devices' ('basic' and 'pthread') kernel code.
-    Useful when llc makes a mistake. See 'llvm-as < /dev/null -o - | llc -mcpu=help'
-    for valid variables.
-    If the host cpu is "(unknown)", then "-march=" is not passed to clang, and 
-    "-mcpu=" is not passed to llc.
-
-
-
-
-TARGET_CLANG_FLAGS (???)
diff --git a/doc/luxmark.txt b/doc/luxmark.txt
deleted file mode 100644
index 712692c..0000000
--- a/doc/luxmark.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-All of the Luxmark v2.0 scenes work with pocl. 
-
-Tested with an LLVM 3.3 (trunk at 2013-03-14).
-
-There's a small issue in the Luxmark itself:
-
-http://www.luxrender.net/forum/viewtopic.php?f=34&t=7769&p=93888#p93888
-
-To circumvent this, use the 'basic' device driver which
-reports only CL_DEVICE_TYPE_CPU. It doesn't multithread so
-the best performance is not reached, but should be OK for
-verification.
-
diff --git a/doc/sphinx/source/benchmarks.rst b/doc/sphinx/source/benchmarks.rst
new file mode 100644
index 0000000..a8171dc
--- /dev/null
+++ b/doc/sphinx/source/benchmarks.rst
@@ -0,0 +1,33 @@
+CLPeak
+------
+
+Currently (Dec 2017) does not work. First, there's a global memory size
+detection bug in CLPeak which makes it fail on all OpenCL calls (this
+can be workarounded by using POCL_MEMORY_LIMIT=1). Second, compilation
+takes forever - this can't be fixed in pocl and needs to be fixed in
+either CLPeak or LLVM. CLPeak sources use recursive macros to create
+a giant stream of instructions. Certain optimization passes
+in LLVM seem to explode exponentially on this code. The second
+consequence of giant instruction stream is, it easily overflows the
+instruction caches of a CPU, therefore CLPeak results are highly
+dependent on whether the compiler manages to fit the code into icache,
+and as such are not a reliable measure of peak device FLOPS.
+
+Luxmark
+-------
+
+* Using the binary downloaded from www.luxmark.info might lead to pocl
+  abort on creating cache directory. This is not a bug in Pocl, it's a
+  consequence of the two programs (pocl & luxmark) having been compiled
+  with different libstdc++. Using a distribution packaged Luxmark
+  fixes this problem.
+
+* It's recommended to remove luxmark cache (~/.config/luxrender.net)
+  after updating pocl version.
+
+* There's another bug (http://www.luxrender.net/mantis/view.php?id=1640)
+  - it crashes after compiling kernels, because it doesn't recognize
+  an OpenCL device. This requires editing scenes/<name>/render.cfg,
+  you must add ``opencl.cpu.use = 0`` and ``film.opencl.device = 0``
+
+* Microphone and Luxball scenes work, Hotel scene fails to compile
diff --git a/doc/sphinx/source/conf.py b/doc/sphinx/source/conf.py
index dc3690f..254a7d6 100644
--- a/doc/sphinx/source/conf.py
+++ b/doc/sphinx/source/conf.py
@@ -45,9 +45,9 @@ copyright = u'2010-2017 pocl developers'
 # built documents.
 #
 # The short X.Y version.
-version = '0.14'
+version = '1.0'
 # The full version, including alpha/beta/rc tags.
-release = '0.14'
+release = '1.0-pre'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/doc/sphinx/source/conformance.rst b/doc/sphinx/source/conformance.rst
new file mode 100644
index 0000000..e83d0dc
--- /dev/null
+++ b/doc/sphinx/source/conformance.rst
@@ -0,0 +1,225 @@
+.. _pocl-conformance:
+
+=======================
+Pocl OpenCL conformance
+=======================
+
+Conformance related CMake options
+---------------------------------
+
+- ``-DENABLE_CONFORMANCE=ON/OFF``
+  This is mostly related to the kernel library (the runtime is always conformant
+  on x86). Defaults to ON.
+  Non-conformant kernel library might be somewhat faster, at the expense of
+  precision and/or range. Note that conformance was tested **only** on certain
+  hardware and software (Linux, x86-64, CPU with AVX & FMA instructions).
+
+How to run the conformance test suite on your hardware
+------------------------------------------------------
+
+First you need to enable the suite in the pocl's external test suite set.
+This is done by adding switch ``-DENABLE_TESTSUITES=conformance``
+to the cmake command line. After this ``make prepare_examples`` fetches and
+prepares the conformance suite for testing.
+
+To run a shortened version of the conformance suite, run: ``ctest -L conformance_suite_mini``
+This might take a few hours on slow hardware. There is also a ``conformance_suite_micro``
+label, which takes about 20-30 minutes on slow hardware.
+
+To run the full conformance testsuite, run: ``ctest -L conformance_suite_full``
+Note that this can take a week to finish on slow hardware, and about a day
+on fast hardware (6C/12T Intel or equivalent).
+
+Known issues with the conformance testsuite
+-------------------------------------------
+
+- the "not" operator test (``math_brute_force/bruteforce not``) may fail to
+  compile with LLVM 4.0 with certain vector sizes on some hardware.
+  This does not seem to affect the rest of the testsuite in any way, and
+  appears to be fixed with LLVM 5.0
+
+- a few tests from ``basic/test_basic`` may fail / segfault because they
+  request a huge amount of memory for buffers.
+
+- a few tests from ``conversions/test_conversions`` may report failures. This
+  is likely a bug in the test or miscompilation; the same test from branch
+  cl20_trunk of CTS passes.
+
+- a few tests may run much faster if you limit the reported Global memory size
+  with POCL_MEMORY_LIMIT env var. In particular, "kernel_image_methods" test
+  with "max_images" argument.
+
+- two tests in ``api/test_api`` fail with LLVM 5.0 because of
+  LLVM commit 1c1154229a41b688f9:
+
+    ``[OpenCL] Do not generate "kernel_arg_type_qual" metadata for non-pointer args``
+
+  This is a bug in CTS, which tests for non-pointer type qualifiers, not in pocl.
+  See:
+
+  https://www.khronos.org/registry/OpenCL/specs/opencl-1.2.pdf page 169:
+
+  ``CL_KERNEL_ARG_TYPE_VOLATILE`` is returned if the **argument is a pointer**
+  and the referenced type is declared with the volatile qualifier.
+  Similarly, ``CL_KERNEL_ARG_TYPE_RESTRICT`` or ``CL_KERNEL_ARG_TYPE_CONST`` is
+  returned if the **argument is a pointer** and the referenced type is declared with
+  the restrict or const qualifier
+
+.. _sigfpe-handler:
+
+Known issues in pocl / things to be aware of
+--------------------------------------------
+
+- Integer division by zero. OpenCL 1.2 specification requires that division by
+  zero on integers results in undefined values, instead of raising exceptions.
+  This requires pocl to install a handler of SIGFPE. Unfortunately signal
+  handlers are per-process not per-thread, and pocl drivers do not run in a
+  separate process, which means that integer division by zero will not raise
+  SIGFPE for the entire pocl library and also the user's program. The handler
+  may be disabled by setting the env variable POCL_SIGFPE_HANDLER to 0.
+  Note that this is currently only relevant for x86(-64) + Linux, on all other
+  systems this issue is not handled in any way (thus Pocl is likely
+  non-conformant there).
+
+- Several options to clBuildProgram() are accepted but currently have no effect.
+  This is related mostly to optimization options like `-cl-fast-relaxed-math`.
+  The `-cl-denorms-are-zero` and `-cl-fp32-correctly-rounded-divide-sqrt`
+  options are honored.
+
+- Many of ``native_`` and ``half_`` variants of kernel library functions are mapped
+  to the "full" variants.
+
+- the optional OpenGL / D3D / SPIR extensions are not supported
+
+- clUnloadCompiler() only actually unload LLVM after all programs & kernels
+  have been released.
+
+- clSetUserEventStatus() called with negative status. The Spec leaves the behaviour
+  in this case as "implementation defined", and this part of pocl is
+  only very lightly tested by the conformance tests. clSetUserEventStatus()
+  called with CL_COMPLETE works as expected, and is heavily used by
+  the conversions conformance test.
+
+Conformance tests results (kernel library precision) on tested hardware
+-----------------------------------------------------------------------
+
+Note that it's impossible to test double precision on the entire range,
+therefore the results may vary.
+
+x86-64 CPU with AVX2+FMA, LLVM 4.0, tested on Nov 1, 2017
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+====================   =========================   ===========================================================
+        NAME                Worst ULP                 WHERE
+====================   =========================   ===========================================================
+             add            0.00                      {0x0p+0, 0x0p+0}
+             addD           0.00                      {0x0p+0, 0x0p+0}
+      assignment            0.00                      0x0p+0
+      assignmentD           0.00                      0x0p+0
+            cbrt            0.50                      -0x1.5629d2p+116
+            cbrtD           0.59                      0x1.0000000000136p+1022
+            ceil            0.00                      0x0p+0
+            ceilD           0.00                      0x0p+0
+        copysign            0.00                      {0x0p+0, 0x0p+0}
+        copysignD           0.00                      {0x0p+0, 0x0p+0}
+             cos            2.37                      0x1.1338ccp+20
+             cosD           2.27                      -0x1.d10000000074p+380
+            cosh            2.41                      -0x1.602166p+2
+            coshD           1.43                      -0x1.98000000003efp+5
+           cospi            1.94                      0x1.d73b56p-2
+           cospiD           2.46                      -0x1.adffffffffa91p-2
+          divide            0.00                      {0x0p+0, 0x0p+0}
+          divideD           0.00                      {0x0p+0, 0x0p+0}
+             exp            0.95                      -0x1.762532p+2
+             expD           0.94                      0x1.2f0000000023dp+7
+           exp10            0.79                      -0x1.309022p+5
+           exp10D           0.64                      -0x1.34ffffffffcc9p+8
+            exp2            0.79                      -0x1.fa3d0ep+6
+            exp2D           0.75                      -0x1.ff00000000417p+9
+           expm1            1.00                      -0x1.7a0002p-25
+           expm1D           0.99                      -0x1.26p+5
+            fabs            0.00                      0x0p+0
+            fabsD           0.00                      0x0p+0
+            fdim            0.00                      {0x0p+0, 0x0p+0}
+            fdimD           0.00                      {0x0p+0, 0x0p+0}
+           floor            0.00                      0x0p+0
+           floorD           0.00                      0x0p+0
+             fma            0.00                      {0x0p+0, 0x0p+0, 0x0p+0}
+             fmaD           0.00                      {0x0p+0, 0x0p+0, 0x0p+0}
+            fmax            0.00                      {0x0p+0, 0x0p+0}
+            fmaxD           0.00                      {0x0p+0, 0x0p+0}
+            fmin            0.00                      {0x0p+0, 0x0p+0}
+            fminD           0.00                      {0x0p+0, 0x0p+0}
+            fmod            0.00                      {0x0p+0, 0x0p+0}
+            fmodD           0.00                      {0x0p+0, 0x0p+0}
+           fract            { 0.00, 0.00}             {0x0p+0, 0x0p+0}
+           fractD           { 0.00, 0.00}             {0x0p+0, 0x0p+0}
+           frexp            { 0.00, 0}                 0x0p+0
+           frexpD           { 0.00, 0}                 0x0p+0
+           hypot            1.93                      {0x1.17c998p-127, -0x1.5fedb8p-127}
+           hypotD           1.73                      {0x1.5d2ebeed7663cp-1022, 0x1.67457048a2318p-1022}
+           ldexp            0.00                      {0x0p+0, 0}
+           ldexpD           0.00                      {0x0p+0, 0}
+           log10            0.50                      0x1.7fee2ep-1
+           log10D           0.50                      0x1.9100000000639p+1022
+             log            0.63                      0x1.7fcb3ep-1
+             logD           0.75                      0x1.7d00000000381p+0
+           log1p            1.00                      -0x1.fa0002p-126
+           log1pD           1.00                      -0x1.e000000000001p-1022
+            log2            0.59                      0x1.1107a2p+0
+            log2D           0.72                      0x1.120000000063dp+0
+            logb            0.00                      0x0p+0
+            logbD           0.00                      0x0p+0
+             mad            0.00                      {0x0p+0, 0x0p+0, 0x0p+0} no ULP check
+             madD           0.00                      {0x0p+0, 0x0p+0, 0x0p+0} no ULP check
+          maxmag            0.00                      {0x0p+0, 0x0p+0}
+          maxmagD           0.00                      {0x0p+0, 0x0p+0}
+          minmag            0.00                      {0x0p+0, 0x0p+0}
+          minmagD           0.00                      {0x0p+0, 0x0p+0}
+            modf        { 0.00, 0.00}                 {0x0p+0, 0x0p+0}
+            modfD       { 0.00, 0.00}                 {0x0p+0, 0x0p+0}
+        multiply            0.00                      {0x0p+0, 0x0p+0}
+        multiplyD           0.00                      {0x0p+0, 0x0p+0}
+             nan            0.00                      0x0p+0
+             nanD           0.00                      0x0p+0
+       nextafter            0.00                      {0x0p+0, 0x0p+0}
+       nextafterD           0.00                      {0x0p+0, 0x0p+0}
+             pow            0.82                      {0x1.91237cp-1, 0x1.4da146p+8}
+             powD           0.80                      {0x1.2bfb4b18164c9p+65, -0x1.b78438ae9c3bdp-8}
+            pown            0.65                      {-0x1.9p+6, -2}
+            pownD           0.62                      {-0x1.7ffffffffffffp+1, 3}
+            powr            0.82                      {0x1.91237cp-1, 0x1.4da146p+8}
+            powrD           0.80                      {0x1.2bfb4b18164c9p+65, -0x1.b78438ae9c3bdp-8}
+       remainder            0.00                      {0x0p+0, 0x0p+0}
+       remainderD           0.00                      {0x0p+0, 0x0p+0}
+          remquo        { 0.00, 0}                    0x0p+0
+          remquoD       { 0.00, 0}                    0x0p+0
+            rint            0.00                      0x0p+0
+            rintD           0.00                      0x0p+0
+           rootn            0.69                      {-0x1.e2fe6ep-74, -141}
+           rootnD           0.68                      {-0x1.8000000000001p+1, 3}
+           round            0.00                      0x0p+0
+           roundD           0.00                      0x0p+0
+           rsqrt            1.49                      0x1.019566p+124
+           rsqrtD           1.49                      0x1.01ffffffffa39p+1016
+             sin            2.48                      -0x1.09f07ap+21
+             sinD           1.87                      -0x1.f2fffffffffbap+32
+          sincos        { 2.48, 2.37}                 {0x1.09f07ap+21, 0x1.1338ccp+20}
+          sincosD       { 1.87, 2.27}                 {0x1.f2fffffffffbap+32, 0x1.d10000000074p+380}
+            sinh            2.32                      0x1.e76078p+2
+            sinhD           1.53                      -0x1.3100000000278p+4
+           sinpi            2.13                      -0x1.45f3ep-9
+           sinpiD           2.50                      -0x1.46000000000dap-7
+            sqrt            0.00                      0x0p+0
+            sqrtD           0.00                      0x0p+0
+        subtract            0.00                      {0x0p+0, 0x0p+0}
+        subtractD           0.00                      {0x0p+0, 0x0p+0}
+             tan            4.35                      -0x1.b4eba2p+22
+             tanD           4.00                      -0x1.2f000000003edp+333
+            tanh            1.18                      -0x1.ca742ap-1
+            tanhD           1.19                      0x1.f400000000395p-1
+           tanpi            4.21                      -0x1.f99d16p-3
+           tanpiD           4.09                      0x1.f6000000001d3p-3
+           trunc            0.00                      0x0p+0
+           truncD           0.00                      0x0p+0
+====================   =========================   ===========================================================
diff --git a/doc/sphinx/source/cuda.rst b/doc/sphinx/source/cuda.rst
new file mode 100644
index 0000000..5aa5fa7
--- /dev/null
+++ b/doc/sphinx/source/cuda.rst
@@ -0,0 +1,137 @@
+==================
+NVIDIA GPU support
+==================
+
+NOTE: Support for NVIDIA GPUs via the CUDA backend is currently experimental
+and many features may be missing or incomplete.
+
+The experimental CUDA backend provides support for CUDA-capable NVIDIA GPUs
+under Linux or macOS.
+The goal of this backend is to provide an open-source alternative to the
+proprietary NVIDIA OpenCL implementation.
+This makes use of the NVPTX backend in LLVM and the CUDA driver API.
+
+Building pocl with CUDA support
+-------------------------------
+
+1) Install prerequisites
+~~~~~~~~~~~~~~~~~~~~~~~~
+  Aside from the usual pocl dependencies, you will also need the CUDA toolkit.
+  Currently this backend has only been tested against CUDA 8.0, but it may also
+  be possible to build against other versions.
+
+  If you experience build failures regarding missing CUDA headers or libraries,
+  you may need to add the include directory containing ``cuda.h`` to your header
+  search path, and/or the library directory containing ``libcuda.{so,dylib}`` to
+  your library search path.
+
+  The CUDA backend requires LLVM 4.0 or newer, and LLVM must have been built
+  with the NVPTX backend enabled.
+
+2) Build pocl
+~~~~~~~~~~~~~
+  To enable the CUDA backend, add ``-DENABLE_CUDA=ON`` to your CMake
+  configuration command line.
+
+  Otherwise, build and install pocl as normal.
+
+3) Run tests
+~~~~~~~~~~~~
+  After building pocl, you can smoke test the CUDA backend by executing the
+  subset of pocl's tests that are known to pass on NVIDIA GPUs::
+
+    ../tools/scripts/run_cuda_tests
+
+4) Configuration
+~~~~~~~~~~~~~~~~
+  Use ``POCL_DEVICES=CUDA`` to select only CUDA devices. If the system has more
+  than one GPU, specify the ``CUDA`` device multiple times (e.g.
+  ``POCL_DEVICES=CUDA,CUDA`` for two GPUs).
+
+  The CUDA backend currently has a runtime dependency on the CUDA toolkit. If
+  you receive errors regarding a failure to load ``libdevice``, you may need
+  to set the ``POCL_CUDA_TOOLKIT_PATH`` environment variable to tell pocl
+  where the CUDA toolkit is installed.
+  Set this variable to the root of the toolkit installation (the directory
+  containing the ``nvvm`` directory).
+
+  The ``POCL_CUDA_GPU_ARCH`` environment variable can be set to override the
+  target GPU architecture (e.g. ``POCL_CUDA_GPU_ARCH=sm_35``), which may be
+  necessary in cases where LLVM doesn't yet support the architecture.
+
+  The ``POCL_CUDA_VERIFY_MODULE`` environment variable can be set to ``1`` to
+  verify that the LLVM module produced by the CUDA backend is well formed.
+
+  The ``POCL_CUDA_DUMP_NVVM`` environment variable can be set to ``1`` to
+  dump the LLVM IR that is fed into the NVPTX backend for debugging purposes
+  (requires ``POCL_DEBUG=1``).
+
+  The ``POCL_CUDA_DISABLE_QUEUE_THREADS`` environment variable can be set to
+  ``1`` to disable background threads for handling command submission. This can
+  potentially reduce command launch latency, but can cause problems if using
+  user events or sharing a context with a non-CUDA device.
+
+CUDA backend status
+-------------------
+
+(last updated: 2017-06-02)
+
+The CUDA backend currently passes 73 tests from pocl's internal testsuite, and
+is capable of running various real OpenCL codes.
+Unlike NVIDIA's proprietary OpenCL implementation, pocl supports SPIR
+consumption, and so this backend has also been able to run (for example) SYCL
+codes using Codeplay's ComputeCpp implementation on NVIDIA GPUs.
+Since it uses CUDA under-the-hood, this backend also works with all of the
+NVIDIA CUDA profiling and debugging tools, many of which don't work with
+NVIDIA's own OpenCL implementation.
+
+Conformance status
+~~~~~~~~~~~~~~~~~~
+
+The Khronos OpenCL 1.2 conformance tests are
+`available here <https://github.com/KhronosGroup/OpenCL-CTS/tree/cl12_trunk>`_.
+The following test categories are known to pass on at least one NVIDIA GPU using
+pocl's CUDA backend:
+
+* allocations
+* api
+* atomics
+* basic
+* commonfns
+* computeinfo
+* contractions
+* events
+* profiling
+* relationals
+* thread_dimensions
+* vec_step
+
+Tested platforms
+~~~~~~~~~~~~~~~~
+The CUDA backend has been tested on Linux (CentOS 7.3) with SM_35, SM_52,
+SM_60, and SM_61 capable NVIDIA GPUs.
+
+The backend is also functional on macOS, with just one additional test failure
+compared to Linux (``test_event_cycle``).
+
+Known issues
+~~~~~~~~~~~~
+The following is a non-comprehensive list of known issues in the CUDA backend:
+
+* image types and samplers are unimplemented
+* printf format support is incomplete
+
+Additionally, there has been little effort to optimize the performance of this
+backend so far - the current effort is on implementing remaining functionality.
+Once the core functionality is completed, optimization of the code generation
+and runtime can begin.
+
+Support
+~~~~~~~
+For bug reports and questions, please use pocl's `GitHub issue tracker
+<https://github.com/pocl/pocl/issues>`_.
+Pull requests and other contributions are also very welcome.
+
+This work has primarily been done by James Price from the
+`University of Bristol's High Performance Computing Group
+<http://uob-hpc.github.io>`_.
diff --git a/doc/sphinx/source/development.rst b/doc/sphinx/source/development.rst
index 481ec0c..19fbfce 100644
--- a/doc/sphinx/source/development.rst
+++ b/doc/sphinx/source/development.rst
@@ -4,8 +4,10 @@ Information for Developers
 Using cmake to build & install pocl
 -----------------------------------
 
-Most of the important stuff on using cmake is in the INSTALL file. A few
-additional items:
+Most of the important stuff on using cmake is in the install document,
+see :ref:pocl-install
+
+A few additional items:
 
 The
 
@@ -14,15 +16,18 @@ The
 command must point to ocl-vendors in the  cmake *build* directory, not the
 pocl source directory.
 
-Testing is done using either "make test" or invoking "ctest" directly;
-"make check" does not work. Invoke ctest with -jX option to run X tests
-in parallel.
+You can run the tests or built examples using "ctest" directly;
+``ctest --print-labels`` prints the available labels (testsuites);
+Invoke ctest with -jX option to run X tests in parallel.
+
+"make check" will invoke ctest with tier-1 testsuites.
+See :ref:`maintenance-policy` for details.
 
 Testsuite
 ----------
 
 Before changes are committed to the mainline, all tests in the 'make
-check' suite should pass::
+check' tier-1 suite should pass::
 
    make check
 
@@ -140,10 +145,8 @@ By default, pocl build system compiles the kernel libraries for
 the host CPU architecture, to be used by 'basic' and 'pthread' devices.
 
 LLVM is used to detect the CPU variant to be used as target. This 
-can be overridden by passing LLC_HOST_CPU to './configure'.
-Valid options are best documented in the output of::
-
-  llvm-as /dev/null | llc -mcpu=help
+can be overridden by passing -DLLC_HOST_CPU=... to CMake. See the
+documentation for LLC_HOST_CPU build option.
 
 Cross-compilation where 'build' is different from 'host' has not been
 tested.
diff --git a/doc/sphinx/source/docker.rst b/doc/sphinx/source/docker.rst
new file mode 100644
index 0000000..72cdc27
--- /dev/null
+++ b/doc/sphinx/source/docker.rst
@@ -0,0 +1,32 @@
+=======================
+running Pocl in Docker
+=======================
+
+Install Docker
+----------------
+
+* install docker for your distribution
+* start the docker daemon
+* make sure you have enough space (default location is usually ``/var/lib/docker``,
+  required storage for pocl is about 1.5 GB per container)
+
+start Pocl container
+----------------------
+
+* create an empty directory <D>
+* copy Dockerfile of your choice (any file from tools/docker/) to ``<D>/Dockerfile``
+* ``cd <D> ; sudo docker build -t TAG .`` .. where TAG is a name you can choose for the build.
+* ``sudo docker run -t TAG``
+* this will by default use master branch of pocl git; to use a different branch/commit,
+  run docker build with ``--build-arg GIT_COMMIT=<branch/commit>``
+
+
+Dockerfiles:
+--------------
+* `default`: builds pocl, then runs the internal tests from build dir.
+   Uses latest release of a distribution, with whatever is the default version of LLVM.
+* `<release>`: same as above, except uses specific release and specific LLVM version
+  (the latest available in that release).
+* `default.32bit`: same as default but sets up i386 environment
+* `test_install`: builds & installs pocl into system path, then runs the internal tests
+* `distro`: does a distribution-friendly build (enables runtime detection of CPU, etc)
diff --git a/doc/sphinx/source/env_variables.rst b/doc/sphinx/source/env_variables.rst
index 7c370aa..d30bdc5 100644
--- a/doc/sphinx/source/env_variables.rst
+++ b/doc/sphinx/source/env_variables.rst
@@ -5,18 +5,18 @@ The behavior of pocl can be controlled with multiple environment variables
 listed below. The variables are helpful both when using and when developing
 pocl.
 
+- **POCL_AFFINITY**
+ Linux-only, specific to pthread driver. If set to 1, each thread of
+ the pthread CPU driver sets its affinity to its index. This may be
+ useful with very long running kernels, or when using subdevices
+ (lets any idle cores enter deeper sleep). Defaults to 0 (most
+ people don't need this).
+
 - **POCL_BUILDING**
 
  If  set, the pocl helper scripts, kernel library and headers are 
  searched first from the pocl build directory.
 
-- **POCL_BBVECTORIZE**
-
- If set to 1, makes the pocl kernel compiler execute the LLVM BBVectorizer in
- addition to the SLP vectorizer and the inner loop vectorizer. BBVectorizer
- has known stability issues, therefore it's disabled by default, but it can
- provide performance improvements. See: https://github.com/pocl/pocl/issues/251
-
 - **POCL_CACHE_DIR**
 
  If this is set to an existing directory, pocl uses it as the cache
@@ -33,6 +33,11 @@ pocl.
  CL_INVALID_VALUE. If clock_gettime is available, messages
  will include a timestamp.
 
+ The old way (setting POCL_DEBUG to 1) has been updated to support categories.
+ Using this limits the amount of debug messages produced. Current options are:
+ error,warning,general,memory,llvm,events,cache,locking,refcounts,timing,hsa,tce,all.
+ Note: setting POCL_DEBUG to 1 still works and equals error+warning+general.
+
 - **POCL_DEBUG_LLVM_PASSES**
 
  When set to 1, enables debug output from LLVM passes during optimization.
@@ -105,6 +110,13 @@ pocl.
  Forces the maximum WG size returned by the device or kernel work group queries
  to be at most this number.
 
+- **POCL_MEMORY_LIMIT**
+
+ Integer option, unit: gigabytes. Limits the total global memory size
+ reported by pocl for the pthread/basic devices (this will also affect
+ local/constant/max-alloc-size numbers, since these are derived from
+ global mem size).
+
 - **POCL_OFFLINE_COMPILE**
 
  Bool. When enabled(==1), some drivers will create virtual devices which are only
@@ -153,6 +165,11 @@ pocl.
               However, the code bloat is increased with larger
               WG sizes.
 
+- **POCL_SIGFPE_HANDLER**
+
+ Defaults to 1. If set to 0, pocl will not install the SIGFPE handler.
+ See :ref:`sigfpe-handler`
+
 - **POCL_TRACE_EVENT**, **POCL_TRACE_EVENT_OPT** and **POCL_TRACE_EVENT_FILTER**
 
  If POCL_TRACE_EVENT is set to some tracer name, then all events
diff --git a/doc/sphinx/source/faq.rst b/doc/sphinx/source/faq.rst
index 71ff76e..446e7a9 100644
--- a/doc/sphinx/source/faq.rst
+++ b/doc/sphinx/source/faq.rst
@@ -99,9 +99,6 @@ the performance in each release, so if you encounter performance
 regressions (an older pocl/LLVM version used to run an app faster), 
 please report a bug.
 
-Also you might want to try to set the `POCL_BBVECTORIZER` environment
-variable to 1. More info :doc:`here </env_variables>`.
-
 pocl source code
 ----------------
 
diff --git a/doc/sphinx/source/features.rst b/doc/sphinx/source/features.rst
index 381d56f..338957b 100644
--- a/doc/sphinx/source/features.rst
+++ b/doc/sphinx/source/features.rst
@@ -9,8 +9,8 @@ Frontend/Clang
 
 * OpenCL 1.x
 
-  * OpenGL interoperability
-  * Image support is incomplete
+  * OpenGL interoperability extension
+  * SPIR extension
 
 * OpenCL 2.0
 
@@ -18,17 +18,12 @@ Frontend/Clang
   * pipes (WIP)
   * device-side enqueue
 
-* cl_khr_f16: half precision float literals
-
-  Compiling "3434.0h" fails with:
-  error: invalid suffix 'h' on floating constant
-
-  Tested with Clang 3.4 on 2014-07-10.
-
+* cl_khr_f16: half precision support (with the exception of  vload_half / vstore_half)
 
 Unimplemented host side functions
 ---------------------------------
 
-The list of unimplemented host-side API functions can be seen as the NULLs in the ICD dispatch struct in
+All 1.2 API call should be implemented. The list of unimplemented
+2.0 calls can be seen as the NULLs in the ICD dispatch struct in
 https://github.com/pocl/pocl/blob/master/lib/CL/clGetPlatformIDs.c
 
diff --git a/doc/sphinx/source/index.rst b/doc/sphinx/source/index.rst
index fc3f42f..22a2a68 100644
--- a/doc/sphinx/source/index.rst
+++ b/doc/sphinx/source/index.rst
@@ -17,9 +17,14 @@ Contents:
    faq
    development
    releasing
+   maintainer-policy
    design
    features
    hsa
+   hsa_status
+   cuda
+   conformance
+   docker
 
 Back to `pocl home page <http://pocl.sourceforge.net>`_.
 
diff --git a/doc/sphinx/source/install.rst b/doc/sphinx/source/install.rst
index 60c5efc..ea983ce 100644
--- a/doc/sphinx/source/install.rst
+++ b/doc/sphinx/source/install.rst
@@ -1,3 +1,5 @@
+.. _pocl-install:
+
 ============
 Installation
 ============
@@ -9,21 +11,30 @@ In order to build pocl, you need the following support libraries and
 tools:
 
   * Latest released version of LLVM & Clang
-  * GNU make
+  * development files for LLVM & Clang + their transitive dependencies
+    (e.g. libclang-dev, libllvm-dev, zlib1g-dev, libtinfo-dev...)
+  * GNU make or ninja
   * libtool dlopen wrapper files (e.g. libltdl3-dev in Debian)
   * pthread (should be installed by default)
   * hwloc v1.0 or newer (e.g. libhwloc-dev)
   * pkg-config
   * cmake
 
+
+There are Dockerfiles available for a few most common linux
+distributions in ``tools/docker``, looking into them might be helpful.
+
 Clang / LLVM Notes
 ------------------
 
-**IMPORTANT NOTE!** Some platforms (TCE and possibly HSA) require that
+**IMPORTANT NOTE!** Some targets (TCE and possibly HSA) require that
 you compile & build LLVM with RTTI on. It can be enabled on cmake command
 line, as follows:
 
-**Supported versions**
+    cmake .... -DLLVM_ENABLE_RTTI=ON -DLLVM_ENABLE_EH=ON ....
+
+Supported LLVM versions
+~~~~~~~~~~~~~~~~~~~~~~~~~
 
   Note that pocl aims to support **the latest LLVM version** at the time
   of pocl release, **plus the previous** LLVM version. All older LLVM
@@ -47,38 +58,122 @@ The build+install is the usual CMake way::
 To see the default detected values, run ``cmake ..`` without any options,
 it will produce a summary.
 
+CMake variables
+===============
+
+Since pocl is a compiler, it both compiles (producing code) and is
+compiled (it consists of code). This distinction typically called
+"host" and "target": The host is where pocl is running, the target is
+where the OpenCL code will be running. These two systems can be wildly
+different.
+
+Host compiler used to compile pocl can be GCC or Clang; the target
+compiler is always Clang+LLVM since pocl uses Clang/LLVM internally.
+For host compiler, you should use the one which your LLVM was compiled
+with (because the LLVM-related parts of pocl take LLVM's CXXFLAGS from
+llvm-config and pass them to the host compiler).
+
+CMake host flags
+----------------
+
+Compile C:
+  CMAKE_C_FLAGS
+  CMAKE_C_FLAGS_<build-type>
+
+Compile C++:
+  CMAKE_CXX_FLAGS
+  CMAKE_CXX_FLAGS_<build-type>
+
+Building kernels and the kernel library, i.e. target flags
+------------------------------------------------------------
+
+All of these empty by default. There are hardcoded defaults which may
+be overriden by setting these variables (rarely needed).
+
+Extra parameters to llc
+   EXTRA_HOST_LLC_FLAGS
+
+Extra parameters to clang
+   EXTRA_HOST_CLANG_FLAGS
+
+Extra parameters to linker (links kernel to shared library
+which is then dlopened):
+   EXTRA_HOST_LD_FLAGS
+
+EXTRA_KERNEL_FLAGS
+  is applied to all kernel library compilation commands, IOW it's for
+  language-independent options
+
+EXTRA_KERNEL_{C,CL,CXX}_FLAGS
+  cmake variables for per-language options for kernel library compilation
+
+
 
-CMake: important options & features
+CMake: other options & features
 -------------------------------------
 
-For multiple-item options, use ";" as separator (you'll have to escape it for bash).
+Note that there are a few more packaging-related options described
+in ``README.packaging``.
+
+For multiple-item options like KERNELLIB_HOST_CPU_VARIANTS,
+use ";" as separator (you'll have to escape it for bash).
 
 - ``-DWITH_LLVM_CONFIG=<path-to-llvm-config>``
   **IMPORTANT** Path to a llvm-config binary.
   This determines the LLVM installation used by pocl.
   If not specified, pocl will try to find and link against
   llvm-config in PATH env var (usually means your system LLVM).
+
 - ``-DSTATIC_LLVM`` enable this to link LLVM statically into pocl.
   Note that you need LLVM built with static libs. This option might result
   in much longer build/link times and much larger pocl library, but the
   resulting libpocl will not require an LLVM installation to run.
-- ``-DENABLE_ICD`` and ``-DDIRECT_LINKAGE`` By default pocl's
-  buildsystem will try to find an ICD and build pocl as a dynamic library
-  named "libpocl". These options are useful if you want to avoid ICD and
-  build pocl directly as libOpenCL library. See also :ref:`linking-with-icd`
+
+- ``-DENABLE_ICD`` By default pocl's buildsystem will try to find an ICD
+  and build pocl as a dynamic library named "libpocl". This option is useful
+  if you want to avoid ICD and build pocl directly as libOpenCL library.
+  See also :ref:`linking-with-icd`
+
 - ``-DPOCL_INSTALL_<something>_DIR`` The equivalent of ``--bindir``,
   ``--sbindir`` etc fine-tuning of paths for autotools. See the beginning
   of toplevel CMakeLists.txt for all the variables.
-- ``-DKERNELLIB_HOST_CPU_VARIANTS`` You can control which CPUs the
-  kernel library will be built for. Defaults to "native" which will be
-  converted to the build machine's CPU at buildtime. Available CPUs are
-  listed by ``llc -mcpu=help``; you can specify multiple CPUs, and pocl will
-  look for a kernel library for the runtime-detected CPU.
 
-  For x86(64) there is another possibility, ``distro``, which builds a few
-  preselected sse/avx variants covering 99.99% of x86 processors, and pocl
-  will use the most appropriate one at runtime, based on detected CPU features.
-  With ``distro``, the minimum requirement on CPU is SSE2.
+  Note that if ``CMAKE_INSTALL_PREFIX`` equals ``/usr`` then pocl.icd is
+  installed to ``/etc/OpenCL/vendors``, otherwise it's installed to
+  ``${CMAKE_INSTALL_PREFIX}/etc/OpenCL/vendors``.
+
+- ``-DLLC_HOST_CPU=<something>``
+  Defaults to auto-detection via ``llc``. Run ``llc -mcpu=help``
+  for valid values. The CPU type is required to compile
+  the "target" (kernel library) part of CPU backend.
+
+  This variable overrides LLVM's autodetected host CPU at configure time.
+  Useful when llc fails to detect the CPU (often happens on non-x86
+  platforms, or x86 with CPU newer than LLVM).
+
+  Note that when this is set (set by default) and the
+  KERNELLIB_HOST_CPU_VARIANTS variable is not ``distro``,
+  pocl will first try to find compiled kernel library
+  for runtime-detected CPU then fallback to LLC_HOST_CPU.
+  This works well if pocl is run where it was built,
+  or the actual CPU is in the KERNELLIB_HOST_CPU_VARIANTS list,
+  or the actual CPU is >= LLC_HOST_CPU feature-wise;
+  otherwise it will likely fail with illegal instruction at runtime.
+
+- ``-DKERNELLIB_HOST_CPU_VARIANTS`` You can control which CPUs the
+  "target" part of CPU backend will be built for.
+  Unlike LLC_HOST_CPU, this variable is useful if you plan
+  to build for multiple CPUs. Defaults to "native" which is
+  automagically replaced by LLC_HOST_CPU.
+  Available CPUs are listed by ``llc -mcpu=help``. See above for
+  runtime CPU detection rules.
+
+  Note that there's another valid value on x86(64) platforms.
+  If set to ``distro``, the KERNELLIB_HOST_CPU_VARIANTS variable will be
+  set up with a few preselected sse/avx variants covering 99.99% of x86
+  processors, and the runtime CPU detection is slightly altered: pocl
+  will find the suitable compiled library based on detected CPU features,
+  so it cannot fail (at worst it'll degrade to SSE2 library).
 
 - ``-DENABLE_TESTSUITES`` Which external (source outside pocl) testsuites to enable.
   For the list of testsuites, see examples/CMakeLists.txt or the ``examples``
@@ -92,19 +187,30 @@ For multiple-item options, use ";" as separator (you'll have to escape it for ba
   with ``-DTESTSUITE_BASEDIR=/home/pocltest-build -DTESTSUITE_SOURCE_BASEDIR=/home/pocltest-src``,
   place the ``AMD-APP-SDK-v2.9-RC-lnx64.tgz`` file into ``/home/pocltest-src/AMDSDK2.9`` directory.
 
+- ``-DENABLE_CONFORMANCE=ON/OFF``
+  Builds Pocl as a fully conformant OpenCL implementation. Defaults to ON.
+  See :ref:`pocl-conformance` for details.
+
+- ``-DENABLE_{A,L,T,UB}SAN`` - compiles pocl's host code (and tests
+  + examples) with various sanitizers. Using more than one sanitizer at
+  a time is untested. Using together with ``-DENABLE_ICD=OFF`` is highly
+  recommended to avoid issues with loading order of sanitizer libraries.
+
+- ``-DENABLE_{CUDA,TCE,HSA}=ON/OFF`` - enable various (non-CPU) backends.
+  Usually requires some extra setup; see their documentation.
+
+- ``-DPOCL_DEBUG_MESSAGES=ON`` - when disabled, pocl is compiled without
+  debug messages (POCL_DEBUG env var) support.
+
+- ``-DEXAMPLES_USE_GIT_MASTER=ON`` - when enabled, examples (external
+  programs in ``examples/`` directory) are built from their git branches
+  (if available), as opposed to default: building from release tars.
+
 LLVM-less build
 ---------------
  See :ref:`pocl-without-llvm`
 
 
-Building on Ubuntu 16.04 LTS
-----------------------------
-
-The Clang/LLVM 3.8 shipped with Ubuntu 16.04 should work with pocl.
-Be sure to install also the 'libclang-3.8-dev' package in addition
-to the 'clang-3.8 and llvm-3.8-dev' packages, otherwise cmake will
-fail.
-
 Known build-time issues
 -----------------------
 
diff --git a/doc/sphinx/source/kernel_compiler.rst b/doc/sphinx/source/kernel_compiler.rst
index a79e2f4..998100a 100644
--- a/doc/sphinx/source/kernel_compiler.rst
+++ b/doc/sphinx/source/kernel_compiler.rst
@@ -2,24 +2,17 @@ Kernel compiler
 ---------------
 
 The compilation of kernels in pocl is performed roughly as follows.
-In release 0.9 the scripts (referred to below) were replaced by direct
-LLVM API calls. The structure remains, e.g. calling script ``pocl-build`` was
-replaced with calling function ``call_pocl_build()``. See ``lib/CL/pocl_llvm_api.cc``
 
-#. Produce an LLVM bitcode of the single kernel function.
+#. Produce an LLVM bitcode of the entire program.
 
-   The kernel compiler of pocl relies on the OpenCL C frontend of the Clang 
-   for parsing the kernel descriptions to LLVM bytecode. The output from
-   Clang is a description of the kernel function for a single work-item.
+   This is done using 'preprocess' and 'emit-llvm' Clang actions. This
+   happens at clBuildProgram() time.
 
-   Done with the help of ``pocl-build`` script that invokes the Clang. See
-   ``clBuildProgram.c``.
-
-#. Link in the built-in functions.
+#. Link in the built-in kernel library functions.
 
    The OpenCL C builtin functions are precompiled to LLVM *bitcode* libraries
    residing under ``lib/kernel/$TARGET``. These are linked to the kernel using
-   the ``llvm-link`` tool when the helper script ``pocl-workgroup`` (see the next item).
+   link() from lib/llvmopencl/linker.cpp. This too happens in clBuildProgram()
 
 #. Produce the work-group function.
 
@@ -32,19 +25,21 @@ replaced with calling function ``call_pocl_build()``. See ``lib/CL/pocl_llvm_api
    description and take care of the parallel execution of multiple kernel instances 
    using their scheduling hardware.
 
-   This part is performed when a kernel execution command is executed (see 
-   ``clEnqueueNDRangeKernel.c``).  Only at this point the work-group dimensions are 
-   known, after which it is possible to produce functions of the single kernel functions 
-   that execute the whole work-group.
+   This part is performed by target-specific code when a kernel execution
+   command is scheduled. Only at this point the work-group dimensions are
+   known, after which it is possible to produce functions of the single
+   kernel functions that execute the whole work-group.
 
 #. Code generation for the target.
 
    The work-group function (which is still in LLVM IR) of the kernel along with the launcher 
    functions are finally converted to the machine code of the target device. This is done in
-   the device layer's implementation of the kernel run command. For example, see ``llvm_codegen()``
-   in ``lib/CL/devices/common.c``. This function generates a dynamically loaded object of the
-   work-group function for actually launching the kernel. The function is called from the CPU
-   device layer implementations (``pocl_basic_run()`` of ``lib/CL/devices/basic/basic.c``).
+   the device layer's implementation of the kernel run command (same as generating wg
+   function). For example, see ``llvm_codegen()`` in ``lib/CL/devices/common.c``.
+   This function generates a dynamically loaded object of the work-group
+   function for actually launching the kernel. The function is called
+   from the CPU device layer implementations
+   (``pocl_basic_run()`` of ``lib/CL/devices/basic/basic.c``).
    
 
 Work group function generation
diff --git a/doc/sphinx/source/maintainer-policy.rst b/doc/sphinx/source/maintainer-policy.rst
index 19ac3fc..d4d9002 100644
--- a/doc/sphinx/source/maintainer-policy.rst
+++ b/doc/sphinx/source/maintainer-policy.rst
@@ -23,13 +23,16 @@ new pull requests (PR) to the master branch, and some of them are additionally
 executed with multiple continuous integration (buildbot) servers on
 different platforms. Active developers are also assumed to run them locally
 before submitting PRs. Thus, regressions on these suites should be detected
-early.
+early. The required testsuites can be enabled at buildtime with
+``-DENABLE_TESTSUITES=tier1`` cmake option.
 
 Currently (2017-03-16) the following are included in the tier-1 test suites:
 
 * The standard test suite of pocl.
 * AMD SDK 3.0 test suite
 * PyOpenCL test suite
+* piglit test suite
+* conformance_suite_micro test suite
 * HSA test suite (uses the LLVM 3.7 with an HSAIL backend and targets an AMD Kaveri GPU)
 * TCE short smoke test suite (against the latest TCE open source release)
 
diff --git a/doc/sphinx/source/memory_management.rst b/doc/sphinx/source/memory_management.rst
index 785434b..388d0b8 100644
--- a/doc/sphinx/source/memory_management.rst
+++ b/doc/sphinx/source/memory_management.rst
@@ -59,3 +59,18 @@ When passing buffer pointers to the kernel/work-group launchers, the memory addr
 passed as integer values. The values passed from the host are casted to the actual
 address-space qualified LLVM IR pointers for calling the kernels with correct types
 by the work-group function (see :ref:`wg-functions`).
+
+Custom memory management for pthread device
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Enabled by CMake option USE_POCL_MEMMANAGER. This is only useful for certain
+uncommon setups, where pocl is expected to allocate a huge number of queue or
+event objects. For most available OpenCL programs / tests / benchmarks, there
+is no measurable difference in speed.
+
+Advantages:
+* allocation of queues/events/command objects can be a lot faster
+
+Disadvantages:
+* memory allocated for those objects is never free()d; it's only returned to allocation pool
+* debugging tools will not detect use-after-free bugs on said objects
diff --git a/doc/sphinx/source/pocl_binary.rst b/doc/sphinx/source/pocl_binary.rst
index f67cc81..1de1f74 100644
--- a/doc/sphinx/source/pocl_binary.rst
+++ b/doc/sphinx/source/pocl_binary.rst
@@ -18,11 +18,7 @@ To do this
 The string after "HSTR:" is the device build hash.
 
 * now build the LLVM-less pocl. You will need the device build hash from
-  previous step
-
-  ``./configure --disable-ocs HOST_DEVICE_BUILD_HASH=<something> ...``
-
-  or cmake:
+  previous step:
 
   ``cmake -DOCS_AVAILABLE=0 -DHOST_DEVICE_BUILD_HASH=<something> ...``
 
@@ -47,7 +43,7 @@ As mentionned in the khronos_ documentation, the parameter ``binaries`` of
 ``clCreateProgramWithBinary`` can consist of either or both of device-specific
 code and/or implementation-specific intermediate representation.
 
-In POCL, both representations can be use.
+In POCL, both representations can be used.
 
 Implementation-specific intermediate representation
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/doc/sphinx/source/releasing.rst b/doc/sphinx/source/releasing.rst
index 366170b..4739ba6 100644
--- a/doc/sphinx/source/releasing.rst
+++ b/doc/sphinx/source/releasing.rst
@@ -13,38 +13,44 @@ See the :ref:`maintenance-policy` for the current release criteria.
 
 A checklist and hints for testing and making a release successfully:
 
-* Create a release branch in github. After branching the release, only
-  bug fixes should be committed to the branch. The bug fixes are merged
-  *from* the release branch to the *master*. Now development towards the next
-  release can go on in *master* while the release branch is being stabilized.
-* Set the correct version number without -pre or -rc in the release branch
-  (configure.ac). Increment the version in the master branch. Do not include
-  an -rcX in the revision number in the source base so it is possible to
-  release an approved release candidate tar ball by just renaming the tar
-  ball file name.
-* Update the new dynamic library version in the *master* branch. This
-  can be done in configure.ac.
-  Search for "4:0:3" to see the place where it's set. It includes more info
-  in comments.
 * Check that CHANGES has the most interesting updates done during the release
   cycle. Add missing notable changes from git log.
-* Disallow support for the unreleased LLVM version from the release branch
-  because it will most likely stop working before the new LLVM is released
-  because LLVM API lives.  That is, modify configure.ac in the release branch to not
-  allow the currently unreleased development version of LLVM.
+
 * Update the release notes in *doc/notes-VERNUM.txt*.
-* Add a git tag for the release candidate: 'git tag 0.14-rc1'
-* Create and test the tar ball package with command along the lines of 'git archive --format=tar.gz --prefix=pocl-0.14/ 0.14-rc1 -o pocl-0.14-rc1.tar.gz'.
+
+* Create a single commit in master branch: change the version to the
+  release one (without -pre), in all relevant places (CHANGES, docs,
+  CMakeLists.txt, etc); update the .so version (if required);
+  check that supported LLVM versions in cmake/LLVM.cmake are correct.
+  Create the release branch from this commit and push it to github.
+
+* In the master branch, create a new commit: increase version
+  number (with -pre) in all relevant places; update the .so version;
+  increase the supported LLVM versions in cmake/LLVM.cmake.
+  Commit, push master to github. Now development can go on in master
+  while the release branch is being stabilized.
+
+* The previous two steps ensure that merge-base of release & master is
+  the start of release branch, which ensures that merging release
+  to the master will not screw up the version numbers in the master.
+  Bugs which need to be fixed in both branches, should be comitted to
+  the release branch, then release branch merged to master.
+
+* Create a new release on Github. Mark it as pre-release. This should
+  create both a tarball and a git tag.
+
 * Upload the package to portablecl.org/downloads via SFTP or to the
   sourceforge file listing for the pocl project.
+
 * Request for testers in Twitter and/or mailing list. Point the testers to
   send their test reports to you privately or by adding them to the wiki.
   A good way is to create a wiki page for the release schedule and a test
   log. See https://github.com/pocl/pocl/wiki/pocl-0.10-release-testing for
   an example.
-* To publish a release, after testing it thoroughly, rename the latest RC
-  tar ball to omit the rcX tag, e.g.,
-  pocl-0.10.tar.gz. Upload the tar ball to the sourceforge download page and
+
+* To publish a release, create a new release on Github without the
+  checking the pre-release checkbox.
+  Upload the tar ball to the sourceforge download page and
   to http://portablecl.org/downloads.
 * Update the CHANGES and ANNOUNCEMENT text files in these directories.
   ANNOUNCEMENT is a copy of the latest release notes. A direct link to it can
diff --git a/doc/sphinx/source/using.rst b/doc/sphinx/source/using.rst
index 0182fe5..20b1915 100644
--- a/doc/sphinx/source/using.rst
+++ b/doc/sphinx/source/using.rst
@@ -36,14 +36,9 @@ to have several OpenCL implementations concurrently on your computer, and
 select the one to use at runtime by selecting the corresponding cl_platform. 
 ICD support can be disabled by adding the flag::
 
-  --disable-icd
+  -DENABLE_ICD=OFF
 
-to the ./configure script.
-
-In case you also give the --prefix=$INSTALL option to ./configure, you need to 
-copy the icd file to where your ICD loader finds it, e.g.::
-
-  cp $INSTALL/etc/OpenCL/vendors/pocl.icd /etc/OpenCL/vendors/pocl.icd
+to the CMake invocation.
 
 The ocl-icd ICD loader allows to use the OCL_ICD_VENDORS environment variable
 to specify a (non-standard) replacement for the /etc/OpenCL/vendors directory.
@@ -61,13 +56,6 @@ Linking your program directly with pocl
 Passing the appropriate linker flags is enough to use pocl in your
 program. However, please bear in mind that:
 
-#. The current distribution only supports one device, "native",
-   which runs the kernels in the host system.
-#. Current implementation of both host and kernel runtime libraries
-   is not complete. If your program uses any of the unimplemented
-   API calls, it will not work. Please implement the mssing APIs
-   when you need them and submit us a patch :)
-
 The pkg-config tool is used to locate the libraries and headers in
 the installation directory. 
 
@@ -79,9 +67,6 @@ the pkg-config::
 In this link mode, your program will always require the pocl OpenCL library. It
 wont be able to run with another OpenCL implementation without recompilation.
 
-Pocl needs to be configured with the --enable-direct-linkage option (enabled
-by default)
-
 Using pocl on Android
 ---------------------
 
@@ -92,20 +77,6 @@ OpenCL function symbols from it.
 Refer examples/pocl-android-sample/ for hello-world android app that uses pocl.
 This app uses a third-party stub OpenCL library that does dlopen/dlsym on its behalf
 
-Vecmathlib
-----------
-
-Vecmathlib (aka VML)
-`<https://bitbucket.org/eschnett/vecmathlib/wiki/Home>`_ provides
-optimized implementations for math builtins such as sqrt, sin, cos,
-etc. These are highly recommended as they can be inlined to the call
-site and lead to better optimized kernels. A copy of Vecmathlib is
-distributed with pocl for convenience in the directory
-`lib/kernel/vecmathlib`.
-
-To use VML, you need to have a functional clang++ installed.
-Currently, VML is enabled only for x86_64.
-
 Wiki
 ----
 
diff --git a/examples/AMD/CMakeLists.txt b/examples/AMD/CMakeLists.txt
index 6684eff..52b6c6d 100644
--- a/examples/AMD/CMakeLists.txt
+++ b/examples/AMD/CMakeLists.txt
@@ -70,16 +70,17 @@ if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Linux" AND X86_64)
     FloydWarshall
     HelloWorld
     Histogram
-    ImageBandwidth
     ImageOverlap
     KernelLaunch
     LUDecomposition
+    Mandelbrot
     MatrixMulImage
     MatrixMultiplication
     MatrixTranspose
     MemoryModel
     MonteCarloAsian
     MonteCarloAsianMultiGPU
+    NBody
     PrefixSum
     QuasiRandomSequence
     RadixSort
@@ -95,13 +96,13 @@ if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Linux" AND X86_64)
     TransferOverlap
     URNG)
 
-    # disabled tests:  Mandelbrot NBody
+    # disabled tests:
     #   SimpleGL BoxFilterGL FluidSimulation2D GaussianNoiseGL
-    #   MonteCarloAsianDP AdvancedMultiGPU BasicDebug
+    #   MonteCarloAsianDP AdvancedMultiGPU BasicDebug ImageBandwidth
 
     foreach(SAMPLE IN LISTS AMD_SAMPLES)
       add_test(NAME "AMD_28_${SAMPLE}"
-        COMMAND "${TS_BASEDIR}/src/${TS_NAME}/AMD-APP-SDK-v2.8-RC-lnx64/samples/opencl/bin/x86_64/${SAMPLE}"
+        COMMAND "${TS_BASEDIR}/src/${TS_NAME}/AMD-APP-SDK-v2.8-RC-lnx64/samples/opencl/bin/x86_64/${SAMPLE}" "-q" "-t"
         WORKING_DIRECTORY "${TS_BASEDIR}/src/${TS_NAME}/AMD-APP-SDK-v2.8-RC-lnx64/samples/opencl/bin/x86_64")
       set_tests_properties("AMD_28_${SAMPLE}" PROPERTIES LABELS "amdsdk_28")
     endforeach()
diff --git a/examples/AMDSDK2.9/CMakeLists.txt b/examples/AMDSDK2.9/CMakeLists.txt
index 4c9d694..deb2524 100644
--- a/examples/AMDSDK2.9/CMakeLists.txt
+++ b/examples/AMDSDK2.9/CMakeLists.txt
@@ -59,6 +59,7 @@ if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Linux" AND X86_64)
     BlackScholes
     BlackScholesDP
     DCT
+    DeviceFission
     DwtHaar1D
     DynamicOpenCLDetection
     FastWalshTransform
@@ -69,11 +70,13 @@ if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Linux" AND X86_64)
     HistogramAtomics
     ImageOverlap
     LUDecomposition
+    Mandelbrot
     MatrixMulImage
     MatrixMultiplication
     MatrixTranspose
     MemoryModel
     MonteCarloAsianMultiGPU
+    NBody
     PrefixSum
     QuasiRandomSequence
     RadixSort
@@ -88,17 +91,24 @@ if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Linux" AND X86_64)
     Template
     TransferOverlap
     URNG
-    URNGNoiseGL)
+    )
 
-    # disabled c++ tests: DwtHaar1DCPPKernel EigenValue FFT IntroStaticCPPKernel MatrixMultiplicationCPPKernel MersenneTwister SoAversusAoS TransferOverlapCPP
-    # disabled but fixable: CplusplusWrapper DeviceFission FluidSimulation2D GaussianNoise HDRToneMapping ImageBandwidth KernelLaunch MatrixMulDouble MonteCarloAsian MonteCarloAsianDP SobelFilterImage UnsharpMask
+    # disabled c++ tests: DwtHaar1DCPPKernel EigenValue FFT IntroStaticCPPKernel
+    # MatrixMultiplicationCPPKernel MersenneTwister SoAversusAoS TransferOverlapCPP
+    #
+    # disabled but fixable: CplusplusWrapper FluidSimulation2D GaussianNoise HDRToneMapping
+    # ImageBandwidth KernelLaunch MatrixMulDouble MonteCarloAsian
+    # MonteCarloAsianDP SobelFilterImage UnsharpMask
+    #
     # disabled tests: NBody AtomicCounters BasicDebug DeviceFission11Ext
-    # disabled graphics tests: Mandelbrot  KmeansAutoclustering GaussianNoiseGL SimpleGL
+    #
+    # disabled graphics tests: KmeansAutoclustering GaussianNoiseGL SimpleGL
+    #
     # very slow: LDSBandwidth ConstantBandwidth MemoryOptimizations
 
     foreach(SAMPLE IN LISTS AMD_SAMPLES)
       add_test(NAME "AMD_29_${SAMPLE}"
-        COMMAND "${TS_BASEDIR}/src/${TS_NAME}/AMD-APP-SDK-v2.9-RC-lnx64/samples/opencl/bin/x86_64/${SAMPLE}"
+        COMMAND "${TS_BASEDIR}/src/${TS_NAME}/AMD-APP-SDK-v2.9-RC-lnx64/samples/opencl/bin/x86_64/${SAMPLE}" "-q" "-t"
         WORKING_DIRECTORY "${TS_BASEDIR}/src/${TS_NAME}/AMD-APP-SDK-v2.9-RC-lnx64/samples/opencl/bin/x86_64")
       set_tests_properties("AMD_29_${SAMPLE}" PROPERTIES LABELS "amdsdk_29")
     endforeach()
diff --git a/examples/AMDSDK3.0/CMakeLists.txt b/examples/AMDSDK3.0/CMakeLists.txt
index d604967..155efa8 100644
--- a/examples/AMDSDK3.0/CMakeLists.txt
+++ b/examples/AMDSDK3.0/CMakeLists.txt
@@ -61,9 +61,7 @@ if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Linux" AND X86_64)
     BlackScholes
     BlackScholesDP
     BoxFilter
-    BufferImageInterop
     BuiltInScan
-    CalcPie
     ConcurrentKernel
     DCT
     DeviceFission
@@ -71,6 +69,7 @@ if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Linux" AND X86_64)
     DwtHaar1D
     FastWalshTransform
     FloydWarshall
+    FluidSimulation2D
     GaussianNoise
     HDRToneMapping
     HelloWorld
@@ -80,6 +79,7 @@ if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Linux" AND X86_64)
     ImageOverlap
     KernelLaunch
     LUDecomposition
+    Mandelbrot
     MatrixMulImage
     MatrixMultiplication
     MatrixTranspose
@@ -107,7 +107,7 @@ if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Linux" AND X86_64)
     #     IntroStaticCPPKernel FFT MatrixMultiplicationCPPKernel
     #     MersenneTwister SoAversusAoS TransferOverlapCPP
     # disabled but fixable: BuiltInScan CalcPie CplusplusWrapper
-    #     DeviceFission FluidSimulation2D GaussianNoise HDRToneMapping
+    #     GaussianNoise HDRToneMapping
     #     HeatPDE ImageBandwidth KernelLaunch MatrixMulDouble
     #     MonteCarloAsianDP SobelFilterImage UnsharpMask DynamicOpenCLDetection
     #     FineGrainSVMCAS RecursiveGaussian_ProgramScope SobelFilterImage
@@ -115,7 +115,7 @@ if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Linux" AND X86_64)
     # disabled tests: AsyncDataTransfer AtomicCounters BasicDebug
     #     DeviceFission11Ext RangeMinimumQuery SimpleDepthImage
     #     SimpleGenericAddressSpace
-    # disabled graphics tests: Mandelbrot  KmeansAutoclustering
+    # disabled graphics tests: KmeansAutoclustering
     #     GaussianNoiseGL SimpleGL BoxFilterGL URNGNoiseGL
     # very slow: BufferBandwidth LDSBandwidth ConstantBandwidth
     #     MemoryOptimizations TransferOverlap
@@ -123,11 +123,23 @@ if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Linux" AND X86_64)
     #     DeviceEnqueueBFS ExtractPrimes GlobalMemoryBandwidth
     #     RegionGrowingSegmentation SimpleSPIR
     # disabled - requires pipe: PipeProducerConsumerKernels SimplePipe
+    # requires OpenCL 2.0: BufferImageInterop
 
+    set(HSA_LABELED_TESTS
+        AMD_30_BinomialOption AMD_30_DCT AMD_30_BlackScholes
+        AMD_30_FastWalshTransform AMD_30_FloydWarshall AMD_30_HelloWorld
+        AMD_30_Histogram AMD_30_MatrixMultiplication AMD_30_MatrixTranspose
+        AMD_30_PrefixSum AMD_30_QuasiRandomSequence AMD_30_ScanLargeArrays
+        AMD_30_SimpleConvolution AMD_30_URNG)
+
+    if(ENABLE_HSA)
+      list(APPEND AMD_SAMPLES CalcPie)
+      list(APPEND HSA_LABELED_TESTS AMD_30_CalcPie)
+    endif()
 
     foreach(SAMPLE IN LISTS AMD_SAMPLES)
       add_test(NAME "AMD_30_${SAMPLE}"
-        COMMAND "${TS_BASEDIR}/src/${TS_NAME}/AMD-APP-SDK-3.0/samples/opencl/bin/x86_64/${SAMPLE}"
+        COMMAND "${TS_BASEDIR}/src/${TS_NAME}/AMD-APP-SDK-3.0/samples/opencl/bin/x86_64/${SAMPLE}" "-q" "-t"
         WORKING_DIRECTORY "${TS_BASEDIR}/src/${TS_NAME}/AMD-APP-SDK-3.0/samples/opencl/bin/x86_64")
       set_tests_properties("AMD_30_${SAMPLE}" PROPERTIES LABELS "amdsdk_30")
     endforeach()
@@ -135,22 +147,17 @@ if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Linux" AND X86_64)
     # AMD_30_HeatPDE
     # AMD_30_FineGrainSVMCAS
     set_tests_properties(
-        AMD_30_BinomialOption AMD_30_CalcPie AMD_30_DCT AMD_30_BlackScholes
-        AMD_30_FastWalshTransform AMD_30_FloydWarshall AMD_30_HelloWorld
-        AMD_30_Histogram AMD_30_MatrixMultiplication AMD_30_MatrixTranspose
-        AMD_30_PrefixSum AMD_30_QuasiRandomSequence AMD_30_ScanLargeArrays
-        AMD_30_SimpleConvolution AMD_30_URNG
+      ${HSA_LABELED_TESTS}
       PROPERTIES
         LABELS "hsa")
 
     # XFAILs as of 2016-05-17:
     # AMD_30_BuiltInScan: undefined symbol: work_group_scan_inclusive_add
-    # AMD_30_DeviceFission: Invalid device.
     # AMD_30_GaussianNoise, AMD_30_HDRToneMapping: Invalid platform.
     # AMD_30_ImageBinarization: work_group_barrier missing
     # AMD_30_HDRToneMapping: no output
     # AMD_30_UnsharpMask: clCreateContextFromType failed
-    set_tests_properties(AMD_30_BuiltInScan AMD_30_DeviceFission
+    set_tests_properties(AMD_30_BuiltInScan
       AMD_30_GaussianNoise AMD_30_HDRToneMapping
       AMD_30_ImageBinarization AMD_30_KernelLaunch
       AMD_30_UnsharpMask
diff --git a/examples/ASL/CMakeLists.txt b/examples/ASL/CMakeLists.txt
index 0ea4608..bdc53e3 100644
--- a/examples/ASL/CMakeLists.txt
+++ b/examples/ASL/CMakeLists.txt
@@ -34,6 +34,12 @@ find_package(Boost 1.55 QUIET)
 find_package(VTK QUIET)
 set(VTK_VER "${VTK_MAJOR_VERSION}.${VTK_MINOR_VERSION}")
 
+if(EXAMPLES_USE_GIT_MASTER)
+  set(FETCH_SOURCE GIT_REPOSITORY "https://github.com/AvtechScientific/ASL")
+else()
+  set(FETCH_SOURCE URL "https://github.com/AvtechScientific/ASL/archive/v0.1.6.tar.gz")
+endif()
+
 if ((NOT "${CMAKE_VERSION}" VERSION_LESS "3.0.2")
     AND VTK_FOUND AND (NOT VTK_VER VERSION_LESS "6.1")
     AND Boost_FOUND AND (NOT Boost_VERSION VERSION_LESS "1.55"))
@@ -45,8 +51,8 @@ if ((NOT "${CMAKE_VERSION}" VERSION_LESS "3.0.2")
   ExternalProject_Add(
     ${TS_NAME}
     PREFIX "${TS_BASEDIR}"
-    # GIT_REPOSITORY "https://github.com/AvtechScientific/ASL"
-    URL "https://github.com/AvtechScientific/ASL/archive/v0.1.6.tar.gz"
+    ${FETCH_SOURCE}
+    PATCH_COMMAND pwd && patch -p1 -i ${CMAKE_SOURCE_DIR}/examples/ASL/asl.patch
     CMAKE_ARGS
       -DCMAKE_BUILD_TYPE=RelWithDebInfo
       -DWITH_API_DOC:BOOL=OFF
@@ -73,8 +79,9 @@ if ((NOT "${CMAKE_VERSION}" VERSION_LESS "3.0.2")
            COMMAND "${TS_BUILDDIR}/test/testACL/testOperators")
   add_test(NAME ASL_testKernelMerger
            COMMAND "${TS_BUILDDIR}/test/testACL/testKernelMerger")
-  add_test(NAME ASL_testPrivateVar
-           COMMAND "${TS_BUILDDIR}/test/testACL/testPrivateVar")
+# takes too long
+#  add_test(NAME ASL_testPrivateVar
+#           COMMAND "${TS_BUILDDIR}/test/testACL/testPrivateVar")
   add_test(NAME ASL_testASLData
            COMMAND "${TS_BUILDDIR}/test/testMath/testASLData")
   add_test(NAME ASL_testDistanceFunction
@@ -82,61 +89,64 @@ if ((NOT "${CMAKE_VERSION}" VERSION_LESS "3.0.2")
   add_test(NAME ASL_testReductionFunction
            COMMAND "${TS_BUILDDIR}/test/testMath/testReductionFunction")
 
-
-  add_test(NAME ASL_example_bus_wind
-           COMMAND "${TS_BUILDDIR}/examples/flow/bus_wind/asl-bus_wind")
-  add_test(NAME ASL_example_compressor
-           COMMAND "${TS_BUILDDIR}/examples/flow/compressor/asl-compressor")
-  add_test(NAME ASL_example_flow
-           COMMAND "${TS_BUILDDIR}/examples/flow/flow/asl-flow")
-  add_test(NAME ASL_example_flow2
-           COMMAND "${TS_BUILDDIR}/examples/flow/flow2/asl-flow2")
-  add_test(NAME ASL_example_flow3
-           COMMAND "${TS_BUILDDIR}/examples/flow/flow3/asl-flow3")
-  add_test(NAME ASL_example_flowKDPGrowth
-           COMMAND "${TS_BUILDDIR}/examples/flow/flowKDPGrowth/asl-flowKDPGrowth")
-  add_test(NAME ASL_example_flowRotatingCylinders
-           COMMAND "${TS_BUILDDIR}/examples/flow/flowRotatingCylinders/asl-flowRotatingCylinders")
-  add_test(NAME ASL_example_locomotive
-           COMMAND "${TS_BUILDDIR}/examples/flow/locomotive/asl-locomotive")
-  add_test(NAME ASL_example_locomotive_laminar
-           COMMAND "${TS_BUILDDIR}/examples/flow/locomotive_laminar/asl-locomotive_laminar")
-  add_test(NAME ASL_example_locomotive_stability
-           COMMAND "${TS_BUILDDIR}/examples/flow/locomotive_stability/asl-locomotive_stability")
-  add_test(NAME ASL_example_multicomponent_flow
-           COMMAND "${TS_BUILDDIR}/examples/flow/multicomponent_flow/asl-multicomponent_flow")
-  add_test(NAME ASL_example_multiphase_flow
-           COMMAND "${TS_BUILDDIR}/examples/flow/multiphase_flow/asl-multiphase_flow")
-  add_test(NAME ASL_example_pitot_tube_ice
-           COMMAND "${TS_BUILDDIR}/examples/flow/pitot_tube_ice/asl-pitot_tube_ice")
-  add_test(NAME ASL_example_acousticWaves
-           COMMAND "${TS_BUILDDIR}/examples/elastic/acousticWaves/asl-acousticWaves")
-  add_test(NAME ASL_example_cubeGravity
-           COMMAND "${TS_BUILDDIR}/examples/elastic/cubeGravity/asl-cubeGravity")
-  add_test(NAME ASL_example_cubeIncompressibleGravity
-           COMMAND "${TS_BUILDDIR}/examples/elastic/cubeIncompressibleGravity/asl-cubeIncompressibleGravity")
-  add_test(NAME ASL_example_cubePoroelasticGravity
-           COMMAND "${TS_BUILDDIR}/examples/elastic/cubePoroelasticGravity/asl-cubePoroelasticGravity")
-  add_test(NAME ASL_example_poroelastic
-           COMMAND "${TS_BUILDDIR}/examples/elastic/poroelastic/asl-poroelastic")
-  add_test(NAME ASL_example_levelSetBasic
-           COMMAND "${TS_BUILDDIR}/examples/levelSet/levelSetBasic/asl-levelSetBasic")
-  add_test(NAME ASL_example_levelSetFacetedGrowth
-           COMMAND "${TS_BUILDDIR}/examples/levelSet/levelSetFacetedGrowth/asl-levelSetFacetedGrowth")
-  add_test(NAME ASL_example_levelSetNormalGrowth
-           COMMAND "${TS_BUILDDIR}/examples/levelSet/levelSetNormalGrowth/asl-levelSetNormalGrowth")
-  add_test(NAME ASL_example_jumpingBox
-           COMMAND "${TS_BUILDDIR}/examples/jumpingObjects/jumpingBox/asl-jumpingBox")
-  add_test(NAME ASL_example_surfaceFlux
-           COMMAND "${TS_BUILDDIR}/examples/heatTransfer/surfaceFlux/asl-surfaceFlux")
-  add_test(NAME ASL_example_testSMDiff
-           COMMAND "${TS_BUILDDIR}/examples/massTransferSM/testSMDiff/asl-testSMDiff")
-  add_test(NAME ASL_example_testSMDiff3C
-           COMMAND "${TS_BUILDDIR}/examples/massTransferSM/testSMDiff3C/asl-testSMDiff3C")
-  add_test(NAME ASL_example_testSMPhi
-           COMMAND "${TS_BUILDDIR}/examples/massTransferSM/testSMPhi/asl-testSMPhi")
-  add_test(NAME ASL_example_testSMPhiBV
-           COMMAND "${TS_BUILDDIR}/examples/massTransferSM/testSMPhiBV/asl-testSMPhiBV")
+# Tests are disabled until ASL issue #31 is resolved.
+# https://github.com/AvtechScientific/ASL/issues/31
+#
+#
+#  add_test(NAME ASL_example_bus_wind
+#           COMMAND "${TS_BUILDDIR}/examples/flow/bus_wind/asl-bus_wind")
+#  add_test(NAME ASL_example_compressor
+#           COMMAND "${TS_BUILDDIR}/examples/flow/compressor/asl-compressor")
+#  add_test(NAME ASL_example_flow
+#           COMMAND "${TS_BUILDDIR}/examples/flow/flow/asl-flow")
+#  add_test(NAME ASL_example_flow2
+#           COMMAND "${TS_BUILDDIR}/examples/flow/flow2/asl-flow2")
+#  add_test(NAME ASL_example_flow3
+#           COMMAND "${TS_BUILDDIR}/examples/flow/flow3/asl-flow3")
+#  add_test(NAME ASL_example_flowKDPGrowth
+#           COMMAND "${TS_BUILDDIR}/examples/flow/flowKDPGrowth/asl-flowKDPGrowth")
+#  add_test(NAME ASL_example_flowRotatingCylinders
+#           COMMAND "${TS_BUILDDIR}/examples/flow/flowRotatingCylinders/asl-flowRotatingCylinders")
+#  add_test(NAME ASL_example_locomotive
+#           COMMAND "${TS_BUILDDIR}/examples/flow/locomotive/asl-locomotive")
+#  add_test(NAME ASL_example_locomotive_laminar
+#           COMMAND "${TS_BUILDDIR}/examples/flow/locomotive_laminar/asl-locomotive_laminar")
+#  add_test(NAME ASL_example_locomotive_stability
+#           COMMAND "${TS_BUILDDIR}/examples/flow/locomotive_stability/asl-locomotive_stability")
+#  add_test(NAME ASL_example_multicomponent_flow
+#           COMMAND "${TS_BUILDDIR}/examples/flow/multicomponent_flow/asl-multicomponent_flow")
+#  add_test(NAME ASL_example_multiphase_flow
+#           COMMAND "${TS_BUILDDIR}/examples/flow/multiphase_flow/asl-multiphase_flow")
+#  add_test(NAME ASL_example_pitot_tube_ice
+#           COMMAND "${TS_BUILDDIR}/examples/flow/pitot_tube_ice/asl-pitot_tube_ice")
+#  add_test(NAME ASL_example_acousticWaves
+#           COMMAND "${TS_BUILDDIR}/examples/elastic/acousticWaves/asl-acousticWaves")
+#  add_test(NAME ASL_example_cubeGravity
+#           COMMAND "${TS_BUILDDIR}/examples/elastic/cubeGravity/asl-cubeGravity")
+#  add_test(NAME ASL_example_cubeIncompressibleGravity
+#           COMMAND "${TS_BUILDDIR}/examples/elastic/cubeIncompressibleGravity/asl-cubeIncompressibleGravity")
+#  add_test(NAME ASL_example_cubePoroelasticGravity
+#           COMMAND "${TS_BUILDDIR}/examples/elastic/cubePoroelasticGravity/asl-cubePoroelasticGravity")
+#  add_test(NAME ASL_example_poroelastic
+#           COMMAND "${TS_BUILDDIR}/examples/elastic/poroelastic/asl-poroelastic")
+#  add_test(NAME ASL_example_levelSetBasic
+#           COMMAND "${TS_BUILDDIR}/examples/levelSet/levelSetBasic/asl-levelSetBasic")
+#  add_test(NAME ASL_example_levelSetFacetedGrowth
+#           COMMAND "${TS_BUILDDIR}/examples/levelSet/levelSetFacetedGrowth/asl-levelSetFacetedGrowth")
+#  add_test(NAME ASL_example_levelSetNormalGrowth
+#           COMMAND "${TS_BUILDDIR}/examples/levelSet/levelSetNormalGrowth/asl-levelSetNormalGrowth")
+#  add_test(NAME ASL_example_jumpingBox
+#           COMMAND "${TS_BUILDDIR}/examples/jumpingObjects/jumpingBox/asl-jumpingBox")
+#  add_test(NAME ASL_example_surfaceFlux
+#           COMMAND "${TS_BUILDDIR}/examples/heatTransfer/surfaceFlux/asl-surfaceFlux")
+#  add_test(NAME ASL_example_testSMDiff
+#           COMMAND "${TS_BUILDDIR}/examples/massTransferSM/testSMDiff/asl-testSMDiff")
+#  add_test(NAME ASL_example_testSMDiff3C
+#           COMMAND "${TS_BUILDDIR}/examples/massTransferSM/testSMDiff3C/asl-testSMDiff3C")
+#  add_test(NAME ASL_example_testSMPhi
+#           COMMAND "${TS_BUILDDIR}/examples/massTransferSM/testSMPhi/asl-testSMPhi")
+#  add_test(NAME ASL_example_testSMPhiBV
+#           COMMAND "${TS_BUILDDIR}/examples/massTransferSM/testSMPhiBV/asl-testSMPhiBV")
 
 
   set_tests_properties(
@@ -146,37 +156,37 @@ if ((NOT "${CMAKE_VERSION}" VERSION_LESS "3.0.2")
     ASL_testKernel
     ASL_testOperators
     ASL_testKernelMerger
-    ASL_testPrivateVar
+#    ASL_testPrivateVar
     ASL_testASLData
     ASL_testDistanceFunction
     ASL_testReductionFunction
-    ASL_example_bus_wind
-    ASL_example_compressor
-    ASL_example_flow
-    ASL_example_flow2
-    ASL_example_flow3
-    ASL_example_flowKDPGrowth
-    ASL_example_flowRotatingCylinders
-    ASL_example_locomotive
-    ASL_example_locomotive_laminar
-    ASL_example_locomotive_stability
-    ASL_example_multicomponent_flow
-    ASL_example_multiphase_flow
-    ASL_example_pitot_tube_ice
-    ASL_example_acousticWaves
-    ASL_example_cubeGravity
-    ASL_example_cubeIncompressibleGravity
-    ASL_example_cubePoroelasticGravity
-    ASL_example_poroelastic
-    ASL_example_levelSetBasic
-    ASL_example_levelSetFacetedGrowth
-    ASL_example_levelSetNormalGrowth
-    ASL_example_jumpingBox
-    ASL_example_surfaceFlux
-    ASL_example_testSMDiff
-    ASL_example_testSMDiff3C
-    ASL_example_testSMPhi
-    ASL_example_testSMPhiBV
+#    ASL_example_bus_wind
+#    ASL_example_compressor
+#    ASL_example_flow
+#    ASL_example_flow2
+#    ASL_example_flow3
+#    ASL_example_flowKDPGrowth
+#    ASL_example_flowRotatingCylinders
+#    ASL_example_locomotive
+#    ASL_example_locomotive_laminar
+#    ASL_example_locomotive_stability
+#    ASL_example_multicomponent_flow
+#    ASL_example_multiphase_flow
+#    ASL_example_pitot_tube_ice
+#    ASL_example_acousticWaves
+#    ASL_example_cubeGravity
+#    ASL_example_cubeIncompressibleGravity
+#    ASL_example_cubePoroelasticGravity
+#    ASL_example_poroelastic
+#    ASL_example_levelSetBasic
+#    ASL_example_levelSetFacetedGrowth
+#    ASL_example_levelSetNormalGrowth
+#    ASL_example_jumpingBox
+#    ASL_example_surfaceFlux
+#    ASL_example_testSMDiff
+#    ASL_example_testSMDiff3C
+#    ASL_example_testSMPhi
+#    ASL_example_testSMPhiBV
 
     PROPERTIES
       LABELS "ASL")
diff --git a/examples/ASL/asl.patch b/examples/ASL/asl.patch
new file mode 100644
index 0000000..bf5380a
--- /dev/null
+++ b/examples/ASL/asl.patch
@@ -0,0 +1,29 @@
+--- a/src/utilities/aslParametersManager.cxx	2017-04-11 11:59:40.208368454 +0200
++++ b/src/utilities/aslParametersManager.cxx	2017-04-11 12:00:34.676369980 +0200
+@@ -332,7 +332,7 @@
+ 
+ 		try
+ 		{
+-			ifstream ifs(paramFile);
++			std::ifstream ifs(paramFile);
+ 			if (!ifs.good())
+ 				errorMessage("Can not open parameters file: " + paramFile);
+ 
+@@ -357,7 +357,7 @@
+ 
+ 	void ParametersManager::writeParametersFile(const std::string fileName)
+ 	{
+-		ofstream fo(fileName);
++		std::ofstream fo(fileName);
+ 		if (!fo.good())
+ 			errorMessage("ParametersManager::writeParametersFile() - can not open file: " + fileName);
+ 
+@@ -463,7 +463,7 @@
+ 				}
+ 				else
+ 				{
+-					ifstream ifs(p.string());
++					std::ifstream ifs(p.string());
+ 					if (ifs.good())
+ 					{
+ 						parsed_options parsed = parse_config_file(ifs, allOptions, true);
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index a4c45b7..102cfa9 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -33,10 +33,14 @@
 #
 
 add_subdirectory("example1")
-add_subdirectory("example1-spir32")
-add_subdirectory("example1-spir64")
 add_subdirectory("example2")
 add_subdirectory("example2a")
+
+if(HOST_DEVICE_EXTENSIONS MATCHES "spir")
+  add_subdirectory("example1-spir32")
+  add_subdirectory("example1-spir64")
+endif()
+
 add_subdirectory("poclcc")
 add_subdirectory("scalarwave")
 add_subdirectory("trig")
@@ -46,7 +50,7 @@ add_subdirectory("EinsteinToolkit")
 set(ALL_TESTSUITES
     AMD AMDSDK2.9 AMDSDK3.0
     ASL arrayfire clBLAS clFFT
-    CloverLeaf Halide IntelSVM
+    conformance CloverLeaf Halide IntelSVM
     opencl-book-samples OpenCV
     Parboil piglit PyOpenCL
     Rodinia VexCL ViennaCL)
@@ -55,18 +59,24 @@ if("${ENABLE_TESTSUITES}" STREQUAL "all")
   set(ENABLE_TESTSUITES ${ALL_TESTSUITES})
 endif()
 
+if("${ENABLE_TESTSUITES}" MATCHES "tier1")
+  list(REMOVE_ITEM ENABLE_TESTSUITES "tier1")
+  list(APPEND ENABLE_TESTSUITES "AMDSDK3.0" "piglit" "conformance")
+endif()
+
 include(ExternalProject)
 
 # invoke this to build all examples
 add_custom_target(prepare_examples)
 
-if(ENABLE_TESTSUITES AND (NOT DEFINED ACTUALLY_ENABLED_TESTSUITES))
+find_program(BASH "bash")
+find_program(MAKE_PROGRAM NAMES "make")
+find_package(Git QUIET)
+set_expr(HAVE_GIT GIT_EXECUTABLE)
+
+if(ENABLE_TESTSUITES)
 
-    unset(CMAKE_MODULE_PATH) # Use CMake builtin find module
-    find_package(Git QUIET)
-    set_expr(HAVE_GIT GIT_EXECUTABLE)
-    find_package(Subversion QUIET)
-    set_expr(HAVE_SVN Subversion_SVN_EXECUTABLE)
+  unset(CMAKE_MODULE_PATH) # Use CMake builtin find module
 
   message(STATUS "Trying to enable testsuites: ${ENABLE_TESTSUITES}")
 
@@ -104,7 +114,6 @@ if(ENABLE_TESTSUITES AND (NOT DEFINED ACTUALLY_ENABLED_TESTSUITES))
     endif()
   endforeach()
 
-  set(ACTUALLY_ENABLED_TESTSUITES "${ACTUALLY_ENABLED_TESTSUITES}" CACHE STRING "Actually available & enabled testsuites")
 endif()
 
 set(ACTUALLY_ENABLED_TESTSUITES ${ACTUALLY_ENABLED_TESTSUITES} PARENT_SCOPE)
@@ -119,11 +128,9 @@ set(DISABLED_TESTSUITES ${DISABLED} PARENT_SCOPE)
 
 ######################################################################################
 
-if(ACTUALLY_ENABLED_TESTSUITES MATCHES "AMD")
+if((ACTUALLY_ENABLED_TESTSUITES MATCHES "AMD") AND (NOT MSVC))
 
-  if(NOT MSVC)
-    pkg_check_modules(GLEW glew)
-  endif()
+  pkg_check_modules(GLEW glew)
 
   if(GLEW_FOUND)
     set(HAVE_GLEW 1 PARENT_SCOPE)
diff --git a/examples/EinsteinToolkit/CMakeLists.txt b/examples/EinsteinToolkit/CMakeLists.txt
index 5fd1baa..1409c8f 100644
--- a/examples/EinsteinToolkit/CMakeLists.txt
+++ b/examples/EinsteinToolkit/CMakeLists.txt
@@ -43,6 +43,14 @@ if(NOT MIPS)
 
   set_tests_properties( "EinsteinToolkit"
     PROPERTIES
-    LABELS "internal;Einstein"
+    LABELS "EinsteinToolkit"
+    DEPENDS "pocl_version_check")
+
+  add_test(NAME "EinsteinToolkit_SubDev" COMMAND "EinsteinToolkit" s)
+
+  set_tests_properties( "EinsteinToolkit_SubDev"
+    PROPERTIES
+    LABELS "EinsteinToolkit"
+    ENVIRONMENT "POCL_AFFINITY=1"
     DEPENDS "pocl_version_check")
 endif()
diff --git a/examples/EinsteinToolkit/EinsteinToolkit.c b/examples/EinsteinToolkit/EinsteinToolkit.c
index 0975382..eab4431 100644
--- a/examples/EinsteinToolkit/EinsteinToolkit.c
+++ b/examples/EinsteinToolkit/EinsteinToolkit.c
@@ -51,9 +51,7 @@ int const niters = 10;
 #  include <OpenCL/opencl.h>
 #else
 #  include <CL/opencl.h>
-#endif	
-
-
+#endif
 
 // Stringify
 #define XSTR(x) #x
@@ -88,10 +86,10 @@ static inline size_t roundup(size_t const a, size_t const b)
 // Global OpenCL handles
 cl_platform_id platform_id;
 cl_device_id device_id;
+cl_device_id main_device_id;
 cl_context context;
 cl_command_queue cmd_queue;
-
-
+static int use_subdev;
 
 // Code generation choices:
 #define VECTORISE_ALIGNED_ARRAYS 0
@@ -445,9 +443,25 @@ static void allocate(cGH const* const cctkGH,
   assert(ptr->mem);
 }
 
+static void deallocate(cGH const* const cctkGH,
+                     ptr_t* const ptr,
+                     CCTK_REAL const val)
+{
+  assert(ptr->ptr);
+  clReleaseMemObject(ptr->mem);
+  free(ptr->ptr);
+}
+
+static cl_mem mem_cctkGH;
+static cl_mem mem_cctk_parameters;
 
+static cl_program program1;
+static cl_kernel kernel1;
 
-void setup()
+static cl_program program2;
+static cl_kernel kernel2;
+
+void setup(const char* program_source1, const char* program_source2)
 {
   cl_int cerr;
   
@@ -510,8 +524,22 @@ void setup()
   clGetContextInfo(context, CL_CONTEXT_DEVICES,
                    ndevice_ids*sizeof(cl_device_id), device_ids, NULL);
   assert(ndevice_ids >= 1);
-  device_id = device_ids[0];
-  
+  main_device_id = device_ids[0];
+
+  if (use_subdev)
+    {
+      const cl_device_partition_property props[]
+          = { CL_DEVICE_PARTITION_EQUALLY, 2, 0 };
+      cl_device_id subdevs[128];
+      cl_uint retval;
+      int err
+          = clCreateSubDevices (main_device_id, props, 128, subdevs, &retval);
+      assert (err == CL_SUCCESS);
+      device_id = subdevs[0];
+    }
+  else
+    device_id = main_device_id;
+
   size_t device_name_length;
   clGetDeviceInfo(device_id, CL_DEVICE_NAME, 0, NULL, &device_name_length);
   char device_name[device_name_length];
@@ -538,9 +566,76 @@ void setup()
   
   cmd_queue = clCreateCommandQueue(context, device_id, 0, NULL);
   assert(cmd_queue);
+
+
+  char const* const options =
+    "-DVECTORISE_ALIGNED_ARRAYS=" STR(VECTORISE_ALIGNED_ARRAYS) " "
+    "-DVECTOR_SIZE_I=" STR(VECTOR_SIZE_I) " "
+    "-DVECTOR_SIZE_J=" STR(VECTOR_SIZE_J) " "
+    "-DVECTOR_SIZE_K=" STR(VECTOR_SIZE_K) " "
+    "-DUNROLL_SIZE_I=" STR(UNROLL_SIZE_I) " "
+    "-DUNROLL_SIZE_J=" STR(UNROLL_SIZE_J) " "
+    "-DUNROLL_SIZE_K=" STR(UNROLL_SIZE_K) " "
+    "-DGROUP_SIZE_I=" STR(GROUP_SIZE_I) " "
+    "-DGROUP_SIZE_J=" STR(GROUP_SIZE_J) " "
+    "-DGROUP_SIZE_K=" STR(GROUP_SIZE_K) " "
+    "-DTILE_SIZE_I=" STR(TILE_SIZE_I) " "
+    "-DTILE_SIZE_J=" STR(TILE_SIZE_J) " "
+    "-DTILE_SIZE_K=" STR(TILE_SIZE_K) " ";
+
+  int ierr;
+
+  program1 =
+    clCreateProgramWithSource(context, 1, (const char**)&program_source1,
+                              NULL, NULL);
+  assert(program1);
+
+  ierr = clBuildProgram(program1, 0, NULL, options, NULL, NULL);
+  if (ierr) {
+    size_t log_size;
+    ierr = clGetProgramBuildInfo(program1, device_id,
+                                 CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
+    assert(!ierr);
+    char build_log[log_size];
+    ierr = clGetProgramBuildInfo(program1, device_id,
+                                 CL_PROGRAM_BUILD_LOG,
+                                 log_size, build_log, NULL);
+    assert(!ierr);
+    printf("Build log:\n"
+           "********************************************************************************\n"
+           "%s\n"
+           "********************************************************************************\n", build_log);
+    assert(0);
+  }
+
+  kernel1 = clCreateKernel(program1, "ML_BSSN_CL_RHS1", NULL);
+  assert(kernel1);
+
+  program2 =
+    clCreateProgramWithSource(context, 1, (const char**)&program_source2,
+                              NULL, NULL);
+  assert(program2);
+
+  ierr = clBuildProgram(program2, 0, NULL, options, NULL, NULL);
+  assert(!ierr);
+
+  kernel2 = clCreateKernel(program2, "ML_BSSN_CL_RHS2", NULL);
+  assert(kernel2);
+
 }
 
+void cleanup() {
 
+  clReleaseKernel(kernel1);
+  clReleaseProgram(program1);
+
+  clReleaseKernel(kernel2);
+  clReleaseProgram(program2);
+
+  clReleaseCommandQueue(cmd_queue);
+  clUnloadPlatformCompiler(platform_id);
+  clReleaseContext(context);
+}
 
 void init(cGH              * const cctkGH,
           cctk_parameters_t* const cctk_parameters,
@@ -859,9 +954,132 @@ void init(cGH              * const cctkGH,
   allocate(cctkGH, &cctk_arguments->At22rhs, -1.0);
   allocate(cctkGH, &cctk_arguments->At23rhs, -1.0);
   allocate(cctkGH, &cctk_arguments->At33rhs, -1.0);
+
+  mem_cctkGH =
+    clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_ONLY,
+                   sizeof *cctkGH, (cGH*)cctkGH, NULL);
+  assert(mem_cctkGH);
+
+  mem_cctk_parameters =
+    clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_ONLY,
+                   sizeof *cctk_parameters, (cctk_parameters_t*)cctk_parameters, NULL);
+  assert(mem_cctk_parameters);
+
 }
 
+void deinit(cGH              * const cctkGH,
+          cctk_parameters_t* const cctk_parameters,
+          cctk_arguments_t * const cctk_arguments)
+{
 
+  clReleaseMemObject(mem_cctkGH);
+  clReleaseMemObject(mem_cctk_parameters);
+
+  deallocate(cctkGH, &cctk_arguments->x, 10.0);
+  deallocate(cctkGH, &cctk_arguments->y, 11.0);
+  deallocate(cctkGH, &cctk_arguments->z, 12.0);
+  deallocate(cctkGH, &cctk_arguments->r, 13.0);
+  deallocate(cctkGH, &cctk_arguments->At11, 0.0);
+  deallocate(cctkGH, &cctk_arguments->At11_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->At11_p_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->At12, 0.0);
+  deallocate(cctkGH, &cctk_arguments->At12_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->At12_p_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->At13, 0.0);
+  deallocate(cctkGH, &cctk_arguments->At13_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->At13_p_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->At22, 0.0);
+  deallocate(cctkGH, &cctk_arguments->At22_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->At22_p_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->At23, 0.0);
+  deallocate(cctkGH, &cctk_arguments->At23_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->At23_p_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->At33, 0.0);
+  deallocate(cctkGH, &cctk_arguments->At33_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->At33_p_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->A, 0.0);
+  deallocate(cctkGH, &cctk_arguments->A_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->A_p_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->Arhs, -1.0);
+  deallocate(cctkGH, &cctk_arguments->B1, 0.0);
+  deallocate(cctkGH, &cctk_arguments->B1_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->B1_p_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->B2, 0.0);
+  deallocate(cctkGH, &cctk_arguments->B2_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->B2_p_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->B3, 0.0);
+  deallocate(cctkGH, &cctk_arguments->B3_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->B3_p_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->B1rhs, -1.0);
+  deallocate(cctkGH, &cctk_arguments->B2rhs, -1.0);
+  deallocate(cctkGH, &cctk_arguments->B3rhs, -1.0);
+  deallocate(cctkGH, &cctk_arguments->Xt1, 0.0);
+  deallocate(cctkGH, &cctk_arguments->Xt1_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->Xt1_p_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->Xt2, 0.0);
+  deallocate(cctkGH, &cctk_arguments->Xt2_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->Xt2_p_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->Xt3, 0.0);
+  deallocate(cctkGH, &cctk_arguments->Xt3_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->Xt3_p_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->Xt1rhs, -1.0);
+  deallocate(cctkGH, &cctk_arguments->Xt2rhs, -1.0);
+  deallocate(cctkGH, &cctk_arguments->Xt3rhs, -1.0);
+  deallocate(cctkGH, &cctk_arguments->alpha, 1.0);
+  deallocate(cctkGH, &cctk_arguments->alpha_p, 1.0);
+  deallocate(cctkGH, &cctk_arguments->alpha_p_p, 1.0);
+  deallocate(cctkGH, &cctk_arguments->alpharhs, -1.0);
+  deallocate(cctkGH, &cctk_arguments->phi, 0.0);
+  deallocate(cctkGH, &cctk_arguments->phi_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->phi_p_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->phirhs, -1.0);
+  deallocate(cctkGH, &cctk_arguments->gt11, 1.0);
+  deallocate(cctkGH, &cctk_arguments->gt11_p, 1.0);
+  deallocate(cctkGH, &cctk_arguments->gt11_p_p, 1.0);
+  deallocate(cctkGH, &cctk_arguments->gt12, 0.0);
+  deallocate(cctkGH, &cctk_arguments->gt12_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->gt12_p_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->gt13, 0.0);
+  deallocate(cctkGH, &cctk_arguments->gt13_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->gt13_p_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->gt22, 1.0);
+  deallocate(cctkGH, &cctk_arguments->gt22_p, 1.0);
+  deallocate(cctkGH, &cctk_arguments->gt22_p_p, 1.0);
+  deallocate(cctkGH, &cctk_arguments->gt23, 0.0);
+  deallocate(cctkGH, &cctk_arguments->gt23_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->gt23_p_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->gt33, 1.0);
+  deallocate(cctkGH, &cctk_arguments->gt33_p, 1.0);
+  deallocate(cctkGH, &cctk_arguments->gt33_p_p, 1.0);
+  deallocate(cctkGH, &cctk_arguments->gt11rhs, -1.0);
+  deallocate(cctkGH, &cctk_arguments->gt12rhs, -1.0);
+  deallocate(cctkGH, &cctk_arguments->gt13rhs, -1.0);
+  deallocate(cctkGH, &cctk_arguments->gt22rhs, -1.0);
+  deallocate(cctkGH, &cctk_arguments->gt23rhs, -1.0);
+  deallocate(cctkGH, &cctk_arguments->gt33rhs, -1.0);
+  deallocate(cctkGH, &cctk_arguments->beta1, 0.0);
+  deallocate(cctkGH, &cctk_arguments->beta1_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->beta1_p_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->beta2, 0.0);
+  deallocate(cctkGH, &cctk_arguments->beta2_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->beta2_p_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->beta3, 0.0);
+  deallocate(cctkGH, &cctk_arguments->beta3_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->beta3_p_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->beta1rhs, -1.0);
+  deallocate(cctkGH, &cctk_arguments->beta2rhs, -1.0);
+  deallocate(cctkGH, &cctk_arguments->beta3rhs, -1.0);
+  deallocate(cctkGH, &cctk_arguments->trK, 0.0);
+  deallocate(cctkGH, &cctk_arguments->trK_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->trK_p_p, 0.0);
+  deallocate(cctkGH, &cctk_arguments->trKrhs, -1.0);
+  deallocate(cctkGH, &cctk_arguments->At11rhs, -1.0);
+  deallocate(cctkGH, &cctk_arguments->At12rhs, -1.0);
+  deallocate(cctkGH, &cctk_arguments->At13rhs, -1.0);
+  deallocate(cctkGH, &cctk_arguments->At22rhs, -1.0);
+  deallocate(cctkGH, &cctk_arguments->At23rhs, -1.0);
+  deallocate(cctkGH, &cctk_arguments->At33rhs, -1.0);
+}
 
 static void set_arg(cl_kernel kernel, int arg, cl_mem const* mem)
 {
@@ -871,175 +1089,114 @@ static void set_arg(cl_kernel kernel, int arg, cl_mem const* mem)
 
 
 
-int exec_ML_BSSN_CL_RHS1(char              const* const program_source,
-                         cGH               const* const cctkGH,
-                         cctk_parameters_t const* const cctk_parameters,
+int exec_ML_BSSN_CL_RHS1(cGH               const* const cctkGH,
                          cctk_arguments_t  const* const cctk_arguments)
 {
-  static int initialised = 0;
-  static cl_program program;
-  static cl_kernel kernel;
-  static cl_mem mem_cctkGH;
-  static cl_mem mem_cctk_parameters;
   
   int ierr;
-  
-  if (!initialised) {
-    initialised = 1;
-    
-    program =
-      clCreateProgramWithSource(context, 1, (const char**)&program_source,
-                                NULL, NULL);
-    assert(program);
-    
-    char const* const options =
-      "-DVECTORISE_ALIGNED_ARRAYS=" STR(VECTORISE_ALIGNED_ARRAYS) " "
-      "-DVECTOR_SIZE_I=" STR(VECTOR_SIZE_I) " "
-      "-DVECTOR_SIZE_J=" STR(VECTOR_SIZE_J) " "
-      "-DVECTOR_SIZE_K=" STR(VECTOR_SIZE_K) " "
-      "-DUNROLL_SIZE_I=" STR(UNROLL_SIZE_I) " "
-      "-DUNROLL_SIZE_J=" STR(UNROLL_SIZE_J) " "
-      "-DUNROLL_SIZE_K=" STR(UNROLL_SIZE_K) " "
-      "-DGROUP_SIZE_I=" STR(GROUP_SIZE_I) " "
-      "-DGROUP_SIZE_J=" STR(GROUP_SIZE_J) " "
-      "-DGROUP_SIZE_K=" STR(GROUP_SIZE_K) " "
-      "-DTILE_SIZE_I=" STR(TILE_SIZE_I) " "
-      "-DTILE_SIZE_J=" STR(TILE_SIZE_J) " "
-      "-DTILE_SIZE_K=" STR(TILE_SIZE_K) " ";
-    
-    ierr = clBuildProgram(program, 0, NULL, options, NULL, NULL);
-    if (ierr) {
-      size_t log_size;
-      ierr = clGetProgramBuildInfo(program, device_id,
-                                   CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
-      assert(!ierr);
-      char build_log[log_size];
-      ierr = clGetProgramBuildInfo(program, device_id,
-                                   CL_PROGRAM_BUILD_LOG,
-                                   log_size, build_log, NULL);
-      assert(!ierr);
-      printf("Build log:\n"
-             "********************************************************************************\n"
-             "%s\n"
-             "********************************************************************************\n", build_log);
-      assert(0);
-    }
-    
-    kernel = clCreateKernel(program, "ML_BSSN_CL_RHS1", NULL);
-    assert(kernel);
-    
-    mem_cctkGH =
-      clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_ONLY,
-                     sizeof *cctkGH, (cGH*)cctkGH, NULL);
-    assert(mem_cctkGH);
-    
-    mem_cctk_parameters =
-      clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_ONLY,
-                     sizeof *cctk_parameters, (cctk_parameters_t*)cctk_parameters, NULL);
-    assert(mem_cctk_parameters);
-  }
-  
+
+
   int nargs = 0;
-  set_arg(kernel, nargs++, &mem_cctkGH);
-  set_arg(kernel, nargs++, &mem_cctk_parameters);
-  set_arg(kernel, nargs++, &cctk_arguments->x.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->y.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->z.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->r.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At11.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At11_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At11_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At12.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At12_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At12_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At13.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At13_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At13_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At22.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At22_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At22_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At23.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At23_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At23_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At33.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At33_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At33_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->A.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->A_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->A_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->Arhs.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->B1.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->B1_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->B1_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->B2.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->B2_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->B2_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->B3.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->B3_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->B3_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->B1rhs.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->B2rhs.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->B3rhs.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->Xt1.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->Xt1_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->Xt1_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->Xt2.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->Xt2_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->Xt2_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->Xt3.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->Xt3_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->Xt3_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->Xt1rhs.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->Xt2rhs.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->Xt3rhs.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->alpha.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->alpha_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->alpha_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->alpharhs.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->phi.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->phi_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->phi_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->phirhs.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt11.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt11_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt11_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt12.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt12_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt12_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt13.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt13_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt13_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt22.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt22_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt22_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt23.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt23_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt23_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt33.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt33_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt33_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt11rhs.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt12rhs.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt13rhs.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt22rhs.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt23rhs.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt33rhs.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->beta1.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->beta1_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->beta1_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->beta2.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->beta2_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->beta2_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->beta3.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->beta3_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->beta3_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->beta1rhs.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->beta2rhs.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->beta3rhs.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->trK.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->trK_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->trK_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->trKrhs.mem);
+  set_arg(kernel1, nargs++, &mem_cctkGH);
+  set_arg(kernel1, nargs++, &mem_cctk_parameters);
+  set_arg(kernel1, nargs++, &cctk_arguments->x.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->y.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->z.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->r.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->At11.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->At11_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->At11_p_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->At12.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->At12_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->At12_p_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->At13.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->At13_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->At13_p_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->At22.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->At22_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->At22_p_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->At23.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->At23_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->At23_p_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->At33.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->At33_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->At33_p_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->A.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->A_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->A_p_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->Arhs.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->B1.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->B1_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->B1_p_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->B2.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->B2_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->B2_p_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->B3.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->B3_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->B3_p_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->B1rhs.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->B2rhs.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->B3rhs.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->Xt1.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->Xt1_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->Xt1_p_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->Xt2.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->Xt2_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->Xt2_p_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->Xt3.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->Xt3_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->Xt3_p_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->Xt1rhs.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->Xt2rhs.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->Xt3rhs.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->alpha.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->alpha_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->alpha_p_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->alpharhs.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->phi.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->phi_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->phi_p_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->phirhs.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->gt11.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->gt11_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->gt11_p_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->gt12.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->gt12_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->gt12_p_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->gt13.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->gt13_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->gt13_p_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->gt22.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->gt22_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->gt22_p_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->gt23.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->gt23_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->gt23_p_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->gt33.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->gt33_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->gt33_p_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->gt11rhs.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->gt12rhs.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->gt13rhs.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->gt22rhs.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->gt23rhs.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->gt33rhs.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->beta1.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->beta1_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->beta1_p_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->beta2.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->beta2_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->beta2_p_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->beta3.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->beta3_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->beta3_p_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->beta1rhs.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->beta2rhs.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->beta3rhs.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->trK.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->trK_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->trK_p_p.mem);
+  set_arg(kernel1, nargs++, &cctk_arguments->trKrhs.mem);
   
   size_t const local_work_size[3] =
     { GROUP_SIZE_I, GROUP_SIZE_J, GROUP_SIZE_K };
@@ -1070,158 +1227,112 @@ int exec_ML_BSSN_CL_RHS1(char              const* const program_source,
     }
   }
   
-  ierr = clEnqueueNDRangeKernel(cmd_queue, kernel, dim,
+  ierr = clEnqueueNDRangeKernel(cmd_queue, kernel1, dim,
                                 NULL, global_work_size, local_work_size,  
                                 0, NULL, NULL);
   assert(!ierr);
   
   ierr = clFinish(cmd_queue);
   assert(!ierr);
-  
+
   return 0;
 }
 
 
 
-int exec_ML_BSSN_CL_RHS2(char              const* const program_source,
-                         cGH               const* const cctkGH,
-                         cctk_parameters_t const* const cctk_parameters,
+int exec_ML_BSSN_CL_RHS2(cGH               const* const cctkGH,
                          cctk_arguments_t  const* const cctk_arguments)
 { 
-  static int initialised = 0;
-  static cl_program program;
-  static cl_kernel kernel;
-  static cl_mem mem_cctkGH;
-  static cl_mem mem_cctk_parameters;
   
   int ierr;
-  
-  if (!initialised) {
-    initialised = 1;
-    
-    program =
-      clCreateProgramWithSource(context, 1, (const char**)&program_source,
-                                NULL, NULL);
-    assert(program);
-    
-    char const* const options =
-      "-DVECTORISE_ALIGNED_ARRAYS=" STR(VECTORISE_ALIGNED_ARRAYS) " "
-      "-DVECTOR_SIZE_I=" STR(VECTOR_SIZE_I) " "
-      "-DVECTOR_SIZE_J=" STR(VECTOR_SIZE_J) " "
-      "-DVECTOR_SIZE_K=" STR(VECTOR_SIZE_K) " "
-      "-DUNROLL_SIZE_I=" STR(UNROLL_SIZE_I) " "
-      "-DUNROLL_SIZE_J=" STR(UNROLL_SIZE_J) " "
-      "-DUNROLL_SIZE_K=" STR(UNROLL_SIZE_K) " "
-      "-DGROUP_SIZE_I=" STR(GROUP_SIZE_I) " "
-      "-DGROUP_SIZE_J=" STR(GROUP_SIZE_J) " "
-      "-DGROUP_SIZE_K=" STR(GROUP_SIZE_K) " "
-      "-DTILE_SIZE_I=" STR(TILE_SIZE_I) " "
-      "-DTILE_SIZE_J=" STR(TILE_SIZE_J) " "
-      "-DTILE_SIZE_K=" STR(TILE_SIZE_K) " ";
-    
-    ierr = clBuildProgram(program, 0, NULL, options, NULL, NULL);
-    assert(!ierr);
-    
-    kernel = clCreateKernel(program, "ML_BSSN_CL_RHS2", NULL);
-    assert(kernel);
-    
-    mem_cctkGH =
-      clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_ONLY,
-                     sizeof *cctkGH, (cGH*)cctkGH, NULL);
-    assert(mem_cctkGH);
-    
-    mem_cctk_parameters =
-      clCreateBuffer(context, CL_MEM_COPY_HOST_PTR | CL_MEM_READ_ONLY,
-                     sizeof *cctk_parameters, (cctk_parameters_t*)cctk_parameters, NULL);
-    assert(mem_cctk_parameters);
-  }
-  
+
   int nargs = 0;
-  set_arg(kernel, nargs++, &mem_cctkGH);
-  set_arg(kernel, nargs++, &mem_cctk_parameters);
-  set_arg(kernel, nargs++, &cctk_arguments->At11.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At11_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At11_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At12.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At12_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At12_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At13.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At13_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At13_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At22.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At22_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At22_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At23.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At23_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At23_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At33.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At33_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At33_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At11rhs.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At12rhs.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At13rhs.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At22rhs.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At23rhs.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->At33rhs.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->Xt1.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->Xt1_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->Xt1_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->Xt2.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->Xt2_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->Xt2_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->Xt3.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->Xt3_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->Xt3_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->alpha.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->alpha_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->alpha_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->phi.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->phi_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->phi_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt11.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt11_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt11_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt12.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt12_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt12_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt13.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt13_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt13_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt22.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt22_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt22_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt23.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt23_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt23_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt33.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt33_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->gt33_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->beta1.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->beta1_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->beta1_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->beta2.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->beta2_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->beta2_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->beta3.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->beta3_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->beta3_p_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->trK.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->trK_p.mem);
-  set_arg(kernel, nargs++, &cctk_arguments->trK_p_p.mem);
+  set_arg(kernel2, nargs++, &mem_cctkGH);
+  set_arg(kernel2, nargs++, &mem_cctk_parameters);
+  set_arg(kernel2, nargs++, &cctk_arguments->At11.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->At11_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->At11_p_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->At12.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->At12_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->At12_p_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->At13.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->At13_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->At13_p_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->At22.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->At22_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->At22_p_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->At23.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->At23_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->At23_p_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->At33.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->At33_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->At33_p_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->At11rhs.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->At12rhs.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->At13rhs.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->At22rhs.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->At23rhs.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->At33rhs.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->Xt1.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->Xt1_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->Xt1_p_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->Xt2.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->Xt2_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->Xt2_p_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->Xt3.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->Xt3_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->Xt3_p_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->alpha.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->alpha_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->alpha_p_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->phi.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->phi_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->phi_p_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->gt11.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->gt11_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->gt11_p_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->gt12.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->gt12_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->gt12_p_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->gt13.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->gt13_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->gt13_p_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->gt22.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->gt22_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->gt22_p_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->gt23.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->gt23_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->gt23_p_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->gt33.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->gt33_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->gt33_p_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->beta1.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->beta1_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->beta1_p_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->beta2.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->beta2_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->beta2_p_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->beta3.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->beta3_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->beta3_p_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->trK.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->trK_p.mem);
+  set_arg(kernel2, nargs++, &cctk_arguments->trK_p_p.mem);
   
   size_t const global_work_size[3] =
     { cctkGH->cctk_ash[0], cctkGH->cctk_ash[1], cctkGH->cctk_ash[2] };
   size_t const local_work_size[3] =
     { GROUP_SIZE_I, GROUP_SIZE_J, GROUP_SIZE_K };
   
-  ierr = clEnqueueNDRangeKernel(cmd_queue, kernel, dim,
+  ierr = clEnqueueNDRangeKernel(cmd_queue, kernel2, dim,
                                 NULL, global_work_size, local_work_size,  
                                 0, NULL, NULL);
+
   assert(!ierr);
   
   ierr = clFinish(cmd_queue);
   assert(!ierr);
-  
+
   return 0;
 }
 
@@ -1298,9 +1409,11 @@ void check(cGH              * const cctkGH,
 int main(int argc, char** argv)
 {
   printf("EinsteinToolkit test\n");
-  
-  
-  
+
+  if (argc > 1)
+    if (argv[1][0] == 's')
+      use_subdev = 1;
+
   printf("Reading sources...\n");
   FILE *const source1_file = fopen(SRCDIR "/ML_BSSN_CL_RHS1.cl", "r");
   assert(source1_file != NULL && "ML_BSSN_CL_RHS1.cl not found!");
@@ -1322,32 +1435,27 @@ int main(int argc, char** argv)
   source2[source2_size] = '\0';
   fclose(source2_file);
   
-  
-  
   printf("Initialise...\n");
-  setup();
+  setup(source1, source2);
   cGH cctkGH;
   cctk_parameters_t cctk_parameters;
   cctk_arguments_t cctk_arguments;
   init(&cctkGH, &cctk_parameters, &cctk_arguments);
-  
+
   printf("RHS1...\n");
-  exec_ML_BSSN_CL_RHS1(source1, &cctkGH, &cctk_parameters, &cctk_arguments);
+  exec_ML_BSSN_CL_RHS1(&cctkGH, &cctk_arguments);
   printf("RHS2...\n");
-  exec_ML_BSSN_CL_RHS2(source2, &cctkGH, &cctk_parameters, &cctk_arguments);
-  
+  exec_ML_BSSN_CL_RHS2(&cctkGH, &cctk_arguments);
   check(&cctkGH, &cctk_parameters, &cctk_arguments);
-  
-  
-  
+
   printf("Begin timing %d iterations...\n", niters);
   double min_elapsed = HUGE_VAL;
   double avg_elapsed = 0.0;
   for (int n=0; n<niters; ++n) {
     struct timeval tv0;
     gettimeofday(&tv0, NULL);
-    exec_ML_BSSN_CL_RHS1(source1, &cctkGH, &cctk_parameters, &cctk_arguments);
-    exec_ML_BSSN_CL_RHS2(source2, &cctkGH, &cctk_parameters, &cctk_arguments);
+    exec_ML_BSSN_CL_RHS1(&cctkGH, &cctk_arguments);
+    exec_ML_BSSN_CL_RHS2(&cctkGH, &cctk_arguments);
     struct timeval tv1;
     gettimeofday(&tv1, NULL);
     double const elapsed =
@@ -1369,7 +1477,7 @@ int main(int argc, char** argv)
   double const flop_per_point = 3400.0;
   printf("        This corresponds to %g GFlop/s\n",
          1.0e-9 * flop_per_point / time_per_point);
-  
+
   printf("\n");
   // VECTOR_SIZE_I=1: 3388 FLop per gpu
   // VECTOR_SIZE_I=2: 3418 Flop per gpu
@@ -1378,7 +1486,8 @@ int main(int argc, char** argv)
   printf("      Smaller numbers are better.\n");
   printf("\n");
   
-  
+  deinit(&cctkGH, &cctk_parameters, &cctk_arguments);
+  cleanup();
   
   printf ("Done.\n");
   return 0;
diff --git a/examples/Halide/CMakeLists.txt b/examples/Halide/CMakeLists.txt
index 2cf8abc..617285c 100644
--- a/examples/Halide/CMakeLists.txt
+++ b/examples/Halide/CMakeLists.txt
@@ -35,11 +35,16 @@ message(STATUS "Enabling testsuite ${TS_NAME}")
 list(APPEND ACTUALLY_ENABLED_TESTSUITES "${TS_NAME}")
 set(ACTUALLY_ENABLED_TESTSUITES ${ACTUALLY_ENABLED_TESTSUITES} PARENT_SCOPE)
 
+if(EXAMPLES_USE_GIT_MASTER)
+  set(FETCH_SOURCE GIT_REPOSITORY "https://github.com/Halide/Halide.git")
+else()
+  set(FETCH_SOURCE URL "https://github.com/halide/Halide/archive/release_2017_10_30.tar.gz")
+endif()
+
 ExternalProject_Add(
   ${TS_NAME}
   PREFIX "${TS_BASEDIR}"
-  #DOWNLOAD_COMMAND "/bin/true"
-  GIT_REPOSITORY "https://github.com/Halide/Halide.git"
+  ${FETCH_SOURCE}
   #PATCH_COMMAND /bin/sh "${AMD_APP_SDK_TGZ}" --noexec --keep --target AMD-APP-SDK-3.0 &&
   #     patch -p1 -i ${CMAKE_CURRENT_SOURCE_DIR}/amdsdk3_0.patch
   CMAKE_ARGS
diff --git a/examples/IntelSVM/CMakeLists.txt b/examples/IntelSVM/CMakeLists.txt
index 5c20b2f..3dfecf7 100644
--- a/examples/IntelSVM/CMakeLists.txt
+++ b/examples/IntelSVM/CMakeLists.txt
@@ -58,7 +58,7 @@ if (EXISTS "${INTEL_ZIP}")
   # This seems to occur only with some LLVM built setups, so let's just skip
   # the tests completely.
 
-  if(!LLVM_3_8)
+  if(NOT LLVM_3_8)
     add_test(NAME intel_svm_coarse
       COMMAND "${TS_BUILDDIR}/coarse" -p 0 -t default
       WORKING_DIRECTORY "${TS_BASEDIR}/src/${TS_NAME}/SVMBasicCoarseGrained")
@@ -69,7 +69,8 @@ if (EXISTS "${INTEL_ZIP}")
     set_tests_properties(intel_svm_coarse intel_svm_fine
       PROPERTIES  LABELS "IntelSVM;hsa")
     set_tests_properties(intel_svm_coarse intel_svm_fine
-      PROPERTIES  WILL_FAIL 1)
+      PROPERTIES  PASS_REGULAR_EXPRESSION "PASSED")
+
   endif()
 
 else()
diff --git a/examples/OpenCV/opencv.patch b/examples/OpenCV/opencv.patch
index 78aea87..acb8516 100644
--- a/examples/OpenCV/opencv.patch
+++ b/examples/OpenCV/opencv.patch
@@ -11,3 +11,14 @@ diff --git a/opencv-3.0.0-beta/cmake/OpenCVDetectOpenCL.cmake b/opencv-3.0.0-bet
  endif(APPLE)
  
  if(OPENCL_FOUND)
+--- a/opencv-3.0.0-beta/cmake/OpenCVPCHSupport.cmake	2017-08-04 12:56:24.393300005 +0200
++++ b/opencv-3.0.0-beta/cmake/OpenCVPCHSupport.cmake	2017-08-04 12:56:08.969876062 +0200
+@@ -24,7 +24,7 @@
+     ENDIF()
+ 
+     SET(_PCH_include_prefix "-I")
+-    SET(_PCH_isystem_prefix "-isystem")
++    SET(_PCH_isystem_prefix "-I")
+     SET(_PCH_define_prefix "-D")
+ 
+ ELSEIF(CMAKE_GENERATOR MATCHES "^Visual.*$")
diff --git a/examples/PyOpenCL/CMakeLists.txt b/examples/PyOpenCL/CMakeLists.txt
index 840393a..f84f56c 100644
--- a/examples/PyOpenCL/CMakeLists.txt
+++ b/examples/PyOpenCL/CMakeLists.txt
@@ -25,27 +25,83 @@
 
 set(TS_NAME "PyOpenCL")
 set(TS_BASEDIR "${TESTSUITE_BASEDIR}/${TS_NAME}")
-set(TS_SRCDIR "${TESTSUITE_SOURCE_BASEDIR}/${TS_NAME}")
-set(TS_BUILDDIR "${TS_SRCDIR}/PyOpenCL-build")
+set(TS_BUILDDIR "${TS_BASEDIR}/src/${TS_NAME}-build")
+set(TS_SRCDIR "${TS_BASEDIR}/src/${TS_NAME}")
 
-# see README on how to install PyOpenCL to the example dir
+# TODO for whatever reason, compyte (required for running tests)
+# is not included in pyopencl release tars. For now force using git
+set(EXAMPLES_USE_GIT_MASTER 1)
 
-if (EXISTS "${TS_BUILDDIR}/mypy/bin/py.test" AND
-    IS_DIRECTORY "${TS_BUILDDIR}/pyopencl/test")
+if(EXAMPLES_USE_GIT_MASTER)
+  set(FETCH_SOURCE
+      GIT_REPOSITORY "https://github.com/inducer/pyopencl.git"
+      GIT_SUBMODULES "pyopencl/compyte" "src/c_wrapper")
+else()
+  set(FETCH_SOURCE URL "https://github.com/inducer/pyopencl/archive/v2017.2.tar.gz")
+endif()
+
+if(NOT TESTS_USE_ICD)
+  message(STATUS "Disabling testsuite ${TS_NAME}, requires ocl-icd")
+elseif(NOT BASH)
+  message(STATUS "Disabling testsuite ${TS_NAME}, can't find bash shell")
+else()
+
+find_program(VIRTUALENV NAMES "virtualenv")
+find_program(PYTHON_INTERP NAMES python3 python2 python)
+
+if(VIRTUALENV)
 
   message(STATUS "Enabling testsuite ${TS_NAME}")
   list(APPEND ACTUALLY_ENABLED_TESTSUITES "${TS_NAME}")
   set(ACTUALLY_ENABLED_TESTSUITES ${ACTUALLY_ENABLED_TESTSUITES} PARENT_SCOPE)
 
+  ExternalProject_Add(
+    ${TS_NAME}
+    PREFIX "${TS_BASEDIR}"
+    ${FETCH_SOURCE}
+
+    # have to use external scripts, because CMake has /bin/sh hardcoded,
+    # and /bin/sh on Ubuntu is dash, which doesn't support "source"
+    # command.
+    CONFIGURE_COMMAND "${BASH}" "${CMAKE_CURRENT_SOURCE_DIR}/configure.sh"
+      "${TS_BASEDIR}"
+      "${TS_BUILDDIR}"
+      "${TS_SRCDIR}"
+      "${VIRTUALENV}"
+      "${PYTHON_INTERP}"
+
+    BUILD_COMMAND "${BASH}" "${CMAKE_CURRENT_SOURCE_DIR}/build.sh"
+      "${TS_BASEDIR}"
+      "${TS_BUILDDIR}"
+      "${TS_SRCDIR}"
+
+    INSTALL_COMMAND "${BASH}" "${CMAKE_CURRENT_SOURCE_DIR}/install.sh"
+      "${TS_BASEDIR}"
+      "${TS_BUILDDIR}"
+      "${TS_SRCDIR}"
+
+  )
+
+
+  set_target_properties(${TS_NAME} PROPERTIES EXCLUDE_FROM_ALL TRUE)
+  add_dependencies(prepare_examples ${TS_NAME})
+
   add_test(NAME "PyOpenCL"
-           COMMAND "${TS_SRCDIR}/test_build_dir")
+           COMMAND "${BASH}" "${CMAKE_CURRENT_SOURCE_DIR}/runtest.sh"
+            "${TS_BASEDIR}"
+            "${TS_BUILDDIR}"
+            "${TS_SRCDIR}"
+           )
 
   set_tests_properties("PyOpenCL"
     PROPERTIES
+      ENVIRONMENT "PYOPENCL_NO_CACHE=1;PYOPENCL_TEST=portable"
       LABELS "PyOpenCL")
 
 else()
 
-  message(STATUS "Disabling testsuite ${TS_NAME}, required files not found in ${TS_BUILDDIR}." )
+  message(STATUS "Disabling testsuite ${TS_NAME}, requires python virtualenv")
+
+endif()
 
 endif()
diff --git a/examples/PyOpenCL/build.sh b/examples/PyOpenCL/build.sh
new file mode 100644
index 0000000..a864415
--- /dev/null
+++ b/examples/PyOpenCL/build.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+TS_BASEDIR="$1"
+TS_BUILDDIR="$2"
+TS_SRCDIR="$3"
+
+source "${TS_BUILDDIR}/bin/activate" && cd "${TS_SRCDIR}" && python setup.py build
diff --git a/examples/PyOpenCL/configure.sh b/examples/PyOpenCL/configure.sh
new file mode 100644
index 0000000..3aafc94
--- /dev/null
+++ b/examples/PyOpenCL/configure.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+TS_BASEDIR="$1"
+TS_BUILDDIR="$2"
+TS_SRCDIR="$3"
+VIRTUALENV="$4"
+PYTH="$5"
+
+cd "${TS_BASEDIR}/src" && "${VIRTUALENV}" --system-site-packages "--python=$PYTH" "PyOpenCL-build" && source "${TS_BUILDDIR}/bin/activate" && cd "${TS_SRCDIR}" && python configure.py
diff --git a/examples/PyOpenCL/install.sh b/examples/PyOpenCL/install.sh
new file mode 100644
index 0000000..e107b24
--- /dev/null
+++ b/examples/PyOpenCL/install.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+TS_BASEDIR="$1"
+TS_BUILDDIR="$2"
+TS_SRCDIR="$3"
+
+source "${TS_BUILDDIR}/bin/activate" &&  cd "${TS_SRCDIR}" &&  python setup.py install
diff --git a/examples/PyOpenCL/runtest.sh b/examples/PyOpenCL/runtest.sh
new file mode 100644
index 0000000..246ca2f
--- /dev/null
+++ b/examples/PyOpenCL/runtest.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+TS_BASEDIR="$1"
+TS_BUILDDIR="$2"
+TS_SRCDIR="$3"
+
+cd "${TS_SRCDIR}/test" && source "${TS_BUILDDIR}/bin/activate" && py.test -v --tb=native
diff --git a/examples/PyOpenCL/test_build_dir b/examples/PyOpenCL/test_build_dir
index 42e83e9..732ed95 100755
--- a/examples/PyOpenCL/test_build_dir
+++ b/examples/PyOpenCL/test_build_dir
@@ -3,7 +3,12 @@
 # Runs the PyOpenCL tests against a pocl build dir located in the
 # current working directory. Assumes an ICD build in PWD.
 
-pyopencl_root=`dirname $0`
+pyopencl_root=$1
+if [ -z "$pyopencl_root" ]; then
+  pyopencl_root=`dirname $0`
+fi
+
+if [ -z "$OCL_ICD_VENDORS" ]; then
 
 pushd . > /dev/null
 cd $pyopencl_root
@@ -25,6 +30,13 @@ fi
 
 export POCL_BUILDING=1
 export OCL_ICD_VENDORS=$PWD/ocl-vendors
+
+else
+
+echo "###### NOTE: using OCL_ICD_VENDORS already set to: $OCL_ICD_VENDORS, POCL_BUILDING set to: $POCL_BUILDING"
+
+fi
+
 export OPENCL_VENDOR_PATH=$OCL_ICD_VENDORS
 
 cd $pyopencl_root/PyOpenCL-build || exit -1
diff --git a/examples/Rodinia/CMakeLists.txt b/examples/Rodinia/CMakeLists.txt
index f87e9e3..9427861 100644
--- a/examples/Rodinia/CMakeLists.txt
+++ b/examples/Rodinia/CMakeLists.txt
@@ -30,7 +30,9 @@ set(TS_SRCDIR "${TESTSUITE_SOURCE_BASEDIR}/${TS_NAME}")
 set(RODINIA "rodinia_3.1")
 set(RODINIA_TGZ "${TS_SRCDIR}/${RODINIA}.tar.bz2")
 
-if (EXISTS "${RODINIA_TGZ}")
+if(NOT MAKE_PROGRAM)
+  message(STATUS "Disabling testsuite ${TS_NAME}, requires make to build")
+elseif (EXISTS "${RODINIA_TGZ}")
 
   message(STATUS "Enabling testsuite ${TS_NAME}")
   list(APPEND ACTUALLY_ENABLED_TESTSUITES "${TS_NAME}")
@@ -42,7 +44,7 @@ if (EXISTS "${RODINIA_TGZ}")
     DOWNLOAD_COMMAND test -d ${RODINIA} || /bin/bash -c "pwd && echo 'extracting rodinia tgz' && tar xjf '${RODINIA_TGZ}' && patch -p1 -r- -N -i '${CMAKE_CURRENT_SOURCE_DIR}/Rodinia.patch' && rmdir Rodinia && mv ${RODINIA} Rodinia"
     CONFIGURE_COMMAND /bin/true
     BUILD_IN_SOURCE 1
-    BUILD_COMMAND pwd && make OPENCL
+    BUILD_COMMAND pwd && ${MAKE_PROGRAM} OPENCL
       "CFLAGS=-Wno-unused-result -DCL_USE_DEPRECATED_OPENCL_1_2_APIS -DCL_USE_DEPRECATED_OPENCL_1_1_APIS"
       "OPENCL_INC=${CMAKE_SOURCE_DIR}/include"
     INSTALL_COMMAND /bin/true
diff --git a/examples/VexCL/CMakeLists.txt b/examples/VexCL/CMakeLists.txt
index 9e2528f..2166936 100644
--- a/examples/VexCL/CMakeLists.txt
+++ b/examples/VexCL/CMakeLists.txt
@@ -28,24 +28,35 @@ set(TS_BASEDIR "${TESTSUITE_BASEDIR}/${TS_NAME}")
 set(TS_BUILDDIR "${TS_BASEDIR}/src/${TS_NAME}-build")
 set(TS_SRCDIR "${TESTSUITE_SOURCE_BASEDIR}/${TS_NAME}")
 
-if(NOT HAVE_GIT)
-  message(STATUS "Disabling testsuite ${TS_NAME}, requires git to checkout sources")
-elseif(NOT TESTS_USE_ICD)
+if(EXAMPLES_USE_GIT_MASTER)
+  set(FETCH_SOURCE GIT_REPOSITORY "https://github.com/ddemidov/vexcl.git")
+else()
+  set(FETCH_SOURCE URL "https://github.com/ddemidov/vexcl/archive/1.4.1.tar.gz")
+endif()
+
+if(NOT TESTS_USE_ICD)
+
   message(STATUS "Disabling testsuite ${TS_NAME}, requires ocl-icd")
 
 else()
+
 message(STATUS "Enabling testsuite ${TS_NAME}")
 list(APPEND ACTUALLY_ENABLED_TESTSUITES "${TS_NAME}")
 set(ACTUALLY_ENABLED_TESTSUITES ${ACTUALLY_ENABLED_TESTSUITES} PARENT_SCOPE)
 
 ExternalProject_Add(
   ${TS_NAME}
+  ${FETCH_SOURCE}
   PREFIX "${TS_BASEDIR}"
-  GIT_REPOSITORY "https://github.com/ddemidov/vexcl.git"
+
   CMAKE_ARGS
     -DVEXCL_CACHE_KERNELS=OFF
     -DCMAKE_BUILD_TYPE=RelWithDebInfo
     -DVEXCL_BACKEND=OpenCL
+    -DVEXCL_BUILD_EXAMPLES=ON
+    -DVEXCL_BUILD_TESTS=ON
+    -DVEXCL_CACHE_KERNELS=OFF
+    -DVEXCL_HAVE_BOOST_COMPUTE=ON
   INSTALL_COMMAND /bin/true
 )
 
diff --git a/examples/ViennaCL/CMakeLists.txt b/examples/ViennaCL/CMakeLists.txt
index ceaebbf..b856364 100644
--- a/examples/ViennaCL/CMakeLists.txt
+++ b/examples/ViennaCL/CMakeLists.txt
@@ -30,6 +30,12 @@ set(TS_SRCDIR "${TESTSUITE_SOURCE_BASEDIR}/${TS_NAME}")
 set(VIENNA "ViennaCL-1.7.0")
 set(ViennaCL_TGZ "${TS_SRCDIR}/${VIENNA}.tar.gz")
 
+if(EXAMPLES_USE_GIT_MASTER)
+  set(FETCH_SOURCE GIT_REPOSITORY "https://github.com/viennacl/viennacl-dev.git")
+else()
+  set(FETCH_SOURCE URL "http://sourceforge.net/projects/viennacl/files/1.7.x/ViennaCL-1.7.1.tar.gz/download")
+endif()
+
 if(EXISTS "${ViennaCL_TGZ}")
 
   message(STATUS "Enabling testsuite ${TS_NAME}")
@@ -102,16 +108,16 @@ if(EXISTS "${ViennaCL_TGZ}")
           COMMAND "${TS_BUILDDIR}/examples/tutorial/libviennacl-tutorial"
           WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
 
-  add_test(NAME viennacl_benchmarks_opencl_bench_opencl
-          COMMAND "${TS_BUILDDIR}/examples/benchmarks/opencl-bench-opencl"
-          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
-  add_test(NAME viennacl_benchmarks_dense_blas_bench_opencl
-          COMMAND "${TS_BUILDDIR}/examples/benchmarks/dense_blas-bench-opencl"
-          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+#  add_test(NAME viennacl_benchmarks_opencl_bench_opencl
+#          COMMAND "${TS_BUILDDIR}/examples/benchmarks/opencl-bench-opencl"
+#          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+#  add_test(NAME viennacl_benchmarks_dense_blas_bench_opencl
+#          COMMAND "${TS_BUILDDIR}/examples/benchmarks/dense_blas-bench-opencl"
+#          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
 
-  add_test(NAME viennacl_tests_bisect_test_opencl
-          COMMAND "${TS_BUILDDIR}/tests/bisect-test-opencl"
-          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+#  add_test(NAME viennacl_tests_bisect_test_opencl
+#          COMMAND "${TS_BUILDDIR}/tests/bisect-test-opencl"
+#          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
   add_test(NAME viennacl_tests_fft_1d_test_opencl
           COMMAND "${TS_BUILDDIR}/tests/fft_1d-test-opencl"
           WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
@@ -202,9 +208,9 @@ if(EXISTS "${ViennaCL_TGZ}")
   add_test(NAME viennacl_tests_spmdm_test_opencl
           COMMAND "${TS_BUILDDIR}/tests/spmdm-test-opencl"
           WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
-  add_test(NAME viennacl_tests_svd_test_opencl
-          COMMAND "${TS_BUILDDIR}/tests/svd-test-opencl"
-          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
+#  add_test(NAME viennacl_tests_svd_test_opencl
+#          COMMAND "${TS_BUILDDIR}/tests/svd-test-opencl"
+#          WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
   add_test(NAME viennacl_tests_tql_test_opencl
           COMMAND "${TS_BUILDDIR}/tests/tql-test-opencl"
           WORKING_DIRECTORY "${TS_BUILDDIR}/tests")
@@ -242,14 +248,16 @@ if(EXISTS "${ViennaCL_TGZ}")
     PROPERTIES
       LABELS "ViennaCL ViennaCL_examples")
 
-  set_tests_properties(
-  viennacl_benchmarks_opencl_bench_opencl
-  viennacl_benchmarks_dense_blas_bench_opencl
-    PROPERTIES
-      LABELS "ViennaCL ViennaCL_benchmarks")
+  # too slow
+  # set_tests_properties(
+  # viennacl_benchmarks_opencl_bench_opencl
+  # viennacl_benchmarks_dense_blas_bench_opencl
+  #  PROPERTIES
+  #    LABELS "ViennaCL_benchmarks")
 
   set_tests_properties(
-  viennacl_tests_bisect_test_opencl
+  # slow
+  # viennacl_tests_bisect_test_opencl
   viennacl_tests_fft_1d_test_opencl
   viennacl_tests_external_linkage_opencl
   viennacl_tests_blas3_solve_test_opencl
@@ -280,7 +288,8 @@ if(EXISTS "${ViennaCL_TGZ}")
   viennacl_tests_sparse_test_opencl
   viennacl_tests_structured_matrices_test_opencl
   viennacl_tests_spmdm_test_opencl
-  viennacl_tests_svd_test_opencl
+  # slow
+  # viennacl_tests_svd_test_opencl
   viennacl_tests_tql_test_opencl
   viennacl_tests_vector_convert_test_opencl
   viennacl_tests_vector_float_double_test_opencl
diff --git a/examples/arrayfire/CMakeLists.txt b/examples/arrayfire/CMakeLists.txt
index 5fc46cf..902150c 100644
--- a/examples/arrayfire/CMakeLists.txt
+++ b/examples/arrayfire/CMakeLists.txt
@@ -31,6 +31,12 @@ set(TS_SRCDIR "${TESTSUITE_SOURCE_BASEDIR}/${TS_NAME}")
 find_package(LAPACK QUIET)
 find_package(Boost QUIET)
 
+if(EXAMPLES_USE_GIT_MASTER)
+  set(FETCH_SOURCE GIT_REPOSITORY "https://github.com/arrayfire/arrayfire.git")
+else()
+  set(FETCH_SOURCE URL "http://arrayfire.com/arrayfire_source/arrayfire-full-3.5.1.tar.bz2")
+endif()
+
 if(HAVE_GIT)
 
 if(LAPACK_FOUND AND
@@ -43,7 +49,7 @@ if(LAPACK_FOUND AND
   ExternalProject_Add(
     ${TS_NAME}
     PREFIX "${TS_BASEDIR}"
-    GIT_REPOSITORY "https://github.com/arrayfire/${TS_NAME}.git"
+    ${FETCH_SOURCE}
     #PATCH_COMMAND  pwd && sed -i "s/CL_DEVICE_TYPE_GPU/CL_DEVICE_TYPE_CPU/g" *.cpp
     #UPDATE_COMMAND /bin/true
     CMAKE_ARGS
@@ -60,8 +66,6 @@ if(LAPACK_FOUND AND
   set_target_properties(${TS_NAME} PROPERTIES EXCLUDE_FROM_ALL TRUE)
   add_dependencies(prepare_examples ${TS_NAME})
 
-
-
   add_test(NAME arrayfire_tests_approx1_opencl
            COMMAND "${TS_BUILDDIR}/test/approx1_opencl"
            WORKING_DIRECTORY "${TS_BUILDDIR}/test")
@@ -161,21 +165,12 @@ if(LAPACK_FOUND AND
   add_test(NAME arrayfire_tests_gfor_opencl
            COMMAND "${TS_BUILDDIR}/test/gfor_opencl"
            WORKING_DIRECTORY "${TS_BUILDDIR}/test")
-  add_test(NAME arrayfire_tests_gloh_nonfree_opencl
-           COMMAND "${TS_BUILDDIR}/test/gloh_nonfree_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
   add_test(NAME arrayfire_tests_gradient_opencl
            COMMAND "${TS_BUILDDIR}/test/gradient_opencl"
            WORKING_DIRECTORY "${TS_BUILDDIR}/test")
   add_test(NAME arrayfire_tests_hamming_opencl
            COMMAND "${TS_BUILDDIR}/test/hamming_opencl"
            WORKING_DIRECTORY "${TS_BUILDDIR}/test")
-  add_test(NAME arrayfire_tests_harris_opencl
-           COMMAND "${TS_BUILDDIR}/test/harris_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
-  add_test(NAME arrayfire_tests_histogram_opencl
-           COMMAND "${TS_BUILDDIR}/test/histogram_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
   add_test(NAME arrayfire_tests_homography_opencl
            COMMAND "${TS_BUILDDIR}/test/homography_opencl"
            WORKING_DIRECTORY "${TS_BUILDDIR}/test")
@@ -212,9 +207,6 @@ if(LAPACK_FOUND AND
   add_test(NAME arrayfire_tests_lu_dense_opencl
            COMMAND "${TS_BUILDDIR}/test/lu_dense_opencl"
            WORKING_DIRECTORY "${TS_BUILDDIR}/test")
-  add_test(NAME arrayfire_tests_manual_memory_test_opencl
-           COMMAND "${TS_BUILDDIR}/test/manual_memory_test_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
   add_test(NAME arrayfire_tests_match_template_opencl
            COMMAND "${TS_BUILDDIR}/test/match_template_opencl"
            WORKING_DIRECTORY "${TS_BUILDDIR}/test")
@@ -233,9 +225,10 @@ if(LAPACK_FOUND AND
   add_test(NAME arrayfire_tests_medfilt_opencl
            COMMAND "${TS_BUILDDIR}/test/medfilt_opencl"
            WORKING_DIRECTORY "${TS_BUILDDIR}/test")
-  add_test(NAME arrayfire_tests_median_opencl
-           COMMAND "${TS_BUILDDIR}/test/median_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+# takes too long
+#  add_test(NAME arrayfire_tests_median_opencl
+#           COMMAND "${TS_BUILDDIR}/test/median_opencl"
+#           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
   add_test(NAME arrayfire_tests_memory_lock_opencl
            COMMAND "${TS_BUILDDIR}/test/memory_lock_opencl"
            WORKING_DIRECTORY "${TS_BUILDDIR}/test")
@@ -302,9 +295,6 @@ if(LAPACK_FOUND AND
   add_test(NAME arrayfire_tests_set_opencl
            COMMAND "${TS_BUILDDIR}/test/set_opencl"
            WORKING_DIRECTORY "${TS_BUILDDIR}/test")
-  add_test(NAME arrayfire_tests_sift_nonfree_opencl
-           COMMAND "${TS_BUILDDIR}/test/sift_nonfree_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/test")
   add_test(NAME arrayfire_tests_sobel_opencl
            COMMAND "${TS_BUILDDIR}/test/sobel_opencl"
            WORKING_DIRECTORY "${TS_BUILDDIR}/test")
@@ -367,69 +357,31 @@ if(LAPACK_FOUND AND
            WORKING_DIRECTORY "${TS_BUILDDIR}/test")
 
 
-  add_test(NAME arrayfire_examples_adaptive_thresholding_opencl
-           COMMAND "${TS_BUILDDIR}/examples/image_processing/adaptive_thresholding_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/image_processing")
-  add_test(NAME arrayfire_examples_binary_thresholding_opencl
-           COMMAND "${TS_BUILDDIR}/examples/image_processing/binary_thresholding_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/image_processing")
-  add_test(NAME arrayfire_examples_brain_segmentation_opencl
-           COMMAND "${TS_BUILDDIR}/examples/image_processing/brain_segmentation_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/image_processing")
-  add_test(NAME arrayfire_examples_edge_opencl
-           COMMAND "${TS_BUILDDIR}/examples/image_processing/edge_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/image_processing")
-  add_test(NAME arrayfire_examples_filters_opencl
-           COMMAND "${TS_BUILDDIR}/examples/image_processing/filters_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/image_processing")
-  add_test(NAME arrayfire_examples_image_demo_opencl
-           COMMAND "${TS_BUILDDIR}/examples/image_processing/image_demo_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/image_processing")
-  add_test(NAME arrayfire_examples_image_editing_opencl
-           COMMAND "${TS_BUILDDIR}/examples/image_processing/image_editing_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/image_processing")
-  add_test(NAME arrayfire_examples_morphing_opencl
-           COMMAND "${TS_BUILDDIR}/examples/image_processing/morphing_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/image_processing")
-  add_test(NAME arrayfire_examples_optical_flow_opencl
-           COMMAND "${TS_BUILDDIR}/examples/image_processing/optical_flow_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/image_processing")
-  add_test(NAME arrayfire_examples_pyramids_opencl
-           COMMAND "${TS_BUILDDIR}/examples/image_processing/pyramids_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/image_processing")
-  add_test(NAME arrayfire_examples_bagging_opencl
-           COMMAND "${TS_BUILDDIR}/examples/machine_learning/bagging_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/machine_learning")
-  add_test(NAME arrayfire_examples_deep_belief_net_opencl
-           COMMAND "${TS_BUILDDIR}/examples/machine_learning/deep_belief_net_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/machine_learning")
-  add_test(NAME arrayfire_examples_kmeans_opencl
-           COMMAND "${TS_BUILDDIR}/examples/machine_learning/kmeans_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/machine_learning")
-  add_test(NAME arrayfire_examples_knn_opencl
-           COMMAND "${TS_BUILDDIR}/examples/machine_learning/knn_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/machine_learning")
-  add_test(NAME arrayfire_examples_logistic_regression_opencl
-           COMMAND "${TS_BUILDDIR}/examples/machine_learning/logistic_regression_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/machine_learning")
+# these tests are commented out b/c they take too much time on CPU.
+#  add_test(NAME arrayfire_examples_deep_belief_net_opencl
+#           COMMAND "${TS_BUILDDIR}/examples/machine_learning/deep_belief_net_opencl"
+#           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/machine_learning")
+#  add_test(NAME arrayfire_examples_kmeans_opencl
+#           COMMAND "${TS_BUILDDIR}/examples/machine_learning/kmeans_opencl"
+#           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/machine_learning")
+#  add_test(NAME arrayfire_examples_logistic_regression_opencl
+#           COMMAND "${TS_BUILDDIR}/examples/machine_learning/logistic_regression_opencl"
+#           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/machine_learning")
   add_test(NAME arrayfire_examples_naive_bayes_opencl
            COMMAND "${TS_BUILDDIR}/examples/machine_learning/naive_bayes_opencl"
            WORKING_DIRECTORY "${TS_BUILDDIR}/examples/machine_learning")
-  add_test(NAME arrayfire_examples_neural_network_opencl
-           COMMAND "${TS_BUILDDIR}/examples/machine_learning/neural_network_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/machine_learning")
-  add_test(NAME arrayfire_examples_perceptron_opencl
-           COMMAND "${TS_BUILDDIR}/examples/machine_learning/perceptron_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/machine_learning")
-  add_test(NAME arrayfire_examples_rbm_opencl
-           COMMAND "${TS_BUILDDIR}/examples/machine_learning/rbm_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/machine_learning")
-  add_test(NAME arrayfire_examples_softmax_regression_opencl
-           COMMAND "${TS_BUILDDIR}/examples/machine_learning/softmax_regression_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/machine_learning")
-  add_test(NAME arrayfire_examples_basic_opencl
-           COMMAND "${TS_BUILDDIR}/examples/unified/basic_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/unified")
+#  add_test(NAME arrayfire_examples_neural_network_opencl
+#           COMMAND "${TS_BUILDDIR}/examples/machine_learning/neural_network_opencl"
+#           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/machine_learning")
+#  add_test(NAME arrayfire_examples_perceptron_opencl
+#           COMMAND "${TS_BUILDDIR}/examples/machine_learning/perceptron_opencl"
+#           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/machine_learning")
+#  add_test(NAME arrayfire_examples_rbm_opencl
+#           COMMAND "${TS_BUILDDIR}/examples/machine_learning/rbm_opencl"
+#           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/machine_learning")
+#  add_test(NAME arrayfire_examples_softmax_regression_opencl
+#           COMMAND "${TS_BUILDDIR}/examples/machine_learning/softmax_regression_opencl"
+#           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/machine_learning")
   add_test(NAME arrayfire_examples_black_scholes_options_opencl
            COMMAND "${TS_BUILDDIR}/examples/financial/black_scholes_options_opencl"
            WORKING_DIRECTORY "${TS_BUILDDIR}/examples/financial")
@@ -472,45 +424,111 @@ if(LAPACK_FOUND AND
   add_test(NAME arrayfire_examples_vectorize_opencl
            COMMAND "${TS_BUILDDIR}/examples/getting_started/vectorize_opencl"
            WORKING_DIRECTORY "${TS_BUILDDIR}/examples/getting_started")
-  add_test(NAME arrayfire_examples_conway_opencl
-           COMMAND "${TS_BUILDDIR}/examples/graphics/conway_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/graphics")
-  add_test(NAME arrayfire_examples_conway_pretty_opencl
-           COMMAND "${TS_BUILDDIR}/examples/graphics/conway_pretty_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/graphics")
-  add_test(NAME arrayfire_examples_fractal_opencl
-           COMMAND "${TS_BUILDDIR}/examples/graphics/fractal_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/graphics")
   add_test(NAME arrayfire_examples_histogram_opencl
            COMMAND "${TS_BUILDDIR}/examples/graphics/histogram_opencl"
            WORKING_DIRECTORY "${TS_BUILDDIR}/examples/graphics")
-  add_test(NAME arrayfire_examples_plot2d_opencl
-           COMMAND "${TS_BUILDDIR}/examples/graphics/plot2d_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/graphics")
-  add_test(NAME arrayfire_examples_plot3_opencl
-           COMMAND "${TS_BUILDDIR}/examples/graphics/plot3_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/graphics")
-  add_test(NAME arrayfire_examples_surface_opencl
-           COMMAND "${TS_BUILDDIR}/examples/graphics/surface_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/graphics")
-  add_test(NAME arrayfire_examples_fast_opencl
-           COMMAND "${TS_BUILDDIR}/examples/computer_vision/fast_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/computer_vision")
-  add_test(NAME arrayfire_examples_harris_opencl
-           COMMAND "${TS_BUILDDIR}/examples/computer_vision/harris_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/computer_vision")
-  add_test(NAME arrayfire_examples_matching_opencl
-           COMMAND "${TS_BUILDDIR}/examples/computer_vision/matching_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/computer_vision")
-  add_test(NAME arrayfire_examples_susan_opencl
-           COMMAND "${TS_BUILDDIR}/examples/computer_vision/susan_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/computer_vision")
   add_test(NAME arrayfire_examples_helloworld_opencl
            COMMAND "${TS_BUILDDIR}/examples/helloworld/helloworld_opencl"
            WORKING_DIRECTORY "${TS_BUILDDIR}/examples/helloworld")
-  add_test(NAME arrayfire_examples_swe_opencl
-           COMMAND "${TS_BUILDDIR}/examples/pde/swe_opencl"
-           WORKING_DIRECTORY "${TS_BUILDDIR}/examples/pde")
+
+  if (IMAGE_TESTS_ENABLED)
+    set (IMAGE_TESTS_LIST
+        arrayfire_examples_fast_opencl
+        arrayfire_examples_matching_opencl
+        arrayfire_examples_susan_opencl
+        arrayfire_examples_adaptive_thresholding_opencl
+        arrayfire_examples_binary_thresholding_opencl
+        arrayfire_examples_brain_segmentation_opencl
+        arrayfire_examples_filters_opencl
+        arrayfire_examples_edge_opencl
+        arrayfire_examples_plot2d_opencl
+        arrayfire_examples_plot3_opencl
+        arrayfire_examples_image_demo_opencl
+        arrayfire_examples_image_editing_opencl
+        arrayfire_examples_morphing_opencl
+        arrayfire_examples_optical_flow_opencl
+        arrayfire_examples_pyramids_opencl
+        arrayfire_examples_bagging_opencl
+        arrayfire_examples_conway_opencl
+        arrayfire_examples_conway_pretty_opencl
+        arrayfire_examples_fractal_opencl
+        arrayfire_examples_histogram_opencl
+        arrayfire_examples_surface_opencl
+        arrayfire_examples_harris_opencl
+        arrayfire_examples_swe_opencl
+    )
+    add_test(NAME arrayfire_examples_fast_opencl
+             COMMAND "${TS_BUILDDIR}/examples/computer_vision/fast_opencl"
+             WORKING_DIRECTORY "${TS_BUILDDIR}/examples/computer_vision")
+    add_test(NAME arrayfire_examples_matching_opencl
+             COMMAND "${TS_BUILDDIR}/examples/computer_vision/matching_opencl"
+             WORKING_DIRECTORY "${TS_BUILDDIR}/examples/computer_vision")
+    add_test(NAME arrayfire_examples_susan_opencl
+             COMMAND "${TS_BUILDDIR}/examples/computer_vision/susan_opencl"
+             WORKING_DIRECTORY "${TS_BUILDDIR}/examples/computer_vision")
+    add_test(NAME arrayfire_examples_adaptive_thresholding_opencl
+             COMMAND "${TS_BUILDDIR}/examples/image_processing/adaptive_thresholding_opencl"
+             WORKING_DIRECTORY "${TS_BUILDDIR}/examples/image_processing")
+    add_test(NAME arrayfire_examples_binary_thresholding_opencl
+             COMMAND "${TS_BUILDDIR}/examples/image_processing/binary_thresholding_opencl"
+             WORKING_DIRECTORY "${TS_BUILDDIR}/examples/image_processing")
+    add_test(NAME arrayfire_examples_brain_segmentation_opencl
+             COMMAND "${TS_BUILDDIR}/examples/image_processing/brain_segmentation_opencl"
+             WORKING_DIRECTORY "${TS_BUILDDIR}/examples/image_processing")
+    add_test(NAME arrayfire_examples_filters_opencl
+             COMMAND "${TS_BUILDDIR}/examples/image_processing/filters_opencl"
+             WORKING_DIRECTORY "${TS_BUILDDIR}/examples/image_processing")
+    add_test(NAME arrayfire_examples_edge_opencl
+             COMMAND "${TS_BUILDDIR}/examples/image_processing/edge_opencl"
+             WORKING_DIRECTORY "${TS_BUILDDIR}/examples/image_processing")
+    add_test(NAME arrayfire_examples_plot2d_opencl
+             COMMAND "${TS_BUILDDIR}/examples/graphics/plot2d_opencl"
+             WORKING_DIRECTORY "${TS_BUILDDIR}/examples/graphics")
+    add_test(NAME arrayfire_examples_plot3_opencl
+             COMMAND "${TS_BUILDDIR}/examples/graphics/plot3_opencl"
+             WORKING_DIRECTORY "${TS_BUILDDIR}/examples/graphics")
+    add_test(NAME arrayfire_examples_image_demo_opencl
+             COMMAND "${TS_BUILDDIR}/examples/image_processing/image_demo_opencl"
+             WORKING_DIRECTORY "${TS_BUILDDIR}/examples/image_processing")
+    add_test(NAME arrayfire_examples_image_editing_opencl
+             COMMAND "${TS_BUILDDIR}/examples/image_processing/image_editing_opencl"
+             WORKING_DIRECTORY "${TS_BUILDDIR}/examples/image_processing")
+    add_test(NAME arrayfire_examples_morphing_opencl
+             COMMAND "${TS_BUILDDIR}/examples/image_processing/morphing_opencl"
+             WORKING_DIRECTORY "${TS_BUILDDIR}/examples/image_processing")
+    add_test(NAME arrayfire_examples_optical_flow_opencl
+             COMMAND "${TS_BUILDDIR}/examples/image_processing/optical_flow_opencl"
+             WORKING_DIRECTORY "${TS_BUILDDIR}/examples/image_processing")
+    add_test(NAME arrayfire_examples_pyramids_opencl
+             COMMAND "${TS_BUILDDIR}/examples/image_processing/pyramids_opencl"
+             WORKING_DIRECTORY "${TS_BUILDDIR}/examples/image_processing")
+    add_test(NAME arrayfire_examples_bagging_opencl
+             COMMAND "${TS_BUILDDIR}/examples/machine_learning/bagging_opencl"
+             WORKING_DIRECTORY "${TS_BUILDDIR}/examples/machine_learning")
+    add_test(NAME arrayfire_examples_conway_opencl
+             COMMAND "${TS_BUILDDIR}/examples/graphics/conway_opencl"
+             WORKING_DIRECTORY "${TS_BUILDDIR}/examples/graphics")
+    add_test(NAME arrayfire_examples_conway_pretty_opencl
+             COMMAND "${TS_BUILDDIR}/examples/graphics/conway_pretty_opencl"
+             WORKING_DIRECTORY "${TS_BUILDDIR}/examples/graphics")
+    add_test(NAME arrayfire_examples_fractal_opencl
+             COMMAND "${TS_BUILDDIR}/examples/graphics/fractal_opencl"
+             WORKING_DIRECTORY "${TS_BUILDDIR}/examples/graphics")
+    add_test(NAME arrayfire_tests_histogram_opencl
+             COMMAND "${TS_BUILDDIR}/test/histogram_opencl"
+             WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+    add_test(NAME arrayfire_tests_harris_opencl
+             COMMAND "${TS_BUILDDIR}/test/harris_opencl"
+             WORKING_DIRECTORY "${TS_BUILDDIR}/test")
+    add_test(NAME arrayfire_examples_surface_opencl
+             COMMAND "${TS_BUILDDIR}/examples/graphics/surface_opencl"
+             WORKING_DIRECTORY "${TS_BUILDDIR}/examples/graphics")
+    add_test(NAME arrayfire_examples_swe_opencl
+             COMMAND "${TS_BUILDDIR}/examples/pde/swe_opencl"
+             WORKING_DIRECTORY "${TS_BUILDDIR}/examples/pde")
+  else()
+    unset (IMAGE_TESTS_LIST)
+  endif()
 
   set_tests_properties(
     arrayfire_tests_approx1_opencl
@@ -546,11 +564,8 @@ if(LAPACK_FOUND AND
     arrayfire_tests_gen_index_opencl
     arrayfire_tests_getting_started_opencl
     arrayfire_tests_gfor_opencl
-    arrayfire_tests_gloh_nonfree_opencl
     arrayfire_tests_gradient_opencl
     arrayfire_tests_hamming_opencl
-    arrayfire_tests_harris_opencl
-    arrayfire_tests_histogram_opencl
     arrayfire_tests_homography_opencl
     arrayfire_tests_hsv_rgb_opencl
     arrayfire_tests_iir_opencl
@@ -563,14 +578,13 @@ if(LAPACK_FOUND AND
     arrayfire_tests_jit_opencl
     arrayfire_tests_join_opencl
     arrayfire_tests_lu_dense_opencl
-    arrayfire_tests_manual_memory_test_opencl
     arrayfire_tests_match_template_opencl
     arrayfire_tests_math_opencl
     arrayfire_tests_matrix_manipulation_opencl
     arrayfire_tests_mean_opencl
     arrayfire_tests_meanshift_opencl
     arrayfire_tests_medfilt_opencl
-    arrayfire_tests_median_opencl
+#    arrayfire_tests_median_opencl
     arrayfire_tests_memory_lock_opencl
     arrayfire_tests_memory_opencl
     arrayfire_tests_missing_opencl
@@ -594,7 +608,6 @@ if(LAPACK_FOUND AND
     arrayfire_tests_rotate_opencl
     arrayfire_tests_select_opencl
     arrayfire_tests_set_opencl
-    arrayfire_tests_sift_nonfree_opencl
     arrayfire_tests_sobel_opencl
     arrayfire_tests_solve_dense_opencl
     arrayfire_tests_shift_opencl
@@ -620,28 +633,17 @@ if(LAPACK_FOUND AND
     PROPERTIES
       LABELS "${TS_NAME} ${TS_NAME}_tests")
 
+
   set_tests_properties(
-    arrayfire_examples_adaptive_thresholding_opencl
-    arrayfire_examples_binary_thresholding_opencl
-    arrayfire_examples_brain_segmentation_opencl
-    arrayfire_examples_edge_opencl
-    arrayfire_examples_filters_opencl
-    arrayfire_examples_image_demo_opencl
-    arrayfire_examples_image_editing_opencl
-    arrayfire_examples_morphing_opencl
-    arrayfire_examples_optical_flow_opencl
-    arrayfire_examples_pyramids_opencl
-    arrayfire_examples_bagging_opencl
-    arrayfire_examples_deep_belief_net_opencl
-    arrayfire_examples_kmeans_opencl
-    arrayfire_examples_knn_opencl
-    arrayfire_examples_logistic_regression_opencl
+    ${IMAGE_TESTS_LIST}
+#    arrayfire_examples_deep_belief_net_opencl
+#    arrayfire_examples_kmeans_opencl
+#    arrayfire_examples_logistic_regression_opencl
     arrayfire_examples_naive_bayes_opencl
-    arrayfire_examples_neural_network_opencl
-    arrayfire_examples_perceptron_opencl
-    arrayfire_examples_rbm_opencl
-    arrayfire_examples_softmax_regression_opencl
-    arrayfire_examples_basic_opencl
+#    arrayfire_examples_neural_network_opencl
+#    arrayfire_examples_perceptron_opencl
+#    arrayfire_examples_rbm_opencl
+#    arrayfire_examples_softmax_regression_opencl
     arrayfire_examples_black_scholes_options_opencl
     arrayfire_examples_heston_model_opencl
     arrayfire_examples_monte_carlo_options_opencl
@@ -656,20 +658,8 @@ if(LAPACK_FOUND AND
     arrayfire_examples_integer_opencl
     arrayfire_examples_rainfall_opencl
     arrayfire_examples_vectorize_opencl
-    arrayfire_examples_conway_opencl
-    arrayfire_examples_conway_pretty_opencl
-    arrayfire_examples_fractal_opencl
 #    arrayfire_examples_gravity_sim_opencl
-    arrayfire_examples_histogram_opencl
-    arrayfire_examples_plot2d_opencl
-    arrayfire_examples_plot3_opencl
-    arrayfire_examples_surface_opencl
-    arrayfire_examples_fast_opencl
-    arrayfire_examples_harris_opencl
-    arrayfire_examples_matching_opencl
-    arrayfire_examples_susan_opencl
     arrayfire_examples_helloworld_opencl
-    arrayfire_examples_swe_opencl
     PROPERTIES
       LABELS "${TS_NAME} ${TS_NAME}_examples")
 
diff --git a/examples/clBLAS/CMakeLists.txt b/examples/clBLAS/CMakeLists.txt
index 6f56293..7ca0642 100644
--- a/examples/clBLAS/CMakeLists.txt
+++ b/examples/clBLAS/CMakeLists.txt
@@ -30,9 +30,13 @@ set(TS_SRCDIR "${TESTSUITE_SOURCE_BASEDIR}/${TS_NAME}")
 
 find_package(Boost 1.44 QUIET)
 
-if(NOT HAVE_GIT)
-  message(STATUS "Disabling testsuite ${TS_NAME}, requires git to checkout sources")
-elseif(NOT Boost_FOUND)
+if(EXAMPLES_USE_GIT_MASTER)
+  set(FETCH_SOURCE GIT_REPOSITORY "https://github.com/clMathLibraries/clBLAS.git")
+else()
+  set(FETCH_SOURCE URL "https://github.com/clMathLibraries/clBLAS/archive/v2.12.tar.gz")
+endif()
+
+if(NOT Boost_FOUND)
   message(STATUS "Disabling testsuite ${TS_NAME}, required Boost version not found" )
 elseif(NOT TESTS_USE_ICD)
   message(STATUS "Disabling testsuite ${TS_NAME}, requires ocl-icd" )
@@ -45,9 +49,9 @@ else()
   ExternalProject_Add(
     ${TS_NAME}
     PREFIX "${TS_BASEDIR}"
-    GIT_REPOSITORY "https://github.com/clMathLibraries/${TS_NAME}.git"
-    PATCH_COMMAND cd src && pwd && find samples/ -type f -name *.c | xargs sed -i "s/CL_DEVICE_TYPE_GPU/CL_DEVICE_TYPE_CPU/g"
-    UPDATE_COMMAND /bin/true
+    ${FETCH_SOURCE}
+    PATCH_COMMAND cd src && pwd && patch -p1 -i ${CMAKE_SOURCE_DIR}/examples/clBLAS/clBLAS_float_error.patch && patch -p1 -i ${CMAKE_SOURCE_DIR}/examples/clBLAS/clBLAS_link.patch
+    UPDATE_COMMAND pwd && find ./src/samples/ -type f -name *.c -o -name *.cpp  | xargs sed -i "s/CL_DEVICE_TYPE_GPU/CL_DEVICE_TYPE_CPU/g"
     CONFIGURE_COMMAND ${CMAKE_COMMAND}
       -DBUILD_RUNTIME=ON
       -DBUILD_TEST=OFF
diff --git a/examples/clBLAS/clBLAS_float_error.patch b/examples/clBLAS/clBLAS_float_error.patch
new file mode 100644
index 0000000..4ccf8bb
--- /dev/null
+++ b/examples/clBLAS/clBLAS_float_error.patch
@@ -0,0 +1,11 @@
+--- a/samples/example_ctrsm.c	2017-04-10 13:40:17.793250767 +0200
++++ b/samples/example_ctrsm.c	2017-04-10 13:40:31.869925562 +0200
+@@ -71,7 +71,7 @@
+     nrows = (sizeof(result) / sizeof(FloatComplex)) / ldb;
+     for (i = 0; i < nrows; i++) {
+         for (j = 0; j < ldb; j++) {
+-            printf("%.5f ", result[i * ldb + j].x);
++            printf("%.5f ", result[i * ldb + j].s[0]);
+         }
+         printf("\n");
+     }
diff --git a/examples/clBLAS/clBLAS_link.patch b/examples/clBLAS/clBLAS_link.patch
new file mode 100644
index 0000000..47e681c
--- /dev/null
+++ b/examples/clBLAS/clBLAS_link.patch
@@ -0,0 +1,12 @@
+--- a/library/CMakeLists.txt	2017-04-07 16:05:59.000000000 +0200
++++ b/library/CMakeLists.txt	2017-04-07 16:06:19.000000000 +0200
+@@ -887,7 +887,7 @@
+ set_target_properties(clBLAS PROPERTIES VERSION ${clBLAS_VERSION})
+ set_target_properties(clBLAS PROPERTIES SOVERSION ${clBLAS_SOVERSION})
+ set_target_properties( clBLAS PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" )
+-target_link_libraries(clBLAS ${OPENCL_LIBRARIES} ${MATH_LIBRARY} ${THREAD_LIBRARY})
++target_link_libraries(clBLAS ${OPENCL_LIBRARIES} ${MATH_LIBRARY} ${THREAD_LIBRARY} "cblas")
+ 
+ # CPack configuration; include the executable into the package
+ install( TARGETS clBLAS
+
diff --git a/examples/clFFT/CMakeLists.txt b/examples/clFFT/CMakeLists.txt
index 1864858..d8ec501 100644
--- a/examples/clFFT/CMakeLists.txt
+++ b/examples/clFFT/CMakeLists.txt
@@ -30,9 +30,13 @@ set(TS_SRCDIR "${TESTSUITE_SOURCE_BASEDIR}/${TS_NAME}")
 
 find_package(Boost 1.44 QUIET)
 
-if(NOT HAVE_GIT)
-  message(STATUS "Disabling testsuite ${TS_NAME}, requires git to checkout sources")
-elseif(NOT TESTS_USE_ICD)
+if(EXAMPLES_USE_GIT_MASTER)
+  set(FETCH_SOURCE GIT_REPOSITORY "https://github.com/clMathLibraries/clFFT.git")
+else()
+  set(FETCH_SOURCE URL "https://github.com/clMathLibraries/clFFT/archive/v2.12.2.tar.gz")
+endif()
+
+if(NOT TESTS_USE_ICD)
   message(STATUS "Disabling testsuite ${TS_NAME}, requires ocl-icd")
 elseif(NOT Boost_FOUND)
   message(STATUS "Disabling testsuite ${TS_NAME}, required Boost version not found" )
@@ -45,9 +49,9 @@ else()
   ExternalProject_Add(
     ${TS_NAME}
     PREFIX "${TS_BASEDIR}"
-    GIT_REPOSITORY "https://github.com/clMathLibraries/${TS_NAME}.git"
+    ${FETCH_SOURCE}
     #PATCH_COMMAND  pwd && sed -i "s/CL_DEVICE_TYPE_GPU/CL_DEVICE_TYPE_CPU/g" *.cpp
-    UPDATE_COMMAND /bin/true
+
     CONFIGURE_COMMAND ${CMAKE_COMMAND}
       -DBUILD_RUNTIME=ON
       -DBUILD_CLIENT=ON
diff --git a/examples/conformance/CMakeLists.txt b/examples/conformance/CMakeLists.txt
new file mode 100644
index 0000000..46ec2a4
--- /dev/null
+++ b/examples/conformance/CMakeLists.txt
@@ -0,0 +1,1978 @@
+#=============================================================================
+#   CMake build system files
+#
+#   Copyright (c) 2017 Michal Babej / Tampere University of Technology
+#
+#   Permission is hereby granted, free of charge, to any person obtaining a copy
+#   of this software and associated documentation files (the "Software"), to deal
+#   in the Software without restriction, including without limitation the rights
+#   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+#   copies of the Software, and to permit persons to whom the Software is
+#   furnished to do so, subject to the following conditions:
+#
+#   The above copyright notice and this permission notice shall be included in
+#   all copies or substantial portions of the Software.
+#
+#   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+#   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+#   THE SOFTWARE.
+#
+#=============================================================================
+
+set(TS_NAME "conformance")
+set(TS_BASEDIR "${TESTSUITE_BASEDIR}/${TS_NAME}")
+set(TS_BUILDDIR "${TS_BASEDIR}/src/${TS_NAME}-build")
+set(TS_SRCDIR "${TESTSUITE_SOURCE_BASEDIR}/${TS_NAME}")
+
+if(NOT HAVE_GIT)
+  message(STATUS "Disabling testsuite ${TS_NAME}, requires git to checkout sources")
+elseif(NOT TESTS_USE_ICD)
+  message(STATUS "Disabling testsuite ${TS_NAME}, requires ocl-icd")
+
+elseif(NOT MAKE_PROGRAM)
+  message(STATUS "Disabling testsuite ${TS_NAME}, requires make to build")
+
+else()
+
+message(STATUS "Enabling testsuite ${TS_NAME}")
+list(APPEND ACTUALLY_ENABLED_TESTSUITES "${TS_NAME}")
+set(ACTUALLY_ENABLED_TESTSUITES ${ACTUALLY_ENABLED_TESTSUITES} PARENT_SCOPE)
+
+ExternalProject_Add(
+  ${TS_NAME}
+  PREFIX "${TS_BASEDIR}"
+  GIT_REPOSITORY "https://github.com/franz/OpenCL-CTS"
+  GIT_TAG "cl12_trunk"
+  CONFIGURE_COMMAND ${CMAKE_COMMAND} -DCMAKE_BUILD_TYPE=Release
+      "${TS_BASEDIR}/src/${TS_NAME}/test_conformance"
+  INSTALL_COMMAND /bin/true
+)
+
+set_target_properties(${TS_NAME} PROPERTIES EXCLUDE_FROM_ALL TRUE)
+add_dependencies(prepare_examples ${TS_NAME})
+
+add_test(NAME "${TS_NAME}_allocations_buffer"
+         COMMAND "${TS_BUILDDIR}/allocations/test_allocations" buffer buffer_non_blocking)
+add_test(NAME "${TS_NAME}_allocations_image"
+         COMMAND "${TS_BUILDDIR}/allocations/test_allocations" image2d_read  image2d_write  image2d_read_non_blocking  image2d_write_non_blocking)
+
+# When running LLVM 5.0+, run all tests except the get_kernel_arg_info.
+# That test incorrectly assumes "const" type qualifiers are returned
+# for non-pointer arguments too; clang 4 does this, clang 5 does not.
+# The spec PDF says type qualifiers are only returned for pointer types
+# (so this test & clang 4 seems to be wrong, clang 5 is correct)
+if(LLVM_VERSION VERSION_LESS "5.0")
+  add_test(NAME "${TS_NAME}_api"
+           COMMAND "${TS_BUILDDIR}/api/test_api" )
+else()
+  add_test(NAME "${TS_NAME}_api"
+           COMMAND "${TS_BUILDDIR}/api/test_api" get_platform_info  get_sampler_info  get_command_queue_info  get_context_info  get_device_info  enqueue_task  binary_get  binary_create  kernel_required_group_size  release_kernel_order  release_during_execute  load_single_kernel  load_two_kernels  load_two_kernels_in_one  load_two_kernels_manually  get_program_info_kernel_names   create_kernels_in_program  get_kernel_info  execute_kernel_local_sizes  set_kernel_arg_by_index  set_kernel_ar [...]
+endif()
+
+add_test(NAME "${TS_NAME}_atomics"
+         COMMAND "${TS_BUILDDIR}/atomics/test_atomics" )
+
+add_test(NAME "${TS_NAME}_basic_math"
+         COMMAND "${TS_BUILDDIR}/basic/test_basic" fpmath_float  fpmath_float2  fpmath_float4  intmath_int  intmath_int2  intmath_int4  intmath_long  intmath_long2  intmath_long4  int2float  float2int)
+add_test(NAME "${TS_NAME}_basic_s2v"
+         COMMAND "${TS_BUILDDIR}/basic/test_basic" explicit_s2v_bool  explicit_s2v_char  explicit_s2v_uchar  explicit_s2v_short  explicit_s2v_ushort  explicit_s2v_int  explicit_s2v_uint  explicit_s2v_long  explicit_s2v_ulong  explicit_s2v_float  explicit_s2v_double)
+add_test(NAME "${TS_NAME}_basic_memory"
+         COMMAND "${TS_BUILDDIR}/basic/test_basic" vload_global  vload_local  vload_constant  vload_private  vstore_global  vstore_local  vstore_private  bufferreadwriterect  arrayreadwrite  arraycopy  enqueue_map_buffer  async_copy_global_to_local  async_copy_local_to_global  async_strided_copy_global_to_local  async_strided_copy_local_to_global  kernel_memory_alignment_local  kernel_memory_alignment_global  kernel_memory_alignment_constant  kernel_memory_alignment_private)
+add_test(NAME "${TS_NAME}_basic_image"
+         COMMAND "${TS_BUILDDIR}/basic/test_basic" readimage  readimage_int16  readimage_fp32  writeimage  writeimage_int16  writeimage_fp32  image_r8  imagereadwrite  imagereadwrite3d  readimage3d  readimage3d_int16  readimage3d_fp32  imagearraycopy  imagearraycopy3d  imagecopy  imagecopy3d  imagerandomcopy  arrayimagecopy  arrayimagecopy3d  imagenpot  imagedim_pow2  imagedim_non_pow2  image_param  image_multipass_integer_coord  image_multipass_float_coord  enqueue_map_image)
+add_test(NAME "${TS_NAME}_basic_other"
+         COMMAND "${TS_BUILDDIR}/basic/test_basic" mri_one  mri_multiple  barrier  createkernelsinprogram  work_item_functions  astype  prefetch  kernel_call_kernel_function  host_numeric_constants  kernel_numeric_constants  kernel_limit_constants  kernel_preprocessor_macros  parameter_types  vector_creation  vec_type_hint  global_work_offsets  get_global_offset  hostptr  hiloeo  if  sizeof  loop  pointer_cast  local_arg_def  local_kernel_def  local_kernel_scope  constant  constant_source)
+
+add_test(NAME "${TS_NAME}_buffers_read_async"
+         COMMAND "${TS_BUILDDIR}/buffers/test_buffers" buffer_read_async_int  buffer_read_async_uint  buffer_read_async_long  buffer_read_async_ulong  buffer_read_async_short  buffer_read_async_ushort  buffer_read_async_char  buffer_read_async_uchar  buffer_read_async_float)
+add_test(NAME "${TS_NAME}_buffers_write_async"
+         COMMAND "${TS_BUILDDIR}/buffers/test_buffers"  buffer_write_async_int  buffer_write_async_uint  buffer_write_async_short  buffer_write_async_ushort  buffer_write_async_char  buffer_write_async_uchar  buffer_write_async_float  buffer_write_async_long  buffer_write_async_ulong)
+
+add_test(NAME "${TS_NAME}_buffers_read_array_barrier"
+         COMMAND "${TS_BUILDDIR}/buffers/test_buffers" buffer_read_array_barrier_int  buffer_read_array_barrier_uint  buffer_read_array_barrier_long  buffer_read_array_barrier_ulong  buffer_read_array_barrier_short  buffer_read_array_barrier_ushort  buffer_read_array_barrier_char  buffer_read_array_barrier_uchar  buffer_read_array_barrier_float)
+
+add_test(NAME "${TS_NAME}_buffers_read"
+         COMMAND "${TS_BUILDDIR}/buffers/test_buffers" buffer_read_int  buffer_read_uint  buffer_read_long  buffer_read_ulong  buffer_read_short  buffer_read_ushort  buffer_read_float  buffer_read_half  buffer_read_char  buffer_read_uchar  buffer_read_struct  buffer_read_random_size)
+add_test(NAME "${TS_NAME}_buffers_write"
+         COMMAND "${TS_BUILDDIR}/buffers/test_buffers" buffer_write_int  buffer_write_uint  buffer_write_short  buffer_write_ushort  buffer_write_char  buffer_write_uchar  buffer_write_float  buffer_write_half  buffer_write_long  buffer_write_ulong  buffer_write_struct)
+add_test(NAME "${TS_NAME}_buffers_fill"
+         COMMAND "${TS_BUILDDIR}/buffers/test_buffers"   buffer_fill_int  buffer_fill_uint  buffer_fill_short  buffer_fill_ushort  buffer_fill_char  buffer_fill_uchar  buffer_fill_long  buffer_fill_ulong  buffer_fill_float  buffer_fill_struct)
+
+add_test(NAME "${TS_NAME}_buffers_var1"
+         COMMAND "${TS_BUILDDIR}/buffers/test_buffers" buffer_copy  buffer_partial_copy  mem_read_write_flags  mem_write_only_flags  mem_read_only_flags  mem_copy_host_flags  mem_alloc_ref_flags  array_info_size)
+add_test(NAME "${TS_NAME}_buffers_var2"
+         COMMAND "${TS_BUILDDIR}/buffers/test_buffers" sub_buffers_read_write  sub_buffers_read_write_dual_devices  sub_buffers_overlapping  buffer_migrate  image_migrate)
+
+add_test(NAME "${TS_NAME}_buffers_map_read"
+         COMMAND "${TS_BUILDDIR}/buffers/test_buffers" buffer_map_read_int  buffer_map_read_uint  buffer_map_read_long  buffer_map_read_ulong  buffer_map_read_short  buffer_map_read_ushort  buffer_map_read_char  buffer_map_read_uchar  buffer_map_read_float  buffer_map_read_struct)
+add_test(NAME "${TS_NAME}_buffers_map_write"
+         COMMAND "${TS_BUILDDIR}/buffers/test_buffers" buffer_map_write_int  buffer_map_write_uint  buffer_map_write_long  buffer_map_write_ulong  buffer_map_write_short  buffer_map_write_ushort  buffer_map_write_char  buffer_map_write_uchar  buffer_map_write_float  buffer_map_write_struct)
+
+add_test(NAME "${TS_NAME}_commonfns"
+         COMMAND "${TS_BUILDDIR}/commonfns/test_commonfns" )
+
+add_test(NAME "${TS_NAME}_compiler"
+         COMMAND "${TS_BUILDDIR}/compiler/test_compiler"
+         WORKING_DIRECTORY "${TS_BASEDIR}/src/${TS_NAME}/test_conformance/compiler")
+
+add_test(NAME "${TS_NAME}_computeinfo"
+         COMMAND "${TS_BUILDDIR}/computeinfo/computeinfo" )
+
+add_test(NAME "${TS_NAME}_contractions"
+         COMMAND "${TS_BUILDDIR}/contractions/contractions" )
+
+add_test(NAME "${TS_NAME}_conversion_uchar_uchar"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" uchar_uchar uchar_sat_uchar )
+
+add_test(NAME "${TS_NAME}_conversion_char_uchar"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" char_uchar char_sat_uchar )
+
+add_test(NAME "${TS_NAME}_conversion_ushort_uchar"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" ushort_uchar ushort_sat_uchar )
+
+add_test(NAME "${TS_NAME}_conversion_short_uchar"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" short_uchar short_sat_uchar )
+
+add_test(NAME "${TS_NAME}_conversion_uint_uchar"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" uint_uchar uint_sat_uchar )
+
+add_test(NAME "${TS_NAME}_conversion_int_uchar"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" int_uchar int_sat_uchar )
+
+add_test(NAME "${TS_NAME}_conversion_float_uchar"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" float_uchar float_rte_uchar float_rtp_uchar float_rtn_uchar float_rtz_uchar )
+
+add_test(NAME "${TS_NAME}_conversion_double_uchar"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" double_uchar double_rte_uchar double_rtp_uchar double_rtn_uchar double_rtz_uchar )
+
+add_test(NAME "${TS_NAME}_conversion_ulong_uchar"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" ulong_uchar ulong_sat_uchar )
+
+add_test(NAME "${TS_NAME}_conversion_long_uchar"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" long_uchar long_sat_uchar )
+
+add_test(NAME "${TS_NAME}_conversion_uchar_char"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" uchar_char uchar_sat_char )
+
+add_test(NAME "${TS_NAME}_conversion_char_char"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" char_char char_sat_char )
+
+add_test(NAME "${TS_NAME}_conversion_ushort_char"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" ushort_char ushort_sat_char )
+
+add_test(NAME "${TS_NAME}_conversion_short_char"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" short_char short_sat_char )
+
+add_test(NAME "${TS_NAME}_conversion_uint_char"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" uint_char uint_sat_char )
+
+add_test(NAME "${TS_NAME}_conversion_int_char"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" int_char int_sat_char )
+
+add_test(NAME "${TS_NAME}_conversion_float_char"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" float_char float_rte_char float_rtp_char float_rtn_char float_rtz_char )
+
+add_test(NAME "${TS_NAME}_conversion_double_char"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" double_char double_rte_char double_rtp_char double_rtn_char double_rtz_char )
+
+add_test(NAME "${TS_NAME}_conversion_ulong_char"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" ulong_char ulong_sat_char )
+
+add_test(NAME "${TS_NAME}_conversion_long_char"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" long_char long_sat_char )
+
+add_test(NAME "${TS_NAME}_conversion_uchar_ushort"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" uchar_ushort uchar_sat_ushort )
+
+add_test(NAME "${TS_NAME}_conversion_char_ushort"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" char_ushort char_sat_ushort )
+
+add_test(NAME "${TS_NAME}_conversion_ushort_ushort"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" ushort_ushort ushort_sat_ushort )
+
+add_test(NAME "${TS_NAME}_conversion_short_ushort"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" short_ushort short_sat_ushort )
+
+add_test(NAME "${TS_NAME}_conversion_uint_ushort"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" uint_ushort uint_sat_ushort )
+
+add_test(NAME "${TS_NAME}_conversion_int_ushort"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" int_ushort int_sat_ushort )
+
+add_test(NAME "${TS_NAME}_conversion_float_ushort"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" float_ushort float_rte_ushort float_rtp_ushort float_rtn_ushort float_rtz_ushort )
+
+add_test(NAME "${TS_NAME}_conversion_double_ushort"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" double_ushort double_rte_ushort double_rtp_ushort double_rtn_ushort double_rtz_ushort )
+
+add_test(NAME "${TS_NAME}_conversion_ulong_ushort"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" ulong_ushort ulong_sat_ushort )
+
+add_test(NAME "${TS_NAME}_conversion_long_ushort"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" long_ushort long_sat_ushort )
+
+add_test(NAME "${TS_NAME}_conversion_uchar_short"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" uchar_short uchar_sat_short )
+
+add_test(NAME "${TS_NAME}_conversion_char_short"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" char_short char_sat_short )
+
+add_test(NAME "${TS_NAME}_conversion_ushort_short"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" ushort_short ushort_sat_short )
+
+add_test(NAME "${TS_NAME}_conversion_short_short"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" short_short short_sat_short )
+
+add_test(NAME "${TS_NAME}_conversion_uint_short"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" uint_short uint_sat_short )
+
+add_test(NAME "${TS_NAME}_conversion_int_short"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" int_short int_sat_short )
+
+add_test(NAME "${TS_NAME}_conversion_float_short"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" float_short float_rte_short float_rtp_short float_rtn_short float_rtz_short )
+
+add_test(NAME "${TS_NAME}_conversion_double_short"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" double_short double_rte_short double_rtp_short double_rtn_short double_rtz_short )
+
+add_test(NAME "${TS_NAME}_conversion_ulong_short"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" ulong_short ulong_sat_short )
+
+add_test(NAME "${TS_NAME}_conversion_long_short"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" long_short long_sat_short )
+
+add_test(NAME "${TS_NAME}_conversion_uchar_uint"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" uchar_uint uchar_sat_uint )
+
+add_test(NAME "${TS_NAME}_conversion_char_uint"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" char_uint char_sat_uint )
+
+add_test(NAME "${TS_NAME}_conversion_ushort_uint"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" ushort_uint ushort_sat_uint )
+
+add_test(NAME "${TS_NAME}_conversion_short_uint"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" short_uint short_sat_uint )
+
+add_test(NAME "${TS_NAME}_conversion_uint_uint"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" uint_uint uint_sat_uint )
+
+add_test(NAME "${TS_NAME}_conversion_int_uint"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" int_uint int_sat_uint )
+
+add_test(NAME "${TS_NAME}_conversion_float_uint"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" float_uint float_rte_uint float_rtp_uint float_rtn_uint float_rtz_uint )
+
+add_test(NAME "${TS_NAME}_conversion_double_uint"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" double_uint double_rte_uint double_rtp_uint double_rtn_uint double_rtz_uint )
+
+add_test(NAME "${TS_NAME}_conversion_ulong_uint"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" ulong_uint ulong_sat_uint )
+
+add_test(NAME "${TS_NAME}_conversion_long_uint"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" long_uint long_sat_uint )
+
+add_test(NAME "${TS_NAME}_conversion_uchar_int"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" uchar_int uchar_sat_int )
+
+add_test(NAME "${TS_NAME}_conversion_char_int"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" char_int char_sat_int )
+
+add_test(NAME "${TS_NAME}_conversion_ushort_int"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" ushort_int ushort_sat_int )
+
+add_test(NAME "${TS_NAME}_conversion_short_int"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" short_int short_sat_int )
+
+add_test(NAME "${TS_NAME}_conversion_uint_int"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" uint_int uint_sat_int )
+
+add_test(NAME "${TS_NAME}_conversion_int_int"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" int_int int_sat_int )
+
+add_test(NAME "${TS_NAME}_conversion_float_int"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" float_int float_rte_int float_rtp_int float_rtn_int float_rtz_int )
+
+add_test(NAME "${TS_NAME}_conversion_double_int"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" double_int double_rte_int double_rtp_int double_rtn_int double_rtz_int )
+
+add_test(NAME "${TS_NAME}_conversion_ulong_int"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" ulong_int ulong_sat_int )
+
+add_test(NAME "${TS_NAME}_conversion_long_int"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" long_int long_sat_int )
+
+add_test(NAME "${TS_NAME}_conversion_uchar_float"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" uchar_float uchar_sat_float uchar_rte_float uchar_sat_rte_float uchar_rtp_float uchar_sat_rtp_float uchar_rtn_float uchar_sat_rtn_float uchar_rtz_float uchar_sat_rtz_float )
+
+add_test(NAME "${TS_NAME}_conversion_char_float"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" char_float char_sat_float char_rte_float char_sat_rte_float char_rtp_float char_sat_rtp_float char_rtn_float char_sat_rtn_float char_rtz_float char_sat_rtz_float )
+
+add_test(NAME "${TS_NAME}_conversion_ushort_float"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" ushort_float ushort_sat_float ushort_rte_float ushort_sat_rte_float ushort_rtp_float ushort_sat_rtp_float ushort_rtn_float ushort_sat_rtn_float ushort_rtz_float ushort_sat_rtz_float )
+
+add_test(NAME "${TS_NAME}_conversion_short_float"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" short_float short_sat_float short_rte_float short_sat_rte_float short_rtp_float short_sat_rtp_float short_rtn_float short_sat_rtn_float short_rtz_float short_sat_rtz_float )
+
+add_test(NAME "${TS_NAME}_conversion_uint_float"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" uint_float uint_sat_float uint_rte_float uint_sat_rte_float uint_rtp_float uint_sat_rtp_float uint_rtn_float uint_sat_rtn_float uint_rtz_float uint_sat_rtz_float )
+
+add_test(NAME "${TS_NAME}_conversion_int_float"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" int_float int_sat_float int_rte_float int_sat_rte_float int_rtp_float int_sat_rtp_float int_rtn_float int_sat_rtn_float int_rtz_float int_sat_rtz_float )
+
+add_test(NAME "${TS_NAME}_conversion_float_float"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" float_float float_rte_float float_rtp_float float_rtn_float float_rtz_float )
+
+add_test(NAME "${TS_NAME}_conversion_double_float"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" double_float double_rte_float double_rtp_float double_rtn_float double_rtz_float )
+
+add_test(NAME "${TS_NAME}_conversion_ulong_float"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" ulong_float ulong_sat_float ulong_rte_float ulong_sat_rte_float ulong_rtp_float ulong_sat_rtp_float ulong_rtn_float ulong_sat_rtn_float ulong_rtz_float ulong_sat_rtz_float )
+
+add_test(NAME "${TS_NAME}_conversion_long_float"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" long_float long_sat_float long_rte_float long_sat_rte_float long_rtp_float long_sat_rtp_float long_rtn_float long_sat_rtn_float long_rtz_float long_sat_rtz_float )
+
+add_test(NAME "${TS_NAME}_conversion_uchar_double"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" uchar_double uchar_sat_double uchar_rte_double uchar_sat_rte_double uchar_rtp_double uchar_sat_rtp_double uchar_rtn_double uchar_sat_rtn_double uchar_rtz_double uchar_sat_rtz_double )
+
+add_test(NAME "${TS_NAME}_conversion_char_double"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" char_double char_sat_double char_rte_double char_sat_rte_double char_rtp_double char_sat_rtp_double char_rtn_double char_sat_rtn_double char_rtz_double char_sat_rtz_double )
+
+add_test(NAME "${TS_NAME}_conversion_ushort_double"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" ushort_double ushort_sat_double ushort_rte_double ushort_sat_rte_double ushort_rtp_double ushort_sat_rtp_double ushort_rtn_double ushort_sat_rtn_double ushort_rtz_double ushort_sat_rtz_double )
+
+add_test(NAME "${TS_NAME}_conversion_short_double"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" short_double short_sat_double short_rte_double short_sat_rte_double short_rtp_double short_sat_rtp_double short_rtn_double short_sat_rtn_double short_rtz_double short_sat_rtz_double )
+
+add_test(NAME "${TS_NAME}_conversion_uint_double"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" uint_double uint_sat_double uint_rte_double uint_sat_rte_double uint_rtp_double uint_sat_rtp_double uint_rtn_double uint_sat_rtn_double uint_rtz_double uint_sat_rtz_double )
+
+add_test(NAME "${TS_NAME}_conversion_int_double"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" int_double int_sat_double int_rte_double int_sat_rte_double int_rtp_double int_sat_rtp_double int_rtn_double int_sat_rtn_double int_rtz_double int_sat_rtz_double )
+
+add_test(NAME "${TS_NAME}_conversion_float_double"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" float_double float_rte_double float_rtp_double float_rtn_double float_rtz_double )
+
+add_test(NAME "${TS_NAME}_conversion_double_double"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" double_double double_rte_double double_rtp_double double_rtn_double double_rtz_double )
+
+add_test(NAME "${TS_NAME}_conversion_ulong_double"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" ulong_double ulong_sat_double ulong_rte_double ulong_sat_rte_double ulong_rtp_double ulong_sat_rtp_double ulong_rtn_double ulong_sat_rtn_double ulong_rtz_double ulong_sat_rtz_double )
+
+add_test(NAME "${TS_NAME}_conversion_long_double"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" long_double long_sat_double long_rte_double long_sat_rte_double long_rtp_double long_sat_rtp_double long_rtn_double long_sat_rtn_double long_rtz_double long_sat_rtz_double )
+
+add_test(NAME "${TS_NAME}_conversion_uchar_ulong"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" uchar_ulong uchar_sat_ulong )
+
+add_test(NAME "${TS_NAME}_conversion_char_ulong"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" char_ulong char_sat_ulong )
+
+add_test(NAME "${TS_NAME}_conversion_ushort_ulong"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" ushort_ulong ushort_sat_ulong )
+
+add_test(NAME "${TS_NAME}_conversion_short_ulong"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" short_ulong short_sat_ulong )
+
+add_test(NAME "${TS_NAME}_conversion_uint_ulong"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" uint_ulong uint_sat_ulong )
+
+add_test(NAME "${TS_NAME}_conversion_int_ulong"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" int_ulong int_sat_ulong )
+
+add_test(NAME "${TS_NAME}_conversion_float_ulong"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" float_ulong float_rte_ulong float_rtp_ulong float_rtn_ulong float_rtz_ulong )
+
+add_test(NAME "${TS_NAME}_conversion_double_ulong"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" double_ulong double_rte_ulong double_rtp_ulong double_rtn_ulong double_rtz_ulong )
+
+add_test(NAME "${TS_NAME}_conversion_ulong_ulong"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" ulong_ulong ulong_sat_ulong )
+
+add_test(NAME "${TS_NAME}_conversion_long_ulong"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" long_ulong long_sat_ulong )
+
+add_test(NAME "${TS_NAME}_conversion_uchar_long"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" uchar_long uchar_sat_long )
+
+add_test(NAME "${TS_NAME}_conversion_char_long"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" char_long char_sat_long )
+
+add_test(NAME "${TS_NAME}_conversion_ushort_long"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" ushort_long ushort_sat_long )
+
+add_test(NAME "${TS_NAME}_conversion_short_long"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" short_long short_sat_long )
+
+add_test(NAME "${TS_NAME}_conversion_uint_long"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" uint_long uint_sat_long )
+
+add_test(NAME "${TS_NAME}_conversion_int_long"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" int_long int_sat_long )
+
+add_test(NAME "${TS_NAME}_conversion_float_long"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" float_long float_rte_long float_rtp_long float_rtn_long float_rtz_long )
+
+add_test(NAME "${TS_NAME}_conversion_double_long"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" double_long double_rte_long double_rtp_long double_rtn_long double_rtz_long )
+
+add_test(NAME "${TS_NAME}_conversion_ulong_long"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" ulong_long ulong_sat_long )
+
+add_test(NAME "${TS_NAME}_conversion_long_long"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" long_long long_sat_long )
+
+add_test(NAME "${TS_NAME}_device_partition"
+         COMMAND "${TS_BUILDDIR}/device_partition/test_device_partition" )
+
+add_test(NAME "${TS_NAME}_events"
+         COMMAND "${TS_BUILDDIR}/events/test_events" event_get_execute_status  event_get_write_array_status  event_get_read_array_status  event_get_info  event_wait_for_execute  event_wait_for_array  event_flush  event_finish_execute  event_finish_array  event_release_before_done  event_enqueue_marker  event_enqueue_marker_with_event_list  event_enqueue_barrier_with_event_list)
+add_test(NAME "${TS_NAME}_events_ooo"
+         COMMAND "${TS_BUILDDIR}/events/test_events" out_of_order_event_waitlist_single_queue  out_of_order_event_waitlist_multi_queue  out_of_order_event_waitlist_multi_queue_multi_device  out_of_order_event_enqueue_wait_for_events_single_queue  out_of_order_event_enqueue_wait_for_events_multi_queue  out_of_order_event_enqueue_wait_for_events_multi_queue_multi_device  out_of_order_event_enqueue_marker_single_queue  out_of_order_event_enqueue_marker_multi_queue  out_of_order_event_enqueu [...]
+add_test(NAME "${TS_NAME}_events_other"
+         COMMAND "${TS_BUILDDIR}/events/test_events" waitlists  test_userevents  callbacks  callbacks_simultaneous  userevents_multithreaded)
+
+add_test(NAME "${TS_NAME}_geometrics"
+         COMMAND "${TS_BUILDDIR}/geometrics/test_geometrics" )
+
+add_test(NAME "${TS_NAME}_gl"
+         COMMAND "${TS_BUILDDIR}/gl/test_gl" )
+
+add_test(NAME "${TS_NAME}_half"
+         COMMAND "${TS_BUILDDIR}/half/Test_half" )
+
+add_test(NAME "${TS_NAME}_cl_gl_h"
+         COMMAND "${TS_BUILDDIR}/headers/test_cl_gl_h" )
+
+add_test(NAME "${TS_NAME}_cl_gl_h_c99"
+         COMMAND "${TS_BUILDDIR}/headers/test_cl_gl_h_c99" )
+
+add_test(NAME "${TS_NAME}_cl_h"
+         COMMAND "${TS_BUILDDIR}/headers/test_cl_h" )
+
+add_test(NAME "${TS_NAME}_cl_h_c99"
+         COMMAND "${TS_BUILDDIR}/headers/test_cl_h_c99" )
+
+add_test(NAME "${TS_NAME}_cl_platform_h"
+         COMMAND "${TS_BUILDDIR}/headers/test_cl_platform_h" )
+
+add_test(NAME "${TS_NAME}_cl_platform_h_c99"
+         COMMAND "${TS_BUILDDIR}/headers/test_cl_platform_h_c99" )
+
+add_test(NAME "${TS_NAME}_headers"
+         COMMAND "${TS_BUILDDIR}/headers/test_headers" )
+
+add_test(NAME "${TS_NAME}_opencl_h"
+         COMMAND "${TS_BUILDDIR}/headers/test_opencl_h" )
+
+add_test(NAME "${TS_NAME}_opencl_h_c99"
+         COMMAND "${TS_BUILDDIR}/headers/test_opencl_h_c99" )
+
+#***************************************************************************
+
+add_test(NAME "${TS_NAME}_images_cl_copy_images"
+         COMMAND "${TS_BUILDDIR}/images/clCopyImage/test_cl_copy_images" )
+add_test(NAME "${TS_NAME}_images_cl_copy_images_max"
+         COMMAND "${TS_BUILDDIR}/images/clCopyImage/test_cl_copy_images" max_images)
+add_test(NAME "${TS_NAME}_images_cl_copy_images_min"
+         COMMAND "${TS_BUILDDIR}/images/clCopyImage/test_cl_copy_images" small_images)
+add_test(NAME "${TS_NAME}_images_cl_copy_images_rounding"
+         COMMAND "${TS_BUILDDIR}/images/clCopyImage/test_cl_copy_images" rounding)
+
+add_test(NAME "${TS_NAME}_images_cl_fill_images"
+         COMMAND "${TS_BUILDDIR}/images/clFillImage/test_cl_fill_images" )
+add_test(NAME "${TS_NAME}_images_cl_fill_images_min"
+         COMMAND "${TS_BUILDDIR}/images/clFillImage/test_cl_fill_images" small_images)
+add_test(NAME "${TS_NAME}_images_cl_fill_images_max"
+         COMMAND "${TS_BUILDDIR}/images/clFillImage/test_cl_fill_images" max_images)
+add_test(NAME "${TS_NAME}_images_cl_fill_images_pitch"
+         COMMAND "${TS_BUILDDIR}/images/clFillImage/test_cl_fill_images" use_pitches)
+
+add_test(NAME "${TS_NAME}_images_cl_get_info"
+         COMMAND "${TS_BUILDDIR}/images/clGetInfo/test_cl_get_info" )
+add_test(NAME "${TS_NAME}_images_cl_get_info_min"
+         COMMAND "${TS_BUILDDIR}/images/clGetInfo/test_cl_get_info" small_images)
+add_test(NAME "${TS_NAME}_images_cl_get_info_max"
+         COMMAND "${TS_BUILDDIR}/images/clGetInfo/test_cl_get_info" max_images)
+add_test(NAME "${TS_NAME}_images_cl_get_info_random"
+         COMMAND "${TS_BUILDDIR}/images/clGetInfo/test_cl_get_info" randomize)
+
+add_test(NAME "${TS_NAME}_images_cl_read_write_images"
+         COMMAND "${TS_BUILDDIR}/images/clReadWriteImage/test_cl_read_write_images" )
+add_test(NAME "${TS_NAME}_images_cl_read_write_images_min"
+         COMMAND "${TS_BUILDDIR}/images/clReadWriteImage/test_cl_read_write_images" small_images)
+add_test(NAME "${TS_NAME}_images_cl_read_write_images_max"
+         COMMAND "${TS_BUILDDIR}/images/clReadWriteImage/test_cl_read_write_images" max_images)
+add_test(NAME "${TS_NAME}_images_cl_read_write_images_round"
+         COMMAND "${TS_BUILDDIR}/images/clReadWriteImage/test_cl_read_write_images" rounding)
+add_test(NAME "${TS_NAME}_images_cl_read_write_images_pitch"
+         COMMAND "${TS_BUILDDIR}/images/clReadWriteImage/test_cl_read_write_images" use_pitches)
+
+add_test(NAME "${TS_NAME}_images_kernel_image_methods_1D"
+         COMMAND "${TS_BUILDDIR}/images/kernel_image_methods/test_kernel_image_methods" 1D)
+add_test(NAME "${TS_NAME}_images_kernel_image_methods_2D"
+         COMMAND "${TS_BUILDDIR}/images/kernel_image_methods/test_kernel_image_methods" 2D)
+add_test(NAME "${TS_NAME}_images_kernel_image_methods_3D"
+         COMMAND "${TS_BUILDDIR}/images/kernel_image_methods/test_kernel_image_methods" 3D)
+add_test(NAME "${TS_NAME}_images_kernel_image_methods_1Darray"
+         COMMAND "${TS_BUILDDIR}/images/kernel_image_methods/test_kernel_image_methods" 1Darray)
+add_test(NAME "${TS_NAME}_images_kernel_image_methods_2Darray"
+         COMMAND "${TS_BUILDDIR}/images/kernel_image_methods/test_kernel_image_methods" 2Darray)
+
+add_test(NAME "${TS_NAME}_images_image_streams_1D"
+         COMMAND "${TS_BUILDDIR}/images/kernel_read_write/test_image_streams" "1D" )
+add_test(NAME "${TS_NAME}_images_image_streams_2D"
+         COMMAND "${TS_BUILDDIR}/images/kernel_read_write/test_image_streams" "2D" )
+add_test(NAME "${TS_NAME}_images_image_streams_3D"
+         COMMAND "${TS_BUILDDIR}/images/kernel_read_write/test_image_streams" "3D")
+add_test(NAME "${TS_NAME}_images_image_streams_1Darray"
+         COMMAND "${TS_BUILDDIR}/images/kernel_read_write/test_image_streams" "1Darray")
+add_test(NAME "${TS_NAME}_images_image_streams_2Darray"
+         COMMAND "${TS_BUILDDIR}/images/kernel_read_write/test_image_streams" "2Darray")
+
+add_test(NAME "${TS_NAME}_images_image_streams_1D_pitch"
+         COMMAND "${TS_BUILDDIR}/images/kernel_read_write/test_image_streams" "1D"  use_pitches)
+add_test(NAME "${TS_NAME}_images_image_streams_2D_pitch"
+         COMMAND "${TS_BUILDDIR}/images/kernel_read_write/test_image_streams" "2D"  use_pitches)
+add_test(NAME "${TS_NAME}_images_image_streams_3D_pitch"
+         COMMAND "${TS_BUILDDIR}/images/kernel_read_write/test_image_streams" "3D" use_pitches)
+add_test(NAME "${TS_NAME}_images_image_streams_1Darray_pitch"
+         COMMAND "${TS_BUILDDIR}/images/kernel_read_write/test_image_streams" "1Darray" use_pitches)
+add_test(NAME "${TS_NAME}_images_image_streams_2Darray_pitch"
+         COMMAND "${TS_BUILDDIR}/images/kernel_read_write/test_image_streams" "2Darray" use_pitches)
+
+add_test(NAME "${TS_NAME}_images_samplerless_reads_1D"
+         COMMAND "${TS_BUILDDIR}/images/samplerlessReads/test_samplerless_reads" 1D)
+add_test(NAME "${TS_NAME}_images_samplerless_reads_2D"
+         COMMAND "${TS_BUILDDIR}/images/samplerlessReads/test_samplerless_reads" 2D)
+add_test(NAME "${TS_NAME}_images_samplerless_reads_3D"
+         COMMAND "${TS_BUILDDIR}/images/samplerlessReads/test_samplerless_reads" 3D)
+add_test(NAME "${TS_NAME}_images_samplerless_reads_1Darray"
+         COMMAND "${TS_BUILDDIR}/images/samplerlessReads/test_samplerless_reads" 1Darray)
+add_test(NAME "${TS_NAME}_images_samplerless_reads_2Darray"
+         COMMAND "${TS_BUILDDIR}/images/samplerlessReads/test_samplerless_reads" 2Darray)
+
+add_test(NAME "${TS_NAME}_images_samplerless_reads_1D_pitch"
+         COMMAND "${TS_BUILDDIR}/images/samplerlessReads/test_samplerless_reads" 1D use_pitches)
+add_test(NAME "${TS_NAME}_images_samplerless_reads_2D_pitch"
+         COMMAND "${TS_BUILDDIR}/images/samplerlessReads/test_samplerless_reads" 2D use_pitches)
+add_test(NAME "${TS_NAME}_images_samplerless_reads_3D_pitch"
+         COMMAND "${TS_BUILDDIR}/images/samplerlessReads/test_samplerless_reads" 3D use_pitches)
+add_test(NAME "${TS_NAME}_images_samplerless_reads_1Darray_pitch"
+         COMMAND "${TS_BUILDDIR}/images/samplerlessReads/test_samplerless_reads" 1Darray use_pitches)
+add_test(NAME "${TS_NAME}_images_samplerless_reads_2Darray_pitch"
+         COMMAND "${TS_BUILDDIR}/images/samplerlessReads/test_samplerless_reads" 2Darray use_pitches)
+
+#***************************************************************************
+
+add_test(NAME "${TS_NAME}_integer_ops_1"
+         COMMAND "${TS_BUILDDIR}/integer_ops/test_integer_ops" integer_clz  integer_hadd  integer_rhadd  integer_mul_hi  integer_rotate  integer_clamp  integer_mad_sat  integer_mad_hi  integer_min  integer_max  integer_upsample  integer_abs  integer_abs_diff  integer_add_sat  integer_sub_sat  integer_addAssign  integer_subtractAssign  integer_multiplyAssign  integer_divideAssign  integer_moduloAssign  integer_andAssign  integer_orAssign  integer_exclusiveOrAssign)
+
+add_test(NAME "${TS_NAME}_integer_ops_2"
+         COMMAND "${TS_BUILDDIR}/integer_ops/test_integer_ops" unary_ops_increment  unary_ops_decrement  unary_ops_full  integer_mul24  integer_mad24  long_math  long_logic  long_shift  long_compare  ulong_math  ulong_logic  ulong_shift  ulong_compare  int_math  int_logic  int_shift  int_compare  uint_math  uint_logic  uint_shift  uint_compare  short_math  short_logic  short_shift  short_compare  ushort_math  ushort_logic  ushort_shift  ushort_compare  char_math  char_logic  char_shift   [...]
+
+add_test(NAME "${TS_NAME}_integer_ops_3"
+         COMMAND "${TS_BUILDDIR}/integer_ops/test_integer_ops" quick_long_math  quick_long_logic  quick_long_shift  quick_long_compare  quick_ulong_math  quick_ulong_logic  quick_ulong_shift  quick_ulong_compare  quick_int_math  quick_int_logic  quick_int_shift  quick_int_compare  quick_uint_math  quick_uint_logic  quick_uint_shift  quick_uint_compare  quick_short_math  quick_short_logic  quick_short_shift  quick_short_compare  quick_ushort_math  quick_ushort_logic  quick_ushort_shift  q [...]
+
+#***************************************************************************
+
+add_test(NAME "${TS_NAME}_math_add"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" add)
+
+add_test(NAME "${TS_NAME}_math_assignment"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" assignment)
+
+add_test(NAME "${TS_NAME}_math_cbrt"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" cbrt)
+
+add_test(NAME "${TS_NAME}_math_ceil"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" ceil)
+
+add_test(NAME "${TS_NAME}_math_copysign"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" copysign)
+
+add_test(NAME "${TS_NAME}_math_cos"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" cos)
+
+add_test(NAME "${TS_NAME}_math_cosh"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" cosh)
+
+add_test(NAME "${TS_NAME}_math_cospi"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" cospi)
+
+add_test(NAME "${TS_NAME}_math_divide"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" divide)
+
+add_test(NAME "${TS_NAME}_math_exp"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" exp)
+
+add_test(NAME "${TS_NAME}_math_exp10"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" exp10)
+
+add_test(NAME "${TS_NAME}_math_exp2"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" exp2)
+
+add_test(NAME "${TS_NAME}_math_expm1"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" expm1)
+
+add_test(NAME "${TS_NAME}_math_fabs"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" fabs)
+
+add_test(NAME "${TS_NAME}_math_fdim"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" fdim)
+
+add_test(NAME "${TS_NAME}_math_floor"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" floor)
+
+add_test(NAME "${TS_NAME}_math_fma"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" fma)
+
+add_test(NAME "${TS_NAME}_math_fmax"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" fmax)
+
+add_test(NAME "${TS_NAME}_math_fmin"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" fmin)
+
+add_test(NAME "${TS_NAME}_math_fmod"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" fmod)
+
+add_test(NAME "${TS_NAME}_math_fract"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" fract)
+
+add_test(NAME "${TS_NAME}_math_frexp"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" frexp)
+
+add_test(NAME "${TS_NAME}_math_hypot"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" hypot)
+
+add_test(NAME "${TS_NAME}_math_ilogb"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" ilogb)
+
+add_test(NAME "${TS_NAME}_math_isequal"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" isequal)
+
+add_test(NAME "${TS_NAME}_math_isfinite"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" isfinite)
+
+add_test(NAME "${TS_NAME}_math_isgreater"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" isgreater)
+
+add_test(NAME "${TS_NAME}_math_isgreaterequal"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" isgreaterequal)
+
+add_test(NAME "${TS_NAME}_math_isinf"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" isinf)
+
+add_test(NAME "${TS_NAME}_math_isless"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" isless)
+
+add_test(NAME "${TS_NAME}_math_islessequal"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" islessequal)
+
+add_test(NAME "${TS_NAME}_math_islessgreater"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" islessgreater)
+
+add_test(NAME "${TS_NAME}_math_isnan"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" isnan)
+
+add_test(NAME "${TS_NAME}_math_isnormal"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" isnormal)
+
+add_test(NAME "${TS_NAME}_math_isnotequal"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" isnotequal)
+
+add_test(NAME "${TS_NAME}_math_isordered"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" isordered)
+
+add_test(NAME "${TS_NAME}_math_isunordered"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" isunordered)
+
+add_test(NAME "${TS_NAME}_math_ldexp"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" ldexp)
+
+add_test(NAME "${TS_NAME}_math_log"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" log)
+
+add_test(NAME "${TS_NAME}_math_log10"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" log10)
+
+add_test(NAME "${TS_NAME}_math_log1p"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" log1p)
+
+add_test(NAME "${TS_NAME}_math_log2"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" log2)
+
+add_test(NAME "${TS_NAME}_math_logb"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" logb)
+
+add_test(NAME "${TS_NAME}_math_mad"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" mad)
+
+add_test(NAME "${TS_NAME}_math_maxmag"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" maxmag)
+
+add_test(NAME "${TS_NAME}_math_minmag"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" minmag)
+
+add_test(NAME "${TS_NAME}_math_modf"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" modf)
+
+add_test(NAME "${TS_NAME}_math_multiply"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" multiply)
+
+add_test(NAME "${TS_NAME}_math_nan"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" nan)
+
+add_test(NAME "${TS_NAME}_math_nextafter"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" nextafter)
+
+add_test(NAME "${TS_NAME}_math_not"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" not)
+
+add_test(NAME "${TS_NAME}_math_pow"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" pow)
+
+add_test(NAME "${TS_NAME}_math_pown"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" pown)
+
+add_test(NAME "${TS_NAME}_math_powr"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" powr)
+
+add_test(NAME "${TS_NAME}_math_remainder"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" remainder)
+
+add_test(NAME "${TS_NAME}_math_remquo"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" remquo)
+
+add_test(NAME "${TS_NAME}_math_rint"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" rint)
+
+add_test(NAME "${TS_NAME}_math_rootn"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" rootn)
+
+add_test(NAME "${TS_NAME}_math_round"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" round)
+
+add_test(NAME "${TS_NAME}_math_rsqrt"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" rsqrt)
+
+add_test(NAME "${TS_NAME}_math_signbit"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" signbit)
+
+add_test(NAME "${TS_NAME}_math_sin"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" sin)
+
+add_test(NAME "${TS_NAME}_math_sincos"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" sincos)
+
+add_test(NAME "${TS_NAME}_math_sinh"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" sinh)
+
+add_test(NAME "${TS_NAME}_math_sinpi"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" sinpi)
+
+add_test(NAME "${TS_NAME}_math_sqrt"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" sqrt)
+
+add_test(NAME "${TS_NAME}_math_subtract"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" subtract)
+
+add_test(NAME "${TS_NAME}_math_tan"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" tan)
+
+add_test(NAME "${TS_NAME}_math_tanh"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" tanh)
+
+add_test(NAME "${TS_NAME}_math_tanpi"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" tanpi)
+
+add_test(NAME "${TS_NAME}_math_trunc"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce" trunc)
+
+#***************************************************************************
+
+add_test(NAME "${TS_NAME}_mem_host_flags"
+         COMMAND "${TS_BUILDDIR}/mem_host_flags/test_mem_host_flags" )
+
+add_test(NAME "${TS_NAME}_multiples"
+         COMMAND "${TS_BUILDDIR}/multiple_device_context/test_multiples" )
+
+add_test(NAME "${TS_NAME}_printf"
+         COMMAND "${TS_BUILDDIR}/printf/test_printf"
+   WORKING_DIRECTORY "${TS_BASEDIR}/src/${TS_NAME}/test_conformance/printf" )
+
+add_test(NAME "${TS_NAME}_profiling"
+         COMMAND "${TS_BUILDDIR}/profiling/test_profiling" )
+
+add_test(NAME "${TS_NAME}_relationals"
+         COMMAND "${TS_BUILDDIR}/relationals/test_relationals" relational_any  relational_all  relational_bitselect  relational_select_signed  relational_select_unsigned  relational_isequal  relational_isnotequal  relational_isgreater  relational_isgreaterequal  relational_isless  relational_islessequal  relational_islessgreater)
+add_test(NAME "${TS_NAME}_relationals_shuffle"
+         COMMAND "${TS_BUILDDIR}/relationals/test_relationals" shuffle_copy  shuffle_function_call  shuffle_array_cast  shuffle_built_in  shuffle_built_in_dual_input)
+
+add_test(NAME "${TS_NAME}_select_uchar_uchar"
+         COMMAND "${TS_BUILDDIR}/select/test_select" select_uchar_uchar)
+add_test(NAME "${TS_NAME}_select_uchar_char"
+         COMMAND "${TS_BUILDDIR}/select/test_select" select_uchar_char)
+add_test(NAME "${TS_NAME}_select_char_uchar"
+         COMMAND "${TS_BUILDDIR}/select/test_select" select_char_uchar)
+add_test(NAME "${TS_NAME}_select_char_char"
+         COMMAND "${TS_BUILDDIR}/select/test_select" select_char_char)
+
+add_test(NAME "${TS_NAME}_select_ushort_ushort"
+         COMMAND "${TS_BUILDDIR}/select/test_select" select_ushort_ushort)
+add_test(NAME "${TS_NAME}_select_ushort_short"
+         COMMAND "${TS_BUILDDIR}/select/test_select" select_ushort_short)
+add_test(NAME "${TS_NAME}_select_short_ushort"
+         COMMAND "${TS_BUILDDIR}/select/test_select" select_short_ushort)
+add_test(NAME "${TS_NAME}_select_short_short"
+         COMMAND "${TS_BUILDDIR}/select/test_select" select_short_short)
+
+add_test(NAME "${TS_NAME}_select_uint_uint"
+         COMMAND "${TS_BUILDDIR}/select/test_select" select_uint_uint)
+add_test(NAME "${TS_NAME}_select_uint_int"
+         COMMAND "${TS_BUILDDIR}/select/test_select" select_uint_int)
+add_test(NAME "${TS_NAME}_select_int_uint"
+         COMMAND "${TS_BUILDDIR}/select/test_select" select_int_uint)
+add_test(NAME "${TS_NAME}_select_int_int"
+         COMMAND "${TS_BUILDDIR}/select/test_select" select_int_int)
+
+add_test(NAME "${TS_NAME}_select_ulong_ulong"
+         COMMAND "${TS_BUILDDIR}/select/test_select" select_ulong_ulong)
+add_test(NAME "${TS_NAME}_select_ulong_long"
+         COMMAND "${TS_BUILDDIR}/select/test_select" select_ulong_long)
+add_test(NAME "${TS_NAME}_select_long_ulong"
+         COMMAND "${TS_BUILDDIR}/select/test_select" select_long_ulong)
+add_test(NAME "${TS_NAME}_select_long_long"
+         COMMAND "${TS_BUILDDIR}/select/test_select" select_long_long)
+
+add_test(NAME "${TS_NAME}_thread_dimensions_quick"
+         COMMAND "${TS_BUILDDIR}/thread_dimensions/test_thread_dimensions" quick_1d_explicit_local  quick_2d_explicit_local  quick_3d_explicit_local  quick_1d_implicit_local  quick_2d_implicit_local  quick_3d_implicit_local)
+add_test(NAME "${TS_NAME}_thread_dimensions_full"
+         COMMAND "${TS_BUILDDIR}/thread_dimensions/test_thread_dimensions" full_1d_explicit_local  full_2d_explicit_local  full_3d_explicit_local  full_1d_implicit_local  full_2d_implicit_local  full_3d_implicit_local)
+
+add_test(NAME "${TS_NAME}_vecalign"
+         COMMAND "${TS_BUILDDIR}/vec_align/test_vecalign" )
+
+add_test(NAME "${TS_NAME}_vecstep"
+         COMMAND "${TS_BUILDDIR}/vec_step/test_vecstep" )
+
+set_tests_properties(
+    "${TS_NAME}_allocations_buffer"
+    "${TS_NAME}_allocations_image"
+    "${TS_NAME}_api"
+    "${TS_NAME}_atomics"
+    "${TS_NAME}_basic_math"
+    "${TS_NAME}_basic_s2v"
+    "${TS_NAME}_basic_memory"
+    "${TS_NAME}_basic_image"
+    "${TS_NAME}_basic_other"
+    "${TS_NAME}_buffers_read_async"
+    "${TS_NAME}_buffers_write_async"
+    "${TS_NAME}_buffers_read_array_barrier"
+    "${TS_NAME}_buffers_read"
+    "${TS_NAME}_buffers_write"
+    "${TS_NAME}_buffers_fill"
+    "${TS_NAME}_buffers_var1"
+    "${TS_NAME}_buffers_var2"
+    "${TS_NAME}_buffers_map_read"
+    "${TS_NAME}_buffers_map_write"
+    "${TS_NAME}_commonfns"
+    "${TS_NAME}_compiler"
+    "${TS_NAME}_computeinfo"
+    "${TS_NAME}_contractions"
+    "${TS_NAME}_conversion_uchar_uchar"
+    "${TS_NAME}_conversion_char_uchar"
+    "${TS_NAME}_conversion_ushort_uchar"
+    "${TS_NAME}_conversion_short_uchar"
+    "${TS_NAME}_conversion_uint_uchar"
+    "${TS_NAME}_conversion_int_uchar"
+    "${TS_NAME}_conversion_float_uchar"
+    "${TS_NAME}_conversion_double_uchar"
+    "${TS_NAME}_conversion_ulong_uchar"
+    "${TS_NAME}_conversion_long_uchar"
+    "${TS_NAME}_conversion_uchar_char"
+    "${TS_NAME}_conversion_char_char"
+    "${TS_NAME}_conversion_ushort_char"
+    "${TS_NAME}_conversion_short_char"
+    "${TS_NAME}_conversion_uint_char"
+    "${TS_NAME}_conversion_int_char"
+    "${TS_NAME}_conversion_float_char"
+    "${TS_NAME}_conversion_double_char"
+    "${TS_NAME}_conversion_ulong_char"
+    "${TS_NAME}_conversion_long_char"
+    "${TS_NAME}_conversion_uchar_ushort"
+    "${TS_NAME}_conversion_char_ushort"
+    "${TS_NAME}_conversion_ushort_ushort"
+    "${TS_NAME}_conversion_short_ushort"
+    "${TS_NAME}_conversion_uint_ushort"
+    "${TS_NAME}_conversion_int_ushort"
+    "${TS_NAME}_conversion_float_ushort"
+    "${TS_NAME}_conversion_double_ushort"
+    "${TS_NAME}_conversion_ulong_ushort"
+    "${TS_NAME}_conversion_long_ushort"
+    "${TS_NAME}_conversion_uchar_short"
+    "${TS_NAME}_conversion_char_short"
+    "${TS_NAME}_conversion_ushort_short"
+    "${TS_NAME}_conversion_short_short"
+    "${TS_NAME}_conversion_uint_short"
+    "${TS_NAME}_conversion_int_short"
+    "${TS_NAME}_conversion_float_short"
+    "${TS_NAME}_conversion_double_short"
+    "${TS_NAME}_conversion_ulong_short"
+    "${TS_NAME}_conversion_long_short"
+    "${TS_NAME}_conversion_uchar_uint"
+    "${TS_NAME}_conversion_char_uint"
+    "${TS_NAME}_conversion_ushort_uint"
+    "${TS_NAME}_conversion_short_uint"
+    "${TS_NAME}_conversion_uint_uint"
+    "${TS_NAME}_conversion_int_uint"
+    "${TS_NAME}_conversion_float_uint"
+    "${TS_NAME}_conversion_double_uint"
+    "${TS_NAME}_conversion_ulong_uint"
+    "${TS_NAME}_conversion_long_uint"
+    "${TS_NAME}_conversion_uchar_int"
+    "${TS_NAME}_conversion_char_int"
+    "${TS_NAME}_conversion_ushort_int"
+    "${TS_NAME}_conversion_short_int"
+    "${TS_NAME}_conversion_uint_int"
+    "${TS_NAME}_conversion_int_int"
+    "${TS_NAME}_conversion_float_int"
+    "${TS_NAME}_conversion_double_int"
+    "${TS_NAME}_conversion_ulong_int"
+    "${TS_NAME}_conversion_long_int"
+    "${TS_NAME}_conversion_uchar_float"
+    "${TS_NAME}_conversion_char_float"
+    "${TS_NAME}_conversion_ushort_float"
+    "${TS_NAME}_conversion_short_float"
+    "${TS_NAME}_conversion_uint_float"
+    "${TS_NAME}_conversion_int_float"
+    "${TS_NAME}_conversion_float_float"
+    "${TS_NAME}_conversion_double_float"
+    "${TS_NAME}_conversion_ulong_float"
+    "${TS_NAME}_conversion_long_float"
+    "${TS_NAME}_conversion_uchar_double"
+    "${TS_NAME}_conversion_char_double"
+    "${TS_NAME}_conversion_ushort_double"
+    "${TS_NAME}_conversion_short_double"
+    "${TS_NAME}_conversion_uint_double"
+    "${TS_NAME}_conversion_int_double"
+    "${TS_NAME}_conversion_float_double"
+    "${TS_NAME}_conversion_double_double"
+    "${TS_NAME}_conversion_ulong_double"
+    "${TS_NAME}_conversion_long_double"
+    "${TS_NAME}_conversion_uchar_ulong"
+    "${TS_NAME}_conversion_char_ulong"
+    "${TS_NAME}_conversion_ushort_ulong"
+    "${TS_NAME}_conversion_short_ulong"
+    "${TS_NAME}_conversion_uint_ulong"
+    "${TS_NAME}_conversion_int_ulong"
+    "${TS_NAME}_conversion_float_ulong"
+    "${TS_NAME}_conversion_double_ulong"
+    "${TS_NAME}_conversion_ulong_ulong"
+    "${TS_NAME}_conversion_long_ulong"
+    "${TS_NAME}_conversion_uchar_long"
+    "${TS_NAME}_conversion_char_long"
+    "${TS_NAME}_conversion_ushort_long"
+    "${TS_NAME}_conversion_short_long"
+    "${TS_NAME}_conversion_uint_long"
+    "${TS_NAME}_conversion_int_long"
+    "${TS_NAME}_conversion_float_long"
+    "${TS_NAME}_conversion_double_long"
+    "${TS_NAME}_conversion_ulong_long"
+    "${TS_NAME}_conversion_long_long"
+    "${TS_NAME}_device_partition"
+    "${TS_NAME}_events"
+    "${TS_NAME}_events_ooo"
+    "${TS_NAME}_events_other"
+    "${TS_NAME}_geometrics"
+    "${TS_NAME}_gl"
+    "${TS_NAME}_half"
+    "${TS_NAME}_cl_gl_h"
+    "${TS_NAME}_cl_gl_h_c99"
+    "${TS_NAME}_cl_h"
+    "${TS_NAME}_cl_h_c99"
+    "${TS_NAME}_cl_platform_h"
+    "${TS_NAME}_cl_platform_h_c99"
+    "${TS_NAME}_half"
+    "${TS_NAME}_headers"
+    "${TS_NAME}_opencl_h"
+    "${TS_NAME}_opencl_h_c99"
+    "${TS_NAME}_images_cl_copy_images"
+    "${TS_NAME}_images_cl_copy_images_max"
+    "${TS_NAME}_images_cl_copy_images_min"
+    "${TS_NAME}_images_cl_copy_images_rounding"
+    "${TS_NAME}_images_cl_fill_images"
+    "${TS_NAME}_images_cl_fill_images_min"
+    "${TS_NAME}_images_cl_fill_images_max"
+    "${TS_NAME}_images_cl_fill_images_pitch"
+    "${TS_NAME}_images_cl_get_info"
+    "${TS_NAME}_images_cl_get_info_min"
+    "${TS_NAME}_images_cl_get_info_max"
+    "${TS_NAME}_images_cl_get_info_random"
+    "${TS_NAME}_images_cl_read_write_images"
+    "${TS_NAME}_images_cl_read_write_images_min"
+    "${TS_NAME}_images_cl_read_write_images_max"
+    "${TS_NAME}_images_cl_read_write_images_round"
+    "${TS_NAME}_images_cl_read_write_images_pitch"
+    "${TS_NAME}_images_kernel_image_methods_1D"
+    "${TS_NAME}_images_kernel_image_methods_2D"
+    "${TS_NAME}_images_kernel_image_methods_3D"
+    "${TS_NAME}_images_kernel_image_methods_1Darray"
+    "${TS_NAME}_images_kernel_image_methods_2Darray"
+    "${TS_NAME}_images_image_streams_1D"
+    "${TS_NAME}_images_image_streams_2D"
+    "${TS_NAME}_images_image_streams_3D"
+    "${TS_NAME}_images_image_streams_1Darray"
+    "${TS_NAME}_images_image_streams_2Darray"
+    "${TS_NAME}_images_image_streams_1D_pitch"
+    "${TS_NAME}_images_image_streams_2D_pitch"
+    "${TS_NAME}_images_image_streams_3D_pitch"
+    "${TS_NAME}_images_image_streams_1Darray_pitch"
+    "${TS_NAME}_images_image_streams_2Darray_pitch"
+    "${TS_NAME}_images_samplerless_reads_1D"
+    "${TS_NAME}_images_samplerless_reads_2D"
+    "${TS_NAME}_images_samplerless_reads_3D"
+    "${TS_NAME}_images_samplerless_reads_1Darray"
+    "${TS_NAME}_images_samplerless_reads_2Darray"
+    "${TS_NAME}_images_samplerless_reads_1D_pitch"
+    "${TS_NAME}_images_samplerless_reads_2D_pitch"
+    "${TS_NAME}_images_samplerless_reads_3D_pitch"
+    "${TS_NAME}_images_samplerless_reads_1Darray_pitch"
+    "${TS_NAME}_images_samplerless_reads_2Darray_pitch"
+    "${TS_NAME}_integer_ops_1"
+    "${TS_NAME}_integer_ops_2"
+    "${TS_NAME}_integer_ops_3"
+
+    "${TS_NAME}_math_add"
+    "${TS_NAME}_math_assignment"
+    "${TS_NAME}_math_cbrt"
+    "${TS_NAME}_math_ceil"
+    "${TS_NAME}_math_copysign"
+    "${TS_NAME}_math_cos"
+    "${TS_NAME}_math_cosh"
+    "${TS_NAME}_math_cospi"
+    "${TS_NAME}_math_divide"
+    "${TS_NAME}_math_exp"
+    "${TS_NAME}_math_exp10"
+    "${TS_NAME}_math_exp2"
+    "${TS_NAME}_math_expm1"
+    "${TS_NAME}_math_fabs"
+    "${TS_NAME}_math_fdim"
+    "${TS_NAME}_math_floor"
+    "${TS_NAME}_math_fma"
+    "${TS_NAME}_math_fmax"
+    "${TS_NAME}_math_fmin"
+    "${TS_NAME}_math_fmod"
+    "${TS_NAME}_math_fract"
+    "${TS_NAME}_math_frexp"
+    "${TS_NAME}_math_hypot"
+    "${TS_NAME}_math_ilogb"
+    "${TS_NAME}_math_isequal"
+    "${TS_NAME}_math_isfinite"
+    "${TS_NAME}_math_isgreater"
+    "${TS_NAME}_math_isgreaterequal"
+    "${TS_NAME}_math_isinf"
+    "${TS_NAME}_math_isless"
+    "${TS_NAME}_math_islessequal"
+    "${TS_NAME}_math_islessgreater"
+    "${TS_NAME}_math_isnan"
+    "${TS_NAME}_math_isnormal"
+    "${TS_NAME}_math_isnotequal"
+    "${TS_NAME}_math_isordered"
+    "${TS_NAME}_math_isunordered"
+    "${TS_NAME}_math_ldexp"
+    "${TS_NAME}_math_log"
+    "${TS_NAME}_math_log10"
+    "${TS_NAME}_math_log1p"
+    "${TS_NAME}_math_log2"
+    "${TS_NAME}_math_logb"
+    "${TS_NAME}_math_mad"
+    "${TS_NAME}_math_maxmag"
+    "${TS_NAME}_math_minmag"
+    "${TS_NAME}_math_modf"
+    "${TS_NAME}_math_multiply"
+    "${TS_NAME}_math_nan"
+    "${TS_NAME}_math_nextafter"
+    "${TS_NAME}_math_not"
+    "${TS_NAME}_math_pow"
+    "${TS_NAME}_math_pown"
+    "${TS_NAME}_math_pow"
+    "${TS_NAME}_math_remquo"
+    "${TS_NAME}_math_remainder"
+    "${TS_NAME}_math_rint"
+    "${TS_NAME}_math_rootn"
+    "${TS_NAME}_math_round"
+    "${TS_NAME}_math_rsqrt"
+    "${TS_NAME}_math_signbit"
+    "${TS_NAME}_math_sin"
+    "${TS_NAME}_math_sincos"
+    "${TS_NAME}_math_sinh"
+    "${TS_NAME}_math_sinpi"
+    "${TS_NAME}_math_sqrt"
+    "${TS_NAME}_math_subtract"
+    "${TS_NAME}_math_tan"
+    "${TS_NAME}_math_tanh"
+    "${TS_NAME}_math_tanpi"
+    "${TS_NAME}_math_trunc"
+
+    "${TS_NAME}_mem_host_flags"
+    "${TS_NAME}_multiples"
+    "${TS_NAME}_printf"
+    "${TS_NAME}_profiling"
+    "${TS_NAME}_relationals"
+    "${TS_NAME}_relationals_shuffle"
+    "${TS_NAME}_select_uchar_uchar"
+    "${TS_NAME}_select_uchar_char"
+    "${TS_NAME}_select_char_uchar"
+    "${TS_NAME}_select_char_char"
+    "${TS_NAME}_select_ushort_ushort"
+    "${TS_NAME}_select_ushort_short"
+    "${TS_NAME}_select_short_ushort"
+    "${TS_NAME}_select_short_short"
+    "${TS_NAME}_select_uint_uint"
+    "${TS_NAME}_select_uint_int"
+    "${TS_NAME}_select_int_uint"
+    "${TS_NAME}_select_int_int"
+    "${TS_NAME}_select_ulong_ulong"
+    "${TS_NAME}_select_ulong_long"
+    "${TS_NAME}_select_long_ulong"
+    "${TS_NAME}_select_long_long"
+    "${TS_NAME}_thread_dimensions_quick"
+    "${TS_NAME}_thread_dimensions_full"
+    "${TS_NAME}_vecalign"
+    "${TS_NAME}_vecstep"
+  PROPERTIES
+    LABELS "conformance_suite_full"
+    ENVIRONMENT "POCL_MEMORY_LIMIT=1")
+
+#***************************************************************************
+
+# conversions, for short version of conformance suite.
+# disables multithreading and only tests vectors of size 4
+
+add_test(NAME "${TS_NAME}_conversion_mini_uint_uchar"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" -m -4 uint_uchar uint_sat_uchar )
+
+add_test(NAME "${TS_NAME}_conversion_mini_short_char"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" -m -4 short_char short_sat_char )
+
+add_test(NAME "${TS_NAME}_conversion_mini_uchar_ushort"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" -m -4 uchar_ushort uchar_sat_ushort )
+
+add_test(NAME "${TS_NAME}_conversion_mini_char_int"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" -m -4 char_int char_sat_int )
+
+add_test(NAME "${TS_NAME}_conversion_mini_float_char"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" -m -4 float_char float_rte_char float_rtp_char float_rtn_char float_rtz_char )
+
+add_test(NAME "${TS_NAME}_conversion_mini_float_ushort"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" -m -4 float_ushort float_rte_ushort float_rtp_ushort float_rtn_ushort float_rtz_ushort )
+
+add_test(NAME "${TS_NAME}_conversion_mini_int_float"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" -m -4 int_float int_sat_float int_rte_float int_sat_rte_float int_rtp_float int_sat_rtp_float int_rtn_float int_sat_rtn_float int_rtz_float int_sat_rtz_float )
+
+add_test(NAME "${TS_NAME}_conversion_mini_double_short"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" -m -4 double_short double_rte_short double_rtp_short double_rtn_short double_rtz_short )
+
+add_test(NAME "${TS_NAME}_conversion_mini_double_ulong"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" -m -4 double_ulong double_rte_ulong double_rtp_ulong double_rtn_ulong double_rtz_ulong )
+
+add_test(NAME "${TS_NAME}_conversion_mini_long_double"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" -m -4 long_double long_sat_double long_rte_double long_sat_rte_double long_rtp_double long_sat_rtp_double long_rtn_double long_sat_rtn_double long_rtz_double long_sat_rtz_double )
+
+#***************************************************************************
+
+# tests math on vectors of 4 floats.
+# This is usually the fastest version on machines with AVX(2)
+
+add_test(NAME "${TS_NAME}_math_mini_add"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d add)
+
+add_test(NAME "${TS_NAME}_math_mini_assignment"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d assignment)
+
+add_test(NAME "${TS_NAME}_math_mini_cbrt"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d cbrt)
+
+add_test(NAME "${TS_NAME}_math_mini_ceil"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d ceil)
+
+add_test(NAME "${TS_NAME}_math_mini_copysign"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d copysign)
+
+add_test(NAME "${TS_NAME}_math_mini_cos"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d cos)
+
+add_test(NAME "${TS_NAME}_math_mini_cosh"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d cosh)
+
+add_test(NAME "${TS_NAME}_math_mini_cospi"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d cospi)
+
+add_test(NAME "${TS_NAME}_math_mini_divide"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d divide)
+
+add_test(NAME "${TS_NAME}_math_mini_exp"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d exp)
+
+add_test(NAME "${TS_NAME}_math_mini_expm1"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d expm1)
+
+add_test(NAME "${TS_NAME}_math_mini_fabs"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d fabs)
+
+add_test(NAME "${TS_NAME}_math_mini_fdim"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d fdim)
+
+add_test(NAME "${TS_NAME}_math_mini_floor"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d floor)
+
+add_test(NAME "${TS_NAME}_math_mini_fma"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d fma)
+
+add_test(NAME "${TS_NAME}_math_mini_fmax"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d fmax)
+
+add_test(NAME "${TS_NAME}_math_mini_fmin"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d fmin)
+
+add_test(NAME "${TS_NAME}_math_mini_fract"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d fract)
+
+add_test(NAME "${TS_NAME}_math_mini_frexp"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d frexp)
+
+add_test(NAME "${TS_NAME}_math_mini_hypot"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d hypot)
+
+add_test(NAME "${TS_NAME}_math_mini_ilogb"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d ilogb)
+
+add_test(NAME "${TS_NAME}_math_mini_isequal"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d isequal)
+
+add_test(NAME "${TS_NAME}_math_mini_isfinite"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d isfinite)
+
+add_test(NAME "${TS_NAME}_math_mini_isgreater"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d isgreater)
+
+add_test(NAME "${TS_NAME}_math_mini_isgreaterequal"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d isgreaterequal)
+
+add_test(NAME "${TS_NAME}_math_mini_isinf"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d isinf)
+
+add_test(NAME "${TS_NAME}_math_mini_isless"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d isless)
+
+add_test(NAME "${TS_NAME}_math_mini_islessequal"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d islessequal)
+
+add_test(NAME "${TS_NAME}_math_mini_islessgreater"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d islessgreater)
+
+add_test(NAME "${TS_NAME}_math_mini_isnan"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d isnan)
+
+add_test(NAME "${TS_NAME}_math_mini_isnormal"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d isnormal)
+
+add_test(NAME "${TS_NAME}_math_mini_isnotequal"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d isnotequal)
+
+add_test(NAME "${TS_NAME}_math_mini_isordered"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d isordered)
+
+add_test(NAME "${TS_NAME}_math_mini_isunordered"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d isunordered)
+
+add_test(NAME "${TS_NAME}_math_mini_ldexp"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d ldexp)
+
+add_test(NAME "${TS_NAME}_math_mini_log"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d log)
+
+add_test(NAME "${TS_NAME}_math_mini_log10"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d log10)
+
+add_test(NAME "${TS_NAME}_math_mini_log1p"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d log1p)
+
+add_test(NAME "${TS_NAME}_math_mini_log2"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d log2)
+
+add_test(NAME "${TS_NAME}_math_mini_logb"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d logb)
+
+add_test(NAME "${TS_NAME}_math_mini_mad"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d mad)
+
+add_test(NAME "${TS_NAME}_math_mini_maxmag"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d maxmag)
+
+add_test(NAME "${TS_NAME}_math_mini_minmag"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d minmag)
+
+add_test(NAME "${TS_NAME}_math_mini_modf"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d modf)
+
+add_test(NAME "${TS_NAME}_math_mini_multiply"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d multiply)
+
+add_test(NAME "${TS_NAME}_math_mini_nan"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d nan)
+
+add_test(NAME "${TS_NAME}_math_mini_nextafter"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d nextafter)
+
+add_test(NAME "${TS_NAME}_math_mini_not"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d not)
+
+add_test(NAME "${TS_NAME}_math_mini_pown"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d pown)
+
+add_test(NAME "${TS_NAME}_math_mini_rint"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d rint)
+
+add_test(NAME "${TS_NAME}_math_mini_rootn"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d rootn)
+
+add_test(NAME "${TS_NAME}_math_mini_round"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d round)
+
+add_test(NAME "${TS_NAME}_math_mini_rsqrt"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d rsqrt)
+
+add_test(NAME "${TS_NAME}_math_mini_signbit"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d signbit)
+
+add_test(NAME "${TS_NAME}_math_mini_sin"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d sin)
+
+add_test(NAME "${TS_NAME}_math_mini_sinh"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d sinh)
+
+add_test(NAME "${TS_NAME}_math_mini_sinpi"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d sinpi)
+
+add_test(NAME "${TS_NAME}_math_mini_sqrt"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d sqrt)
+
+add_test(NAME "${TS_NAME}_math_mini_subtract"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d subtract)
+
+add_test(NAME "${TS_NAME}_math_mini_tan"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d tan)
+
+add_test(NAME "${TS_NAME}_math_mini_tanh"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d tanh)
+
+add_test(NAME "${TS_NAME}_math_mini_tanpi"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d tanpi)
+
+add_test(NAME "${TS_NAME}_math_mini_trunc"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -d trunc)
+
+
+add_test(NAME "${TS_NAME}_mini_half"
+         COMMAND "${TS_BUILDDIR}/half/Test_half" -w)
+
+set_tests_properties(
+    "${TS_NAME}_mini_half"
+    "${TS_NAME}_math_mini_add"
+    "${TS_NAME}_math_mini_assignment"
+    "${TS_NAME}_math_mini_cbrt"
+    "${TS_NAME}_math_mini_ceil"
+    "${TS_NAME}_math_mini_copysign"
+    "${TS_NAME}_math_mini_cos"
+    "${TS_NAME}_math_mini_cosh"
+    "${TS_NAME}_math_mini_cospi"
+    "${TS_NAME}_math_mini_divide"
+    "${TS_NAME}_math_mini_exp"
+    "${TS_NAME}_math_mini_expm1"
+    "${TS_NAME}_math_mini_fabs"
+    "${TS_NAME}_math_mini_fdim"
+    "${TS_NAME}_math_mini_floor"
+    "${TS_NAME}_math_mini_fma"
+    "${TS_NAME}_math_mini_fmax"
+    "${TS_NAME}_math_mini_fmin"
+    "${TS_NAME}_math_mini_fract"
+    "${TS_NAME}_math_mini_frexp"
+    "${TS_NAME}_math_mini_hypot"
+    "${TS_NAME}_math_mini_ilogb"
+    "${TS_NAME}_math_mini_isequal"
+    "${TS_NAME}_math_mini_isfinite"
+    "${TS_NAME}_math_mini_isgreater"
+    "${TS_NAME}_math_mini_isgreaterequal"
+    "${TS_NAME}_math_mini_isinf"
+    "${TS_NAME}_math_mini_isless"
+    "${TS_NAME}_math_mini_islessequal"
+    "${TS_NAME}_math_mini_islessgreater"
+    "${TS_NAME}_math_mini_isnan"
+    "${TS_NAME}_math_mini_isnormal"
+    "${TS_NAME}_math_mini_isnotequal"
+    "${TS_NAME}_math_mini_isordered"
+    "${TS_NAME}_math_mini_isunordered"
+    "${TS_NAME}_math_mini_ldexp"
+    "${TS_NAME}_math_mini_log"
+    "${TS_NAME}_math_mini_log10"
+    "${TS_NAME}_math_mini_log1p"
+    "${TS_NAME}_math_mini_log2"
+    "${TS_NAME}_math_mini_logb"
+    "${TS_NAME}_math_mini_mad"
+    "${TS_NAME}_math_mini_maxmag"
+    "${TS_NAME}_math_mini_minmag"
+    "${TS_NAME}_math_mini_modf"
+    "${TS_NAME}_math_mini_multiply"
+    "${TS_NAME}_math_mini_nan"
+    "${TS_NAME}_math_mini_nextafter"
+    "${TS_NAME}_math_mini_not"
+    "${TS_NAME}_math_mini_pown"
+    "${TS_NAME}_math_mini_rint"
+    "${TS_NAME}_math_mini_rootn"
+    "${TS_NAME}_math_mini_round"
+    "${TS_NAME}_math_mini_rsqrt"
+    "${TS_NAME}_math_mini_signbit"
+    "${TS_NAME}_math_mini_sin"
+    "${TS_NAME}_math_mini_sinh"
+    "${TS_NAME}_math_mini_sinpi"
+    "${TS_NAME}_math_mini_sqrt"
+    "${TS_NAME}_math_mini_subtract"
+    "${TS_NAME}_math_mini_tan"
+    "${TS_NAME}_math_mini_tanh"
+    "${TS_NAME}_math_mini_tanpi"
+    "${TS_NAME}_math_mini_trunc"
+
+    "${TS_NAME}_conversion_mini_uint_uchar"
+    "${TS_NAME}_conversion_mini_short_char"
+    "${TS_NAME}_conversion_mini_uchar_ushort"
+    "${TS_NAME}_conversion_mini_char_int"
+    "${TS_NAME}_conversion_mini_float_char"
+    "${TS_NAME}_conversion_mini_float_ushort"
+    "${TS_NAME}_conversion_mini_int_float"
+    "${TS_NAME}_conversion_mini_double_short"
+    "${TS_NAME}_conversion_mini_double_ulong"
+    "${TS_NAME}_conversion_mini_long_double"
+  PROPERTIES
+    LABELS "conformance_suite_mini"
+    ENVIRONMENT "POCL_MEMORY_LIMIT=1")
+
+
+set_tests_properties(
+    "${TS_NAME}_allocations_buffer"
+    "${TS_NAME}_allocations_image"
+    "${TS_NAME}_api"
+    "${TS_NAME}_atomics"
+    "${TS_NAME}_basic_math"
+    "${TS_NAME}_basic_s2v"
+    "${TS_NAME}_basic_memory"
+    "${TS_NAME}_basic_image"
+    "${TS_NAME}_basic_other"
+    "${TS_NAME}_buffers_read"
+    "${TS_NAME}_buffers_write"
+    "${TS_NAME}_buffers_fill"
+    "${TS_NAME}_buffers_var1"
+    "${TS_NAME}_buffers_var2"
+    "${TS_NAME}_buffers_map_read"
+    "${TS_NAME}_buffers_map_write"
+    "${TS_NAME}_commonfns"
+    "${TS_NAME}_compiler"
+    "${TS_NAME}_computeinfo"
+    "${TS_NAME}_contractions"
+    "${TS_NAME}_device_partition"
+    "${TS_NAME}_events"
+    "${TS_NAME}_events_ooo"
+    "${TS_NAME}_events_other"
+    "${TS_NAME}_geometrics"
+    "${TS_NAME}_gl"
+    "${TS_NAME}_headers"
+    "${TS_NAME}_opencl_h"
+    "${TS_NAME}_opencl_h_c99"
+    "${TS_NAME}_images_cl_copy_images"
+    "${TS_NAME}_images_cl_fill_images"
+    "${TS_NAME}_images_cl_get_info"
+    "${TS_NAME}_images_cl_read_write_images"
+    "${TS_NAME}_images_kernel_image_methods_1D"
+    "${TS_NAME}_images_kernel_image_methods_2D"
+    "${TS_NAME}_images_kernel_image_methods_1Darray"
+    "${TS_NAME}_images_image_streams_1D"
+    "${TS_NAME}_images_image_streams_2D"
+    "${TS_NAME}_images_image_streams_1Darray"
+    "${TS_NAME}_images_samplerless_reads_1D"
+    "${TS_NAME}_images_samplerless_reads_2D"
+    "${TS_NAME}_images_samplerless_reads_1Darray"
+    "${TS_NAME}_integer_ops_1"
+    "${TS_NAME}_integer_ops_2"
+    "${TS_NAME}_integer_ops_3"
+    "${TS_NAME}_mem_host_flags"
+    "${TS_NAME}_multiples"
+    "${TS_NAME}_printf"
+    "${TS_NAME}_profiling"
+    "${TS_NAME}_relationals"
+    "${TS_NAME}_relationals_shuffle"
+    "${TS_NAME}_select_char_char"
+    "${TS_NAME}_select_short_short"
+    "${TS_NAME}_select_int_int"
+    "${TS_NAME}_select_long_long"
+    "${TS_NAME}_thread_dimensions_quick"
+    "${TS_NAME}_thread_dimensions_full"
+    "${TS_NAME}_vecstep"
+  PROPERTIES
+    LABELS "conformance_suite_full;conformance_suite_mini"
+    ENVIRONMENT "POCL_MEMORY_LIMIT=1")
+
+######################################################################
+
+add_test(NAME "${TS_NAME}_images_micro_kernel_image_methods_1D"
+         COMMAND "${TS_BUILDDIR}/images/kernel_image_methods/test_kernel_image_methods" 1D)
+add_test(NAME "${TS_NAME}_images_micro_kernel_image_methods_2D"
+         COMMAND "${TS_BUILDDIR}/images/kernel_image_methods/test_kernel_image_methods" 2D)
+add_test(NAME "${TS_NAME}_images_micro_kernel_image_methods_1Darray"
+         COMMAND "${TS_BUILDDIR}/images/kernel_image_methods/test_kernel_image_methods" 1Darray)
+add_test(NAME "${TS_NAME}_images_micro_kernel_image_methods_2Darray"
+         COMMAND "${TS_BUILDDIR}/images/kernel_image_methods/test_kernel_image_methods" 2Darray)
+
+
+add_test(NAME "${TS_NAME}_images_micro_image_streams_1D"
+         COMMAND "${TS_BUILDDIR}/images/kernel_read_write/test_image_streams" 1D CL_ARGB CL_SIGNED_INT8)
+add_test(NAME "${TS_NAME}_images_micro_image_streams_2D"
+         COMMAND "${TS_BUILDDIR}/images/kernel_read_write/test_image_streams" 2D CL_A CL_UNSIGNED_INT8)
+add_test(NAME "${TS_NAME}_images_micro_image_streams_3D"
+         COMMAND "${TS_BUILDDIR}/images/kernel_read_write/test_image_streams" 3D CL_RGBA CL_FLOAT)
+add_test(NAME "${TS_NAME}_images_micro_image_streams_1Darray"
+         COMMAND "${TS_BUILDDIR}/images/kernel_read_write/test_image_streams" 1Darray CL_A CL_UNORM_INT8)
+add_test(NAME "${TS_NAME}_images_micro_image_streams_2Darray"
+         COMMAND "${TS_BUILDDIR}/images/kernel_read_write/test_image_streams" 2Darray CL_RGBA CL_SNORM_INT16)
+
+
+add_test(NAME "${TS_NAME}_images_micro_samplerless_reads_1D"
+         COMMAND "${TS_BUILDDIR}/images/samplerlessReads/test_samplerless_reads" 1D CL_A CL_UNSIGNED_INT8)
+add_test(NAME "${TS_NAME}_images_micro_samplerless_reads_2D"
+         COMMAND "${TS_BUILDDIR}/images/samplerlessReads/test_samplerless_reads" 2D CL_RGBA CL_FLOAT)
+add_test(NAME "${TS_NAME}_images_micro_samplerless_reads_3D"
+         COMMAND "${TS_BUILDDIR}/images/samplerlessReads/test_samplerless_reads" 3D CL_A CL_UNORM_INT8)
+add_test(NAME "${TS_NAME}_images_micro_samplerless_reads_1Darray"
+         COMMAND "${TS_BUILDDIR}/images/samplerlessReads/test_samplerless_reads" 1Darray CL_RGBA CL_SNORM_INT16)
+add_test(NAME "${TS_NAME}_images_micro_samplerless_reads_2Darray"
+         COMMAND "${TS_BUILDDIR}/images/samplerlessReads/test_samplerless_reads" 2Darray CL_ARGB CL_SIGNED_INT8)
+
+add_test(NAME "${TS_NAME}_integer_ops_micro_1"
+         COMMAND "${TS_BUILDDIR}/integer_ops/test_integer_ops" integer_clz  integer_hadd  integer_rhadd  integer_mul_hi  integer_rotate  integer_clamp  integer_mad_sat  integer_mad_hi  integer_min  integer_max  integer_addAssign  integer_subtractAssign  integer_multiplyAssign  integer_divideAssign  integer_moduloAssign)
+
+add_test(NAME "${TS_NAME}_integer_ops_micro_3"
+         COMMAND "${TS_BUILDDIR}/integer_ops/test_integer_ops"  quick_long_math  quick_long_logic  quick_ulong_shift  quick_ulong_compare  quick_uint_math  quick_uint_logic  quick_int_shift  quick_int_compare  quick_short_math  quick_short_logic  quick_ushort_shift  quick_ushort_compare  quick_char_math  quick_char_logic  quick_uchar_shift  quick_uchar_compare)
+
+add_test(NAME "${TS_NAME}_basic_micro_other"
+         COMMAND "${TS_BUILDDIR}/basic/test_basic" mri_one  mri_multiple  barrier  createkernelsinprogram  work_item_functions    kernel_call_kernel_function  host_numeric_constants  kernel_numeric_constants  kernel_limit_constants  kernel_preprocessor_macros  parameter_types  vector_creation  vec_type_hint  global_work_offsets  get_global_offset  hostptr    if  sizeof  loop  pointer_cast  local_arg_def  local_kernel_def  local_kernel_scope  constant  constant_source)
+
+add_test(NAME "${TS_NAME}_basic_micro_memory"
+         COMMAND "${TS_BUILDDIR}/basic/test_basic" vload_global  vload_local  vstore_global  vstore_local   bufferreadwriterect  arrayreadwrite  arraycopy  enqueue_map_buffer   kernel_memory_alignment_local  kernel_memory_alignment_global)
+
+add_test(NAME "${TS_NAME}_compiler_micro"
+         COMMAND "${TS_BUILDDIR}/compiler/test_compiler" load_program_source  load_multistring_source  load_two_kernel_source  load_null_terminated_source  load_null_terminated_multi_line_source  load_null_terminated_partial_multi_line_source  load_discreet_length_source  get_program_source  get_program_build_info  get_program_info   async_build  options_build_optimizations  options_build_macro  options_build_macro_existence  options_include_directory  options_denorm_cache  preprocessor_ [...]
+         WORKING_DIRECTORY "${TS_BASEDIR}/src/${TS_NAME}/test_conformance/compiler")
+
+# with wimpy mode
+add_test(NAME "${TS_NAME}_select_micro_char_char"
+         COMMAND "${TS_BUILDDIR}/select/test_select" select_char_char -w)
+
+add_test(NAME "${TS_NAME}_select_micro_ushort_ushort"
+         COMMAND "${TS_BUILDDIR}/select/test_select" select_ushort_ushort -w)
+
+add_test(NAME "${TS_NAME}_select_micro_int_int"
+         COMMAND "${TS_BUILDDIR}/select/test_select" select_int_int -w)
+
+add_test(NAME "${TS_NAME}_select_micro_ulong_ulong"
+         COMMAND "${TS_BUILDDIR}/select/test_select" select_ulong_ulong -w)
+
+# conversions, for micro version of conformance suite.
+# disables multithreading, enables Wimpy mode
+
+add_test(NAME "${TS_NAME}_conversion_micro_uint_uchar"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" -m -w uint_uchar uint_sat_uchar )
+
+add_test(NAME "${TS_NAME}_conversion_micro_short_char"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" -m -w short_char short_sat_char )
+
+add_test(NAME "${TS_NAME}_conversion_micro_uchar_ushort"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" -m -w uchar_ushort uchar_sat_ushort )
+
+add_test(NAME "${TS_NAME}_conversion_micro_char_int"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" -m -w char_int char_sat_int )
+
+add_test(NAME "${TS_NAME}_conversion_micro_float_char"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" -m -w float_char float_rte_char float_rtp_char float_rtn_char float_rtz_char )
+
+add_test(NAME "${TS_NAME}_conversion_micro_float_ushort"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" -m -w float_ushort float_rte_ushort float_rtp_ushort float_rtn_ushort float_rtz_ushort )
+
+add_test(NAME "${TS_NAME}_conversion_micro_int_float"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" -m -w int_float int_sat_float int_rte_float int_sat_rte_float int_rtp_float int_sat_rtp_float int_rtn_float int_sat_rtn_float int_rtz_float int_sat_rtz_float )
+
+add_test(NAME "${TS_NAME}_conversion_micro_double_short"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" -m -w double_short double_rte_short double_rtp_short double_rtn_short double_rtz_short )
+
+add_test(NAME "${TS_NAME}_conversion_micro_double_ulong"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" -m -w double_ulong double_rte_ulong double_rtp_ulong double_rtn_ulong double_rtz_ulong )
+
+add_test(NAME "${TS_NAME}_conversion_micro_long_double"
+         COMMAND "${TS_BUILDDIR}/conversions/test_conversions" -m -w long_double long_sat_double long_rte_double long_sat_rte_double long_rtp_double long_sat_rtp_double long_rtn_double long_sat_rtn_double long_rtz_double long_sat_rtz_double )
+
+# tests math on vectors of 4 floats for micro testsuite
+# vectors of 4, wimpy mode
+
+add_test(NAME "${TS_NAME}_math_micro_add"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w add)
+
+add_test(NAME "${TS_NAME}_math_micro_assignment"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w assignment)
+
+add_test(NAME "${TS_NAME}_math_micro_cbrt"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w cbrt)
+
+add_test(NAME "${TS_NAME}_math_micro_ceil"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w ceil)
+
+add_test(NAME "${TS_NAME}_math_micro_copysign"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w copysign)
+
+add_test(NAME "${TS_NAME}_math_micro_cos"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w cos)
+
+add_test(NAME "${TS_NAME}_math_micro_cosh"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w cosh)
+
+add_test(NAME "${TS_NAME}_math_micro_cospi"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w cospi)
+
+add_test(NAME "${TS_NAME}_math_micro_divide"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w divide)
+
+add_test(NAME "${TS_NAME}_math_micro_exp"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w exp)
+
+add_test(NAME "${TS_NAME}_math_micro_expm1"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w expm1)
+
+add_test(NAME "${TS_NAME}_math_micro_fabs"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w fabs)
+
+add_test(NAME "${TS_NAME}_math_micro_fdim"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w fdim)
+
+add_test(NAME "${TS_NAME}_math_micro_floor"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w floor)
+
+add_test(NAME "${TS_NAME}_math_micro_fma"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w fma)
+
+add_test(NAME "${TS_NAME}_math_micro_fmax"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w fmax)
+
+add_test(NAME "${TS_NAME}_math_micro_fmin"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w fmin)
+
+add_test(NAME "${TS_NAME}_math_micro_fract"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w fract)
+
+add_test(NAME "${TS_NAME}_math_micro_frexp"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w frexp)
+
+add_test(NAME "${TS_NAME}_math_micro_hypot"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w hypot)
+
+add_test(NAME "${TS_NAME}_math_micro_ilogb"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w ilogb)
+
+add_test(NAME "${TS_NAME}_math_micro_isequal"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w isequal)
+
+add_test(NAME "${TS_NAME}_math_micro_isfinite"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w isfinite)
+
+add_test(NAME "${TS_NAME}_math_micro_isgreater"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w isgreater)
+
+add_test(NAME "${TS_NAME}_math_micro_isgreaterequal"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w isgreaterequal)
+
+add_test(NAME "${TS_NAME}_math_micro_isinf"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w isinf)
+
+add_test(NAME "${TS_NAME}_math_micro_isless"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w isless)
+
+add_test(NAME "${TS_NAME}_math_micro_islessequal"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w islessequal)
+
+add_test(NAME "${TS_NAME}_math_micro_islessgreater"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w islessgreater)
+
+add_test(NAME "${TS_NAME}_math_micro_isnan"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w isnan)
+
+add_test(NAME "${TS_NAME}_math_micro_isnormal"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w isnormal)
+
+add_test(NAME "${TS_NAME}_math_micro_isnotequal"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w isnotequal)
+
+add_test(NAME "${TS_NAME}_math_micro_isordered"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w isordered)
+
+add_test(NAME "${TS_NAME}_math_micro_isunordered"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w isunordered)
+
+add_test(NAME "${TS_NAME}_math_micro_ldexp"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w ldexp)
+
+add_test(NAME "${TS_NAME}_math_micro_log"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w log)
+
+add_test(NAME "${TS_NAME}_math_micro_log10"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w log10)
+
+add_test(NAME "${TS_NAME}_math_micro_log1p"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w log1p)
+
+add_test(NAME "${TS_NAME}_math_micro_log2"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w log2)
+
+add_test(NAME "${TS_NAME}_math_micro_logb"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w logb)
+
+add_test(NAME "${TS_NAME}_math_micro_mad"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w mad)
+
+add_test(NAME "${TS_NAME}_math_micro_maxmag"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w maxmag)
+
+add_test(NAME "${TS_NAME}_math_micro_minmag"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w minmag)
+
+add_test(NAME "${TS_NAME}_math_micro_modf"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w modf)
+
+add_test(NAME "${TS_NAME}_math_micro_multiply"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w multiply)
+
+add_test(NAME "${TS_NAME}_math_micro_nan"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w nan)
+
+add_test(NAME "${TS_NAME}_math_micro_nextafter"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w nextafter)
+
+add_test(NAME "${TS_NAME}_math_micro_not"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w not)
+
+add_test(NAME "${TS_NAME}_math_micro_pown"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w pown)
+
+add_test(NAME "${TS_NAME}_math_micro_rint"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w rint)
+
+add_test(NAME "${TS_NAME}_math_micro_rootn"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w rootn)
+
+add_test(NAME "${TS_NAME}_math_micro_round"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w round)
+
+add_test(NAME "${TS_NAME}_math_micro_rsqrt"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w rsqrt)
+
+add_test(NAME "${TS_NAME}_math_micro_signbit"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w signbit)
+
+add_test(NAME "${TS_NAME}_math_micro_sin"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w sin)
+
+add_test(NAME "${TS_NAME}_math_micro_sinh"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w sinh)
+
+add_test(NAME "${TS_NAME}_math_micro_sinpi"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w sinpi)
+
+add_test(NAME "${TS_NAME}_math_micro_sqrt"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w sqrt)
+
+add_test(NAME "${TS_NAME}_math_micro_subtract"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w subtract)
+
+add_test(NAME "${TS_NAME}_math_micro_tan"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w tan)
+
+add_test(NAME "${TS_NAME}_math_micro_tanh"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w tanh)
+
+add_test(NAME "${TS_NAME}_math_micro_tanpi"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w tanpi)
+
+add_test(NAME "${TS_NAME}_math_micro_trunc"
+         COMMAND "${TS_BUILDDIR}/math_brute_force/bruteforce"  -4 -w trunc)
+
+set_tests_properties(
+    "${TS_NAME}_basic_micro_memory"
+    "${TS_NAME}_basic_micro_other"
+
+    "${TS_NAME}_math_micro_add"
+    "${TS_NAME}_math_micro_assignment"
+    "${TS_NAME}_math_micro_cbrt"
+    "${TS_NAME}_math_micro_ceil"
+    "${TS_NAME}_math_micro_copysign"
+    "${TS_NAME}_math_micro_cos"
+    "${TS_NAME}_math_micro_cosh"
+    "${TS_NAME}_math_micro_cospi"
+    "${TS_NAME}_math_micro_divide"
+    "${TS_NAME}_math_micro_exp"
+    "${TS_NAME}_math_micro_expm1"
+    "${TS_NAME}_math_micro_fabs"
+    "${TS_NAME}_math_micro_fdim"
+    "${TS_NAME}_math_micro_floor"
+    "${TS_NAME}_math_micro_fma"
+    "${TS_NAME}_math_micro_fmax"
+    "${TS_NAME}_math_micro_fmin"
+    "${TS_NAME}_math_micro_fract"
+    "${TS_NAME}_math_micro_frexp"
+    "${TS_NAME}_math_micro_hypot"
+    "${TS_NAME}_math_micro_ilogb"
+    "${TS_NAME}_math_micro_isequal"
+    "${TS_NAME}_math_micro_isfinite"
+    "${TS_NAME}_math_micro_isgreater"
+    "${TS_NAME}_math_micro_isgreaterequal"
+    "${TS_NAME}_math_micro_isinf"
+    "${TS_NAME}_math_micro_isless"
+    "${TS_NAME}_math_micro_islessequal"
+    "${TS_NAME}_math_micro_islessgreater"
+    "${TS_NAME}_math_micro_isnan"
+    "${TS_NAME}_math_micro_isnormal"
+    "${TS_NAME}_math_micro_isnotequal"
+    "${TS_NAME}_math_micro_isordered"
+    "${TS_NAME}_math_micro_isunordered"
+    "${TS_NAME}_math_micro_ldexp"
+    "${TS_NAME}_math_micro_log"
+    "${TS_NAME}_math_micro_log10"
+    "${TS_NAME}_math_micro_log1p"
+    "${TS_NAME}_math_micro_log2"
+    "${TS_NAME}_math_micro_logb"
+    "${TS_NAME}_math_micro_mad"
+    "${TS_NAME}_math_micro_maxmag"
+    "${TS_NAME}_math_micro_minmag"
+    "${TS_NAME}_math_micro_modf"
+    "${TS_NAME}_math_micro_multiply"
+    "${TS_NAME}_math_micro_nan"
+    "${TS_NAME}_math_micro_nextafter"
+    "${TS_NAME}_math_micro_not"
+    "${TS_NAME}_math_micro_pown"
+    "${TS_NAME}_math_micro_rint"
+    "${TS_NAME}_math_micro_rootn"
+    "${TS_NAME}_math_micro_round"
+    "${TS_NAME}_math_micro_rsqrt"
+    "${TS_NAME}_math_micro_signbit"
+    "${TS_NAME}_math_micro_sin"
+    "${TS_NAME}_math_micro_sinh"
+    "${TS_NAME}_math_micro_sinpi"
+    "${TS_NAME}_math_micro_sqrt"
+    "${TS_NAME}_math_micro_subtract"
+    "${TS_NAME}_math_micro_tan"
+    "${TS_NAME}_math_micro_tanh"
+    "${TS_NAME}_math_micro_tanpi"
+    "${TS_NAME}_math_micro_trunc"
+
+    "${TS_NAME}_conversion_micro_uint_uchar"
+    "${TS_NAME}_conversion_micro_short_char"
+    "${TS_NAME}_conversion_micro_uchar_ushort"
+    "${TS_NAME}_conversion_micro_char_int"
+    "${TS_NAME}_conversion_micro_float_char"
+    "${TS_NAME}_conversion_micro_float_ushort"
+    "${TS_NAME}_conversion_micro_int_float"
+    "${TS_NAME}_conversion_micro_double_short"
+    "${TS_NAME}_conversion_micro_double_ulong"
+    "${TS_NAME}_conversion_micro_long_double"
+
+    "${TS_NAME}_images_micro_kernel_image_methods_1D"
+    "${TS_NAME}_images_micro_kernel_image_methods_2D"
+    "${TS_NAME}_images_micro_kernel_image_methods_2D"
+    "${TS_NAME}_images_micro_kernel_image_methods_1Darray"
+    "${TS_NAME}_images_micro_kernel_image_methods_2Darray"
+
+    "${TS_NAME}_images_micro_image_streams_1D"
+    "${TS_NAME}_images_micro_image_streams_2D"
+    "${TS_NAME}_images_micro_image_streams_3D"
+    "${TS_NAME}_images_micro_image_streams_1Darray"
+    "${TS_NAME}_images_micro_image_streams_2Darray"
+
+    "${TS_NAME}_images_micro_samplerless_reads_1D"
+    "${TS_NAME}_images_micro_samplerless_reads_2D"
+    "${TS_NAME}_images_micro_samplerless_reads_3D"
+    "${TS_NAME}_images_micro_samplerless_reads_1Darray"
+    "${TS_NAME}_images_micro_samplerless_reads_2Darray"
+
+    "${TS_NAME}_integer_ops_micro_1"
+    "${TS_NAME}_integer_ops_micro_3"
+
+    "${TS_NAME}_select_micro_char_char"
+    "${TS_NAME}_select_micro_ushort_ushort"
+    "${TS_NAME}_select_micro_int_int"
+    "${TS_NAME}_select_micro_ulong_ulong"
+
+  PROPERTIES
+    LABELS "conformance_suite_micro"
+    ENVIRONMENT "POCL_MEMORY_LIMIT=1")
+
+
+set_tests_properties(
+    "${TS_NAME}_mini_half"
+  PROPERTIES
+    LABELS "conformance_suite_micro;conformance_suite_mini"
+    ENVIRONMENT "POCL_MEMORY_LIMIT=1")
+
+set_tests_properties(
+    "${TS_NAME}_allocations_buffer"
+    "${TS_NAME}_allocations_image"
+    "${TS_NAME}_api"
+    "${TS_NAME}_atomics"
+    "${TS_NAME}_basic_math"
+    "${TS_NAME}_basic_s2v"
+    "${TS_NAME}_basic_image"
+    "${TS_NAME}_buffers_read"
+    "${TS_NAME}_buffers_write"
+    "${TS_NAME}_buffers_fill"
+    "${TS_NAME}_buffers_var1"
+    "${TS_NAME}_buffers_var2"
+    "${TS_NAME}_buffers_map_read"
+    "${TS_NAME}_buffers_map_write"
+    "${TS_NAME}_commonfns"
+    "${TS_NAME}_compiler_micro"
+    "${TS_NAME}_computeinfo"
+    "${TS_NAME}_contractions"
+    "${TS_NAME}_device_partition"
+    "${TS_NAME}_events"
+    "${TS_NAME}_events_ooo"
+    "${TS_NAME}_events_other"
+    "${TS_NAME}_geometrics"
+    "${TS_NAME}_gl"
+    "${TS_NAME}_headers"
+    "${TS_NAME}_opencl_h"
+    "${TS_NAME}_opencl_h_c99"
+    "${TS_NAME}_images_cl_copy_images"
+    "${TS_NAME}_images_cl_fill_images"
+    "${TS_NAME}_images_cl_get_info"
+    "${TS_NAME}_images_cl_read_write_images"
+
+    "${TS_NAME}_mem_host_flags"
+    "${TS_NAME}_multiples"
+    "${TS_NAME}_printf"
+    "${TS_NAME}_profiling"
+    "${TS_NAME}_relationals"
+    "${TS_NAME}_thread_dimensions_quick"
+    "${TS_NAME}_vecstep"
+  PROPERTIES
+    LABELS "conformance_suite_full;conformance_suite_mini;conformance_suite_micro"
+    ENVIRONMENT "POCL_MEMORY_LIMIT=1")
+
+
+endif()
diff --git a/examples/example1-spir32/CMakeLists.txt b/examples/example1-spir32/CMakeLists.txt
index 60684d7..7dab157 100644
--- a/examples/example1-spir32/CMakeLists.txt
+++ b/examples/example1-spir32/CMakeLists.txt
@@ -25,11 +25,8 @@
 
 if(POCL_DEVICE_ADDRESS_BITS EQUAL "32")
 
-#AM_CPPFLAGS = -I$(top_srcdir)/fix-include -I$(top_srcdir)/include -DSRCDIR='"$(abs_srcdir)"'
 add_definitions("-DSRCDIR=\"${CMAKE_CURRENT_SOURCE_DIR}\"")
 
-# example1_CFLAGS = @OPENCL_CFLAGS@
-#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99 ${OPENCL_CFLAGS}")
 add_compile_options(${OPENCL_CFLAGS})
 
 if (MSVC)
@@ -37,22 +34,24 @@ if (MSVC)
 endif(MSVC)
 add_executable("example1-spir32" example1.c example1_exec.c example1.cl example1.spir)
 
-# example1_LDADD = @OPENCL_LIBS@ ../../lib/poclu/libpoclu.la
 target_link_libraries("example1-spir32" ${POCLU_LINK_OPTIONS})
 
-
-add_test("spec_tests/example1_dot_product_spir32" "example1-spir32")
+add_test_pocl(NAME "spec_tests/example1_dot_product_spir32"
+              EXPECTED_OUTPUT "example1-spir.stdout"
+              COMMAND "example1-spir32")
 
 set_tests_properties( "spec_tests/example1_dot_product_spir32"
   PROPERTIES
     COST 3.0
-    PASS_REGULAR_EXPRESSION "[(]0[.]000000, 0[.]000000, 0[.]000000, 0[.]000000[)] [.] [(]0[.]000000, 0[.]000000, 0[.]000000, 0[.]000000[)] = 0[.]000000
-[(]1[.]000000, 1[.]000000, 1[.]000000, 1[.]000000[)] [.] [(]1[.]000000, 1[.]000000, 1[.]000000, 1[.]000000[)] = 4[.]000000
-[(]2[.]000000, 2[.]000000, 2[.]000000, 2[.]000000[)] [.] [(]2[.]000000, 2[.]000000, 2[.]000000, 2[.]000000[)] = 16[.]000000
-[(]3[.]000000, 3[.]000000, 3[.]000000, 3[.]000000[)] [.] [(]3[.]000000, 3[.]000000, 3[.]000000, 3[.]000000[)] = 36[.]000000
-OK"
     PROCESSORS 1
     LABELS "internal;spir"
     DEPENDS "pocl_version_check")
 
+# fails on older LLVMs because SPIR bitcode was generated with LLVM 4.0
+if(LLVM_OLDER_THAN_3_9)
+  set_tests_properties( "spec_tests/example1_dot_product_spir32"
+  PROPERTIES
+    WILL_FAIL 1)
+endif()
+
 endif()
diff --git a/examples/example1-spir32/example1-spir.stdout b/examples/example1-spir32/example1-spir.stdout
new file mode 100644
index 0000000..ef35a12
--- /dev/null
+++ b/examples/example1-spir32/example1-spir.stdout
@@ -0,0 +1,5 @@
+(0.000000, 0.000000, 0.000000, 0.000000) . (0.000000, 0.000000, 0.000000, 0.000000) = 0.000000
+(1.000000, 1.000000, 1.000000, 1.000000) . (1.000000, 1.000000, 1.000000, 1.000000) = 4.000000
+(2.000000, 2.000000, 2.000000, 2.000000) . (2.000000, 2.000000, 2.000000, 2.000000) = 16.000000
+(3.000000, 3.000000, 3.000000, 3.000000) . (3.000000, 3.000000, 3.000000, 3.000000) = 36.000000
+OK
diff --git a/examples/example1-spir32/example1.spir b/examples/example1-spir32/example1.spir
index 7b266d1..f7b70a2 100644
Binary files a/examples/example1-spir32/example1.spir and b/examples/example1-spir32/example1.spir differ
diff --git a/examples/example1-spir32/example1_exec.c b/examples/example1-spir32/example1_exec.c
index 7ab7c7a..1447c46 100644
--- a/examples/example1-spir32/example1_exec.c
+++ b/examples/example1-spir32/example1_exec.c
@@ -36,8 +36,18 @@ exec_dot_product_kernel(const char *program_source, size_t source_size,
   // get the list of GPU devices associated with context 
   clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &cb); 
   devices = (cl_device_id *) malloc(cb);
-  clGetContextInfo(context, CL_CONTEXT_DEVICES, cb, devices, NULL); 
- 
+  clGetContextInfo(context, CL_CONTEXT_DEVICES, cb, devices, NULL);
+
+  char extensions[1024];
+  err = clGetDeviceInfo (devices[0], CL_DEVICE_EXTENSIONS, 1024, extensions,
+                         NULL);
+  CHECK_OPENCL_ERROR_IN ("clGetDeviceInfo");
+  if (strstr (extensions, "cl_khr_spir") == NULL)
+    {
+      printf ("SPIR not supported, cannot run the test\n");
+      return -1;
+    }
+
   // create a command-queue 
   cmd_queue = clCreateCommandQueue(context, devices[0], 0, NULL); 
   if (cmd_queue == (cl_command_queue)0) 
diff --git a/examples/example1-spir64/CMakeLists.txt b/examples/example1-spir64/CMakeLists.txt
index 27ddc5c..17f3783 100644
--- a/examples/example1-spir64/CMakeLists.txt
+++ b/examples/example1-spir64/CMakeLists.txt
@@ -25,11 +25,8 @@
 
 if(POCL_DEVICE_ADDRESS_BITS EQUAL "64")
 
-#AM_CPPFLAGS = -I$(top_srcdir)/fix-include -I$(top_srcdir)/include -DSRCDIR='"$(abs_srcdir)"'
 add_definitions("-DSRCDIR=\"${CMAKE_CURRENT_SOURCE_DIR}\"")
 
-# example1_CFLAGS = @OPENCL_CFLAGS@
-#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99 ${OPENCL_CFLAGS}")
 add_compile_options(${OPENCL_CFLAGS})
 
 if (MSVC)
@@ -37,21 +34,25 @@ if (MSVC)
 endif(MSVC)
 add_executable("example1-spir" example1.c example1_exec.c example1.cl example1.spir)
 
-# example1_LDADD = @OPENCL_LIBS@ ../../lib/poclu/libpoclu.la
 target_link_libraries("example1-spir" ${POCLU_LINK_OPTIONS})
 
-add_test("spec_tests/example1_dot_product_spir64" "example1-spir")
+add_test_pocl(NAME "spec_tests/example1_dot_product_spir64"
+              EXPECTED_OUTPUT "example1-spir.stdout"
+              COMMAND "example1-spir")
 
 set_tests_properties( "spec_tests/example1_dot_product_spir64"
   PROPERTIES
     COST 3.0
-    PASS_REGULAR_EXPRESSION "[(]0[.]000000, 0[.]000000, 0[.]000000, 0[.]000000[)] [.] [(]0[.]000000, 0[.]000000, 0[.]000000, 0[.]000000[)] = 0[.]000000
-[(]1[.]000000, 1[.]000000, 1[.]000000, 1[.]000000[)] [.] [(]1[.]000000, 1[.]000000, 1[.]000000, 1[.]000000[)] = 4[.]000000
-[(]2[.]000000, 2[.]000000, 2[.]000000, 2[.]000000[)] [.] [(]2[.]000000, 2[.]000000, 2[.]000000, 2[.]000000[)] = 16[.]000000
-[(]3[.]000000, 3[.]000000, 3[.]000000, 3[.]000000[)] [.] [(]3[.]000000, 3[.]000000, 3[.]000000, 3[.]000000[)] = 36[.]000000
-OK"
     PROCESSORS 1
-    LABELS "internal;spir"
+    LABELS "internal;spir;cuda"
     DEPENDS "pocl_version_check")
 
+# fails on older LLVMs because SPIR bitcode was generated with LLVM 4.0
+if(LLVM_OLDER_THAN_3_9)
+  set_tests_properties( "spec_tests/example1_dot_product_spir64"
+  PROPERTIES
+    WILL_FAIL 1)
+endif()
+
+
 endif()
diff --git a/examples/example1-spir64/example1-spir.stdout b/examples/example1-spir64/example1-spir.stdout
new file mode 100644
index 0000000..ef35a12
--- /dev/null
+++ b/examples/example1-spir64/example1-spir.stdout
@@ -0,0 +1,5 @@
+(0.000000, 0.000000, 0.000000, 0.000000) . (0.000000, 0.000000, 0.000000, 0.000000) = 0.000000
+(1.000000, 1.000000, 1.000000, 1.000000) . (1.000000, 1.000000, 1.000000, 1.000000) = 4.000000
+(2.000000, 2.000000, 2.000000, 2.000000) . (2.000000, 2.000000, 2.000000, 2.000000) = 16.000000
+(3.000000, 3.000000, 3.000000, 3.000000) . (3.000000, 3.000000, 3.000000, 3.000000) = 36.000000
+OK
diff --git a/examples/example1-spir64/example1.spir b/examples/example1-spir64/example1.spir
index 9362b7a..a352670 100644
Binary files a/examples/example1-spir64/example1.spir and b/examples/example1-spir64/example1.spir differ
diff --git a/examples/example1-spir64/example1_exec.c b/examples/example1-spir64/example1_exec.c
index 7ab7c7a..03f936d 100644
--- a/examples/example1-spir64/example1_exec.c
+++ b/examples/example1-spir64/example1_exec.c
@@ -21,7 +21,8 @@ exec_dot_product_kernel(const char *program_source, size_t source_size,
   cl_context  context; 
   cl_command_queue cmd_queue; 
   cl_device_id  *devices; 
-  cl_program  program; 
+  cl_platform_id platform;
+  cl_program  program;
   cl_kernel  kernel; 
   cl_mem       memobjs[3]; 
   size_t       global_work_size[1]; 
@@ -29,6 +30,9 @@ exec_dot_product_kernel(const char *program_source, size_t source_size,
   size_t       cb; 
   cl_int       err; 
   int          i;
+
+  clGetPlatformIDs (1, &platform, NULL);
+
   context = poclu_create_any_context();
   if (context == (cl_context)0) 
     return -1; 
@@ -36,8 +40,18 @@ exec_dot_product_kernel(const char *program_source, size_t source_size,
   // get the list of GPU devices associated with context 
   clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &cb); 
   devices = (cl_device_id *) malloc(cb);
-  clGetContextInfo(context, CL_CONTEXT_DEVICES, cb, devices, NULL); 
- 
+  clGetContextInfo(context, CL_CONTEXT_DEVICES, cb, devices, NULL);
+
+  char extensions[1024];
+  err = clGetDeviceInfo (devices[0], CL_DEVICE_EXTENSIONS, 1024, extensions,
+                         NULL);
+  CHECK_OPENCL_ERROR_IN ("clGetDeviceInfo");
+  if (strstr (extensions, "cl_khr_spir") == NULL)
+    {
+      printf ("SPIR not supported, cannot run the test\n");
+      return -1;
+    }
+
   // create a command-queue 
   cmd_queue = clCreateCommandQueue(context, devices[0], 0, NULL); 
   if (cmd_queue == (cl_command_queue)0) 
@@ -184,7 +198,8 @@ exec_dot_product_kernel(const char *program_source, size_t source_size,
   delete_memobjs(memobjs, 3); 
   clReleaseKernel(kernel); 
   clReleaseProgram(program); 
-  clReleaseCommandQueue(cmd_queue); 
+  clUnloadPlatformCompiler (platform);
+  clReleaseCommandQueue(cmd_queue);
   clReleaseContext(context); 
   return 0; // success... 
 }
diff --git a/examples/example1/CMakeLists.txt b/examples/example1/CMakeLists.txt
index 6898126..d836011 100644
--- a/examples/example1/CMakeLists.txt
+++ b/examples/example1/CMakeLists.txt
@@ -49,5 +49,5 @@ set_tests_properties( "examples/example1_dot_product"
 [(]3[.]000000, 3[.]000000, 3[.]000000, 3[.]000000[)] [.] [(]3[.]000000, 3[.]000000, 3[.]000000, 3[.]000000[)] = 36[.]000000
 OK"
     PROCESSORS 1
-    LABELS "internal;hsa;tce"
+    LABELS "internal;hsa;tce;cuda"
     DEPENDS "pocl_version_check")
diff --git a/examples/example1/example1.c b/examples/example1/example1.c
index c6b1a22..d7afacc 100644
--- a/examples/example1/example1.c
+++ b/examples/example1/example1.c
@@ -103,6 +103,12 @@ main (void)
         }
     }
 
+  free (source);
+  free (srcA);
+  free (srcB);
+  free (dst);
+
+
   printf ("OK\n");
   return 0;
 }
diff --git a/examples/example1/example1_exec.c b/examples/example1/example1_exec.c
index 27d3838..c4911ab 100644
--- a/examples/example1/example1_exec.c
+++ b/examples/example1/example1_exec.c
@@ -1,6 +1,7 @@
 #include <stdlib.h>
 #include <CL/opencl.h>
 #include <poclu.h>
+#include <assert.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -19,7 +20,8 @@ exec_dot_product_kernel(const char *program_source,
                         int n, cl_float4 *srcA, cl_float4 *srcB, cl_float *dst) 
 { 
   cl_context  context; 
-  cl_command_queue cmd_queue; 
+  cl_command_queue cmd_queue;
+  cl_platform_id platform;
   cl_device_id  *devices; 
   cl_program  program; 
   cl_kernel  kernel; 
@@ -29,6 +31,10 @@ exec_dot_product_kernel(const char *program_source,
   size_t       cb; 
   cl_int       err; 
   int          i;
+
+  clGetPlatformIDs (1, &platform, NULL);
+  assert (platform);
+
   context = poclu_create_any_context();
   if (context == (cl_context)0) 
     return -1; 
@@ -107,8 +113,8 @@ exec_dot_product_kernel(const char *program_source,
       clReleaseCommandQueue(cmd_queue); 
       clReleaseContext(context); 
       return -1; 
-    } 
- 
+    }
+
   // create the kernel 
   kernel = clCreateKernel(program, "dot_product", NULL); 
   if (kernel == (cl_kernel)0) 
@@ -175,16 +181,18 @@ exec_dot_product_kernel(const char *program_source,
       poclu_bswap_cl_float_array(devices[0], (cl_float*)&srcA[i], 4);
       poclu_bswap_cl_float_array(devices[0], (cl_float*)&srcB[i], 4);
     }
-  free(devices); 
 
+  CHECK_CL_ERROR (clFinish (cmd_queue));
+  free(devices);
 
   // release kernel, program, and memory objects 
-  delete_memobjs(memobjs, 3); 
-  clReleaseKernel(kernel); 
-  clReleaseProgram(program); 
-  clReleaseCommandQueue(cmd_queue); 
-  clReleaseContext(context); 
-  return 0; // success... 
+  delete_memobjs (memobjs, 3);
+  CHECK_CL_ERROR (clReleaseKernel (kernel));
+  CHECK_CL_ERROR (clReleaseProgram (program));
+  CHECK_CL_ERROR (clReleaseCommandQueue (cmd_queue));
+  CHECK_CL_ERROR (clReleaseContext (context));
+  CHECK_CL_ERROR (clUnloadCompiler ());
+  return 0; // success...
 }
 
 #ifdef __cplusplus
diff --git a/examples/example2/CMakeLists.txt b/examples/example2/CMakeLists.txt
index dbc65aa..47d6dcc 100644
--- a/examples/example2/CMakeLists.txt
+++ b/examples/example2/CMakeLists.txt
@@ -46,5 +46,5 @@ set_tests_properties( "examples/example2_matrix_transpose"
     COST 3.0
     PASS_REGULAR_EXPRESSION "OK\n"
     PROCESSORS 1
-    LABELS "internal;hsa;tce"
+    LABELS "internal;hsa;tce;cuda"
     DEPENDS "pocl_version_check")
diff --git a/examples/example2/example2.c b/examples/example2/example2.c
index a43655e..d2b86b6 100644
--- a/examples/example2/example2.c
+++ b/examples/example2/example2.c
@@ -48,14 +48,14 @@ main (void)
   int j;
   cl_context  context; 
   size_t cb;
-  cl_device_id *devices;
-  cl_command_queue cmd_queue;
-  cl_program program;
-  cl_int err;
-  cl_kernel kernel;
-  cl_mem memobjs[2];
-  size_t global_work_size[2];
-  size_t local_work_size[2];
+  cl_device_id *devices = NULL;
+  cl_command_queue cmd_queue = NULL;
+  cl_program program = NULL;
+  cl_int err = 0;
+  cl_kernel kernel = NULL;
+  cl_mem memobjs[2] = { 0 };
+  size_t global_work_size[2] = { 0 };
+  size_t local_work_size[2] = { 0 };
 
   source_file = fopen("example2.cl", "r");
   if (source_file == NULL) 
@@ -131,8 +131,10 @@ main (void)
       clReleaseCommandQueue(cmd_queue); 
       clReleaseContext(context); 
       return -1; 
-    } 
- 
+    }
+
+  free (source);
+
   err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); 
   if (err != CL_SUCCESS) 
     { 
@@ -197,14 +199,17 @@ main (void)
       clReleaseProgram(program); 
       clReleaseCommandQueue(cmd_queue); 
       clReleaseContext(context); 
-      return -1; 
-    } 
- 
-  delete_memobjs(memobjs, 2); 
-  clReleaseKernel(kernel); 
-  clReleaseProgram(program); 
-  clReleaseCommandQueue(cmd_queue); 
-  clReleaseContext(context); 
+      return -1;
+    }
+
+  CHECK_CL_ERROR (clFinish (cmd_queue));
+
+  delete_memobjs (memobjs, 2);
+  CHECK_CL_ERROR (clReleaseKernel (kernel));
+  CHECK_CL_ERROR (clReleaseProgram (program));
+  CHECK_CL_ERROR (clReleaseCommandQueue (cmd_queue));
+  CHECK_CL_ERROR (clReleaseContext (context));
+  CHECK_CL_ERROR (clUnloadCompiler ());
 
   for (i = 0; i < HEIGHT; ++i)
     {
@@ -215,7 +220,10 @@ main (void)
 	}
       }
     }
-  
+
+  free (input);
+  free (output);
+
   printf ("OK\n");
   return 0;
 }
@@ -225,5 +233,5 @@ delete_memobjs(cl_mem *memobjs, int n)
 { 
   int i; 
   for (i=0; i<n; i++) 
-    clReleaseMemObject(memobjs[i]); 
+    CHECK_CL_ERROR (clReleaseMemObject(memobjs[i]));
 } 
diff --git a/examples/example2a/CMakeLists.txt b/examples/example2a/CMakeLists.txt
index 3b0b7b4..32463e3 100644
--- a/examples/example2a/CMakeLists.txt
+++ b/examples/example2a/CMakeLists.txt
@@ -46,5 +46,5 @@ set_tests_properties( "examples/example2_matrix_transpose_alocals"
     COST 3.0
     PASS_REGULAR_EXPRESSION "OK\n"
     PROCESSORS 1
-    LABELS "internal;hsa;tce"
+    LABELS "internal;hsa;tce;cuda"
     DEPENDS "pocl_version_check")
diff --git a/examples/example2a/example2a.c b/examples/example2a/example2a.c
index 663f56d..1f65c1a 100644
--- a/examples/example2a/example2a.c
+++ b/examples/example2a/example2a.c
@@ -200,11 +200,14 @@ main (void)
       return -1; 
     } 
  
-  delete_memobjs(memobjs, 2); 
-  clReleaseKernel(kernel); 
-  clReleaseProgram(program); 
-  clReleaseCommandQueue(cmd_queue); 
-  clReleaseContext(context); 
+  CHECK_CL_ERROR (clFinish (cmd_queue));
+
+  delete_memobjs (memobjs, 2);
+  CHECK_CL_ERROR (clReleaseKernel (kernel));
+  CHECK_CL_ERROR (clReleaseProgram (program));
+  CHECK_CL_ERROR (clReleaseCommandQueue (cmd_queue));
+  CHECK_CL_ERROR (clReleaseContext (context));
+  CHECK_CL_ERROR (clUnloadCompiler ());
 
   for (i = 0; i < HEIGHT; ++i)
     {
@@ -215,7 +218,10 @@ main (void)
 	}
       }
     }
-  
+
+  free (input);
+  free (output);
+
   printf ("OK\n");
   return 0;
 }
@@ -225,5 +231,5 @@ delete_memobjs(cl_mem *memobjs, int n)
 { 
   int i; 
   for (i=0; i<n; i++) 
-    clReleaseMemObject(memobjs[i]); 
+    CHECK_CL_ERROR (clReleaseMemObject(memobjs[i]));
 } 
diff --git a/examples/opencl-book-samples/CMakeLists.txt b/examples/opencl-book-samples/CMakeLists.txt
index 74c1303..ef1d999 100644
--- a/examples/opencl-book-samples/CMakeLists.txt
+++ b/examples/opencl-book-samples/CMakeLists.txt
@@ -28,17 +28,18 @@ set(TS_BASEDIR "${TESTSUITE_BASEDIR}/${TS_NAME}")
 set(TS_BUILDDIR "${TS_BASEDIR}/src/${TS_NAME}-build")
 set(TS_SRCDIR "${TS_BASEDIR}/src/${TS_NAME}")
 
-if(HAVE_SVN)
+if(HAVE_GIT)
 
-message(STATUS "Enabling testsuite ${TS_NAME}")
-set(ENABLED_TESTSUITES "${ENABLED_TESTSUITES};${TS_NAME}" PARENT_SCOPE)
+  message(STATUS "Enabling testsuite ${TS_NAME}")
+  list(APPEND ACTUALLY_ENABLED_TESTSUITES "${TS_NAME}")
+  set(ACTUALLY_ENABLED_TESTSUITES ${ACTUALLY_ENABLED_TESTSUITES} PARENT_SCOPE)
 
 ExternalProject_Add(
   ${TS_NAME}
   PREFIX "${TS_BASEDIR}"
   #DOWNLOAD_COMMAND "/bin/true"
-  SVN_REPOSITORY "http://opencl-book-samples.googlecode.com/svn/trunk"
-  PATCH_COMMAND pwd && echo Patching &&	sed -i "s/bool doCPU = false/bool doCPU = true/g" src/Chapter_16/Dijkstra/oclDijkstra.cpp &&
+  GIT_REPOSITORY "https://github.com/bgaster/opencl-book-samples.git"
+  PATCH_COMMAND pwd && echo Patching && sed -i "s/bool doCPU = false/bool doCPU = true/g" src/Chapter_16/Dijkstra/oclDijkstra.cpp &&
   sed -i "s/size_t localWorkSize = maxWorkGroupSize/size_t localWorkSize = 2/g" src/Chapter_16/Dijkstra/oclDijkstraKernel.cpp &&
   sed -i "s/device.j..type == CL_DEVICE_TYPE_GPU/device[j].type \\& CL_DEVICE_TYPE_GPU/g" src/Chapter_22/spmv.c &&
   sed -i "s/context.CL_DEVICE_TYPE_GPU/context\(CL_DEVICE_TYPE_CPU/g" src/Chapter_12/VectorAdd/vecadd.cpp &&
@@ -79,6 +80,7 @@ add_test(NAME "opencl_book_samples_SimpleBufferSubBuffer"
 add_test(NAME "opencl_book_samples_vecadd"
          COMMAND "${TS_BUILDDIR}/src/Chapter_12/VectorAdd/vecadd"
          WORKING_DIRECTORY "${TS_SRCDIR}/src/Chapter_12/VectorAdd")
+# requires a GPU
 #add_test(NAME "opencl_book_samples_histogram"
 #         COMMAND "${TS_BUILDDIR}/src/Chapter_14/histogram/histogram"
 #         WORKING_DIRECTORY "${TS_SRCDIR}/src/Chapter_14/histogram")
@@ -92,6 +94,7 @@ add_test(NAME "opencl_book_samples_Dijkstra"
 
 
 
+
 set_tests_properties(
   opencl_book_samples_HelloWorld
   opencl_book_samples_OpenCLInfo
diff --git a/examples/piglit/CMakeLists.txt b/examples/piglit/CMakeLists.txt
index 5c5253f..ef67ca0 100644
--- a/examples/piglit/CMakeLists.txt
+++ b/examples/piglit/CMakeLists.txt
@@ -145,8 +145,16 @@ add_test(NAME piglit_cl_api_retain_release_command_queue
          COMMAND "${TS_BUILDDIR}/bin/cl-api-retain_release-command-queue")
 add_test(NAME piglit_cl_api_retain_release_context
          COMMAND "${TS_BUILDDIR}/bin/cl-api-retain_release-context")
-add_test(NAME piglit_cl_api_retain_release_event
-         COMMAND "${TS_BUILDDIR}/bin/cl-api-retain_release-event")
+
+# disable retain-release event test.
+# has two issues:
+#   1) has a race condition (events sometime finish before
+#      it asks for status, sometimes after)
+#   2) it thinks CL_EVENT_REFERENCE_COUNT should be 1 after
+#      an API call clEnqueueXYZ(... , &event) - but pocl inits it to 2
+#      (one for pocl, one for user)
+#add_test(NAME piglit_cl_api_retain_release_event
+         #COMMAND "${TS_BUILDDIR}/bin/cl-api-retain_release-event")
 add_test(NAME piglit_cl_api_retain_release_kernel
          COMMAND "${TS_BUILDDIR}/bin/cl-api-retain_release-kernel")
 add_test(NAME piglit_cl_api_retain_release_mem_object
@@ -171,10 +179,6 @@ add_test(NAME piglit_cl_program_bitcoin_phatk
          COMMAND "${TS_BUILDDIR}/bin/cl-program-bitcoin-phatk")
 add_test(NAME piglit_cl_program_max_work_item_sizes
          COMMAND "${TS_BUILDDIR}/bin/cl-program-max-work-item-sizes")
-add_test(NAME piglit_cl_program_tester
-         COMMAND "${TS_BUILDDIR}/bin/cl-program-tester")
-
-
 
 
 set_tests_properties(
@@ -214,7 +218,7 @@ set_tests_properties(
   piglit_cl_api_link_program
   piglit_cl_api_retain_release_command_queue
   piglit_cl_api_retain_release_context
-  piglit_cl_api_retain_release_event
+#  piglit_cl_api_retain_release_event
   piglit_cl_api_retain_release_kernel
   piglit_cl_api_retain_release_mem_object
   piglit_cl_api_retain_release_program
@@ -227,9 +231,16 @@ set_tests_properties(
   piglit_cl_custom_use_sub_buffer_in_kernel
   piglit_cl_program_bitcoin_phatk
   piglit_cl_program_max_work_item_sizes
-  piglit_cl_program_tester
 
   PROPERTIES
+    PASS_REGULAR_EXPRESSION "pass"
+    FAIL_REGULAR_EXPRESSION "fail"
     LABELS "piglit")
 
+set_tests_properties(
+  piglit_cl_api_create_program_with_binary
+
+  PROPERTIES
+    WILL_FAIL 1)
+
 endif()
diff --git a/examples/scalarwave/CMakeLists.txt b/examples/scalarwave/CMakeLists.txt
index 171d29e..5a7360e 100644
--- a/examples/scalarwave/CMakeLists.txt
+++ b/examples/scalarwave/CMakeLists.txt
@@ -61,7 +61,7 @@
 add_definitions("-DSRCDIR=\"${CMAKE_CURRENT_SOURCE_DIR}\"")
 
 #scalarwave_CFLAGS = -std=c99 @OPENCL_CFLAGS@
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99 ${OPENCL_CFLAGS}")
+add_compile_options(${OPENCL_CFLAGS} -std=c99)
 
 if (MSVC)
   set_source_files_properties( scalarwave.c PROPERTIES LANGUAGE CXX )
@@ -78,5 +78,5 @@ set_tests_properties( "examples/scalarwave"
     COST 3.0
     PASS_REGULAR_EXPRESSION "Done.\n"
     PROCESSORS 1
-    LABELS "internal"
+    LABELS "internal;cuda"
     DEPENDS "pocl_version_check")
diff --git a/examples/scalarwave/scalarwave.c b/examples/scalarwave/scalarwave.c
index 361b1ac..c1bb758 100644
--- a/examples/scalarwave/scalarwave.c
+++ b/examples/scalarwave/scalarwave.c
@@ -26,8 +26,12 @@ typedef struct grid_t {
   cl_int ni, nj, nk;            // used size
 } grid_t;
 
+static int initialised = 0;
+static cl_context context;
+static cl_command_queue cmd_queue;
+static cl_program program;
+static cl_kernel kernel;
 
- 
 int 
 exec_scalarwave_kernel(char      const *const program_source, 
                        cl_double       *const phi,
@@ -35,11 +39,6 @@ exec_scalarwave_kernel(char      const *const program_source,
                        cl_double const *const phi_p_p,
                        grid_t    const *const grid)
 { 
-  static int initialised = 0;
-  static cl_context context;
-  static cl_command_queue cmd_queue;
-  static cl_program program;
-  static cl_kernel kernel;
   
   if (!initialised) {
     initialised = 1;
@@ -123,10 +122,6 @@ exec_scalarwave_kernel(char      const *const program_source,
   clReleaseMemObject(mem_phi_p);
   clReleaseMemObject(mem_phi_p_p);
   clReleaseMemObject(mem_grid);
-  /* clReleaseKernel(kernel); */
-  /* clReleaseProgram(program); */
-  /* clReleaseCommandQueue(cmd_queue); */
-  /* clReleaseContext(context); */
  
   return 0;
 }
@@ -165,8 +160,6 @@ main(void)
   
   fclose(source_file);
 
-
-
   grid_t grid;
   grid.dt = ALPHA/(NX-1);
   grid.dx = grid.dy = grid.dz = 1.0/(NX-1);
@@ -220,7 +213,15 @@ main(void)
     assert(!ierr);
     
   } // for n
-  
+
+  clReleaseKernel (kernel);
+  clReleaseProgram (program);
+  clReleaseCommandQueue (cmd_queue);
+  cl_platform_id pocl;
+  clGetPlatformIDs (1, &pocl, NULL);
+  clUnloadPlatformCompiler (pocl);
+  clReleaseContext (context);
+
   for (int i=0; i<NX; ++i) {
     int const j = i;
     int const k = i;
@@ -231,9 +232,13 @@ main(void)
     
     printf ("phi(%-8g,%-8g,%-8g) = %g\n", x,y,z, phi[ind3d]);
   }
-  
+
   printf ("Done.\n");
 
+  free (phi);
+  free (phi_p);
+  free (phi_p_p);
   free(source);
+
   return 0;
 }
diff --git a/examples/trig/CMakeLists.txt b/examples/trig/CMakeLists.txt
index 32d443d..413d642 100644
--- a/examples/trig/CMakeLists.txt
+++ b/examples/trig/CMakeLists.txt
@@ -40,9 +40,10 @@ target_link_libraries("trig" ${POCLU_LINK_OPTIONS})
 add_test("examples/trig" "trig")
 
 set_tests_properties(
+  "examples/trig"
   PROPERTIES
     COST 3.0
     PASS_REGULAR_EXPRESSION "OK\n"
     PROCESSORS 1
-    LABELS "OpenCL_Spec"
+    LABELS "internal;cuda"
     DEPENDS "pocl_version_check")
diff --git a/examples/trig/trig.c b/examples/trig/trig.c
index 5e4ca0c..8100661 100644
--- a/examples/trig/trig.c
+++ b/examples/trig/trig.c
@@ -105,5 +105,10 @@ main (void)
     }
 
   printf ("OK\n");
+
+  free (srcA);
+  free (dst);
+  free (dstS);
+
   return 0;
 }
diff --git a/include/CL/cl2.hpp b/include/CL/cl2.hpp
index ad0c7c4..92a1cc1 100644
--- a/include/CL/cl2.hpp
+++ b/include/CL/cl2.hpp
@@ -468,7 +468,7 @@
 #if defined(_MSC_VER)
 # define CL_HPP_DEFINE_STATIC_MEMBER_ __declspec(selectany)
 #else
-# define CL_HPP_DEFINE_STATIC_MEMBER_ __attribute__((weak))
+# define CL_HPP_DEFINE_STATIC_MEMBER_
 #endif // !_MSC_VER
 
 // Define deprecated prefixes and suffixes to ensure compilation
@@ -1536,6 +1536,8 @@ struct ReferenceHandler<cl_event>
 
 
 // Extracts version number with major in the upper 16 bits, minor in the lower 16
+
+#if CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120
 static cl_uint getVersion(const vector<char> &versionInfo)
 {
     int highVersion = 0;
@@ -1555,7 +1557,6 @@ static cl_uint getVersion(const vector<char> &versionInfo)
     return (highVersion << 16) | lowVersion;
 }
 
-#if CL_HPP_TARGET_OPENCL_VERSION >= 120 && CL_HPP_MINIMUM_OPENCL_VERSION < 120
 static cl_uint getPlatformVersion(cl_platform_id platform)
 {
     size_type size = 0;
@@ -1783,7 +1784,7 @@ public:
 
     cl_type& operator ()() { return object_; }
 
-    const cl_type get() const { return object_; }
+    cl_type get() const { return object_; }
 
     cl_type get() { return object_; }
 
@@ -5808,6 +5809,10 @@ public:
 
     /*! \brief setArg overload taking a POD type
      */
+#if (__GNUC__ > 5)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wignored-attributes"
+#endif
     template <typename T>
     typename std::enable_if<!std::is_pointer<T>::value, cl_int>::type
         setArg(cl_uint index, const T &value)
@@ -5820,6 +5825,9 @@ public:
                 detail::KernelArgumentHandler<T>::ptr(value)),
             __SET_KERNEL_ARGS_ERR);
     }
+#if (__GNUC__ > 5)
+#pragma GCC diagnostic pop
+#endif
 
     cl_int setArg(cl_uint index, size_type size, const void* argPtr)
     {
@@ -6105,6 +6113,11 @@ public:
      *   CL_INVALID_BINARY if an invalid program binary was encountered for any device. binaryStatus will return specific status for each device.
      *   CL_OUT_OF_HOST_MEMORY if there is a failure to allocate resources required by the OpenCL implementation on the host.
      */
+#if (__GNUC__ > 5)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wignored-attributes"
+#endif
+
     Program(
         const Context& context,
         const vector<Device>& devices,
@@ -6163,7 +6176,10 @@ public:
         }
     }
 
-    
+#if (__GNUC__ > 5)
+#pragma GCC diagnostic pop
+#endif
+
 #if CL_HPP_TARGET_OPENCL_VERSION >= 120
     /**
      * Create program using builtin kernels.
@@ -6520,7 +6536,7 @@ inline cl_int cl::Program::getInfo(cl_program_info name, vector<vector<unsigned
 
         // Resize the parameter array and constituent arrays
         param->resize(numBinaries);
-        for (int i = 0; i < numBinaries; ++i) {
+        for (unsigned i = 0; i < numBinaries; ++i) {
             (*param)[i].resize(sizes[i]);
         }
 
diff --git a/include/CMakeLists.txt b/include/CMakeLists.txt
index e8462c5..ca77efe 100644
--- a/include/CMakeLists.txt
+++ b/include/CMakeLists.txt
@@ -25,7 +25,7 @@
 
 add_subdirectory("CL")
 
-set(PRIVATE_HEADERS _enable_all_exts.h _kernel.h _kernel_c.h _kernel_constants.h pocl_types.h pocl_device.h pocl.h)
+set(PRIVATE_HEADERS _enable_all_exts.h _builtin_renames.h _kernel.h _kernel_c.h _kernel_constants.h pocl_types.h pocl_device.h pocl.h)
 
 install(FILES ${PRIVATE_HEADERS}
         DESTINATION ${POCL_INSTALL_PRIVATE_HEADER_DIR})
diff --git a/include/_builtin_renames.h b/include/_builtin_renames.h
new file mode 100644
index 0000000..ec6c5c8
--- /dev/null
+++ b/include/_builtin_renames.h
@@ -0,0 +1,202 @@
+/* pocl/_kernel_renames.h - Rename OpenCL builtin functions to avoid name
+   clashes with libm functions which are called in implementation.
+
+   Copyright (c) 2011-2013 Erik Schnetter <eschnetter at perimeterinstitute.ca>
+                           Perimeter Institute for Theoretical Physics
+   Copyright (c) 2011-2017 Pekka Jääskeläinen / TUT
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#ifndef _KERNEL_RENAMES_H
+#define _KERNEL_RENAMES_H
+
+/* Move built-in declarations and libm functions out of the way.
+  (There should be a better way of doing so. These functions are
+  built-in math functions for OpenCL (see Clang's "Builtins.def").
+  Functions defined in libc or libm may also
+  interfere with OpenCL's functions, since their prototypes will be
+  wrong. */
+#define abs            _cl_abs
+#define abs_diff       _cl_abs_diff
+#define acos           _cl_acos
+#define acosh          _cl_acosh
+#define acospi         _cl_acospi
+#define add_sat        _cl_add_sat
+#define all            _cl_all
+#define any            _cl_any
+#define asin           _cl_asin
+#define asinh          _cl_asinh
+#define asinpi         _cl_asinpi
+#define atan           _cl_atan
+#define atan2          _cl_atan2
+#define atan2pi        _cl_atan2pi
+#define atanh          _cl_atanh
+#define atanpi         _cl_atanpi
+#define bitselect      _cl_bitselect
+#define cbrt           _cl_cbrt
+#define ceil           _cl_ceil
+#define clamp          _cl_clamp
+#define clz            _cl_clz
+#define copysign       _cl_copysign
+#define cos            _cl_cos
+#define cosh           _cl_cosh
+#define cospi          _cl_cospi
+#define cross          _cl_cross
+#define degrees        _cl_degrees
+#define distance       _cl_distance
+#define dot            _cl_dot
+#define erf            _cl_erf
+#define erfc           _cl_erfc
+#define exp            _cl_exp
+#define exp10          _cl_exp10
+#define exp2           _cl_exp2
+#define expm1          _cl_expm1
+#define fabs           _cl_fabs
+#define fast_distance  _cl_fast_distance
+#define fast_length    _cl_fast_length
+#define fast_normalize _cl_fast_normalize
+#define fdim           _cl_fdim
+#define floor          _cl_floor
+#define fma            _cl_fma
+#define fmax           _cl_fmax
+#define fmin           _cl_fmin
+#define fmod           _cl_fmod
+#define fract          _cl_fract
+#define frexp          _cl_frexp
+#define hadd           _cl_hadd
+#define half_cos       _cl_half_cos
+#define half_divide    _cl_half_divide
+#define half_exp       _cl_half_exp
+#define half_exp10     _cl_half_exp10
+#define half_exp2      _cl_half_exp2
+#define half_log       _cl_half_log
+#define half_log10     _cl_half_log10
+#define half_log2      _cl_half_log2
+#define half_powr      _cl_half_powr
+#define half_recip     _cl_half_recip
+#define half_rsqrt     _cl_half_rsqrt
+#define half_sin       _cl_half_sin
+#define half_sqrt      _cl_half_sqrt
+#define half_tan       _cl_half_tan
+#define hypot          _cl_hypot
+#define ilogb          _cl_ilogb
+#define isequal        _cl_isequal
+#define isfinite       _cl_isfinite
+#define isgreater      _cl_isgreater
+#define isgreaterequal _cl_isgreaterequal
+#define isinf          _cl_isinf
+#define isless         _cl_isless
+#define islessequal    _cl_islessequal
+#define islessgreater  _cl_islessgreater
+#define isnan          _cl_isnan
+#define isnormal       _cl_isnormal
+#define isnotequal     _cl_isnotequal
+#define isordered      _cl_isordered
+#define isunordered    _cl_isunordered
+#define ldexp          _cl_ldexp
+#define length         _cl_length
+#define lgamma         _cl_lgamma
+#define lgamma_r       _cl_lgamma_r
+#define log            _cl_log
+#define log10          _cl_log10
+#define log1p          _cl_log1p
+#define log2           _cl_log2
+#define logb           _cl_logb
+#define mad            _cl_mad
+#define mad24          _cl_mad24
+#define mad_hi         _cl_mad_hi
+#define mad_sat        _cl_mad_sat
+#define max            _cl_max
+#define maxmag         _cl_maxmag
+#define min            _cl_min
+#define minmag         _cl_minmag
+#define mix            _cl_mix
+#define modf           _cl_modf
+#define mul24          _cl_mul24
+#define mul_hi         _cl_mul_hi
+#define nan            _cl_nan
+#define native_cos     _cl_native_cos
+#define native_divide  _cl_native_divide
+#define native_exp     _cl_native_exp
+#define native_exp10   _cl_native_exp10
+#define native_exp2    _cl_native_exp2
+#define native_log     _cl_native_log
+#define native_log10   _cl_native_log10
+#define native_log2    _cl_native_log2
+#define native_powr    _cl_native_powr
+#define native_recip   _cl_native_recip
+#define native_rsqrt   _cl_native_rsqrt
+#define native_sin     _cl_native_sin
+#define native_sqrt    _cl_native_sqrt
+#define native_tan     _cl_native_tan
+#define nextafter      _cl_nextafter
+#define normalize      _cl_normalize
+#define popcount       _cl_popcount
+#define pow            _cl_pow
+#define pown           _cl_pown
+#define powr           _cl_powr
+#define radians        _cl_radians
+#define remainder      _cl_remainder
+#define remquo         _cl_remquo
+#define rhadd          _cl_rhadd
+#define rint           _cl_rint
+#define rootn          _cl_rootn
+#define rotate         _cl_rotate
+#define round          _cl_round
+#define rsqrt          _cl_rsqrt
+#define select         _cl_select
+#define sign           _cl_sign
+#define signbit        _cl_signbit
+#define sin            _cl_sin
+#define sincos         _cl_sincos
+#define sinh           _cl_sinh
+#define sinpi          _cl_sinpi
+#define smoothstep     _cl_smoothstep
+#define sqrt           _cl_sqrt
+#define step           _cl_step
+#define sub_sat        _cl_sub_sat
+#define tan            _cl_tan
+#define tanh           _cl_tanh
+#define tanpi          _cl_tanpi
+#define tgamma         _cl_tgamma
+#define trunc          _cl_trunc
+#define upsample       _cl_upsample
+
+// We provide our own printf
+// Note: We declare our printf as taking a constant format string, but
+// we implement it in C using a const format string (i.e. a format
+// string living in a different address space). This works only if all
+// address spaces are actually the same, e.g. on CPUs.
+int __cl_printf(__constant const char* format, ...);
+#define printf __cl_printf
+
+#define atom_add     atomic_add
+#define atom_sub     atomic_sub
+#define atom_xchg    atomic_xchg
+#define atom_inc     atomic_inc
+#define atom_dec     atomic_dec
+#define atom_cmpxchg atomic_cmpxchg
+#define atom_min     atomic_min
+#define atom_max     atomic_max
+#define atom_and     atomic_and
+#define atom_or      atomic_or
+#define atom_xor     atomic_xor
+
+#endif
diff --git a/include/_enable_all_exts.h b/include/_enable_all_exts.h
index 26d4b9a..2a0d7a4 100644
--- a/include/_enable_all_exts.h
+++ b/include/_enable_all_exts.h
@@ -48,3 +48,12 @@
 #ifdef cl_khr_int64_extended_atomics
 #  pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
 #endif
+
+#if (__clang_major__ > 4)
+
+#ifdef cl_khr_3d_image_writes
+#  pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
+#endif
+
+#endif
+
diff --git a/include/_kernel.h b/include/_kernel.h
index 940850a..a73c3b5 100644
--- a/include/_kernel.h
+++ b/include/_kernel.h
@@ -51,10 +51,6 @@
 
 #include "_enable_all_exts.h"
 
-/* Language feature detection */
-/* must come after _enable_all_exts.h b/c of pocl_types.h*/
-#include "_kernel_c.h"
-
 /* Enable double precision. This should really only be done when
    building the run-time library; when building application code, we
    should instead check a macro to see whether the application has
@@ -95,6 +91,56 @@
 #define __IF_EA64(x)
 #endif
 
+#include "_builtin_renames.h"
+
+/* 3.9 needs access qualifier */
+#if ((__clang_major__ < 4) && (__clang_minor__ < 9))
+
+#undef CLANG_HAS_IMAGE_AS
+#define IMG_WO_AQ
+#define IMG_RO_AQ
+#define IMG_RW_AQ
+
+#else
+
+#define CLANG_HAS_IMAGE_AS
+#define IMG_RO_AQ __read_only
+#define IMG_WO_AQ __write_only
+
+#if (__OPENCL_C_VERSION__ > 199)
+#define CLANG_HAS_RW_IMAGES
+#define IMG_RW_AQ __read_write
+#else
+#undef CLANG_HAS_RW_IMAGES
+#define IMG_RW_AQ __RW_IMAGES_UNSUPPORTED_BEFORE_CL_20
+#endif
+
+#endif
+
+/* Image types (implementation).
+ * Note: there is a duplicate definition in
+ * lib/CL/devices/dev_image.h - keep in sync?
+ */
+typedef int dev_sampler_t;
+
+typedef struct dev_image_t {
+  void* _data;
+  int _width;
+  int _height;
+  int _depth;
+  int _image_array_size;
+  int _row_pitch;
+  int _slice_pitch;
+  int _num_mip_levels; /* maybe not needed */
+  int _num_samples; /* maybe not needed */
+  int _order;
+  int _data_type;
+  int _num_channels;
+  int _elem_size;
+} dev_image_t;
+
+#include "_builtin_renames.h"
+
 /* A static assert statement to catch inconsistencies at build time */
 #if __has_extension(__c_static_assert__)
 #  define _CL_STATIC_ASSERT(_t, _x) _Static_assert(_x, #_t)
@@ -102,7 +148,133 @@
 #  define _CL_STATIC_ASSERT(_t, _x) typedef int __cl_ai##_t[(x) ? 1 : -1];
 #endif
 
-typedef uint cl_mem_fence_flags;
+#if (__clang_major__ >= 4)
+
+#ifndef _OPENCL_H_
+/* Use the declarations shipped with Clang. */
+/* Check for _OPENCL_H already here because the kernel compiler loads the
+   header beforehand, but cannot find the file due to include paths not
+   set up. */
+#include <opencl-c.h>
+
+#define __ovld __attribute__((overloadable))
+#define __conv __attribute__((convergent))
+
+// Optimizations
+#define __purefn __attribute__((pure))
+#define __cnfn __attribute__((const))
+
+/* Missing declarations from opencl-c.h. Some of the geometric builtins are
+   defined only up to 4 vectors, but we implement them all: */
+#ifdef cl_khr_fp16
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+half __ovld __cnfn length(half8 p);
+half __ovld __cnfn length(half16 p);
+
+half __ovld __cnfn fast_length(half8 p);
+half __ovld __cnfn fast_length(half16 p);
+
+half8 __ovld __cnfn normalize(half8 p);
+half16 __ovld __cnfn normalize(half16 p);
+
+half8 __ovld __cnfn fast_normalize(half8 p);
+half16 __ovld __cnfn fast_normalize(half16 p);
+
+half __ovld __cnfn dot(half8 p0, half8 p1);
+half __ovld __cnfn dot(half16 p0, half16 p1);
+#endif
+
+float __ovld __cnfn length(float8 p);
+float __ovld __cnfn length(float16 p);
+
+float __ovld __cnfn fast_length(float8 p);
+float __ovld __cnfn fast_length(float16 p);
+
+float8 __ovld __cnfn normalize(float8 p);
+float16 __ovld __cnfn normalize(float16 p);
+
+float8 __ovld __cnfn fast_normalize(float8 p);
+float16 __ovld __cnfn fast_normalize(float16 p);
+
+float __ovld __cnfn dot(float8 p0, float8 p1);
+float __ovld __cnfn dot(float16 p0, float16 p1);
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+double __ovld __cnfn length(double8 p);
+double __ovld __cnfn length(double16 p);
+
+double __ovld __cnfn fast_length(double p);
+double __ovld __cnfn fast_length(double2 p);
+double __ovld __cnfn fast_length(double3 p);
+double __ovld __cnfn fast_length(double4 p);
+double __ovld __cnfn fast_length(double8 p);
+double __ovld __cnfn fast_length(double16 p);
+
+double8 __ovld __cnfn normalize(double8 p);
+double16 __ovld __cnfn normalize(double16 p);
+
+double8 __ovld __cnfn fast_normalize(double8 p);
+double16 __ovld __cnfn fast_normalize(double16 p);
+
+double __ovld __cnfn dot(double8 p0, double8 p1);
+double __ovld __cnfn dot(double16 p0, double16 p1);
+
+#endif
+
+#undef __ovld
+#undef __conv
+#undef __purefn
+#undef __cnfn
+
+#include "_enable_all_exts.h"
+
+#endif
+
+/* GNU's libm seems to use INT_MIN here while the Clang's header uses
+   INT_MAX. Both are allowed by the OpenCL specs. */
+#undef FP_ILOGBNAN
+#define FP_ILOGBNAN INT_MIN
+
+/* Function/type attributes supported by Clang/SPIR */
+#if __has_attribute(__always_inline__)
+#  define _CL_ALWAYSINLINE __attribute__((__always_inline__))
+#else
+#  define _CL_ALWAYSINLINE
+#endif
+#if __has_attribute(__noinline__)
+#  define _CL_NOINLINE __attribute__((__noinline__))
+#else
+#  define _CL_NOINLINE
+#endif
+#if __has_attribute(__overloadable__)
+#  define _CL_OVERLOADABLE __attribute__((__overloadable__))
+#else
+#  define _CL_OVERLOADABLE
+#endif
+#if __has_attribute(__pure__)
+#  define _CL_READONLY __attribute__((__pure__))
+#else
+#  define _CL_READONLY
+#endif
+#if __has_attribute(__const__)
+#  define _CL_READNONE __attribute__((__const__))
+#else
+#  define _CL_READNONE
+#endif
+
+#else
+
+/* Language feature detection */
+/* must come after _enable_all_exts.h b/c of pocl_types.h*/
+#include "_kernel_c.h"
+
+/* required to be defined by the OpenCL standard, see:
+   https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/preprocessorDirectives.html */
+#define __kernel_exec(X, typen)                                               \
+  __kernel __attribute__ ((work_group_size_hint (X, 1, 1)))                   \
+      __attribute__ ((vec_type_hint (typen)))
 
 /* Ensure the data types have the right sizes */
 _CL_STATIC_ASSERT(char  , sizeof(char  ) == 1);
@@ -440,6 +612,12 @@ size_t _CL_OVERLOADABLE get_num_groups(uint);
 size_t _CL_OVERLOADABLE get_group_id(uint);
 size_t _CL_OVERLOADABLE get_global_offset(uint);
 
+#if (__OPENCL_C_VERSION__ < 121)
+void _CL_OVERLOADABLE read_mem_fence (cl_mem_fence_flags flags);
+void _CL_OVERLOADABLE write_mem_fence (cl_mem_fence_flags flags);
+void _CL_OVERLOADABLE mem_fence (cl_mem_fence_flags flags);
+#endif
+
 #if __has_attribute(__noduplicate__)
 void _CL_OVERLOADABLE __attribute__ ((__noduplicate__))
 barrier (cl_mem_fence_flags flags);
@@ -447,6 +625,8 @@ barrier (cl_mem_fence_flags flags);
 void _CL_OVERLOADABLE barrier (cl_mem_fence_flags flags);
 #endif
 
+/* clang's header defines these */
+#ifndef _OPENCL_H_
 
 /* Math Constants */
 
@@ -513,6 +693,7 @@ void _CL_OVERLOADABLE barrier (cl_mem_fence_flags flags);
 #define FLT_MAX        0x1.fffffep127f
 #define FLT_MIN        0x1.0p-126f
 #define FLT_EPSILON    0x1.0p-23f
+#define FLT_RADIX 2
 
 #define FP_ILOGB0   INT_MIN
 #define FP_ILOGBNAN INT_MIN
@@ -545,6 +726,7 @@ void _CL_OVERLOADABLE barrier (cl_mem_fence_flags flags);
 #define DBL_MAX        0x1.fffffffffffffp1023
 #define DBL_MIN        0x1.0p-1022
 #define DBL_EPSILON    0x1.0p-52
+#define DBL_RADIX 2
 
 #define M_E        2.71828182845904523536028747135
 #define M_LOG2E    1.44269504088896340735992468100
@@ -561,6 +743,7 @@ void _CL_OVERLOADABLE barrier (cl_mem_fence_flags flags);
 #define M_SQRT1_2  0.707106781186547524400844362105
 #endif
 
+#endif /* _OPENCL_H_ */
 

 /* Math Functions */
 
@@ -897,6 +1080,69 @@ void _CL_OVERLOADABLE barrier (cl_mem_fence_flags flags);
   double4  _CL_OVERLOADABLE NAME(double4 , __private int4    *);        \
   double8  _CL_OVERLOADABLE NAME(double8 , __private int8    *);        \
   double16 _CL_OVERLOADABLE NAME(double16, __private int16   *);)
+#define _CL_DECLARE_FUNC_V_VVPK(NAME)                                   \
+  __IF_FP16(                                                            \
+  half     _CL_OVERLOADABLE NAME(half    , half    , __global  int     *);        \
+  half2    _CL_OVERLOADABLE NAME(half2   , half2   , __global  int2    *);        \
+  half3    _CL_OVERLOADABLE NAME(half3   , half3   , __global  int3    *);        \
+  half4    _CL_OVERLOADABLE NAME(half4   , half4   , __global  int4    *);        \
+  half8    _CL_OVERLOADABLE NAME(half8   , half8   , __global  int8    *);        \
+  half16   _CL_OVERLOADABLE NAME(half16  , half16  , __global  int16   *);)       \
+  float    _CL_OVERLOADABLE NAME(float   , float   , __global  int     *);        \
+  float2   _CL_OVERLOADABLE NAME(float2  , float2  , __global  int2    *);        \
+  float3   _CL_OVERLOADABLE NAME(float3  , float3  , __global  int3    *);        \
+  float4   _CL_OVERLOADABLE NAME(float4  , float4  , __global  int4    *);        \
+  float8   _CL_OVERLOADABLE NAME(float8  , float8  , __global  int8    *);        \
+  float16  _CL_OVERLOADABLE NAME(float16 , float16 , __global  int16   *);        \
+  __IF_FP64(                                                            \
+  double   _CL_OVERLOADABLE NAME(double  , double  , __global  int     *);        \
+  double2  _CL_OVERLOADABLE NAME(double2 , double2 , __global  int2    *);        \
+  double3  _CL_OVERLOADABLE NAME(double3 , double3 , __global  int3    *);        \
+  double4  _CL_OVERLOADABLE NAME(double4 , double4 , __global  int4    *);        \
+  double8  _CL_OVERLOADABLE NAME(double8 , double8 , __global  int8    *);        \
+  double16 _CL_OVERLOADABLE NAME(double16, double16, __global  int16   *);)       \
+  __IF_FP16(                                                            \
+  half     _CL_OVERLOADABLE NAME(half    , half    , __local   int     *);        \
+  half2    _CL_OVERLOADABLE NAME(half2   , half2   , __local   int2    *);        \
+  half3    _CL_OVERLOADABLE NAME(half3   , half3   , __local   int3    *);        \
+  half4    _CL_OVERLOADABLE NAME(half4   , half4   , __local   int4    *);        \
+  half8    _CL_OVERLOADABLE NAME(half8   , half8   , __local   int8    *);        \
+  half16   _CL_OVERLOADABLE NAME(half16  , half16  , __local   int16   *);)       \
+  float    _CL_OVERLOADABLE NAME(float   , float   , __local   int     *);        \
+  float2   _CL_OVERLOADABLE NAME(float2  , float2  , __local   int2    *);        \
+  float3   _CL_OVERLOADABLE NAME(float3  , float3  , __local   int3    *);        \
+  float4   _CL_OVERLOADABLE NAME(float4  , float4  , __local   int4    *);        \
+  float8   _CL_OVERLOADABLE NAME(float8  , float8  , __local   int8    *);        \
+  float16  _CL_OVERLOADABLE NAME(float16 , float16 , __local   int16   *);        \
+  __IF_FP64(                                                            \
+  double   _CL_OVERLOADABLE NAME(double  , double  , __local   int     *);        \
+  double2  _CL_OVERLOADABLE NAME(double2 , double2 , __local   int2    *);        \
+  double3  _CL_OVERLOADABLE NAME(double3 , double3 , __local   int3    *);        \
+  double4  _CL_OVERLOADABLE NAME(double4 , double4 , __local   int4    *);        \
+  double8  _CL_OVERLOADABLE NAME(double8 , double8 , __local   int8    *);        \
+  double16 _CL_OVERLOADABLE NAME(double16, double16, __local   int16   *);)       \
+  __IF_FP16(                                                            \
+  half     _CL_OVERLOADABLE NAME(half    , half    , __private int     *);        \
+  half2    _CL_OVERLOADABLE NAME(half2   , half2   , __private int2    *);        \
+  half3    _CL_OVERLOADABLE NAME(half3   , half3   , __private int3    *);        \
+  half4    _CL_OVERLOADABLE NAME(half4   , half4   , __private int4    *);        \
+  half8    _CL_OVERLOADABLE NAME(half8   , half8   , __private int8    *);        \
+  half16   _CL_OVERLOADABLE NAME(half16  , half16  , __private int16   *);)       \
+  float    _CL_OVERLOADABLE NAME(float   , float   , __private int     *);        \
+  float2   _CL_OVERLOADABLE NAME(float2  , float2  , __private int2    *);        \
+  float3   _CL_OVERLOADABLE NAME(float3  , float3  , __private int3    *);        \
+  float4   _CL_OVERLOADABLE NAME(float4  , float4  , __private int4    *);        \
+  float8   _CL_OVERLOADABLE NAME(float8  , float8  , __private int8    *);        \
+  float16  _CL_OVERLOADABLE NAME(float16 , float16 , __private int16   *);        \
+  __IF_FP64(                                                            \
+  double   _CL_OVERLOADABLE NAME(double  , double  , __private int     *);        \
+  double2  _CL_OVERLOADABLE NAME(double2 , double2 , __private int2    *);        \
+  double3  _CL_OVERLOADABLE NAME(double3 , double3 , __private int3    *);        \
+  double4  _CL_OVERLOADABLE NAME(double4 , double4 , __private int4    *);        \
+  double8  _CL_OVERLOADABLE NAME(double8 , double8 , __private int8    *);        \
+  double16 _CL_OVERLOADABLE NAME(double16, double16, __private int16   *);)
+
+
 #define _CL_DECLARE_FUNC_V_VPV(NAME)                                    \
   __IF_FP16(                                                            \
   half     _CL_OVERLOADABLE NAME(half    , __global  half    *);        \
@@ -1075,159 +1321,6 @@ void _CL_OVERLOADABLE barrier (cl_mem_fence_flags flags);
   float8   _CL_OVERLOADABLE NAME(float8  , float8  );   \
   float16  _CL_OVERLOADABLE NAME(float16 , float16 );
 
-/* Move built-in declarations and libm functions out of the way.
-  (There should be a better way of doing so. These functions are
-  either built-in math functions for OpenCL (see Clang's
-  "Builtins.def"), although the either should not be, or should have
-  the correct prototype. Functions defined in libc or libm may also
-  interfere with OpenCL's functions, since their prototypes will be
-  wrong. */
-#define abs            _cl_abs
-#define abs_diff       _cl_abs_diff
-#define acos           _cl_acos
-#define acosh          _cl_acosh
-#define acospi         _cl_acospi
-#define add_sat        _cl_add_sat
-#define all            _cl_all
-#define any            _cl_any
-#define asin           _cl_asin
-#define asinh          _cl_asinh
-#define asinpi         _cl_asinpi
-#define atan           _cl_atan
-#define atan2          _cl_atan2
-#define atan2pi        _cl_atan2pi
-#define atanh          _cl_atanh
-#define atanpi         _cl_atanpi
-#define bitselect      _cl_bitselect
-#define cbrt           _cl_cbrt
-#define ceil           _cl_ceil
-#define clamp          _cl_clamp
-#define clz            _cl_clz
-#define copysign       _cl_copysign
-#define cos            _cl_cos
-#define cosh           _cl_cosh
-#define cospi          _cl_cospi
-#define cross          _cl_cross
-#define degrees        _cl_degrees
-#define distance       _cl_distance
-#define dot            _cl_dot
-#define erf            _cl_erf
-#define erfc           _cl_erfc
-#define exp            _cl_exp
-#define exp10          _cl_exp10
-#define exp2           _cl_exp2
-#define expm1          _cl_expm1
-#define fabs           _cl_fabs
-#define fast_distance  _cl_fast_distance
-#define fast_length    _cl_fast_length
-#define fast_normalize _cl_fast_normalize
-#define fdim           _cl_fdim
-#define floor          _cl_floor
-#define fma            _cl_fma
-#define fmax           _cl_fmax
-#define fmin           _cl_fmin
-#define fmod           _cl_fmod
-#define fract          _cl_fract
-#define frexp          _cl_frexp
-#define hadd           _cl_hadd
-#define half_cos       _cl_half_cos
-#define half_divide    _cl_half_divide
-#define half_exp       _cl_half_exp
-#define half_exp10     _cl_half_exp10
-#define half_exp2      _cl_half_exp2
-#define half_log       _cl_half_log
-#define half_log10     _cl_half_log10
-#define half_log2      _cl_half_log2
-#define half_powr      _cl_half_powr
-#define half_recip     _cl_half_recip
-#define half_rsqrt     _cl_half_rsqrt
-#define half_sin       _cl_half_sin
-#define half_sqrt      _cl_half_sqrt
-#define half_tan       _cl_half_tan
-#define hypot          _cl_hypot
-#define ilogb          _cl_ilogb
-#define isequal        _cl_isequal
-#define isfinite       _cl_isfinite
-#define isgreater      _cl_isgreater
-#define isgreaterequal _cl_isgreaterequal
-#define isinf          _cl_isinf
-#define isless         _cl_isless
-#define islessequal    _cl_islessequal
-#define islessgreater  _cl_islessgreater
-#define isnan          _cl_isnan
-#define isnormal       _cl_isnormal
-#define isnotequal     _cl_isnotequal
-#define isordered      _cl_isordered
-#define isunordered    _cl_isunordered
-#define ldexp          _cl_ldexp
-#define length         _cl_length
-#define lgamma         _cl_lgamma
-#define lgamma_r       _cl_lgamma_r
-#define log            _cl_log
-#define log10          _cl_log10
-#define log1p          _cl_log1p
-#define log2           _cl_log2
-#define logb           _cl_logb
-#define mad            _cl_mad
-#define mad24          _cl_mad24
-#define mad_hi         _cl_mad_hi
-#define mad_sat        _cl_mad_sat
-#define max            _cl_max
-#define maxmag         _cl_maxmag
-#define min            _cl_min
-#define minmag         _cl_minmag
-#define mix            _cl_mix
-#define modf           _cl_modf
-#define mul24          _cl_mul24
-#define mul_hi         _cl_mul_hi
-#define nan            _cl_nan
-#define native_cos     _cl_native_cos
-#define native_divide  _cl_native_divide
-#define native_exp     _cl_native_exp
-#define native_exp10   _cl_native_exp10
-#define native_exp2    _cl_native_exp2
-#define native_log     _cl_native_log
-#define native_log10   _cl_native_log10
-#define native_log2    _cl_native_log2
-#define native_powr    _cl_native_powr
-#define native_recip   _cl_native_recip
-#define native_rsqrt   _cl_native_rsqrt
-#define native_sin     _cl_native_sin
-#define native_sqrt    _cl_native_sqrt
-#define native_tan     _cl_native_tan
-#define nextafter      _cl_nextafter
-#define normalize      _cl_normalize
-#define popcount       _cl_popcount
-#define pow            _cl_pow
-#define pown           _cl_pown
-#define powr           _cl_powr
-#define radians        _cl_radians
-#define remainder      _cl_remainder
-#define remquo         _cl_remquo
-#define rhadd          _cl_rhadd
-#define rint           _cl_rint
-#define rootn          _cl_rootn
-#define rotate         _cl_rotate
-#define round          _cl_round
-#define rsqrt          _cl_rsqrt
-#define select         _cl_select
-#define sign           _cl_sign
-#define signbit        _cl_signbit
-#define sin            _cl_sin
-#define sincos         _cl_sincos
-#define sinh           _cl_sinh
-#define sinpi          _cl_sinpi
-#define smoothstep     _cl_smoothstep
-#define sqrt           _cl_sqrt
-#define step           _cl_step
-#define sub_sat        _cl_sub_sat
-#define tan            _cl_tan
-#define tanh           _cl_tanh
-#define tanpi          _cl_tanpi
-#define tgamma         _cl_tgamma
-#define trunc          _cl_trunc
-#define upsample       _cl_upsample
-
 _CL_DECLARE_FUNC_V_V(acos)
 _CL_DECLARE_FUNC_V_V(acosh)
 _CL_DECLARE_FUNC_V_V(acospi)
@@ -1279,7 +1372,7 @@ _CL_DECLARE_FUNC_K_V(ilogb)
 _CL_DECLARE_FUNC_V_VJ(ldexp)
 _CL_DECLARE_FUNC_V_VI(ldexp)
 _CL_DECLARE_FUNC_V_V(lgamma)
-// lgamma_r
+_CL_DECLARE_FUNC_V_VPK(lgamma_r)
 _CL_DECLARE_FUNC_V_V(log)
 _CL_DECLARE_FUNC_V_V(log2)
 _CL_DECLARE_FUNC_V_V(log10)
@@ -1296,7 +1389,7 @@ _CL_DECLARE_FUNC_V_VJ(pown)
 _CL_DECLARE_FUNC_V_VI(pown)
 _CL_DECLARE_FUNC_V_VV(powr)
 _CL_DECLARE_FUNC_V_VV(remainder)
-// remquo
+_CL_DECLARE_FUNC_V_VVPK(remquo)
 _CL_DECLARE_FUNC_V_V(rint)
 _CL_DECLARE_FUNC_V_VJ(rootn)
 _CL_DECLARE_FUNC_V_VI(rootn)
@@ -1312,7 +1405,6 @@ _CL_DECLARE_FUNC_V_V(tanh)
 _CL_DECLARE_FUNC_V_V(tanpi)
 _CL_DECLARE_FUNC_V_V(tgamma)
 _CL_DECLARE_FUNC_V_V(trunc)
-
 _CL_DECLARE_FUNC_F_F(half_cos)
 _CL_DECLARE_FUNC_F_FF(half_divide)
 _CL_DECLARE_FUNC_F_F(half_exp)
@@ -1342,6 +1434,9 @@ _CL_DECLARE_FUNC_V_V(_cl_native_sin)
 _CL_DECLARE_FUNC_V_V(_cl_native_sqrt)
 _CL_DECLARE_FUNC_V_V(_cl_native_tan)
 
+/* clang's header defines these */
+#ifndef _OPENCL_H_
+
 /* Integer Constants */
 
 #define CHAR_BIT  8
@@ -1364,6 +1459,7 @@ _CL_DECLARE_FUNC_V_V(_cl_native_tan)
 #define ULONG_MAX 0xffffffffffffffffUL
 #endif
 
+#endif
 
 /* Integer Functions */
 #define _CL_DECLARE_FUNC_G_G(NAME)              \
@@ -2059,8 +2155,6 @@ _CL_DECLARE_VSTORE(float , __private)
 __IF_FP64(
 _CL_DECLARE_VSTORE(double, __private))
 
-#ifdef cl_khr_fp16
-
 #define _CL_DECLARE_VLOAD_HALF(MOD)                                     \
   float   _CL_OVERLOADABLE vload_half   (size_t offset, const MOD half *p); \
   float2  _CL_OVERLOADABLE vload_half2  (size_t offset, const MOD half *p); \
@@ -2068,6 +2162,7 @@ _CL_DECLARE_VSTORE(double, __private))
   float4  _CL_OVERLOADABLE vload_half4  (size_t offset, const MOD half *p); \
   float8  _CL_OVERLOADABLE vload_half8  (size_t offset, const MOD half *p); \
   float16 _CL_OVERLOADABLE vload_half16 (size_t offset, const MOD half *p); \
+  float   _CL_OVERLOADABLE vloada_half  (size_t offset, const MOD half *p); \
   float2  _CL_OVERLOADABLE vloada_half2 (size_t offset, const MOD half *p); \
   float3  _CL_OVERLOADABLE vloada_half3 (size_t offset, const MOD half *p); \
   float4  _CL_OVERLOADABLE vloada_half4 (size_t offset, const MOD half *p); \
@@ -2087,11 +2182,26 @@ _CL_DECLARE_VLOAD_HALF(__private)
   void _CL_OVERLOADABLE vstore_half4##SUFFIX  (float4  data, size_t offset, MOD half *p); \
   void _CL_OVERLOADABLE vstore_half8##SUFFIX  (float8  data, size_t offset, MOD half *p); \
   void _CL_OVERLOADABLE vstore_half16##SUFFIX (float16 data, size_t offset, MOD half *p); \
+  void _CL_OVERLOADABLE vstorea_half##SUFFIX  (float   data, size_t offset, MOD half *p); \
   void _CL_OVERLOADABLE vstorea_half2##SUFFIX (float2  data, size_t offset, MOD half *p); \
   void _CL_OVERLOADABLE vstorea_half3##SUFFIX (float3  data, size_t offset, MOD half *p); \
   void _CL_OVERLOADABLE vstorea_half4##SUFFIX (float4  data, size_t offset, MOD half *p); \
   void _CL_OVERLOADABLE vstorea_half8##SUFFIX (float8  data, size_t offset, MOD half *p); \
-  void _CL_OVERLOADABLE vstorea_half16##SUFFIX(float16 data, size_t offset, MOD half *p);
+  void _CL_OVERLOADABLE vstorea_half16##SUFFIX(float16 data, size_t offset, MOD half *p); \
+  __IF_FP64(                                                                              \
+  void _CL_OVERLOADABLE vstore_half##SUFFIX   (double   data, size_t offset, MOD half *p); \
+  void _CL_OVERLOADABLE vstore_half2##SUFFIX  (double2  data, size_t offset, MOD half *p); \
+  void _CL_OVERLOADABLE vstore_half3##SUFFIX  (double3  data, size_t offset, MOD half *p); \
+  void _CL_OVERLOADABLE vstore_half4##SUFFIX  (double4  data, size_t offset, MOD half *p); \
+  void _CL_OVERLOADABLE vstore_half8##SUFFIX  (double8  data, size_t offset, MOD half *p); \
+  void _CL_OVERLOADABLE vstore_half16##SUFFIX (double16 data, size_t offset, MOD half *p); \
+  void _CL_OVERLOADABLE vstorea_half##SUFFIX  (double   data, size_t offset, MOD half *p); \
+  void _CL_OVERLOADABLE vstorea_half2##SUFFIX (double2  data, size_t offset, MOD half *p); \
+  void _CL_OVERLOADABLE vstorea_half3##SUFFIX (double3  data, size_t offset, MOD half *p); \
+  void _CL_OVERLOADABLE vstorea_half4##SUFFIX (double4  data, size_t offset, MOD half *p); \
+  void _CL_OVERLOADABLE vstorea_half8##SUFFIX (double8  data, size_t offset, MOD half *p); \
+  void _CL_OVERLOADABLE vstorea_half16##SUFFIX(double16 data, size_t offset, MOD half *p); \
+  )
 
 _CL_DECLARE_VSTORE_HALF(__global  ,     )
 _CL_DECLARE_VSTORE_HALF(__global  , _rte)
@@ -2109,9 +2219,6 @@ _CL_DECLARE_VSTORE_HALF(__private , _rtz)
 _CL_DECLARE_VSTORE_HALF(__private , _rtp)
 _CL_DECLARE_VSTORE_HALF(__private , _rtn)
 
-#endif
-
-
 /* Atomic operations */
 
 #define _CL_DECLARE_ATOMICS(MOD, TYPE)                                  \
@@ -2154,19 +2261,6 @@ _CL_DECLARE_ATOMICS64(__global, ulong)
 _CL_DECLARE_ATOMICS64(__local , long )
 _CL_DECLARE_ATOMICS64(__local , ulong)
 
-#define atom_add     atomic_add
-#define atom_sub     atomic_sub
-#define atom_xchg    atomic_xchg
-#define atom_inc     atomic_inc
-#define atom_dec     atomic_dec
-#define atom_cmpxchg atomic_cmpxchg
-#define atom_min     atomic_min
-#define atom_max     atomic_max
-#define atom_and     atomic_and
-#define atom_or      atomic_or
-#define atom_xor     atomic_xor
-
-
 /* OpenCL 2.0 Atomics */
 
 #if ((__clang_major__ >= 4) || (__clang_major__ == 3) && (__clang_minor__ >= 7))
@@ -2295,13 +2389,9 @@ _CL_DECLARE_SHUFFLE_MN(ulong , ulong ))
 __IF_FP64(
 _CL_DECLARE_SHUFFLE_MN(double, ulong ))
 
-// We provide our own printf
-// Note: We declare our printf as taking a constant format string, but
-// we implement it in C using a const format string (i.e. a format
-// string living in a different address space). This works only if all
-// address spaces are actually the same, e.g. on CPUs.
-int __cl_printf(constant char* restrict format, ...);
-#define printf __cl_printf
+void _CL_OVERLOADABLE wait_group_events (int num_events, event_t *event_list);
+
+/***************************************************************************/
 
 /* Async Copies from Global to Local Memory, Local to
    Global Memory, and Prefetch */
@@ -2319,9 +2409,6 @@ int __cl_printf(constant char* restrict format, ...);
                                  size_t num_gentypes,           \
                                  event_t event);                \
                                                                 
-void wait_group_events (int num_events,                      
-                        event_t *event_list);                 
-
 #define _CL_DECLARE_ASYNC_COPY_FUNCS(GENTYPE)      \
   _CL_DECLARE_ASYNC_COPY_FUNCS_SINGLE(GENTYPE)     \
   _CL_DECLARE_ASYNC_COPY_FUNCS_SINGLE(GENTYPE##2)   \
@@ -2339,10 +2426,45 @@ _CL_DECLARE_ASYNC_COPY_FUNCS(uint);
 __IF_INT64(_CL_DECLARE_ASYNC_COPY_FUNCS(long));
 __IF_INT64(_CL_DECLARE_ASYNC_COPY_FUNCS(ulong));
 
-__IF_FP16(_CL_DECLARE_ASYNC_COPY_FUNCS_SINGLE(half));
+__IF_FP16 (_CL_DECLARE_ASYNC_COPY_FUNCS (half));
 _CL_DECLARE_ASYNC_COPY_FUNCS(float);
 __IF_FP64(_CL_DECLARE_ASYNC_COPY_FUNCS(double));
 
+/***************************************************************************/
+
+#define _CL_DECLARE_ASYNC_STRIDED_COPY_FUNCS_SINGLE(GENTYPE)                  \
+  _CL_OVERLOADABLE                                                            \
+  event_t async_work_group_strided_copy (                                     \
+      __local GENTYPE *dst, const __global GENTYPE *src, size_t num_gentypes, \
+      size_t src_stride, event_t event);                                      \
+                                                                              \
+  _CL_OVERLOADABLE                                                            \
+  event_t async_work_group_strided_copy (                                     \
+      __global GENTYPE *dst, const __local GENTYPE *src, size_t num_gentypes, \
+      size_t dst_stride, event_t event);
+
+#define _CL_DECLARE_ASYNC_STRIDED_COPY_FUNCS(GENTYPE)                         \
+  _CL_DECLARE_ASYNC_STRIDED_COPY_FUNCS_SINGLE (GENTYPE)                       \
+  _CL_DECLARE_ASYNC_STRIDED_COPY_FUNCS_SINGLE (GENTYPE##2)                    \
+  _CL_DECLARE_ASYNC_STRIDED_COPY_FUNCS_SINGLE (GENTYPE##3)                    \
+  _CL_DECLARE_ASYNC_STRIDED_COPY_FUNCS_SINGLE (GENTYPE##4)                    \
+  _CL_DECLARE_ASYNC_STRIDED_COPY_FUNCS_SINGLE (GENTYPE##8)                    \
+  _CL_DECLARE_ASYNC_STRIDED_COPY_FUNCS_SINGLE (GENTYPE##16)
+
+_CL_DECLARE_ASYNC_STRIDED_COPY_FUNCS (char);
+_CL_DECLARE_ASYNC_STRIDED_COPY_FUNCS (uchar);
+_CL_DECLARE_ASYNC_STRIDED_COPY_FUNCS (short);
+_CL_DECLARE_ASYNC_STRIDED_COPY_FUNCS (ushort);
+_CL_DECLARE_ASYNC_STRIDED_COPY_FUNCS (int);
+_CL_DECLARE_ASYNC_STRIDED_COPY_FUNCS (uint);
+__IF_INT64 (_CL_DECLARE_ASYNC_STRIDED_COPY_FUNCS (long));
+__IF_INT64 (_CL_DECLARE_ASYNC_STRIDED_COPY_FUNCS (ulong));
+
+__IF_FP16 (_CL_DECLARE_ASYNC_STRIDED_COPY_FUNCS (half));
+_CL_DECLARE_ASYNC_STRIDED_COPY_FUNCS (float);
+__IF_FP64 (_CL_DECLARE_ASYNC_STRIDED_COPY_FUNCS (double));
+
+/***************************************************************************/
 
 #define _CL_DECLARE_PREFETCH_FUNCS_SINGLE(GENTYPE) \
   _CL_OVERLOADABLE \
@@ -2368,110 +2490,460 @@ __IF_FP16(_CL_DECLARE_PREFETCH_FUNCS(half));
 _CL_DECLARE_PREFETCH_FUNCS(float);
 __IF_FP64(_CL_DECLARE_PREFETCH_FUNCS(double));
 
-/* read_imagef 2d functions*/
-float4 _CL_OVERLOADABLE read_imagef (image2d_t image, sampler_t sampler,
-                                     int2 coord);
-/* float coords not implemented yet
-float4 _CL_OVERLOADABLE read_imagef (image2d_t image, sampler_t sampler,
-                                     float2 coord);
-*/
 
-float4 _CL_OVERLOADABLE read_imagef (image2d_t image, int2 coord);
+/***************************************************************************/
 
-float4 _CL_OVERLOADABLE read_imagef (image2d_array_t image, int4 coord);
+/* NO sampler */
 
-float4 _CL_OVERLOADABLE read_imagef (image2d_array_t image, sampler_t sampler,
-                                     int4 coord);
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (IMG_RO_AQ image1d_t image, int coord);
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (IMG_RO_AQ image1d_buffer_t image,
+                                     int coord);
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (IMG_RO_AQ image1d_array_t image, int2 coord);
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (IMG_RO_AQ image2d_t image, int2 coord);
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (IMG_RO_AQ image2d_array_t image, int4 coord);
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (IMG_RO_AQ image3d_t image, int4 coord);
 
-/*float coords not immplemented yet
-float4 _CL_OVERLOADABLE read_imagef (image2d_array_t image, sampler_t sampler,
-                                     float4 coord);
-*/
 
-/* read_imagef 3d functions*/
-float4 _CL_OVERLOADABLE read_imagef (image3d_t image, sampler_t sampler,
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui (IMG_RO_AQ image1d_t image, int coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RO_AQ image1d_t image, int coord);
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui (IMG_RO_AQ image1d_buffer_t image,
+                                     int coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RO_AQ image1d_buffer_t image,
+                                   int coord);
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui (IMG_RO_AQ image1d_array_t image,
+                                     int2 coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RO_AQ image1d_array_t image,
+                                   int2 coord);
+
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui (IMG_RO_AQ image2d_t image, int2 coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RO_AQ image2d_t image, int2 coord);
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui (IMG_RO_AQ image2d_array_t image,
+                                     int4 coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RO_AQ image2d_array_t image,
+                                   int4 coord);
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui (IMG_RO_AQ image3d_t image, int4 coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RO_AQ image3d_t image, int4 coord);
+
+/* float4 img + float coords + sampler */
+
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (IMG_RO_AQ image1d_t image,
+                                     sampler_t sampler, float coord);
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (IMG_RO_AQ image1d_buffer_t image,
+                                     sampler_t sampler, float coord);
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (IMG_RO_AQ image1d_array_t image, sampler_t sampler,
+                                     float2 coord);
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (IMG_RO_AQ image2d_t image,
+                                     sampler_t sampler, float2 coord);
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (IMG_RO_AQ image2d_array_t image, sampler_t sampler,
+                                     float4 coord);
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (IMG_RO_AQ image3d_t image,
+                                     sampler_t sampler, float4 coord);
+
+/* float4 img + int coords + sampler */
+
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (IMG_RO_AQ image1d_t image,
+                                     sampler_t sampler, int coord);
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (IMG_RO_AQ image1d_buffer_t image,
+                                     sampler_t sampler, int coord);
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (IMG_RO_AQ image1d_array_t image,
+                                     sampler_t sampler, int2 coord);
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (IMG_RO_AQ image2d_t image,
+                                     sampler_t sampler, int2 coord);
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (IMG_RO_AQ image2d_array_t image,
+                                     sampler_t sampler, int4 coord);
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef ( IMG_RO_AQ image3d_t image, sampler_t sampler,
                                      int4 coord);
 
-/* read_imageui 2d functions*/
-uint4 _CL_OVERLOADABLE read_imageui (image2d_t image, sampler_t sampler,
+/* int4 img + float coords + sampler */
+
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui (IMG_RO_AQ image1d_t image,
+                                     sampler_t sampler, float coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RO_AQ image1d_t image,
+                                   sampler_t sampler, float coord);
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui (IMG_RO_AQ image1d_buffer_t image,
+                                     sampler_t sampler, float coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RO_AQ image1d_buffer_t image,
+                                   sampler_t sampler, float coord);
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui (IMG_RO_AQ image1d_array_t image,
+                                     sampler_t sampler, float2 coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RO_AQ image1d_array_t image,
+                                   sampler_t sampler, float2 coord);
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui (IMG_RO_AQ image2d_t image,
+                                     sampler_t sampler, float2 coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RO_AQ image2d_t image,
+                                   sampler_t sampler, float2 coord);
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui (IMG_RO_AQ image2d_array_t image,
+                                     sampler_t sampler, float4 coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RO_AQ image2d_array_t image,
+                                   sampler_t sampler, float4 coord);
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui (IMG_RO_AQ image3d_t image,
+                                     sampler_t sampler, float4 coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RO_AQ image3d_t image,
+                                   sampler_t sampler, float4 coord);
+
+/* int4 img + int coords + sampler */
+
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui (IMG_RO_AQ image1d_t image,
+                                     sampler_t sampler, int coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RO_AQ image1d_t image,
+                                   sampler_t sampler, int coord);
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui (IMG_RO_AQ image1d_buffer_t image,
+                                     sampler_t sampler, int coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RO_AQ image1d_buffer_t image,
+                                   sampler_t sampler, int coord);
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui (IMG_RO_AQ image1d_array_t image,
+                                     sampler_t sampler, int2 coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RO_AQ image1d_array_t image,
+                                   sampler_t sampler, int2 coord);
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui (IMG_RO_AQ image2d_t image,
+                                     sampler_t sampler, int2 coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RO_AQ image2d_t image,
+                                   sampler_t sampler, int2 coord);
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui (IMG_RO_AQ image2d_array_t image,
+                                     sampler_t sampler, int4 coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RO_AQ image2d_array_t image,
+                                   sampler_t sampler, int4 coord);
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui ( IMG_RO_AQ image3d_t image, sampler_t sampler,
+                                     int4 coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RO_AQ image3d_t image,
+                                   sampler_t sampler, int4 coord);
+
+/****************************************************************************/
+
+#ifdef CLANG_HAS_RW_IMAGES
+
+/* NO sampler */
+
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (IMG_RW_AQ image1d_t image, int coord);
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (IMG_RW_AQ image1d_buffer_t image,
+                                     int coord);
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (IMG_RW_AQ image1d_array_t image, int2 coord);
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (IMG_RW_AQ image2d_t image, int2 coord);
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (IMG_RW_AQ image2d_array_t image, int4 coord);
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (IMG_RW_AQ image3d_t image, int4 coord);
+
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui (IMG_RW_AQ image1d_t image, int coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RW_AQ image1d_t image, int coord);
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui (IMG_RW_AQ image1d_buffer_t image,
+                                     int coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RW_AQ image1d_buffer_t image,
+                                   int coord);
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui (IMG_RW_AQ image1d_array_t image,
                                      int2 coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RW_AQ image1d_array_t image,
+                                   int2 coord);
 
-uint4 _CL_OVERLOADABLE read_imageui (image2d_t image, sampler_t sampler, 
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui (IMG_RW_AQ image2d_t image, int2 coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RW_AQ image2d_t image, int2 coord);
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui (IMG_RW_AQ image2d_array_t image,
+                                     int4 coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RW_AQ image2d_array_t image,
+                                   int4 coord);
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui (IMG_RW_AQ image3d_t image, int4 coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RW_AQ image3d_t image, int4 coord);
+
+/* float4 img + float coords + sampler */
+
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (IMG_RW_AQ image1d_t image,
+                                     sampler_t sampler, float coord);
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (IMG_RW_AQ image1d_buffer_t image,
+                                     sampler_t sampler, float coord);
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (IMG_RW_AQ image1d_array_t image, sampler_t sampler,
+                                     float2 coord);
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (IMG_RW_AQ image2d_t image,
+                                     sampler_t sampler, float2 coord);
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (IMG_RW_AQ image2d_array_t image, sampler_t sampler,
+                                     float4 coord);
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (IMG_RW_AQ image3d_t image,
+                                     sampler_t sampler, float4 coord);
+
+/* float4 img + int coords + sampler */
+
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (IMG_RW_AQ image1d_t image,
+                                     sampler_t sampler, int coord);
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (IMG_RW_AQ image1d_buffer_t image,
+                                     sampler_t sampler, int coord);
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (IMG_RW_AQ image1d_array_t image,
+                                     sampler_t sampler, int2 coord);
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (IMG_RW_AQ image2d_t image,
+                                     sampler_t sampler, int2 coord);
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (IMG_RW_AQ image2d_array_t image,
+                                     sampler_t sampler, int4 coord);
+float4 _CL_OVERLOADABLE _CL_READONLY read_imagef ( IMG_RW_AQ image3d_t image, sampler_t sampler,
                                      int4 coord);
 
-uint4 _CL_OVERLOADABLE read_imageui (image3d_t image, sampler_t sampler, 
+/* int4 img + float coords + sampler */
+
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui (IMG_RW_AQ image1d_t image,
+                                     sampler_t sampler, float coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RW_AQ image1d_t image,
+                                   sampler_t sampler, float coord);
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui (IMG_RW_AQ image1d_buffer_t image,
+                                     sampler_t sampler, float coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RW_AQ image1d_buffer_t image,
+                                   sampler_t sampler, float coord);
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui (IMG_RW_AQ image1d_array_t image,
+                                     sampler_t sampler, float2 coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RW_AQ image1d_array_t image,
+                                   sampler_t sampler, float2 coord);
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui (IMG_RW_AQ image2d_t image,
+                                     sampler_t sampler, float2 coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RW_AQ image2d_t image,
+                                   sampler_t sampler, float2 coord);
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui (IMG_RW_AQ image2d_array_t image,
+                                     sampler_t sampler, float4 coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RW_AQ image2d_array_t image,
+                                   sampler_t sampler, float4 coord);
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui (IMG_RW_AQ image3d_t image,
+                                     sampler_t sampler, float4 coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RW_AQ image3d_t image,
+                                   sampler_t sampler, float4 coord);
+
+/* int4 img + int coords + sampler */
+
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui (IMG_RW_AQ image1d_t image,
+                                     sampler_t sampler, int coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RW_AQ image1d_t image,
+                                   sampler_t sampler, int coord);
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui (IMG_RW_AQ image1d_buffer_t image,
+                                     sampler_t sampler, int coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RW_AQ image1d_buffer_t image,
+                                   sampler_t sampler, int coord);
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui (IMG_RW_AQ image1d_array_t image,
+                                     sampler_t sampler, int2 coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RW_AQ image1d_array_t image,
+                                   sampler_t sampler, int2 coord);
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui (IMG_RW_AQ image2d_t image,
+                                     sampler_t sampler, int2 coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RW_AQ image2d_t image,
+                                   sampler_t sampler, int2 coord);
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui (IMG_RW_AQ image2d_array_t image,
+                                     sampler_t sampler, int4 coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RW_AQ image2d_array_t image,
+                                   sampler_t sampler, int4 coord);
+uint4 _CL_OVERLOADABLE _CL_READONLY read_imageui ( IMG_RW_AQ image3d_t image, sampler_t sampler,
                                      int4 coord);
+int4 _CL_OVERLOADABLE _CL_READONLY read_imagei (IMG_RW_AQ image3d_t image,
+                                   sampler_t sampler, int4 coord);
 
-int4 _CL_OVERLOADABLE read_imagei (image2d_t image, sampler_t sampler, 
-                                   int2 coord);
+#endif
 
+/******************************************************************************************/
 
-void _CL_OVERLOADABLE write_imagei (IMG_WRITE_AQ image2d_t image, int2 coord, int4 color);
+void _CL_OVERLOADABLE write_imagef (IMG_WO_AQ image1d_t image, int coord,
+                                    float4 color);
+void _CL_OVERLOADABLE write_imagei (IMG_WO_AQ image1d_t image, int coord,
+                                    int4 color);
+void _CL_OVERLOADABLE write_imageui (IMG_WO_AQ image1d_t image, int coord,
+                                     uint4 color);
 
-void _CL_OVERLOADABLE write_imageui (IMG_WRITE_AQ image2d_t image, int2 coord, uint4 color);
+void _CL_OVERLOADABLE write_imagef (IMG_WO_AQ image1d_buffer_t image,
+                                    int coord, float4 color);
+void _CL_OVERLOADABLE write_imagei (IMG_WO_AQ image1d_buffer_t image,
+                                    int coord, int4 color);
+void _CL_OVERLOADABLE write_imageui (IMG_WO_AQ image1d_buffer_t image,
+                                     int coord, uint4 color);
+
+void _CL_OVERLOADABLE write_imagef (IMG_WO_AQ image1d_array_t image,
+                                    int2 coord, float4 color);
+void _CL_OVERLOADABLE write_imagei (IMG_WO_AQ image1d_array_t image,
+                                    int2 coord, int4 color);
+void _CL_OVERLOADABLE write_imageui (IMG_WO_AQ image1d_array_t image,
+                                     int2 coord, uint4 color);
+
+void _CL_OVERLOADABLE write_imagef (IMG_WO_AQ image2d_t image, int2 coord,
+                                    float4 color);
+void _CL_OVERLOADABLE write_imagei (IMG_WO_AQ image2d_t image, int2 coord,
+                                    int4 color);
+void _CL_OVERLOADABLE write_imageui (IMG_WO_AQ image2d_t image, int2 coord,
+                                     uint4 color);
+__IF_FP16 (void _CL_OVERLOADABLE write_imageh (IMG_WO_AQ image2d_t image,
+                                               int2 coord, half4 color));
+
+void _CL_OVERLOADABLE write_imagef (IMG_WO_AQ image2d_array_t image,
+                                    int4 coord, float4 color);
+void _CL_OVERLOADABLE write_imagei (IMG_WO_AQ image2d_array_t image,
+                                    int4 coord, int4 color);
+void _CL_OVERLOADABLE write_imageui (IMG_WO_AQ image2d_array_t image,
+                                     int4 coord, uint4 color);
+__IF_FP16 (void _CL_OVERLOADABLE write_imageh (IMG_WO_AQ image2d_array_t image,
+                                               int4 coord, half4 color));
+
+#ifdef cl_khr_3d_image_writes
+void _CL_OVERLOADABLE write_imagef (IMG_WO_AQ image3d_t image, int4 coord,
+                                    float4 color);
+void _CL_OVERLOADABLE write_imagei (IMG_WO_AQ image3d_t image, int4 coord,
+                                    int4 color);
+void _CL_OVERLOADABLE write_imageui (IMG_WO_AQ image3d_t image, int4 coord,
+                                     uint4 color);
+__IF_FP16 (void _CL_OVERLOADABLE write_imageh (IMG_WO_AQ image3d_t image,
+                                               int4 coord, half4 color));
+#endif
 
+/* UNIMPLEMENTED: 1d / 1d array */
 
+#ifdef CLANG_HAS_RW_IMAGES
 
-void _CL_OVERLOADABLE write_imagef (IMG_WRITE_AQ image2d_t image, int2 coord,
+void _CL_OVERLOADABLE write_imagef (IMG_RW_AQ image1d_t image, int coord,
                                     float4 color);
+void _CL_OVERLOADABLE write_imagei (IMG_RW_AQ image1d_t image, int coord,
+                                    int4 color);
+void _CL_OVERLOADABLE write_imageui (IMG_RW_AQ image1d_t image, int coord,
+                                     uint4 color);
 
-void _CL_OVERLOADABLE write_imagef (IMG_WRITE_AQ image3d_t image, int4 coord,
+void _CL_OVERLOADABLE write_imagef (IMG_RW_AQ image1d_buffer_t image,
+                                    int coord, float4 color);
+void _CL_OVERLOADABLE write_imagei (IMG_RW_AQ image1d_buffer_t image,
+                                    int coord, int4 color);
+void _CL_OVERLOADABLE write_imageui (IMG_RW_AQ image1d_buffer_t image,
+                                     int coord, uint4 color);
+
+void _CL_OVERLOADABLE write_imagef (IMG_RW_AQ image1d_array_t image,
+                                    int2 coord, float4 color);
+void _CL_OVERLOADABLE write_imagei (IMG_RW_AQ image1d_array_t image,
+                                    int2 coord, int4 color);
+void _CL_OVERLOADABLE write_imageui (IMG_RW_AQ image1d_array_t image,
+                                     int2 coord, uint4 color);
+
+void _CL_OVERLOADABLE write_imagef (IMG_RW_AQ image2d_t image, int2 coord,
                                     float4 color);
-
-/* not implemented 
-void _CL_OVERLOADABLE write_imagef (image2d_array_t image, int4 coord,
+void _CL_OVERLOADABLE write_imagei (IMG_RW_AQ image2d_t image, int2 coord,
+                                    int4 color);
+void _CL_OVERLOADABLE write_imageui (IMG_RW_AQ image2d_t image, int2 coord,
+                                     uint4 color);
+__IF_FP16 (void _CL_OVERLOADABLE write_imageh (IMG_RW_AQ image2d_t image,
+                                               int2 coord, half4 color));
+
+void _CL_OVERLOADABLE write_imagef (IMG_RW_AQ image2d_array_t image,
+                                    int4 coord, float4 color);
+void _CL_OVERLOADABLE write_imagei (IMG_RW_AQ image2d_array_t image,
+                                    int4 coord, int4 color);
+void _CL_OVERLOADABLE write_imageui (IMG_RW_AQ image2d_array_t image,
+                                     int4 coord, uint4 color);
+__IF_FP16 (void _CL_OVERLOADABLE write_imageh (IMG_RW_AQ image2d_array_t image,
+                                               int4 coord, half4 color));
+
+void _CL_OVERLOADABLE write_imagef (IMG_RW_AQ image3d_t image, int4 coord,
                                     float4 color);
-
-void _CL_OVERLOADABLE write_imagei (image2d_array_t image, int4 coord,
+void _CL_OVERLOADABLE write_imagei (IMG_RW_AQ image3d_t image, int4 coord,
                                     int4 color);
-
-void _CL_OVERLOADABLE write_imageui (image2d_array_t image, int4 coord,
+void _CL_OVERLOADABLE write_imageui (IMG_RW_AQ image3d_t image, int4 coord,
                                      uint4 color);
+__IF_FP16 (void _CL_OVERLOADABLE write_imageh (IMG_RW_AQ image3d_t image,
+                                               int4 coord, half4 color));
 
-void _CL_OVERLOADABLE write_imagef (image1d_t image, int coord,
-                                    float4 color);
+#endif
 
-void _CL_OVERLOADABLE write_imagei (image1d_t image, int coord,
-                                    int4 color);
 
-void _CL_OVERLOADABLE write_imageui (image1d_t image, int coord, 
-                                     uint4 color);
+/******************************************************************************************/
 
-void _CL_OVERLOADABLE write_imagef (image1d_buffer_t image, int coord, 
-                                    float4 color);
+int _CL_OVERLOADABLE get_image_channel_data_type (IMG_RO_AQ image1d_t);
+int _CL_OVERLOADABLE get_image_channel_data_type (IMG_RO_AQ image1d_array_t);
+int _CL_OVERLOADABLE get_image_channel_data_type (IMG_RO_AQ image2d_array_t);
+int _CL_OVERLOADABLE get_image_channel_data_type (IMG_RO_AQ image2d_t);
+int _CL_OVERLOADABLE get_image_channel_data_type (IMG_RO_AQ image3d_t);
 
-void _CL_OVERLOADABLE write_imagei (image1d_buffer_t image, int coord,
-                                     int4 color);
+int _CL_OVERLOADABLE get_image_channel_order (IMG_RO_AQ image1d_t);
+int _CL_OVERLOADABLE get_image_channel_order (IMG_RO_AQ image1d_array_t);
+int _CL_OVERLOADABLE get_image_channel_order (IMG_RO_AQ image2d_array_t);
+int _CL_OVERLOADABLE get_image_channel_order (IMG_RO_AQ image2d_t);
+int _CL_OVERLOADABLE get_image_channel_order (IMG_RO_AQ image3d_t);
 
-void _CL_OVERLOADABLE write_imageui (image1d_buffer_t image, int coord,
-                                     uint4 color);
+int _CL_OVERLOADABLE get_image_width (IMG_RO_AQ image1d_t image);
+int _CL_OVERLOADABLE get_image_width (IMG_RO_AQ image1d_array_t image);
+int _CL_OVERLOADABLE get_image_width (IMG_RO_AQ image2d_t image);
+int _CL_OVERLOADABLE get_image_width (IMG_RO_AQ image2d_array_t image);
+int _CL_OVERLOADABLE get_image_width (IMG_RO_AQ image3d_t image);
 
-void _CL_OVERLOADABLE write_imagef (image1d_array_t image, int2 coord,
-                                    float4 color);
+int _CL_OVERLOADABLE get_image_height (IMG_RO_AQ image1d_t image);
+int _CL_OVERLOADABLE get_image_height (IMG_RO_AQ image1d_array_t image);
+int _CL_OVERLOADABLE get_image_height (IMG_RO_AQ image2d_t image);
+int _CL_OVERLOADABLE get_image_height (IMG_RO_AQ image2d_array_t image);
+int _CL_OVERLOADABLE get_image_height (IMG_RO_AQ image3d_t image);
 
-void _CL_OVERLOADABLE write_imagei (image1d_array_t image, int2 coord,
-                                    int4 color);
+int _CL_OVERLOADABLE get_image_depth (IMG_RO_AQ image3d_t image);
 
-void _CL_OVERLOADABLE write_imageui (image1d_array_t image, int2 coord,
-                                     uint4 color);
+int2 _CL_OVERLOADABLE get_image_dim (IMG_RO_AQ image2d_t image);
+int2 _CL_OVERLOADABLE get_image_dim (IMG_RO_AQ image2d_array_t image);
+int4 _CL_OVERLOADABLE get_image_dim (IMG_RO_AQ image3d_t image);
 
-void _CL_OVERLOADABLE write_imageui (image3d_t image, int4 coord,
-                                     uint4 color);
-*/
-int _CL_OVERLOADABLE get_image_width (image1d_t image);
-int _CL_OVERLOADABLE get_image_width (image2d_t image);
-int _CL_OVERLOADABLE get_image_width (image3d_t image);
+size_t _CL_OVERLOADABLE get_image_array_size (IMG_RO_AQ image1d_array_t image);
+size_t _CL_OVERLOADABLE get_image_array_size (IMG_RO_AQ image2d_array_t image);
 
-int _CL_OVERLOADABLE get_image_height (image1d_t image);
-int _CL_OVERLOADABLE get_image_height (image2d_t image);
-int _CL_OVERLOADABLE get_image_height (image3d_t image);
+#ifdef CLANG_HAS_IMAGE_AS
 
-int _CL_OVERLOADABLE get_image_depth (image1d_t image);
-int _CL_OVERLOADABLE get_image_depth (image2d_t image);
-int _CL_OVERLOADABLE get_image_depth (image3d_t image);
+int _CL_OVERLOADABLE get_image_channel_order (IMG_WO_AQ image1d_t);
+int _CL_OVERLOADABLE get_image_channel_order (IMG_WO_AQ image1d_array_t);
+int _CL_OVERLOADABLE get_image_channel_order (IMG_WO_AQ image2d_array_t);
+int _CL_OVERLOADABLE get_image_channel_order (IMG_WO_AQ image2d_t);
+int _CL_OVERLOADABLE get_image_channel_order (IMG_WO_AQ image3d_t);
 
-int2 _CL_OVERLOADABLE get_image_dim (image2d_t image);
-int2 _CL_OVERLOADABLE get_image_dim (image2d_array_t image);
-int4 _CL_OVERLOADABLE get_image_dim (image3d_t image);
+int _CL_OVERLOADABLE get_image_channel_data_type (IMG_WO_AQ image1d_t);
+int _CL_OVERLOADABLE get_image_channel_data_type (IMG_WO_AQ image1d_array_t);
+int _CL_OVERLOADABLE get_image_channel_data_type (IMG_WO_AQ image2d_array_t);
+int _CL_OVERLOADABLE get_image_channel_data_type (IMG_WO_AQ image2d_t);
+int _CL_OVERLOADABLE get_image_channel_data_type (IMG_WO_AQ image3d_t);
+
+int _CL_OVERLOADABLE get_image_width (IMG_WO_AQ image1d_t image);
+int _CL_OVERLOADABLE get_image_width (IMG_WO_AQ image1d_array_t image);
+int _CL_OVERLOADABLE get_image_width (IMG_WO_AQ image2d_t image);
+int _CL_OVERLOADABLE get_image_width (IMG_WO_AQ image2d_array_t image);
+int _CL_OVERLOADABLE get_image_width (IMG_WO_AQ image3d_t image);
+
+int _CL_OVERLOADABLE get_image_height (IMG_WO_AQ image1d_t image);
+int _CL_OVERLOADABLE get_image_height (IMG_WO_AQ image1d_array_t image);
+int _CL_OVERLOADABLE get_image_height (IMG_WO_AQ image2d_t image);
+int _CL_OVERLOADABLE get_image_height (IMG_WO_AQ image2d_array_t image);
+int _CL_OVERLOADABLE get_image_height (IMG_WO_AQ image3d_t image);
+
+int _CL_OVERLOADABLE get_image_depth (IMG_WO_AQ image3d_t image);
+
+int2 _CL_OVERLOADABLE get_image_dim (IMG_WO_AQ image2d_t image);
+int2 _CL_OVERLOADABLE get_image_dim (IMG_WO_AQ image2d_array_t image);
+int4 _CL_OVERLOADABLE get_image_dim (IMG_WO_AQ image3d_t image);
+
+size_t _CL_OVERLOADABLE get_image_array_size (IMG_WO_AQ image1d_array_t image);
+size_t _CL_OVERLOADABLE get_image_array_size (IMG_WO_AQ image2d_array_t image);
+
+#endif
+
+#ifdef CLANG_HAS_RW_IMAGES
+
+int _CL_OVERLOADABLE get_image_channel_order (IMG_RW_AQ image1d_t);
+int _CL_OVERLOADABLE get_image_channel_order (IMG_RW_AQ image1d_array_t);
+int _CL_OVERLOADABLE get_image_channel_order (IMG_RW_AQ image2d_array_t);
+int _CL_OVERLOADABLE get_image_channel_order (IMG_RW_AQ image2d_t);
+int _CL_OVERLOADABLE get_image_channel_order (IMG_RW_AQ image3d_t);
+
+int _CL_OVERLOADABLE get_image_channel_data_type (IMG_RW_AQ image1d_t);
+int _CL_OVERLOADABLE get_image_channel_data_type (IMG_RW_AQ image1d_array_t);
+int _CL_OVERLOADABLE get_image_channel_data_type (IMG_RW_AQ image2d_array_t);
+int _CL_OVERLOADABLE get_image_channel_data_type (IMG_RW_AQ image2d_t);
+int _CL_OVERLOADABLE get_image_channel_data_type (IMG_RW_AQ image3d_t);
+
+int _CL_OVERLOADABLE get_image_width (IMG_RW_AQ image1d_t image);
+int _CL_OVERLOADABLE get_image_width (IMG_RW_AQ image1d_array_t image);
+int _CL_OVERLOADABLE get_image_width (IMG_RW_AQ image2d_t image);
+int _CL_OVERLOADABLE get_image_width (IMG_RW_AQ image2d_array_t image);
+int _CL_OVERLOADABLE get_image_width (IMG_RW_AQ image3d_t image);
+
+int _CL_OVERLOADABLE get_image_height (IMG_RW_AQ image1d_t image);
+int _CL_OVERLOADABLE get_image_height (IMG_RW_AQ image1d_array_t image);
+int _CL_OVERLOADABLE get_image_height (IMG_RW_AQ image2d_t image);
+int _CL_OVERLOADABLE get_image_height (IMG_RW_AQ image2d_array_t image);
+int _CL_OVERLOADABLE get_image_height (IMG_RW_AQ image3d_t image);
+
+int _CL_OVERLOADABLE get_image_depth (IMG_RW_AQ image3d_t image);
+
+int2 _CL_OVERLOADABLE get_image_dim (IMG_RW_AQ image2d_t image);
+int2 _CL_OVERLOADABLE get_image_dim (IMG_RW_AQ image2d_array_t image);
+int4 _CL_OVERLOADABLE get_image_dim (IMG_RW_AQ image3d_t image);
+
+size_t _CL_OVERLOADABLE get_image_array_size (IMG_RW_AQ image1d_array_t image);
+size_t _CL_OVERLOADABLE get_image_array_size (IMG_RW_AQ image2d_array_t image);
+
+#endif
+
+#endif
 
 #pragma OPENCL EXTENSION all : disable
diff --git a/include/_kernel_c.h b/include/_kernel_c.h
index 4c2c993..47edde3 100644
--- a/include/_kernel_c.h
+++ b/include/_kernel_c.h
@@ -2,7 +2,7 @@
    functions declarations for kernel builtin implementations using C.
 
    Copyright (c) 2011 Universidad Rey Juan Carlos
-   Copyright (c) 2011-2013 Pekka Jääskeläinen / TUT
+   Copyright (c) 2011-2017 Pekka Jääskeläinen / TUT
    Copyright (c) 2011-2013 Erik Schnetter <eschnetter at perimeterinstitute.ca>
                            Perimeter Institute for Theoretical Physics
 
@@ -50,6 +50,11 @@
 # undef LLVM_4_0
 # define LLVM_4_0
 
+#elif (__clang_major__ == 5)
+
+# undef LLVM_5_0
+# define LLVM_5_0
+
 #else
 
 #error Unsupported Clang/LLVM version.
@@ -61,21 +66,29 @@
 # define LLVM_OLDER_THAN_3_8 1
 # define LLVM_OLDER_THAN_3_9 1
 # define LLVM_OLDER_THAN_4_0 1
+# define LLVM_OLDER_THAN_5_0 1
 #endif
 
 #if (defined LLVM_3_7)
 # define LLVM_OLDER_THAN_3_8 1
 # define LLVM_OLDER_THAN_3_9 1
 # define LLVM_OLDER_THAN_4_0 1
+# define LLVM_OLDER_THAN_5_0 1
 #endif
 
 #if (defined LLVM_3_8)
 # define LLVM_OLDER_THAN_3_9 1
 # define LLVM_OLDER_THAN_4_0 1
+# define LLVM_OLDER_THAN_5_0 1
 #endif
 
 #if (defined LLVM_3_9)
 # define LLVM_OLDER_THAN_4_0 1
+# define LLVM_OLDER_THAN_5_0 1
+#endif
+
+#if (defined LLVM_4_0)
+# define LLVM_OLDER_THAN_5_0 1
 #endif
 
 #include "_kernel_constants.h"
@@ -96,9 +109,7 @@
 #else
 #  define _CL_OVERLOADABLE
 #endif
-#if (__clang_major__ == 3) && (__clang_minor__ >= 2)
-/* This causes an error with Clang 3.1: */
-/* #if __has_attribute(__const__) */
+#if __has_attribute(__const__)
 #  define _CL_READNONE __attribute__((__const__))
 #else
 #  define _CL_READNONE
@@ -157,13 +168,11 @@ typedef uint uint16 __attribute__((__ext_vector_type__(16)));
 typedef __fp16 half;
 #endif
 
-#ifdef cl_khr_fp16
 typedef half half2  __attribute__((__ext_vector_type__(2)));
 typedef half half3  __attribute__((__ext_vector_type__(3)));
 typedef half half4  __attribute__((__ext_vector_type__(4)));
 typedef half half8  __attribute__((__ext_vector_type__(8)));
 typedef half half16 __attribute__((__ext_vector_type__(16)));
-#endif
 
 typedef float float2  __attribute__((__ext_vector_type__(2)));
 typedef float float3  __attribute__((__ext_vector_type__(3)));
@@ -223,14 +232,6 @@ typedef struct _pocl_image2d_array_t { dev_image_t base; }* image2d_array_t;
 typedef struct _pocl_image1d_array_t { dev_image_t base; }* image1d_array_t;
 #endif
 
-// 3.9 needs access qualifier
-// TODO: rw images
-#ifdef CLANG_OLDER_THAN_3_9
-#define IMG_WRITE_AQ
-#else
-#define IMG_WRITE_AQ __write_only
-#endif
-
 #ifdef POCL_USE_FAKE_ADDR_SPACE_IDS
 /*
  * During pocl kernel compiler transformations we use the fixed address
@@ -258,5 +259,8 @@ typedef struct _pocl_image1d_array_t { dev_image_t base; }* image1d_array_t;
 
 #endif
 
+typedef uint cl_mem_fence_flags;
+
+#include "_enable_all_exts.h"
 
 #endif
diff --git a/include/_kernel_constants.h b/include/_kernel_constants.h
index ab11ea9..e45ebdb 100644
--- a/include/_kernel_constants.h
+++ b/include/_kernel_constants.h
@@ -31,58 +31,63 @@
 #ifndef _KERNEL_CONSTANTS_H
 #define _KERNEL_CONSTANTS_H
 
+/* clang's header defines these */
+#ifndef _OPENCL_H_
+
 /* cl_channel_order */
-#define CL_R                                        0x10B0
-#define CL_A                                        0x10B1
-#define CL_RG                                       0x10B2
-#define CL_RA                                       0x10B3
-#define CL_RGB                                      0x10B4
-#define CL_RGBA                                     0x10B5
-#define CL_BGRA                                     0x10B6
-#define CL_ARGB                                     0x10B7
-#define CL_INTENSITY                                0x10B8
-#define CL_LUMINANCE                                0x10B9
-#define CL_Rx                                       0x10BA
-#define CL_RGx                                      0x10BB
-#define CL_RGBx                                     0x10BC
-#define CL_DEPTH                                    0x10BD
-#define CL_DEPTH_STENCIL                            0x10BE
+#define CLK_R                                        0x10B0
+#define CLK_A                                        0x10B1
+#define CLK_RG                                       0x10B2
+#define CLK_RA                                       0x10B3
+#define CLK_RGB                                      0x10B4
+#define CLK_RGBA                                     0x10B5
+#define CLK_BGRA                                     0x10B6
+#define CLK_ARGB                                     0x10B7
+#define CLK_INTENSITY                                0x10B8
+#define CLK_LUMINANCE                                0x10B9
+#define CLK_Rx                                       0x10BA
+#define CLK_RGx                                      0x10BB
+#define CLK_RGBx                                     0x10BC
+#define CLK_DEPTH                                    0x10BD
+#define CLK_DEPTH_STENCIL                            0x10BE
 
 /* cl_channel_type */
-#define CL_SNORM_INT8                               0x10D0
-#define CL_SNORM_INT16                              0x10D1
-#define CL_UNORM_INT8                               0x10D2
-#define CL_UNORM_INT16                              0x10D3
-#define CL_UNORM_SHORT_565                          0x10D4
-#define CL_UNORM_SHORT_555                          0x10D5
-#define CL_UNORM_INT_101010                         0x10D6
-#define CL_SIGNED_INT8                              0x10D7
-#define CL_SIGNED_INT16                             0x10D8
-#define CL_SIGNED_INT32                             0x10D9
-#define CL_UNSIGNED_INT8                            0x10DA
-#define CL_UNSIGNED_INT16                           0x10DB
-#define CL_UNSIGNED_INT32                           0x10DC
-#define CL_HALF_FLOAT                               0x10DD
-#define CL_FLOAT                                    0x10DE
-#define CL_UNORM_INT24                              0x10DF
+#define CLK_SNORM_INT8                               0x10D0
+#define CLK_SNORM_INT16                              0x10D1
+#define CLK_UNORM_INT8                               0x10D2
+#define CLK_UNORM_INT16                              0x10D3
+#define CLK_UNORM_SHORT_565                          0x10D4
+#define CLK_UNORM_SHORT_555                          0x10D5
+#define CLK_UNORM_INT_101010                         0x10D6
+#define CLK_SIGNED_INT8                              0x10D7
+#define CLK_SIGNED_INT16                             0x10D8
+#define CLK_SIGNED_INT32                             0x10D9
+#define CLK_UNSIGNED_INT8                            0x10DA
+#define CLK_UNSIGNED_INT16                           0x10DB
+#define CLK_UNSIGNED_INT32                           0x10DC
+#define CLK_HALF_FLOAT                               0x10DD
+#define CLK_FLOAT                                    0x10DE
+#define CLK_UNORM_INT24                              0x10DF
 
 /* cl_addressing _mode */
 #define CLK_ADDRESS_NONE                            0x00
-#define CLK_ADDRESS_MIRRORED_REPEAT                 0x01
-#define CLK_ADDRESS_REPEAT                          0x02
-#define CLK_ADDRESS_CLAMP_TO_EDGE                   0x03
+#define CLK_ADDRESS_CLAMP_TO_EDGE                   0x02
 #define CLK_ADDRESS_CLAMP                           0x04
+#define CLK_ADDRESS_REPEAT                          0x06
+#define CLK_ADDRESS_MIRRORED_REPEAT                 0x08
 
 /* cl_sampler_info */
 #define CLK_NORMALIZED_COORDS_FALSE                 0x00
-#define CLK_NORMALIZED_COORDS_TRUE                  0x08
+#define CLK_NORMALIZED_COORDS_TRUE                  0x01
 
 /* filter_mode */
-#define CLK_FILTER_NEAREST                          0x00
-#define CLK_FILTER_LINEAR                           0x10
+#define CLK_FILTER_NEAREST                          0x10
+#define CLK_FILTER_LINEAR                           0x20
 
 /* barrier() flags */
 #define CLK_LOCAL_MEM_FENCE                         0x01
 #define CLK_GLOBAL_MEM_FENCE                        0x02
 
 #endif
+
+#endif
diff --git a/include/pocl.h b/include/pocl.h
index 71f4fef..40b5da7 100644
--- a/include/pocl.h
+++ b/include/pocl.h
@@ -38,6 +38,9 @@
 #include "pocl_device.h"
 #include "config.h"
 
+/* detects restrict, variadic macros etc */
+#include "pocl_compiler_features.h"
+
 #define POCL_FILENAME_LENGTH 1024
 
 typedef struct _mem_mapping mem_mapping_t;
@@ -306,21 +309,29 @@ struct _cl_command_node
 # define LLVM_OLDER_THAN_3_8 1
 # define LLVM_OLDER_THAN_3_9 1
 # define LLVM_OLDER_THAN_4_0 1
+# define LLVM_OLDER_THAN_5_0 1
 #endif
 
 #if (defined LLVM_3_7)
 # define LLVM_OLDER_THAN_3_8 1
 # define LLVM_OLDER_THAN_3_9 1
 # define LLVM_OLDER_THAN_4_0 1
+# define LLVM_OLDER_THAN_5_0 1
 #endif
 
 #if (defined LLVM_3_8)
 # define LLVM_OLDER_THAN_3_9 1
 # define LLVM_OLDER_THAN_4_0 1
+# define LLVM_OLDER_THAN_5_0 1
 #endif
 
 #if (defined LLVM_3_9)
 # define LLVM_OLDER_THAN_4_0 1
+# define LLVM_OLDER_THAN_5_0 1
+#endif
+
+#if (defined LLVM_4_0)
+# define LLVM_OLDER_THAN_5_0 1
 #endif
 
 #endif /* POCL_H */
diff --git a/include/pocl_cache.h b/include/pocl_cache.h
index 0ebc013..05397ef 100644
--- a/include/pocl_cache.h
+++ b/include/pocl_cache.h
@@ -39,7 +39,7 @@ extern "C" {
 #pragma GCC visibility push(hidden)
 #endif
 
-void pocl_cache_init_topdir();
+int pocl_cache_init_topdir ();
 
 int
 pocl_cache_create_program_cachedir(cl_program program, unsigned device_i,
@@ -61,7 +61,9 @@ void pocl_cache_release_lock(void* lock);
 int pocl_cl_device_to_index(cl_program   program,
                             cl_device_id device);
 
-void pocl_cache_mk_temp_name(char* path);
+void pocl_cache_tempname (char *path_template, const char *suffix, int *fd);
+
+int pocl_cache_create_tempdir(char* path);
 
 int pocl_cache_write_program_source(char *program_cl_path,
                                     cl_program program);
diff --git a/include/pocl_compiler_features.h b/include/pocl_compiler_features.h
new file mode 100644
index 0000000..05df68b
--- /dev/null
+++ b/include/pocl_compiler_features.h
@@ -0,0 +1,222 @@
+
+/* autogenerated by CMake, but edited by hand to not
+ * stop with #error when the compiler isn't gcc/clang */
+
+#ifndef POCL_COMPILER_DETECTION_H
+#define POCL_COMPILER_DETECTION_H
+
+#ifndef __cplusplus
+# define POCL_COMPILER_IS_Intel 0
+# define POCL_COMPILER_IS_PathScale 0
+# define POCL_COMPILER_IS_Embarcadero 0
+# define POCL_COMPILER_IS_Borland 0
+# define POCL_COMPILER_IS_Watcom 0
+# define POCL_COMPILER_IS_OpenWatcom 0
+# define POCL_COMPILER_IS_SunPro 0
+# define POCL_COMPILER_IS_HP 0
+# define POCL_COMPILER_IS_Compaq 0
+# define POCL_COMPILER_IS_zOS 0
+# define POCL_COMPILER_IS_XL 0
+# define POCL_COMPILER_IS_VisualAge 0
+# define POCL_COMPILER_IS_PGI 0
+# define POCL_COMPILER_IS_Cray 0
+# define POCL_COMPILER_IS_TI 0
+# define POCL_COMPILER_IS_Fujitsu 0
+# define POCL_COMPILER_IS_TinyCC 0
+# define POCL_COMPILER_IS_Bruce 0
+# define POCL_COMPILER_IS_SCO 0
+# define POCL_COMPILER_IS_AppleClang 0
+# define POCL_COMPILER_IS_Clang 0
+# define POCL_COMPILER_IS_GNU 0
+# define POCL_COMPILER_IS_MSVC 0
+# define POCL_COMPILER_IS_ADSP 0
+# define POCL_COMPILER_IS_IAR 0
+# define POCL_COMPILER_IS_ARMCC 0
+# define POCL_COMPILER_IS_SDCC 0
+# define POCL_COMPILER_IS_MIPSpro 0
+
+#if defined(__INTEL_COMPILER) || defined(__ICC)
+# undef POCL_COMPILER_IS_Intel
+# define POCL_COMPILER_IS_Intel 1
+
+#elif defined(__PATHCC__)
+# undef POCL_COMPILER_IS_PathScale
+# define POCL_COMPILER_IS_PathScale 1
+
+#elif defined(__BORLANDC__) && defined(__CODEGEARC_VERSION__)
+# undef POCL_COMPILER_IS_Embarcadero
+# define POCL_COMPILER_IS_Embarcadero 1
+
+#elif defined(__BORLANDC__)
+# undef POCL_COMPILER_IS_Borland
+# define POCL_COMPILER_IS_Borland 1
+
+#elif defined(__WATCOMC__) && __WATCOMC__ < 1200
+# undef POCL_COMPILER_IS_Watcom
+# define POCL_COMPILER_IS_Watcom 1
+
+#elif defined(__WATCOMC__)
+# undef POCL_COMPILER_IS_OpenWatcom
+# define POCL_COMPILER_IS_OpenWatcom 1
+
+#elif defined(__SUNPRO_C)
+# undef POCL_COMPILER_IS_SunPro
+# define POCL_COMPILER_IS_SunPro 1
+
+#elif defined(__HP_cc)
+# undef POCL_COMPILER_IS_HP
+# define POCL_COMPILER_IS_HP 1
+
+#elif defined(__DECC)
+# undef POCL_COMPILER_IS_Compaq
+# define POCL_COMPILER_IS_Compaq 1
+
+#elif defined(__IBMC__) && defined(__COMPILER_VER__)
+# undef POCL_COMPILER_IS_zOS
+# define POCL_COMPILER_IS_zOS 1
+
+#elif defined(__IBMC__) && !defined(__COMPILER_VER__) && __IBMC__ >= 800
+# undef POCL_COMPILER_IS_XL
+# define POCL_COMPILER_IS_XL 1
+
+#elif defined(__IBMC__) && !defined(__COMPILER_VER__) && __IBMC__ < 800
+# undef POCL_COMPILER_IS_VisualAge
+# define POCL_COMPILER_IS_VisualAge 1
+
+#elif defined(__PGI)
+# undef POCL_COMPILER_IS_PGI
+# define POCL_COMPILER_IS_PGI 1
+
+#elif defined(_CRAYC)
+# undef POCL_COMPILER_IS_Cray
+# define POCL_COMPILER_IS_Cray 1
+
+#elif defined(__TI_COMPILER_VERSION__)
+# undef POCL_COMPILER_IS_TI
+# define POCL_COMPILER_IS_TI 1
+
+#elif defined(__FUJITSU) || defined(__FCC_VERSION) || defined(__fcc_version)
+# undef POCL_COMPILER_IS_Fujitsu
+# define POCL_COMPILER_IS_Fujitsu 1
+
+#elif defined(__TINYC__)
+# undef POCL_COMPILER_IS_TinyCC
+# define POCL_COMPILER_IS_TinyCC 1
+
+#elif defined(__BCC__)
+# undef POCL_COMPILER_IS_Bruce
+# define POCL_COMPILER_IS_Bruce 1
+
+#elif defined(__SCO_VERSION__)
+# undef POCL_COMPILER_IS_SCO
+# define POCL_COMPILER_IS_SCO 1
+
+#elif defined(__clang__) && defined(__apple_build_version__)
+# undef POCL_COMPILER_IS_AppleClang
+# define POCL_COMPILER_IS_AppleClang 1
+
+#elif defined(__clang__)
+# undef POCL_COMPILER_IS_Clang
+# define POCL_COMPILER_IS_Clang 1
+
+#elif defined(__GNUC__)
+# undef POCL_COMPILER_IS_GNU
+# define POCL_COMPILER_IS_GNU 1
+
+#elif defined(_MSC_VER)
+# undef POCL_COMPILER_IS_MSVC
+# define POCL_COMPILER_IS_MSVC 1
+
+#elif defined(__VISUALDSPVERSION__) || defined(__ADSPBLACKFIN__) || defined(__ADSPTS__) || defined(__ADSP21000__)
+# undef POCL_COMPILER_IS_ADSP
+# define POCL_COMPILER_IS_ADSP 1
+
+#elif defined(__IAR_SYSTEMS_ICC__ ) || defined(__IAR_SYSTEMS_ICC)
+# undef POCL_COMPILER_IS_IAR
+# define POCL_COMPILER_IS_IAR 1
+
+#elif defined(__ARMCC_VERSION)
+# undef POCL_COMPILER_IS_ARMCC
+# define POCL_COMPILER_IS_ARMCC 1
+
+#elif defined(SDCC)
+# undef POCL_COMPILER_IS_SDCC
+# define POCL_COMPILER_IS_SDCC 1
+
+#elif defined(_SGI_COMPILER_VERSION) || defined(_COMPILER_VERSION)
+# undef POCL_COMPILER_IS_MIPSpro
+# define POCL_COMPILER_IS_MIPSpro 1
+
+
+#endif
+
+#  if POCL_COMPILER_IS_GNU
+
+#    if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404
+#      define POCL_COMPILER_C_FUNCTION_PROTOTYPES 1
+#    else
+#      define POCL_COMPILER_C_FUNCTION_PROTOTYPES 0
+#    endif
+
+#    if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+#      define POCL_COMPILER_C_RESTRICT 1
+#    else
+#      define POCL_COMPILER_C_RESTRICT 0
+#    endif
+
+#    if (__GNUC__ * 100 + __GNUC_MINOR__) >= 406 && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201000L
+#      define POCL_COMPILER_C_STATIC_ASSERT 1
+#    else
+#      define POCL_COMPILER_C_STATIC_ASSERT 0
+#    endif
+
+#    if (__GNUC__ * 100 + __GNUC_MINOR__) >= 404 && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+#      define POCL_COMPILER_C_VARIADIC_MACROS 1
+#    else
+#      define POCL_COMPILER_C_VARIADIC_MACROS 0
+#    endif
+
+#  elif POCL_COMPILER_IS_Clang
+
+#    if ((__clang_major__ * 100) + __clang_minor__) >= 304
+#      define POCL_COMPILER_C_FUNCTION_PROTOTYPES 1
+#    else
+#      define POCL_COMPILER_C_FUNCTION_PROTOTYPES 0
+#    endif
+
+#    if ((__clang_major__ * 100) + __clang_minor__) >= 304 && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+#      define POCL_COMPILER_C_RESTRICT 1
+#    else
+#      define POCL_COMPILER_C_RESTRICT 0
+#    endif
+
+#    if ((__clang_major__ * 100) + __clang_minor__) >= 304 && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
+#      define POCL_COMPILER_C_STATIC_ASSERT 1
+#    else
+#      define POCL_COMPILER_C_STATIC_ASSERT 0
+#    endif
+
+#    if ((__clang_major__ * 100) + __clang_minor__) >= 304 && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+#      define POCL_COMPILER_C_VARIADIC_MACROS 1
+#    else
+#      define POCL_COMPILER_C_VARIADIC_MACROS 0
+#    endif
+
+#  else
+
+#      define POCL_COMPILER_C_FUNCTION_PROTOTYPES 0
+#      define POCL_COMPILER_C_RESTRICT 0
+#      define POCL_COMPILER_C_STATIC_ASSERT 0
+#      define POCL_COMPILER_C_VARIADIC_MACROS 0
+
+#  endif
+
+#  if POCL_COMPILER_C_RESTRICT
+#    define POCL_RESTRICT restrict
+#  else
+#    define POCL_RESTRICT
+#  endif
+
+#endif
+
+#endif
diff --git a/include/pocl_file_util.h b/include/pocl_file_util.h
index d9b523b..4a9953b 100644
--- a/include/pocl_file_util.h
+++ b/include/pocl_file_util.h
@@ -52,6 +52,8 @@ int pocl_mkdir_p(const char* path);
 /* Remove a file or empty directory */
 int pocl_remove(const char* path);
 
+int pocl_rename(const char *oldpath, const char *newpath);
+
 int pocl_exists(const char* path);
 
 int pocl_filesize(const char* path, uint64_t* res);
diff --git a/include/pocl_types.h b/include/pocl_types.h
index da0ce9a..57f2aa6 100644
--- a/include/pocl_types.h
+++ b/include/pocl_types.h
@@ -39,11 +39,13 @@ typedef struct error_undefined_type_ulong error_undefined_type_ulong;
 #  define ulong error_undefined_type_ulong
 #endif
 
+#ifdef __CBUILD__
 #ifndef cl_khr_fp16
-typedef struct error_undefined_type_half error_undefined_type_half;
-#  define half error_undefined_type_half
+typedef short half;
+#endif
 #endif
 
+
 #ifndef cl_khr_fp64
 typedef struct error_undefined_type_double error_undefined_type_double;
 #  define double error_undefined_type_double
@@ -68,26 +70,3 @@ typedef ptrdiff_t intptr_t;
 typedef size_t uintptr_t;
 
 #endif
-
-/* Image types.
- * Note: there is a duplicate definition in
- * lib/CL/devices/dev_image.h - keep in sync?
- */
-typedef int dev_sampler_t;
-
-typedef struct dev_image_t {
-  void* _data;
-  int _width;
-  int _height;
-  int _depth;
-  int _image_array_size;
-  int _row_pitch;
-  int _slice_pitch;
-  int _num_mip_levels; /* maybe not needed */
-  int _num_samples; /* maybe not needed */
-  int _order;
-  int _data_type;
-  int _num_channels;
-  int _elem_size;
-} dev_image_t;
-
diff --git a/lib/CL/CMakeLists.txt b/lib/CL/CMakeLists.txt
index ac11251..f8adc3f 100644
--- a/lib/CL/CMakeLists.txt
+++ b/lib/CL/CMakeLists.txt
@@ -23,11 +23,10 @@
 #
 #=============================================================================
 
-if(ENABLE_ASAN)
-  add_compile_options(-fsanitize=address -fsanitize=leak -fsanitize=undefined -fno-omit-frame-pointer)
+if(SANITIZER_OPTIONS)
+  add_compile_options(${SANITIZER_OPTIONS})
 endif()
 
-
 include_directories(BEFORE "../../fix-include/OpenCL")
 include_directories(AFTER "devices" ".")
 
@@ -51,15 +50,20 @@ set(POCL_LIB_SOURCES  "clCreateContextFromType.c"
                    "clEnqueueMapBuffer.c"
                    "clEnqueueUnmapMemObject.c"
                    "clEnqueueMarkerWithWaitList.c"
+                   "clEnqueueBarrierWithWaitList.c"
                    "clReleaseMemObject.c"
                    "clRetainMemObject.c"
                    "clGetMemObjectInfo.c"
                    "clSetMemObjectDestructorCallback.c"
                    "clCreateProgramWithSource.c"
                    "clCreateProgramWithBinary.c"
+                   "clCreateProgramWithBuiltInKernels.c"
                    "clReleaseProgram.c"
                    "clRetainProgram.c"
                    "clBuildProgram.c"
+                   "pocl_build.c"
+                   "clCompileProgram.c"
+                   "clLinkProgram.c"
                    "clCreateKernel.c"
                    "clReleaseKernel.c"
                    "clRetainKernel.c"
@@ -115,13 +119,13 @@ set(POCL_LIB_SOURCES  "clCreateContextFromType.c"
                    "clUnloadCompiler.c"
                    "clGetSupportedImageFormats.c"
                    "clGetExtensionFunctionAddress.c"
+                   "clGetExtensionFunctionAddressForPlatform.c"
                    "clIcdGetPlatformIDsKHR.c"
                    "clReleaseDevice.c"
                    "clRetainDevice.c"
                    "clCreateSubDevices.c"
 				           "clUnloadPlatformCompiler.c"
                    "pocl_cl.h" "pocl_util.h" "pocl_util.c"
-                   "pocl_queue_util.h" "pocl_queue_util.c"
                    "pocl_image_util.c" "pocl_image_util.h"
                    "pocl_img_buf_cpy.c"
                    "pocl_icd.h" "pocl_llvm.h"
@@ -134,7 +138,7 @@ set(POCL_LIB_SOURCES  "clCreateContextFromType.c"
                    "clEnqueueSVMMap.c" "clEnqueueSVMUnmap.c"
                    "clEnqueueSVMMemcpy.c" "clEnqueueSVMMemFill.c"
                    "clSetKernelArgSVMPointer.c" "clSetKernelExecInfo.c"
-                   "pocl_binary.c")
+                   "pocl_binary.c" "pocl_opengl.c")
 
 set(LIBPOCL_OBJS "$<TARGET_OBJECTS:libpocl_unlinked_objs>"
                  "$<TARGET_OBJECTS:pocl_cache>"
@@ -144,8 +148,9 @@ add_library("pocl_cache" OBJECT "pocl_cache.c")
 
 if (OCS_AVAILABLE)
   include_directories(${LLVM_INCLUDE_DIRS})
-  list(APPEND POCL_LIB_SOURCES "pocl_llvm_api.cc")
-  set_source_files_properties("pocl_llvm_api.cc" PROPERTIES COMPILE_FLAGS "${LLVM_CXXFLAGS} -I\"${CMAKE_CURRENT_SOURCE_DIR}/../llvmopencl\"")
+  set(LLVM_API_SOURCES "pocl_llvm_build.cc" "pocl_llvm_metadata.cc" "pocl_llvm_utils.cc" "pocl_llvm_wg.cc")
+  list(APPEND POCL_LIB_SOURCES ${LLVM_API_SOURCES})
+  set_source_files_properties(${LLVM_API_SOURCES} PROPERTIES COMPILE_FLAGS "${LLVM_CXXFLAGS} -I\"${CMAKE_CURRENT_SOURCE_DIR}/../llvmopencl\"")
   list(APPEND LIBPOCL_OBJS "$<TARGET_OBJECTS:llvmpasses>")
   # pocl_cache.c depends on a SHA1 hash of all built kernel-<target>.bc
   add_dependencies("pocl_cache" "kernellib_hash")
@@ -176,10 +181,12 @@ else()
 endif()
 
 set(POCL_PRIVATE_LINK_LIST ${CLANG_LIBFILES} ${POCL_LLVM_LIBS} ${LLVM_SYSLIBS})
-if(ENABLE_ASAN)
-  list(APPEND POCL_PRIVATE_LINK_LIST "asan" "ubsan")
+
+if(SANITIZER_OPTIONS)
+  list(APPEND POCL_PRIVATE_LINK_LIST ${SANITIZER_LIBS})
 endif()
 
+
 # see lib/CMakeLists.txt
 set(POCL_TRANSITIVE_LIBS ${POCL_PRIVATE_LINK_LIST} PARENT_SCOPE)
 
diff --git a/lib/CL/clBuildProgram.c b/lib/CL/clBuildProgram.c
index 1d9de05..343ab95 100644
--- a/lib/CL/clBuildProgram.c
+++ b/lib/CL/clBuildProgram.c
@@ -1,18 +1,17 @@
 /* OpenCL runtime library: clBuildProgram()
 
-   Copyright (c) 2011-2013 Universidad Rey Juan Carlos,
-                 2011-2014 Pekka Jääskeläinen / Tampere Univ. of Technology
-   
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
+
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -23,558 +22,21 @@
 */
 
 #include "pocl_cl.h"
-#include "pocl.h"
-#include <assert.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#ifndef _MSC_VER
-#  include <unistd.h>
-#else
-#  include "vccompat.hpp"
-#endif
-#ifdef OCS_AVAILABLE
-#include "pocl_llvm.h"
-#endif
-#include "pocl_util.h"
-#include "pocl_file_util.h"
-#include "pocl_cache.h"
-#include "config.h"
-#include "pocl_runtime_config.h"
-#include "pocl_binary.h"
 #include "pocl_shared.h"
 
-/* supported compiler parameters which should pass to the frontend directly
-   by using -Xclang */
-static const char cl_parameters[] =
-  "-cl-single-precision-constant "
-  "-cl-fp32-correctly-rounded-divide-sqrt "
-  "-cl-opt-disable "
-  "-cl-mad-enable "
-  "-cl-unsafe-math-optimizations "
-  "-cl-finite-math-only "
-  "-cl-fast-relaxed-math "
-  "-cl-std=CL1.2 "
-  "-cl-std=CL1.1 "
-  "-cl-std=CL2.0 "
-  "-cl-kernel-arg-info "
-  "-w "
-  "-g "
-  "-Werror ";
-
-static const char cl_parameters_supported_after_clang_3_9[] =
-  "-cl-strict-aliasing " /* deprecated after OCL1.0 */
-  "-cl-denorms-are-zero "
-  "-cl-no-signed-zeros ";
-
-static const char cl_parameters_not_yet_supported_by_clang[] =
-  "-cl-uniform-work-group-size ";
-
-#define MEM_ASSERT(x, err_jmp) do{ if (x){errcode = CL_OUT_OF_HOST_MEMORY;goto err_jmp;}} while(0)
-
-// append token, growing modded_options, if necessary, by max(strlen(token)+1, 256)
-#define APPEND_TOKEN() do {          \
-  size_t needed = strlen(token) + 1; \
-  if (size <= (i + needed)) { \
-    size_t grow_by = needed > 256 ? needed : 256; \
-    char *grown_ptr = (char *)realloc(modded_options, size + grow_by); \
-    if (grown_ptr == NULL) { \
-      /* realloc failed, free modded_options and return */ \
-      errcode = CL_OUT_OF_HOST_MEMORY; \
-      goto ERROR_CLEAN_OPTIONS; \
-    } \
-    modded_options = grown_ptr; \
-    size += grow_by; \
-  } \
-  i += needed; \
-  strcat (modded_options, token); \
-  strcat (modded_options, " "); \
-} while (0)
-
-#define APPEND_TO_MAIN_BUILD_LOG(...)  \
-  POCL_MSG_ERR(__VA_ARGS__);   \
-  {                            \
-    size_t l = strlen(program->main_build_log); \
-    snprintf(program->main_build_log + l, (640 - l), __VA_ARGS__); \
-  }
-
-#ifdef OCS_AVAILABLE
-cl_int
-program_compile_dynamic_wg_binaries(cl_program program)
-{
-  unsigned i, device_i;
-  cl_int errcode = CL_SUCCESS;
-  _cl_command_node cmd;
-
-  assert(program->num_kernels);
-  assert(program->build_status == CL_BUILD_SUCCESS);
-
-  memset(&cmd, 0, sizeof(_cl_command_node));
-  cmd.type = CL_COMMAND_NDRANGE_KERNEL;
-  char cachedir[POCL_FILENAME_LENGTH];
-  cmd.command.run.tmp_dir = cachedir;
-  POCL_LOCK_OBJ(program);
-
-  /* Build the dynamic WG sized parallel.bc and device specific code,
-     for each kernel & device combo.  */
-  for (device_i = 0; device_i < program->num_devices; ++device_i)
-    {
-      cl_device_id device = program->devices[device_i];
-
-      /* program may not be built for some of its devices */
-      if (program->pocl_binaries[device_i] || (!program->binaries[device_i]))
-        continue;
-
-      cmd.device = device;
-
-      for (i=0; i < program->num_kernels; i++)
-        {
-          cl_kernel kernel = program->default_kernels[i];
-          size_t local_x = 0, local_y = 0, local_z = 0;
-          if (kernel->reqd_wg_size != NULL &&
-              kernel->reqd_wg_size[0] > 0 &&
-              kernel->reqd_wg_size[1] > 0 &&
-              kernel->reqd_wg_size[2] > 0)
-            {
-              local_x = kernel->reqd_wg_size[0];
-              local_y = kernel->reqd_wg_size[1];
-              local_z = kernel->reqd_wg_size[2];
-            }
-
-          pocl_cache_kernel_cachedir_path (cachedir, program, device_i, kernel,
-                                           "", local_x, local_y, local_z);
-
-          errcode = pocl_llvm_generate_workgroup_function (cachedir, device, kernel,
-                                                           local_x, local_y, local_z);
-          if (errcode != CL_SUCCESS)
-            {
-              POCL_MSG_ERR("Failed to generate workgroup function for "
-                           "kernel %s for device %s\n",
-                           program->kernel_names[i], device->short_name);
-              goto RET;
-            }
-          cmd.command.run.kernel = kernel;
-          device->ops->compile_kernel (&cmd, kernel, device);
-        }
-    }
-
-RET:
-  POCL_UNLOCK_OBJ(program);
-  return errcode;
-}
-
-#endif
-
 CL_API_ENTRY cl_int CL_API_CALL
-POname(clBuildProgram)(cl_program program,
-                       cl_uint num_devices,
-                       const cl_device_id *device_list,
-                       const char *options,
-                       void (CL_CALLBACK *pfn_notify) (cl_program program, 
+POname (clBuildProgram) (cl_program program,
+                         cl_uint num_devices,
+                         const cl_device_id *device_list,
+                         const char *options,
+                         void (CL_CALLBACK *pfn_notify) (cl_program program,
                                                        void *user_data),
-                       void *user_data) 
+                         void *user_data)
 CL_API_SUFFIX__VERSION_1_0
 {
-  char program_bc_path[POCL_FILENAME_LENGTH];
-  int errcode;
-  int error;
-  uint64_t fsize;
-  cl_device_id * unique_devlist = NULL;
-  char *binary = NULL;
-  unsigned device_i = 0, actually_built = 0;
-  char *temp_options = NULL;
-  char *modded_options = NULL;
-  char *token = NULL;
-  char *saveptr = NULL;
-  void* write_cache_lock = NULL;
-  build_program_callback_t *callback = NULL;
-
-  POCL_RETURN_ERROR_COND((program == NULL), CL_INVALID_PROGRAM);
-
-  POCL_RETURN_ERROR_COND((num_devices > 0 && device_list == NULL), CL_INVALID_VALUE);
-  POCL_RETURN_ERROR_COND((num_devices == 0 && device_list != NULL), CL_INVALID_VALUE);
-
-  POCL_RETURN_ERROR_COND((pfn_notify == NULL && user_data != NULL), CL_INVALID_VALUE);
-
-  POCL_RETURN_ERROR_ON(program->kernels, CL_INVALID_OPERATION, "Program already has kernels\n");
-
-  POCL_RETURN_ERROR_ON((program->source == NULL && program->binaries == NULL),
-    CL_INVALID_PROGRAM, "Program doesn't have sources or binaries! You need "
-                        "to call clCreateProgramWith{Binary|Source} first\n");
-
-  POCL_LOCK_OBJ(program);
-
-  if (pfn_notify)
-    {
-      callback = (build_program_callback_t*) malloc (sizeof(build_program_callback_t));
-      if (callback == NULL)
-        {
-          POCL_UNLOCK_OBJ(program);
-          return CL_OUT_OF_HOST_MEMORY;
-        }
-
-      callback->callback_function = pfn_notify;
-      callback->user_data = user_data;
-      program->buildprogram_callback = callback;
-    }
-
-  program->main_build_log[0] = 0;
-
-  size_t size = 512;
-  size_t i = 1; /* terminating char */
-  modded_options = (char*) calloc (512, 1);
-
-  if (options != NULL)
-    {
-      size_t size = 512;
-      size_t i = 1; /* terminating char */
-      temp_options = strdup(options);
-
-      token = strtok_r (temp_options, " ", &saveptr);
-      while (token != NULL)
-        {
-          /* check if parameter is supported compiler parameter */
-          if (memcmp (token, "-cl", 3) == 0 || memcmp (token, "-w", 2) == 0 
-              || memcmp(token, "-Werror", 7) == 0)
-            {
-              if (strstr (cl_parameters, token))
-                {
-                  /* the LLVM API call pushes the parameters directly to the 
-                     frontend without using -Xclang */
-                }
-              else if (strstr (cl_parameters_supported_after_clang_3_9, token))
-                {
-#ifndef LLVM_OLDER_THAN_3_9
-                  /* the LLVM API call pushes the parameters directly to the
-                   * frontend without using -Xclang*/
-#else
-                  APPEND_TO_MAIN_BUILD_LOG("This build option is supported after clang3.9: %s\n", token);
-                  token = strtok_r (NULL, " ", &saveptr);  
-                  continue;
-#endif
-                }
-              else if (strstr (cl_parameters_not_yet_supported_by_clang, token))
-                {
-                  APPEND_TO_MAIN_BUILD_LOG("This build option is not yet supported by clang: %s\n", token);
-                  token = strtok_r (NULL, " ", &saveptr);
-                  continue;
-                }
-              else
-                {
-                  APPEND_TO_MAIN_BUILD_LOG("Invalid build option: %s\n", token);
-                  errcode = CL_INVALID_BUILD_OPTIONS;
-                  goto ERROR_CLEAN_OPTIONS;
-                }
-            }
-          else if (memcmp(token, "-g", 2) == 0)
-            {
-#ifndef LLVM_OLDER_THAN_3_8
-              token = "-debug-info-kind=line-tables-only";
-#endif
-            }
-          else if (memcmp (token, "-D", 2) == 0 || memcmp (token, "-I", 2) == 0)
-            {
-              APPEND_TOKEN();
-              /* if there is a space in between, then next token is part 
-                 of the option */
-              if (strlen (token) == 2)
-                token = strtok_r (NULL, " ", &saveptr);
-              else
-                {
-                  token = strtok_r (NULL, " ", &saveptr);
-                  continue;
-                }
-            }
-          else if (memcmp (token, "-x", 2) == 0 && strlen (token) == 2)
-            {
-              /* only "-x spir" is valid for the "-x" option */
-              token = strtok_r (NULL, " ", &saveptr);
-              if (!token || memcmp (token, "spir", 4) != 0)
-                {
-                  APPEND_TO_MAIN_BUILD_LOG("Invalid parameter to -x build option\n");
-                  errcode = CL_INVALID_BUILD_OPTIONS;
-                  goto ERROR_CLEAN_OPTIONS;
-                }
-              /* "-x spir" is not valid if we are building from source */
-              else if (program->source)
-                {
-                  APPEND_TO_MAIN_BUILD_LOG("\"-x spir\" is not valid when building from source\n");
-                  errcode = CL_INVALID_BUILD_OPTIONS;
-                  goto ERROR_CLEAN_OPTIONS;
-                }
-              token = strtok_r (NULL, " ", &saveptr);
-              continue;
-            }
-          else if (memcmp (token, "-spir-std=1.2", 13) == 0)
-            {
-              /* "-spir-std=" flags are not valid when building from source */
-              if (program->source)
-                {
-                  APPEND_TO_MAIN_BUILD_LOG("\"-spir-std=\" flag is not valid when building from source\n");
-                  errcode = CL_INVALID_BUILD_OPTIONS;
-                  goto ERROR_CLEAN_OPTIONS;
-                }
-              token = strtok_r (NULL, " ", &saveptr);
-              continue;
-            }
-          else
-            {
-              APPEND_TO_MAIN_BUILD_LOG("Invalid build option: %s\n", token);
-              errcode = CL_INVALID_BUILD_OPTIONS;
-              goto ERROR_CLEAN_OPTIONS;
-            }
-          APPEND_TOKEN();
-          token = strtok_r (NULL, " ", &saveptr);
-        }
-      POCL_MEM_FREE(temp_options);
-    }
-
-  POCL_MEM_FREE(program->compiler_options);
-  program->compiler_options = modded_options;
-
-  if (num_devices == 0)
-    {
-      num_devices = program->num_devices;
-      device_list = program->devices;
-    }
-  else
-    {
-      // convert subdevices to devices and remove duplicates
-      cl_uint real_num_devices = 0;
-      unique_devlist = pocl_unique_device_list(device_list, num_devices, &real_num_devices);
-      num_devices = real_num_devices;
-      device_list = unique_devlist;
-    }
-
-  POCL_MSG_PRINT_INFO("building program with options %s\n",
-                       program->compiler_options);
-
-  /* Build the fully linked non-parallel bitcode for all
-         devices. */
-  for (device_i = 0; device_i < program->num_devices; ++device_i)
-    {
-      cl_device_id device = program->devices[device_i];
-
-      /* find the device in the supplied devices-to-build-for list */
-      int found = 0;
-      for (i = 0; i < num_devices; ++i)
-          if (device_list[i] == device) found = 1;
-      if (!found) continue;
-
-      actually_built++;
-
-      /* clCreateProgramWithSource */
-      if (program->source)
-        {
-          POCL_MSG_PRINT_INFO("building from sources for device %d\n", device_i);
-#ifdef OCS_AVAILABLE
-          error = pocl_llvm_build_program(program, device_i,
-                                          program->compiler_options,
-                                          program_bc_path);
-          POCL_GOTO_ERROR_ON((error != 0), CL_BUILD_PROGRAM_FAILURE,
-                             "pocl_llvm_build_program() failed\n");
-#else
-          strcpy(program->main_build_log,
-                 "Cannot build a program from sources with pocl "
-                 "that does not have online compiler support\n");
-          POCL_GOTO_ERROR_ON(1, CL_COMPILER_NOT_AVAILABLE,
-                             "%s", program->main_build_log);
-#endif
-        }
-      /* clCreateProgramWithBinaries */
-      else if (program->binaries[device_i])
-        {
-          POCL_MSG_PRINT_INFO("building from a BC binary for device %d\n", device_i);
-
-#ifdef OCS_AVAILABLE
-          error = pocl_cache_create_program_cachedir(program, device_i,
-                                                     NULL, 0, program_bc_path);
-          POCL_GOTO_ERROR_ON((error != 0), CL_BUILD_PROGRAM_FAILURE,
-                             "Could not create program cachedir");
-          write_cache_lock = pocl_cache_acquire_writer_lock_i(program, device_i);
-          assert(write_cache_lock);
-          errcode = pocl_write_file(program_bc_path, (char*)program->binaries[device_i],
-                          (uint64_t)program->binary_sizes[device_i], 0, 0);
-          POCL_GOTO_ERROR_ON(errcode, CL_BUILD_PROGRAM_FAILURE,
-                             "Failed to write binaries to program.bc\n");
-#else
-          if (!program->pocl_binaries[device_i])
-            {
-              strcpy(program->main_build_log,
-                     "Cannot build program from LLVM IR binaries with "
-                     "pocl that does not have online compiler support\n");
-              POCL_GOTO_ERROR_ON(1, CL_COMPILER_NOT_AVAILABLE,
-                                 "%s", program->main_build_log);
-            }
-          else
-            continue;
-#endif
-        }
-      else if (program->pocl_binaries[device_i])
-        {
-          POCL_MSG_PRINT_INFO("having a poclbinary for device %d\n", device_i);
-          /* TODO pocl_binaries[i] might contain program.bc */
-          continue;
-          /* fail */
-        }
-      else
-        {
-          POCL_MSG_PRINT_INFO("no sources nor binaries to build for device %d\n",
-                              device_i);
-          /* TODO pocl_binaries[i] might contain program.bc */
-
-          POCL_GOTO_ERROR_ON(1, CL_INVALID_BINARY,
-                             "No sources nor binaries for device %s - can't "
-                             "build the program\n", device->short_name);
-        }
-
-#ifdef OCS_AVAILABLE
-      /* Read binaries from program.bc to memory */
-      if (program->binaries[device_i] == NULL)
-        {
-          if (!write_cache_lock)
-            write_cache_lock = pocl_cache_acquire_writer_lock_i(program, device_i);
-          assert(write_cache_lock);
-          errcode = pocl_read_file(program_bc_path, &binary, &fsize);
-          POCL_GOTO_ERROR_ON(errcode, CL_BUILD_ERROR,
-                             "Failed to read binaries from program.bc to "
-                             "memory: %s\n", program_bc_path);
-
-          program->binary_sizes[device_i] = (size_t)fsize;
-          program->binaries[device_i] = (unsigned char *)binary;
-        }
-
-      if (program->llvm_irs[device_i] == NULL)
-        {
-          if (!write_cache_lock)
-            write_cache_lock = pocl_cache_acquire_writer_lock_i(program, device_i);
-          assert(write_cache_lock);
-          pocl_update_program_llvm_irs(program, device_i, device);
-        }
-      /* Maintain a 'last_accessed' file in every program's
-       * cache directory. Will be useful for cache pruning script
-       * that flushes old directories based on LRU */
-      pocl_cache_update_program_last_access(program, device_i);
-
-      if (write_cache_lock)
-        {
-          pocl_cache_release_lock(write_cache_lock);
-          write_cache_lock = NULL;
-        }
-#endif
-
-    }
-
-  POCL_GOTO_ERROR_ON((actually_built < num_devices), CL_BUILD_PROGRAM_FAILURE,
-                     "Some of the devices on the argument-supplied list are"
-                     "not available for the program, or do not exist\n");
-
-  /* TODO probably wrong to assume */
-  assert(program->num_kernels == 0);
-  for (i=0; i < program->num_devices; i++)
-    {
-#ifdef OCS_AVAILABLE
-      if (program->binaries[i])
-        {
-          program->num_kernels = pocl_llvm_get_kernel_count(program);
-          if (program->num_kernels)
-            {
-              program->kernel_names = calloc(program->num_kernels, sizeof(char*));
-              pocl_llvm_get_kernel_names(program,
-                                         program->kernel_names,
-                                         program->num_kernels);
-            }
-          break;
-        }
-#endif
-      if (program->pocl_binaries[i])
-        {
-          program->num_kernels =
-              pocl_binary_get_kernel_count(program->pocl_binaries[i]);
-          if (program->num_kernels)
-            {
-              program->kernel_names = calloc(program->num_kernels, sizeof(char*));
-              pocl_binary_get_kernel_names(program->pocl_binaries[i],
-                                           program->kernel_names,
-                                           program->num_kernels);
-            }
-          break;
-        }
-    }
-  POCL_GOTO_ERROR_ON((i >= program->num_devices),
-                     CL_INVALID_BINARY,
-                     "Could not set kernel number / names from the binary\n");
-
-  POCL_MEM_FREE(unique_devlist);
-  program->build_status = CL_BUILD_SUCCESS;
-  POCL_UNLOCK_OBJ(program);
-
-  if (program->buildprogram_callback)
-    program->buildprogram_callback->callback_function (program,
-                                  program->buildprogram_callback->user_data);
-
-  /* Set up all program kernels.  */
-  /* TODO: Should not have to unlock program while adding default kernels.  */
-  assert (program->default_kernels == NULL);
-  program->kernels = ADDING_DEFAULT_KERNELS_TO_CL_PROGRAM;
-  program->default_kernels = calloc(program->num_kernels, sizeof(cl_kernel));
-
-  for (i=0; i < program->num_kernels; i++)
-    {
-      program->default_kernels[i] =
-          POname(clCreateKernel)(program,
-                                 program->kernel_names[i],
-                                 &error);
-      POCL_GOTO_ERROR_ON((error != CL_SUCCESS),
-                         CL_BUILD_PROGRAM_FAILURE,
-                         "Failed to create default kernels\n");
-    }
-
-  program->kernels = 0;
-
-  return CL_SUCCESS;
-
-  /* Set pointers to NULL during cleanup so that clProgramRelease won't
-   * cause a double free. */
-
-ERROR:
-  if (program->buildprogram_callback)
-    {
-      program->buildprogram_callback->callback_function(program,
-                         program->buildprogram_callback->user_data);
-      POCL_MEM_FREE(program->buildprogram_callback);
-    }
-  program->kernels = 0;
-  for(i = 0; i < program->num_devices; i++)
-  {
-    POCL_MEM_FREE(program->binaries[i]);
-    pocl_cache_release_lock(program->read_locks[i]);
-    program->read_locks[i] = NULL;
-  }
-  if (program->num_kernels && program->kernel_names)
-    {
-      for (i=0; i < program->num_kernels; i++)
-        POCL_MEM_FREE(program->kernel_names[i]);
-      POCL_MEM_FREE(program->kernel_names);
-    }
-  if (program->default_kernels)
-    {
-      for (i=0; i < program->num_kernels; i++)
-        if (program->default_kernels[i])
-          POname(clReleaseKernel)(program->default_kernels[i]);
-      POCL_MEM_FREE(program->default_kernels);
-      POCL_LOCK_OBJ(program);
-    }
-  POCL_MEM_FREE(program->binaries);
-  POCL_MEM_FREE(program->binary_sizes);
-  POCL_MEM_FREE(unique_devlist);
-  pocl_cache_release_lock(write_cache_lock);
-ERROR_CLEAN_OPTIONS:
-  POCL_MEM_FREE(modded_options);
-  program->build_status = CL_BUILD_ERROR;
-
-  POCL_UNLOCK_OBJ(program);
-  return errcode;
+  return compile_and_link_program (1, 1, program,
+                                   num_devices, device_list, options,
+                                   0, NULL, NULL, 0, NULL,
+                                   pfn_notify, user_data);
 }
-POsym(clBuildProgram)
+POsym (clBuildProgram)
diff --git a/lib/CL/clEnqueueMarkerWithWaitList.c b/lib/CL/clCompileProgram.c
similarity index 52%
copy from lib/CL/clEnqueueMarkerWithWaitList.c
copy to lib/CL/clCompileProgram.c
index ef62e37..fa1e5f3 100644
--- a/lib/CL/clEnqueueMarkerWithWaitList.c
+++ b/lib/CL/clCompileProgram.c
@@ -1,17 +1,17 @@
-/* OpenCL runtime library: clEnqueueMarkerMarkerWithWaitList()
+/* OpenCL runtime library: clCompileProgram()
+
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
 
-   Copyright (c) 2013 Ville Korhonen / Tampere Univ. of Tech.
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -20,38 +20,27 @@
    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    THE SOFTWARE.
 */
-#include "pocl_cl.h"
-#include "utlist.h"
-#include "pocl_util.h"
-#include <stdio.h>
 
+#include "pocl_cl.h"
+#include "pocl_shared.h"
 
 CL_API_ENTRY cl_int CL_API_CALL
-POname(clEnqueueMarkerWithWaitList) (cl_command_queue   command_queue,
-                                     cl_uint            num_events_in_wait_list,
-                                     const cl_event *   event_wait_list,
-                                     cl_event *         event) 
+POname (clCompileProgram) (cl_program program,
+                           cl_uint num_devices,
+                           const cl_device_id *device_list,
+                           const char *options,
+                           cl_uint num_input_headers,
+                           const cl_program *input_headers,
+                           const char **header_include_names,
+                           void (CL_CALLBACK *pfn_notify) (cl_program program,
+                                                         void *user_data),
+                           void *user_data)
 CL_API_SUFFIX__VERSION_1_2
 {
-  int errcode;
-  _cl_command_node *cmd = NULL;
-
-  POCL_RETURN_ERROR_COND((command_queue == NULL), CL_INVALID_COMMAND_QUEUE);
-
-  errcode = pocl_create_command (&cmd, command_queue, CL_COMMAND_MARKER, 
-                                 event, num_events_in_wait_list, 
-                                 event_wait_list, 0, NULL);
-  if (errcode != CL_SUCCESS)
-    goto ERROR;
-
-  cmd->command.marker.data = command_queue->device->data;
-  pocl_command_enqueue (command_queue, cmd);
-
-  return CL_SUCCESS;
-
- ERROR:
-  POCL_MEM_FREE(cmd);
-  return errcode;
-
+  return compile_and_link_program (1, 0, program,
+                                   num_devices, device_list, options,
+                                   num_input_headers, input_headers,
+                                   header_include_names, 0, NULL,
+                                   pfn_notify, user_data);
 }
-POsym(clEnqueueMarkerWithWaitList)
+POsym (clCompileProgram)
diff --git a/lib/CL/clCreateBuffer.c b/lib/CL/clCreateBuffer.c
index 7166175..7e2f6fc 100644
--- a/lib/CL/clCreateBuffer.c
+++ b/lib/CL/clCreateBuffer.c
@@ -48,6 +48,7 @@ POname(clCreateBuffer)(cl_context   context,
       errcode = CL_OUT_OF_HOST_MEMORY;
       goto ERROR;
     }
+  mem->device_ptrs = NULL;
 
   if (flags == 0)
     flags = CL_MEM_READ_WRITE;
@@ -92,7 +93,6 @@ POname(clCreateBuffer)(cl_context   context,
         (~flags & CL_MEM_COPY_HOST_PTR)), CL_INVALID_HOST_PTR,
         "host_ptr is not NULL, but flags don't specify {COPY|USE}_HOST_PTR\n");
     }
-  
 
   for (i = 0; i < context->num_devices; ++i)
     {
@@ -110,6 +110,7 @@ POname(clCreateBuffer)(cl_context   context,
   mem->parent = NULL;
   mem->map_count = 0;
   mem->mappings = NULL;
+  mem->buffer = NULL;
   mem->destructor_callbacks = NULL;
   mem->type = CL_MEM_OBJECT_BUFFER;
   mem->flags = flags;
@@ -144,6 +145,7 @@ POname(clCreateBuffer)(cl_context   context,
     }
 
   mem->size = size;
+  mem->origin = 0;
   mem->context = context;
   mem->mem_host_ptr = host_ptr;
   mem->shared_mem_allocation_owner = NULL;
@@ -165,8 +167,6 @@ POname(clCreateBuffer)(cl_context   context,
       if (context->svm_allocdev == context->devices[i])
         continue;
 
-      if (i > 0)
-        POname(clRetainMemObject) (mem);
       device = context->devices[i];
       assert (device->ops->alloc_mem_obj != NULL);
       if (device->ops->alloc_mem_obj (device, mem, host_ptr) != CL_SUCCESS)
@@ -177,20 +177,22 @@ POname(clCreateBuffer)(cl_context   context,
     }
 
   /* Some device driver may already have allocated host accessible memory */
-  if (flags & CL_MEM_ALLOC_HOST_PTR && mem->mem_host_ptr == NULL)
+  if ((flags & CL_MEM_ALLOC_HOST_PTR) && (mem->mem_host_ptr == NULL))
     {
       assert(mem->shared_mem_allocation_owner == NULL);
       mem->mem_host_ptr = pocl_memalign_alloc (MAX_EXTENDED_ALIGNMENT, size);
       if (mem->mem_host_ptr == NULL)
         {
           errcode = CL_OUT_OF_HOST_MEMORY;
-          goto ERROR;
+          goto ERROR_CLEAN_MEM_AND_DEVICE;
         }
     }
 
   POCL_RETAIN_OBJECT(context);
 
-  POCL_MSG_PRINT_INFO ("Created Buffer %p\n", mem);
+  POCL_MSG_PRINT_MEMORY (
+      "Created Buffer %p, HOST_PTR: %p, DEVICE_PTR[0]: %p \n", mem,
+      mem->mem_host_ptr, mem->device_ptrs[0].mem_ptr);
 
   if (errcode_ret != NULL)
     *errcode_ret = CL_SUCCESS;
@@ -203,6 +205,8 @@ ERROR_CLEAN_MEM_AND_DEVICE:
       device->ops->free(device, mem);
     }
 ERROR:
+  if (mem)
+    POCL_MEM_FREE (mem->device_ptrs);
   POCL_MEM_FREE(mem);
   if(errcode_ret)
     {
diff --git a/lib/CL/clCreateCommandQueue.c b/lib/CL/clCreateCommandQueue.c
index c77ddf2..8ef33b9 100644
--- a/lib/CL/clCreateCommandQueue.c
+++ b/lib/CL/clCreateCommandQueue.c
@@ -23,7 +23,6 @@
 
 #include "pocl_cl.h"
 #include "pocl_util.h"
-#include "pocl_queue_util.h"
 
 CL_API_ENTRY cl_command_queue CL_API_CALL
 POname(clCreateCommandQueue)(cl_context context, 
@@ -35,18 +34,22 @@ POname(clCreateCommandQueue)(cl_context context,
   int errcode;
   cl_bool found = CL_FALSE;
 
+  POCL_GOTO_ERROR_COND ((context == NULL), CL_INVALID_CONTEXT);
+
+  POCL_GOTO_ERROR_COND ((device == NULL), CL_INVALID_DEVICE);
+
   POCL_MSG_PRINT_INFO("Create Command queue on device %d\n", device->dev_id);
 
   /* validate flags */
   POCL_GOTO_ERROR_ON((properties > (1<<2)-1), CL_INVALID_VALUE,
             "Properties must be <= 3 (there are only 2)\n");
 
-  if (pocl_debug_messages)
+  if (POCL_DEBUGGING_ON)
     properties |= CL_QUEUE_PROFILING_ENABLE;
 
   for (i=0; i<context->num_devices; i++)
     {
-      if (context->devices[i] == POCL_REAL_DEV(device))
+      if (context->devices[i] == pocl_real_dev (device))
         found = CL_TRUE;
     }
 
@@ -67,7 +70,6 @@ POname(clCreateCommandQueue)(cl_context context,
   command_queue->properties = properties;
   command_queue->barrier = NULL;
   command_queue->events = NULL;
-  command_queue->root = NULL;
   command_queue->command_count = 0;
   command_queue->last_event.event = NULL;
   command_queue->last_event.event_id = -1;
@@ -76,10 +78,13 @@ POname(clCreateCommandQueue)(cl_context context,
   POCL_RETAIN_OBJECT(context);
   POCL_RETAIN_OBJECT(device);
 
+  errcode = CL_SUCCESS;
+  if (device->ops->init_queue)
+    errcode = device->ops->init_queue (command_queue);
+
   if (errcode_ret != NULL)
-    *errcode_ret = CL_SUCCESS;
+    *errcode_ret = errcode;
 
-  pocl_queue_list_insert(command_queue);
   return command_queue;
 
 ERROR:
diff --git a/lib/CL/clCreateCommandQueueWithProperties.c b/lib/CL/clCreateCommandQueueWithProperties.c
index 61f496e..8dd3776 100644
--- a/lib/CL/clCreateCommandQueueWithProperties.c
+++ b/lib/CL/clCreateCommandQueueWithProperties.c
@@ -22,7 +22,6 @@
 */
 
 #include "pocl_util.h"
-#include "pocl_queue_util.h"
 
 CL_API_ENTRY cl_command_queue CL_API_CALL
 POname(clCreateCommandQueueWithProperties)(cl_context context,
@@ -44,9 +43,11 @@ POname(clCreateCommandQueueWithProperties)(cl_context context,
 
   POCL_GOTO_ERROR_COND((context == NULL), CL_INVALID_CONTEXT);
 
+  POCL_GOTO_ERROR_COND ((device == NULL), CL_INVALID_DEVICE);
+
   for (i=0; i<context->num_devices; i++)
     {
-      if (context->devices[i] == POCL_REAL_DEV(device))
+      if (context->devices[i] == pocl_real_dev (device))
         found = CL_TRUE;
     }
 
diff --git a/lib/CL/clCreateContext.c b/lib/CL/clCreateContext.c
index c01f822..39f97df 100644
--- a/lib/CL/clCreateContext.c
+++ b/lib/CL/clCreateContext.c
@@ -126,7 +126,7 @@ POname(clCreateContext)(const cl_context_properties * properties,
 {
   unsigned i;
   cl_device_id device_ptr;
-  int errcode = 0;
+  cl_int errcode = 0;
   cl_context context = NULL;
 
   POCL_GOTO_ERROR_COND((devices == NULL || num_devices == 0), CL_INVALID_VALUE);
@@ -134,9 +134,21 @@ POname(clCreateContext)(const cl_context_properties * properties,
   POCL_GOTO_ERROR_COND((pfn_notify == NULL && user_data != NULL), CL_INVALID_VALUE);
 
   int offline_compile = pocl_get_bool_option("POCL_OFFLINE_COMPILE", 0);
-  
+
   lt_dlinit();
-  pocl_init_devices();
+  errcode = pocl_init_devices();
+  /* clCreateContext cannot return CL_DEVICE_NOT_FOUND, which is what
+   * pocl_init_devices() returns if no devices could be probed. Hence,
+   * remap this error to CL_INVALID_DEVICE. Note that this particular
+   * situation should never arise, since an application should issue
+   * clGetDeviceIDs before clCreateContext, and we would have returned
+   * CL_DEVICE_NOT_FOUND already from clGetDeviceIDs. Still, no reason
+   * not to handle it.
+   */
+  POCL_GOTO_ERROR_COND (errcode == CL_DEVICE_NOT_FOUND, CL_INVALID_DEVICE);
+  /* Other error conditions (e.g. CL_OUT_OF_HOST_MEMORY) */
+  if (errcode)
+    goto ERROR;
 
   context = (cl_context) malloc(sizeof(struct _cl_context));
   if (context == NULL)
diff --git a/lib/CL/clCreateContextFromType.c b/lib/CL/clCreateContextFromType.c
index 44d240d..2fb0e7a 100644
--- a/lib/CL/clCreateContextFromType.c
+++ b/lib/CL/clCreateContextFromType.c
@@ -50,7 +50,10 @@ POname(clCreateContextFromType)(const cl_context_properties *properties,
 
   /* initialize libtool here, LT will be needed when loading the kernels */     
   lt_dlinit();
-  pocl_init_devices();
+  errcode = pocl_init_devices();
+
+  if (errcode)
+    goto ERROR;
 
   cl_context context = (cl_context) malloc(sizeof(struct _cl_context));
   if (context == NULL)
diff --git a/lib/CL/clCreateFromGLTexture2D.c b/lib/CL/clCreateFromGLTexture2D.c
index 1afa303..ae800f9 100644
--- a/lib/CL/clCreateFromGLTexture2D.c
+++ b/lib/CL/clCreateFromGLTexture2D.c
@@ -9,6 +9,6 @@ POname(clCreateFromGLTexture2D)(cl_context      context,
 CL_API_SUFFIX__VERSION_1_0
 {
   POCL_ABORT_UNIMPLEMENTED("The entire clCreateFromGLTexture2D call");
-  return CL_SUCCESS;
+  return NULL;
 }
 POsym(clCreateFromGLTexture2D)
diff --git a/lib/CL/clCreateFromGLTexture3D.c b/lib/CL/clCreateFromGLTexture3D.c
index c4daa4b..93f9385 100644
--- a/lib/CL/clCreateFromGLTexture3D.c
+++ b/lib/CL/clCreateFromGLTexture3D.c
@@ -31,6 +31,6 @@ POname(clCreateFromGLTexture3D)(cl_context      context,
 CL_API_SUFFIX__VERSION_1_0
 {
   POCL_ABORT_UNIMPLEMENTED("The entire clCreateFromGLTexture3D call");
-  return CL_SUCCESS;
+  return NULL;
 }
 POsym(clCreateFromGLTexture3D)
diff --git a/lib/CL/clCreateImage.c b/lib/CL/clCreateImage.c
index e362dd8..28a60b5 100644
--- a/lib/CL/clCreateImage.c
+++ b/lib/CL/clCreateImage.c
@@ -22,6 +22,7 @@
 */
 #include "pocl_cl.h"
 #include "pocl_image_util.h"
+#include "pocl_util.h"
 
 extern CL_API_ENTRY cl_mem CL_API_CALL
 POname(clCreateImage) (cl_context              context,
@@ -33,22 +34,23 @@ POname(clCreateImage) (cl_context              context,
 CL_API_SUFFIX__VERSION_1_2
 {
     cl_mem mem = NULL;
-    unsigned i;
+    unsigned i, devices_supporting_images = 0;
     cl_uint num_entries = 0;
-    cl_image_format *supported_image_formats;
-    size_t size;
+    cl_image_format *supported_image_formats = NULL;
+    size_t size = 0;
     int errcode;
     size_t row_pitch;
     size_t slice_pitch;
     int elem_size;
     int channels;
+    size_t elem_bytes;
 
     POCL_GOTO_ERROR_COND((context == NULL), CL_INVALID_CONTEXT);
 
     POCL_GOTO_ERROR_COND((image_format == NULL), CL_INVALID_IMAGE_FORMAT_DESCRIPTOR);
 
     POCL_GOTO_ERROR_COND((image_desc == NULL), CL_INVALID_IMAGE_DESCRIPTOR);
-    
+
     if (image_desc->num_mip_levels != 0 || image_desc->num_samples != 0) {
       POCL_ABORT_UNIMPLEMENTED("clCreateImage with image_desc->num_mip_levels != 0"
       " || image_desc->num_samples != 0 ");
@@ -66,60 +68,72 @@ CL_API_SUFFIX__VERSION_1_2
         errcode = CL_OUT_OF_HOST_MEMORY;
         goto ERROR;
       }
-    
-    errcode = POname(clGetSupportedImageFormats) (context, flags, 
-            image_desc->image_type, num_entries, supported_image_formats, NULL);
-    
+
+    errcode = POname (clGetSupportedImageFormats) (
+        context, flags, image_desc->image_type, num_entries,
+        supported_image_formats, NULL);
+
     if (errcode != CL_SUCCESS){
       POCL_MSG_ERR("Couldn't get the supported image formats\n");
       goto ERROR;
     }
-    
-    for (i = 0; i < num_entries; i++)
+
+    /* CL_INVALID_IMAGE_SIZE if image dimensions specified in image_desc exceed
+     * the minimum maximum image dimensions described in the table of allowed
+     * values for param_name for clGetDeviceInfo FOR ALL DEVICES IN CONTEXT.
+     */
+    for (i = 0; i < context->num_devices; i++)
       {
-        if (supported_image_formats[i].image_channel_order == 
-            image_format->image_channel_order &&
-            supported_image_formats[i].image_channel_data_type ==
-            image_format->image_channel_data_type)
-          {
-            POCL_MEM_FREE(supported_image_formats);
-            goto TYPE_SUPPORTED;
-          }
+        cl_device_id dev = context->devices[i];
+        if (!dev->image_support)
+          continue;
+        else
+          ++devices_supporting_images;
+        if (pocl_check_device_supports_image (dev, image_format, image_desc,
+                                              supported_image_formats,
+                                              num_entries)
+            != CL_SUCCESS)
+          goto ERROR;
       }
+    POCL_GOTO_ERROR_ON (
+        (devices_supporting_images == 0), CL_INVALID_OPERATION,
+        "There are no devices in context that support images\n");
 
-    POCL_MEM_FREE(supported_image_formats);
-    POCL_MSG_ERR("Requested image format is not supported\n");
-    errcode = CL_IMAGE_FORMAT_NOT_SUPPORTED;
-    goto ERROR;
-
-TYPE_SUPPORTED:
-
-    /* maybe they are implemented */
-    if (image_desc->image_type != CL_MEM_OBJECT_IMAGE2D &&
-        image_desc->image_type != CL_MEM_OBJECT_IMAGE3D) {
-        POCL_ABORT_UNIMPLEMENTED("clCreateImage with images other than "
-        "CL_MEM_OBJECT_IMAGE2D or CL_MEM_OBJECT_IMAGE3D");
-    }
-    
     pocl_get_image_information (image_format->image_channel_order,
                                 image_format->image_channel_data_type, 
                                 &channels, &elem_size);
-    
+    elem_bytes = elem_size * channels;
+
     row_pitch = image_desc->image_row_pitch;
     slice_pitch = image_desc->image_slice_pitch;
-    
-    size = image_desc->image_width * image_desc->image_height * elem_size * 
-      channels;
 
-    if (image_desc->image_depth > 0)
+    /* This must be 0 if host_ptr is NULL and can be either 0 or ≥
+     * image_width * size of element in bytes if host_ptr is not NULL.
+     * If host_ptr is not NULL and image_row_pitch = 0, image_row_pitch
+     * is calculated as image_width * size of element in bytes. If
+     * image_row_pitch is not 0, it must be a multiple of the
+     * image element size in bytes.
+     */
+    if (row_pitch == 0)
       {
-        size *= image_desc->image_depth;
+        row_pitch = image_desc->image_width * elem_bytes;
       }
-
-    if (row_pitch == 0)
+    else
       {
-        row_pitch = image_desc->image_width * elem_size * channels;
+        POCL_GOTO_ERROR_COND ((row_pitch % elem_bytes), CL_INVALID_VALUE);
       }
+
+    /* The size in bytes of each 2D slice in the 3D image or the size in bytes
+     * of each image in a 1D or 2D image array. This must be 0 if host_ptr is
+     * NULL. If host_ptr is not NULL, image_slice_pitch can be either 0 or ≥
+     * image_row_pitch * image_height for a 2D image array or 3D image and can
+     * be either 0 or ≥ image_row_pitch for a 1D image array. If host_ptr is
+     * not NULL and image_slice_pitch = 0, image_slice_pitch is calculated as
+     * image_row_pitch * image_height for a 2D image array or 3D image and
+     * image_row_pitch for a 1D image array. If image_slice_pitch is not 0,
+     * it must be a multiple of the image_row_pitch.
+     */
+
     if (slice_pitch == 0)
       {
         if (image_desc->image_type == CL_MEM_OBJECT_IMAGE3D ||
@@ -132,27 +146,90 @@ TYPE_SUPPORTED:
             slice_pitch = row_pitch;
           }
       }
+    else
+      {
+        POCL_GOTO_ERROR_COND ((slice_pitch % row_pitch), CL_INVALID_VALUE);
+      }
+
+    if (image_desc->image_type == CL_MEM_OBJECT_IMAGE3D)
+      size = slice_pitch * image_desc->image_depth;
+
+    if (image_desc->image_type == CL_MEM_OBJECT_IMAGE2D)
+      size = row_pitch * image_desc->image_height;
+
+    if (image_desc->image_type == CL_MEM_OBJECT_IMAGE1D
+        || image_desc->image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER)
+      size = row_pitch;
+
+    if (image_desc->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY
+        || image_desc->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+      {
+        size = slice_pitch * image_desc->image_array_size;
+      }
 
     /* Create buffer and fill in missing parts */
-    mem = POname(clCreateBuffer) (context, flags, size, host_ptr, &errcode);
+    if (image_desc->image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER)
+      {
+        POCL_GOTO_ERROR_COND ((image_desc->buffer == NULL),
+                              CL_INVALID_MEM_OBJECT);
+        POCL_GOTO_ERROR_COND ((image_desc->buffer->size < size),
+                              CL_INVALID_MEM_OBJECT);
+
+        mem = (cl_mem)malloc (sizeof (struct _cl_mem));
+        POCL_GOTO_ERROR_COND ((mem == NULL), CL_OUT_OF_HOST_MEMORY);
+        memset (mem, 0, sizeof (struct _cl_mem));
+        POCL_INIT_OBJECT (mem);
+
+        cl_mem b = image_desc->buffer;
+        mem->buffer = b;
+
+        mem->size = size;
+        mem->origin = 0;
+
+        mem->context = context;
+        assert (mem->context == b->context);
+
+        pocl_cl_mem_inherit_flags (mem, b, flags);
 
-    POCL_GOTO_ERROR_ON((mem == NULL), CL_OUT_OF_HOST_MEMORY,
-      "clCreateBuffer (for backing the image) failed\n");
+        /* Retain the buffer we're referencing */
+        POname (clRetainMemObject) (b);
+
+        POCL_MSG_PRINT_MEMORY ("CREATED IMAGE: %p REF BUFFER: %p \n\n", mem,
+                               b);
+      }
+    else
+      {
+        mem = POname (clCreateBuffer) (context, flags, size, host_ptr,
+                                       &errcode);
+        POCL_GOTO_ERROR_ON ((mem == NULL), CL_OUT_OF_HOST_MEMORY,
+                            "clCreateBuffer (for backing the image) failed\n");
+        mem->buffer = NULL;
+      }
 
     mem->type = image_desc->image_type;
     mem->is_image = CL_TRUE;
-    
     mem->image_width = image_desc->image_width;
-    mem->image_height = image_desc->image_height;
-    mem->image_depth = image_desc->image_depth;
-    mem->image_array_size = image_desc->image_array_size;
+    if (image_desc->image_type == CL_MEM_OBJECT_IMAGE2D
+        || image_desc->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY
+        || image_desc->image_type == CL_MEM_OBJECT_IMAGE3D)
+      mem->image_height = image_desc->image_height;
+    else
+      mem->image_height = 0;
+    if (image_desc->image_type == CL_MEM_OBJECT_IMAGE3D)
+      mem->image_depth = image_desc->image_depth;
+    else
+      mem->image_depth = 0;
+    if (image_desc->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY
+        || image_desc->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+      mem->image_array_size = image_desc->image_array_size;
+    else
+      mem->image_array_size = 0;
     mem->image_row_pitch = row_pitch;
     mem->image_slice_pitch = slice_pitch;
     mem->image_channel_data_type = image_format->image_channel_data_type;
     mem->image_channel_order = image_format->image_channel_order;
     mem->num_mip_levels = image_desc->num_mip_levels;
     mem->num_samples = image_desc->num_samples;
-    mem->buffer = image_desc->buffer;
     mem->image_channels = channels;
     mem->image_elem_size = elem_size;
 
@@ -168,17 +245,19 @@ TYPE_SUPPORTED:
     printf("mem_image_channel_data_type %x \n",mem->image_channel_data_type);
     printf("device_ptrs[0] %x \n \n", mem->device_ptrs[0]);
 #endif
-    
+
     if (errcode_ret != NULL)
       *errcode_ret = CL_SUCCESS;
-    
+
+    POCL_MEM_FREE (supported_image_formats);
     return mem;
     
  ERROR:
-    if (errcode_ret) 
-      {
-        *errcode_ret = errcode;
-      }
-    return NULL;
+   POCL_MEM_FREE (supported_image_formats);
+   if (errcode_ret)
+     {
+       *errcode_ret = errcode;
+     }
+   return NULL;
 }
 POsym(clCreateImage)
diff --git a/lib/CL/clCreateKernel.c b/lib/CL/clCreateKernel.c
index 8e35ca8..d5a64c6 100644
--- a/lib/CL/clCreateKernel.c
+++ b/lib/CL/clCreateKernel.c
@@ -29,6 +29,7 @@
 #include "pocl_llvm.h"
 #endif
 #include "pocl_binary.h"
+#include "pocl_util.h"
 #include <string.h>
 #include <sys/stat.h>
 #ifndef _MSC_VER
@@ -37,8 +38,6 @@
 #  include "vccompat.hpp"
 #endif
 
-#define COMMAND_LENGTH 1024
-
 CL_API_ENTRY cl_kernel CL_API_CALL
 POname(clCreateKernel)(cl_program program,
                const char *kernel_name,
@@ -46,7 +45,7 @@ POname(clCreateKernel)(cl_program program,
 {
   cl_kernel kernel = NULL;
   int errcode = CL_SUCCESS;
-  unsigned device_i;
+  unsigned device_i, i;
 
   POCL_GOTO_ERROR_COND((kernel_name == NULL), CL_INVALID_VALUE);
 
@@ -71,7 +70,6 @@ POname(clCreateKernel)(cl_program program,
                      "clCreateKernel couldn't allocate memory");
 
   POCL_INIT_OBJECT (kernel);
-  POCL_RETAIN_OBJECT (kernel);
 
   kernel->name = strdup(kernel_name);
   POCL_GOTO_ERROR_ON((kernel->name == NULL), CL_OUT_OF_HOST_MEMORY,
@@ -83,30 +81,49 @@ POname(clCreateKernel)(cl_program program,
 
   for (device_i = 0; device_i < program->num_devices; ++device_i)
     {
+#ifdef OCS_AVAILABLE
       if (program->binaries[device_i] &&
           pocl_cache_device_cachedir_exists(program, device_i))
         {
-#ifdef OCS_AVAILABLE
           pocl_llvm_get_kernel_metadata (program, kernel, device_i,
                                          kernel_name, &errcode);
           cl_device_id device = program->devices[device_i];
           if (device->spmd)
             {
               char cachedir[POCL_FILENAME_LENGTH];
+              _cl_command_node cmd;
+              memset (&cmd, 0, sizeof(_cl_command_node));
+              cmd.type = CL_COMMAND_NDRANGE_KERNEL;
+              cmd.command.run.tmp_dir = cachedir;
+              cmd.command.run.kernel = kernel;
+              cmd.device = device;
+              size_t local_x = 0, local_y = 0, local_z = 0;
+              if (kernel->reqd_wg_size != NULL &&
+                  kernel->reqd_wg_size[0] > 0 &&
+                  kernel->reqd_wg_size[1] > 0 &&
+                  kernel->reqd_wg_size[2] > 0)
+                {
+                  local_x = kernel->reqd_wg_size[0];
+                  local_y = kernel->reqd_wg_size[1];
+                  local_z = kernel->reqd_wg_size[2];
+                }
+              cmd.command.run.local_x = local_x;
+              cmd.command.run.local_y = local_y;
+              cmd.command.run.local_z = local_z;
               pocl_cache_kernel_cachedir_path (cachedir, program, device_i,
-                                               kernel, "", 0, 0, 0);
+                                               kernel, "", local_x,
+                                               local_y, local_z);
 
-              errcode = pocl_llvm_generate_workgroup_function (cachedir, device,
-                                                               kernel, 0, 0, 0);
-              if (errcode == CL_SUCCESS)
-                device->ops->compile_kernel(NULL, kernel, device);
+              device->ops->compile_kernel (&cmd, kernel, device);
             }
-#endif
         }
       /* If the program was created with a pocl binary, we won't be able to
          get the metadata for the cl_kernel from an IR file, so we call pocl
          binary function to initialize the cl_kernel data */
       else if (program->pocl_binaries[device_i])
+#else
+      if (program->pocl_binaries[device_i])
+#endif
         {
           errcode
             = pocl_binary_get_kernel_metadata (program->pocl_binaries[device_i],
@@ -128,15 +145,17 @@ POname(clCreateKernel)(cl_program program,
         }
     }
 
-  if (program->kernels != ADDING_DEFAULT_KERNELS_TO_CL_PROGRAM)
+  /* default kernels don't go on the program-kernels linked list,
+   * and they don't increase the program refcount. */
+  if (!program->operating_on_default_kernels)
     {
       POCL_LOCK_OBJ (program);
       cl_kernel k = program->kernels;
       program->kernels = kernel;
-      POCL_UNLOCK_OBJ (program);
       kernel->next = k;
+      POCL_RETAIN_OBJECT_UNLOCKED (program);
+      POCL_UNLOCK_OBJ (program);
     }
-  POCL_RETAIN_OBJECT(program);
 
   errcode = CL_SUCCESS;
   goto SUCCESS;
@@ -144,10 +163,23 @@ POname(clCreateKernel)(cl_program program,
 ERROR:
   if (kernel)
     {
-      POCL_MEM_FREE(kernel->reqd_wg_size);
-      POCL_MEM_FREE(kernel->dyn_arguments);
-      POCL_MEM_FREE(kernel->arg_info);
-      POCL_MEM_FREE(kernel);
+      if (kernel->arg_info)
+        for (i = 0; i < kernel->num_args; i++)
+          {
+            POCL_MEM_FREE (kernel->arg_info[i].name);
+            POCL_MEM_FREE (kernel->arg_info[i].type_name);
+          }
+
+      if (kernel->dyn_arguments)
+        for (i = 0; i < (kernel->num_args + kernel->num_locals); i++)
+          {
+            pocl_aligned_free (kernel->dyn_arguments[i].value);
+          }
+      POCL_MEM_FREE (kernel->reqd_wg_size);
+      POCL_MEM_FREE (kernel->dyn_arguments);
+      POCL_MEM_FREE (kernel->arg_info);
+      POCL_MEM_FREE (kernel->name);
+      POCL_MEM_FREE (kernel);
     }
   kernel = NULL;
 
diff --git a/lib/CL/clCreateProgramWithBinary.c b/lib/CL/clCreateProgramWithBinary.c
index 1644fc4..d4d6fd1 100644
--- a/lib/CL/clCreateProgramWithBinary.c
+++ b/lib/CL/clCreateProgramWithBinary.c
@@ -21,26 +21,29 @@
    THE SOFTWARE.
 */
 
+#include "pocl_binary.h"
+#include "pocl_cache.h"
 #include "pocl_cl.h"
+#include "pocl_file_util.h"
+#include "pocl_shared.h"
 #include "pocl_util.h"
 #include <string.h>
-#include "pocl_binary.h"
-#include "pocl_cache.h"
 
-CL_API_ENTRY cl_program CL_API_CALL
-POname(clCreateProgramWithBinary)(cl_context                     context,
-                          cl_uint                        num_devices,
-                          const cl_device_id *           device_list,
-                          const size_t *                 lengths,
-                          const unsigned char **         binaries,
-                          cl_int *                       binary_status,
-                          cl_int *                       errcode_ret)
-  CL_API_SUFFIX__VERSION_1_0
+/* creates either a program with binaries, or an empty program. The latter
+ * is useful for clLinkProgram() which needs an empty program to put the
+ * compiled results in.
+ */
+cl_program
+create_program_skeleton (cl_context context, cl_uint num_devices,
+                         const cl_device_id *device_list,
+                         const size_t *lengths, const unsigned char **binaries,
+                         cl_int *binary_status, cl_int *errcode_ret,
+                         int allow_empty_binaries)
 {
   cl_program program;
   unsigned i,j;
   int errcode;
-  cl_device_id * unique_devlist = NULL;
+  cl_device_id *unique_devlist = NULL;
 
   POCL_GOTO_ERROR_COND((context == NULL), CL_INVALID_CONTEXT);
 
@@ -48,14 +51,16 @@ POname(clCreateProgramWithBinary)(cl_context                     context,
 
   POCL_GOTO_ERROR_COND((num_devices == 0), CL_INVALID_VALUE);
 
-  POCL_GOTO_ERROR_COND((lengths == NULL), CL_INVALID_VALUE);
-
-  POCL_MSG_PRINT_INFO("creating a program with binary\n");
-
-  for (i = 0; i < num_devices; ++i)
+  if (!allow_empty_binaries)
     {
-      POCL_GOTO_ERROR_ON((lengths[i] == 0 || binaries[i] == NULL), CL_INVALID_VALUE,
-        "%i-th binary is NULL or its length==0\n", i);
+      POCL_GOTO_ERROR_COND ((lengths == NULL), CL_INVALID_VALUE);
+
+      for (i = 0; i < num_devices; ++i)
+        {
+          POCL_GOTO_ERROR_ON ((lengths[i] == 0 || binaries[i] == NULL),
+                              CL_INVALID_VALUE,
+                              "%i-th binary is NULL or its length==0\n", i);
+        }
     }
 
   // check for duplicates in device_list[].
@@ -118,14 +123,16 @@ POname(clCreateProgramWithBinary)(cl_context                     context,
       goto ERROR_CLEAN_PROGRAM_AND_BINARIES;
     }
 
-  program->buildprogram_callback = NULL;
   program->context = context;
   program->num_devices = num_devices;
   program->devices = unique_devlist;
   program->build_status = CL_BUILD_NONE;
-  program->binary_type = CL_PROGRAM_BINARY_TYPE_EXECUTABLE;
+  program->binary_type = CL_PROGRAM_BINARY_TYPE_NONE;
   char program_bc_path[POCL_FILENAME_LENGTH];
 
+  if (allow_empty_binaries && (lengths == NULL) && (binaries == NULL))
+    goto SUCCESS;
+
   for (i = 0; i < num_devices; ++i)
     {
       /* LLVM IR */
@@ -154,6 +161,13 @@ POname(clCreateProgramWithBinary)(cl_context                     context,
           POCL_GOTO_ERROR_ON(pocl_binary_deserialize (program, i),
                              CL_INVALID_BINARY,
                              "Could not unpack a pocl binary\n");
+          /* read program.bc, can be useful later */
+          if (pocl_exists (program_bc_path))
+            {
+              pocl_read_file (program_bc_path,
+                              (char **)(&program->binaries[i]),
+                              (uint64_t *)(&program->binary_sizes[i]));
+            }
           pocl_cache_release_lock (write_cache_lock);
 
           if (binary_status != NULL)
@@ -169,6 +183,7 @@ POname(clCreateProgramWithBinary)(cl_context                     context,
         }
     }
 
+SUCCESS:
   POCL_RETAIN_OBJECT(context);
 
   if (errcode_ret != NULL)
@@ -200,4 +215,13 @@ ERROR:
       }
     return NULL;
 }
+
+CL_API_ENTRY cl_program CL_API_CALL POname (clCreateProgramWithBinary) (
+    cl_context context, cl_uint num_devices, const cl_device_id *device_list,
+    const size_t *lengths, const unsigned char **binaries,
+    cl_int *binary_status, cl_int *errcode_ret) CL_API_SUFFIX__VERSION_1_0
+{
+  return create_program_skeleton (context, num_devices, device_list, lengths,
+                                  binaries, binary_status, errcode_ret, 0);
+}
 POsym(clCreateProgramWithBinary)
diff --git a/lib/CL/clEnqueueMarkerWithWaitList.c b/lib/CL/clCreateProgramWithBuiltInKernels.c
similarity index 51%
copy from lib/CL/clEnqueueMarkerWithWaitList.c
copy to lib/CL/clCreateProgramWithBuiltInKernels.c
index ef62e37..37eddfa 100644
--- a/lib/CL/clEnqueueMarkerWithWaitList.c
+++ b/lib/CL/clCreateProgramWithBuiltInKernels.c
@@ -1,17 +1,17 @@
-/* OpenCL runtime library: clEnqueueMarkerMarkerWithWaitList()
+/* OpenCL runtime library: clCreateProgramWithBuiltInKernels()
+
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
 
-   Copyright (c) 2013 Ville Korhonen / Tampere Univ. of Tech.
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -20,38 +20,33 @@
    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    THE SOFTWARE.
 */
-#include "pocl_cl.h"
-#include "utlist.h"
-#include "pocl_util.h"
-#include <stdio.h>
 
-
-CL_API_ENTRY cl_int CL_API_CALL
-POname(clEnqueueMarkerWithWaitList) (cl_command_queue   command_queue,
-                                     cl_uint            num_events_in_wait_list,
-                                     const cl_event *   event_wait_list,
-                                     cl_event *         event) 
+#include "pocl_cl.h"
+#include "pocl_shared.h"
+
+CL_API_ENTRY cl_program CL_API_CALL
+POname (clCreateProgramWithBuiltInKernels) (cl_context context,
+                                            cl_uint num_devices,
+                                            const cl_device_id *device_list,
+                                            const char *kernel_names,
+                                            cl_int *errcode_ret)
 CL_API_SUFFIX__VERSION_1_2
 {
   int errcode;
-  _cl_command_node *cmd = NULL;
-
-  POCL_RETURN_ERROR_COND((command_queue == NULL), CL_INVALID_COMMAND_QUEUE);
-
-  errcode = pocl_create_command (&cmd, command_queue, CL_COMMAND_MARKER, 
-                                 event, num_events_in_wait_list, 
-                                 event_wait_list, 0, NULL);
-  if (errcode != CL_SUCCESS)
-    goto ERROR;
+  POCL_GOTO_ERROR_COND ((context == NULL), CL_INVALID_CONTEXT);
 
-  cmd->command.marker.data = command_queue->device->data;
-  pocl_command_enqueue (command_queue, cmd);
+  POCL_GOTO_ERROR_COND ((device_list == NULL), CL_INVALID_VALUE);
+  POCL_GOTO_ERROR_COND ((num_devices == 0), CL_INVALID_VALUE);
 
-  return CL_SUCCESS;
+  POCL_GOTO_ERROR_COND ((kernel_names == NULL), CL_INVALID_VALUE);
 
- ERROR:
-  POCL_MEM_FREE(cmd);
-  return errcode;
+  POCL_GOTO_ERROR_ON (1, CL_INVALID_VALUE,
+                      "Builtin kernels are currently"
+                      " not implemented for any device\n");
 
+ERROR:
+  if (errcode_ret)
+    *errcode_ret = errcode;
+  return NULL;
 }
-POsym(clEnqueueMarkerWithWaitList)
+POsym (clCreateProgramWithBuiltInKernels)
diff --git a/lib/CL/clCreateProgramWithSource.c b/lib/CL/clCreateProgramWithSource.c
index 0b34cdc..e7720f9 100644
--- a/lib/CL/clCreateProgramWithSource.c
+++ b/lib/CL/clCreateProgramWithSource.c
@@ -39,6 +39,8 @@ POname(clCreateProgramWithSource)(cl_context context,
   unsigned i;
   int errcode;
 
+  POCL_GOTO_ERROR_COND ((context == NULL), CL_INVALID_CONTEXT);
+
   POCL_GOTO_ERROR_COND((count == 0), CL_INVALID_VALUE);
 
   program = (cl_program) calloc(1, sizeof(struct _cl_program));
@@ -70,7 +72,6 @@ POname(clCreateProgramWithSource)(cl_context context,
     goto ERROR;
   }
 
-  program->buildprogram_callback = NULL;
   program->source = source;
 
   for (i = 0; i < count; ++i)
@@ -98,15 +99,6 @@ POname(clCreateProgramWithSource)(cl_context context,
   program->num_devices = context->num_devices;
   program->devices = context->devices;
   program->build_status = CL_BUILD_NONE;
-  /* we set binary type to NONE here. Based on OCL1.2 spec
-     if program will be compiled using clCompileProgram its binary_type
-     will be set to CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT.
-     clCompileProgram is currently missing in pocl!
-     if program was created by clLinkProgram which is called
-     with the –createlibrary link option its binary_type will be set to
-     CL_PROGRAM_BINARY_TYPE_LIBRARY.
-     clLinkProgram is currently missing in pocl!
-   */
   program->binary_type = CL_PROGRAM_BINARY_TYPE_NONE;
 
   if ((program->binary_sizes =
diff --git a/lib/CL/clCreateSampler.c b/lib/CL/clCreateSampler.c
index 39913a2..a7f68d8 100644
--- a/lib/CL/clCreateSampler.c
+++ b/lib/CL/clCreateSampler.c
@@ -1,5 +1,31 @@
+/* OpenCL runtime library: clCreateSampler()
+
+   Copyright (c) 2012-2017 pocl developers
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+
 #include "pocl_cl.h"
 #include "pocl_icd.h"
+
+
 extern CL_API_ENTRY cl_sampler CL_API_CALL
 POname(clCreateSampler)(cl_context          context,
                 cl_bool             normalized_coords, 
@@ -8,41 +34,44 @@ POname(clCreateSampler)(cl_context          context,
                 cl_int *            errcode_ret)
 CL_API_SUFFIX__VERSION_1_0
 {
-  int errcode;
-  cl_sampler sampler;
+  int errcode = CL_SUCCESS;
+  cl_sampler sampler = NULL;
+
+  POCL_GOTO_ERROR_COND ((context == NULL), CL_INVALID_CONTEXT);
+
+  /* at least 1 device must support images */
+  size_t i, any_device_has_images = 0;
+  for (i = 0; i < context->num_devices; i++)
+    any_device_has_images += (size_t)context->devices[i]->image_support;
+  POCL_GOTO_ERROR_ON ((!any_device_has_images), CL_INVALID_OPERATION,
+                      "None of the devices within context support images\n");
+
+  /* check requested sampler validity */
+  POCL_GOTO_ERROR_COND (
+      ((normalized_coords != CL_TRUE) && (normalized_coords != CL_FALSE)),
+      CL_INVALID_VALUE);
+  POCL_GOTO_ERROR_COND (((normalized_coords != CL_TRUE)
+                         && (addressing_mode == CL_ADDRESS_MIRRORED_REPEAT)),
+                        CL_INVALID_VALUE);
+  POCL_GOTO_ERROR_COND (((normalized_coords != CL_TRUE)
+                         && (addressing_mode == CL_ADDRESS_REPEAT)),
+                        CL_INVALID_VALUE);
 
-  POCL_GOTO_ERROR_COND((context == NULL), CL_INVALID_CONTEXT);
-  
   sampler = (cl_sampler) malloc(sizeof(struct _cl_sampler));
-  if (sampler == NULL)
-  {
-    errcode = CL_OUT_OF_HOST_MEMORY;
-    goto ERROR;
-  }
-  
-  if (normalized_coords == CL_TRUE)
-    POCL_ABORT_UNIMPLEMENTED("clCreateSampler: normalized_coords");
-  
-  if (addressing_mode != CL_ADDRESS_CLAMP_TO_EDGE)
-    POCL_ABORT_UNIMPLEMENTED("clCreateSampler: Addressing modes "
-                              "other than CL_ADDRESS_CLAMP_TO_EDGE");
-  
-  if (filter_mode != CL_FILTER_NEAREST)
-    POCL_ABORT_UNIMPLEMENTED("clCreateSampler: Filter modes other than "
-                                    "CL_FILTER_NEAREST");
-  
-  POCL_INIT_ICD_OBJECT(sampler);
+  POCL_GOTO_ERROR_COND ((sampler == NULL), CL_OUT_OF_HOST_MEMORY);
+
+  POCL_INIT_OBJECT (sampler);
+  POname (clRetainContext) (context);
+  sampler->context = context;
   sampler->normalized_coords = normalized_coords;
   sampler->addressing_mode = addressing_mode;
   sampler->filter_mode = filter_mode;
-  
-  return sampler;
 
 ERROR:
   if(errcode_ret)
   {
     *errcode_ret = errcode;
   }
-    return NULL;
+  return sampler;
 }
 POsym(clCreateSampler)
diff --git a/lib/CL/clCreateSubBuffer.c b/lib/CL/clCreateSubBuffer.c
index f814203..0f2fc31 100644
--- a/lib/CL/clCreateSubBuffer.c
+++ b/lib/CL/clCreateSubBuffer.c
@@ -21,8 +21,9 @@
    THE SOFTWARE.
 */
 
-#include "pocl_cl.h"
 #include "devices.h"
+#include "pocl_cl.h"
+#include "pocl_util.h"
 
 /* NOTE: this function is untested! */
 CL_API_ENTRY cl_mem CL_API_CALL
@@ -37,6 +38,8 @@ POname(clCreateSubBuffer)(cl_mem                   buffer,
   int errcode;
   unsigned i;
 
+  HANDLE_IMAGE1D_BUFFER (buffer);
+
   POCL_GOTO_ERROR_COND((buffer == NULL), CL_INVALID_MEM_OBJECT);
 
   POCL_GOTO_ERROR_ON((buffer->parent != NULL), CL_INVALID_MEM_OBJECT,
@@ -67,13 +70,14 @@ POname(clCreateSubBuffer)(cl_mem                   buffer,
   mem->mappings = NULL;
   mem->destructor_callbacks = NULL;
   mem->parent = buffer;
-
   mem->type = CL_MEM_OBJECT_BUFFER;
   mem->size = info->size;
+  mem->origin = info->origin;
   mem->context = buffer->context;
   mem->latest_event = NULL;
   mem->owning_device = buffer->owning_device;
   mem->is_pipe = CL_FALSE;
+  mem->mem_host_ptr = NULL;
 
   POCL_GOTO_ERROR_ON((buffer->flags & CL_MEM_WRITE_ONLY &&
        flags & (CL_MEM_READ_WRITE | CL_MEM_READ_ONLY)), CL_INVALID_VALUE,
@@ -105,26 +109,7 @@ POname(clCreateSubBuffer)(cl_mem                   buffer,
        "Invalid flags: buffer is CL_MEM_HOST_NO_ACCESS, requested sub-buffer "
        "(CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_WRITE_ONLY)\n");
 
-  if ((flags & CL_MEM_READ_WRITE) |
-      (flags & CL_MEM_READ_ONLY) |
-      (flags & CL_MEM_WRITE_ONLY)) 
-    { 
-      mem->flags =
-        (flags & CL_MEM_READ_WRITE) |
-        (flags & CL_MEM_READ_ONLY) |
-        (flags & CL_MEM_WRITE_ONLY);
-    } else 
-    {
-      mem->flags =
-        (buffer->flags & CL_MEM_READ_WRITE) |
-        (buffer->flags & CL_MEM_READ_ONLY) |
-        (buffer->flags & CL_MEM_WRITE_ONLY);
-    }
-
-  mem->flags = mem->flags |
-    (buffer->flags & CL_MEM_USE_HOST_PTR) |
-    (buffer->flags & CL_MEM_ALLOC_HOST_PTR) |
-    (buffer->flags & CL_MEM_COPY_HOST_PTR);
+  pocl_cl_mem_inherit_flags (mem, buffer, flags);
 
   if (mem->flags & CL_MEM_USE_HOST_PTR || mem->flags & CL_MEM_ALLOC_HOST_PTR)
     {
@@ -141,7 +126,7 @@ POname(clCreateSubBuffer)(cl_mem                   buffer,
 
   for (i = 0; i < pocl_num_devices; ++i)
     mem->device_ptrs[i].mem_ptr = NULL;
-  
+
   for (i = 0; i < mem->context->num_devices; ++i)
     {
       device = mem->context->devices[i];
diff --git a/lib/CL/clCreateSubDevices.c b/lib/CL/clCreateSubDevices.c
index 4fefa0d..01c02a5 100644
--- a/lib/CL/clCreateSubDevices.c
+++ b/lib/CL/clCreateSubDevices.c
@@ -53,83 +53,123 @@ POname(clCreateSubDevices)(cl_device_id in_device,
    POCL_GOTO_ERROR_COND((num_devices && !out_devices), CL_INVALID_VALUE);
    POCL_GOTO_ERROR_COND((!num_devices && out_devices), CL_INVALID_VALUE);
 
+   POCL_GOTO_ERROR_ON (
+       (in_device->max_sub_devices == 0), CL_DEVICE_PARTITION_FAILED,
+       "Device %s cannot be further partitioned\n", in_device->short_name);
+
    /* check that the partition property is supported by the device */
-   errcode = CL_INVALID_VALUE;
+   POCL_GOTO_ERROR_ON ((in_device->num_partition_properties == 0),
+                       CL_INVALID_VALUE,
+                       "Device %s does not support any partition property\n",
+                       in_device->short_name);
+
    for (i = 0; i < in_device->num_partition_properties; ++i) {
      if (properties[0] == in_device->partition_properties[i]) {
-       errcode = CL_SUCCESS;
        break;
      }
    }
-   if (errcode != CL_SUCCESS)
-     goto ERROR;
+
+   POCL_GOTO_ERROR_ON (
+       (i == in_device->num_partition_properties), CL_INVALID_VALUE,
+       "Device %s does not support the requested partition property\n",
+       in_device->short_name);
 
    /* Ok, it's a supported partition property, count the number of devices; currently,
     * we only support EQUALLY and BY_COUNTS, which enumerate the number of devices
     * differently */
-   if (properties[0] == CL_DEVICE_PARTITION_EQUALLY) {
-     // error out if the number of CUs per device is 0 or bigger than the number of
-     // CUs of in_device
-     POCL_GOTO_ERROR_COND(
-       (properties[1] == 0 || (cl_uint)properties[1] > in_device->max_compute_units),
-       CL_INVALID_VALUE);
-     // error out if properties isn't zero-terminated
-     POCL_GOTO_ERROR_COND(properties[2] != 0, CL_INVALID_VALUE);
-
-     count_devices = in_device->max_compute_units / properties[1];
-     num_props = 3; // partition type, CUs per device, terminating 0
-   } else if (properties[0] == CL_DEVICE_PARTITION_BY_COUNTS) {
-     cl_uint total_cus = 0;
-     i = 1;
-     while (properties[i] != 0) {
-       count_devices++;
-       total_cus += properties[i];
-       ++i;
+   if (properties[0] == CL_DEVICE_PARTITION_EQUALLY)
+     {
+       /* error out if the number of CUs per device is 0 or bigger than the
+        * number of CUs of in_device */
+       POCL_GOTO_ERROR_COND (
+           (properties[1] == 0
+            || (cl_uint)properties[1] > in_device->max_compute_units),
+           CL_INVALID_VALUE);
+       // error out if properties isn't zero-terminated
+       POCL_GOTO_ERROR_COND (properties[2] != 0, CL_INVALID_VALUE);
+
+       count_devices = in_device->max_compute_units / properties[1];
+       num_props = 3; // partition type, CUs per device, terminating 0
+     }
+   else if (properties[0] == CL_DEVICE_PARTITION_BY_COUNTS)
+     {
+       cl_uint total_cus = 0;
+       i = 1;
+       while (properties[i] != 0)
+         {
+           count_devices++;
+           total_cus += properties[i];
+           ++i;
+         }
+       /* error out if the total number of CUs surpasses the number of device
+        * CUs, or if we have to many subdevices */
+       POCL_GOTO_ERROR_COND ((total_cus == 0),
+                             CL_INVALID_DEVICE_PARTITION_COUNT);
+       POCL_GOTO_ERROR_COND ((total_cus > in_device->max_compute_units),
+                             CL_INVALID_DEVICE_PARTITION_COUNT);
+       POCL_GOTO_ERROR_COND ((count_devices > in_device->max_sub_devices),
+                             CL_INVALID_DEVICE_PARTITION_COUNT);
+       num_props = count_devices
+                   + 2; /* partition type, one spec per device, terminating 0 */
+     }
+   else
+     {
+       /* we end here if some of our devices claim to support a different
+        * partition type, but this function was not updated accordingly */
+       POCL_GOTO_ERROR_ON (1, CL_INVALID_VALUE,
+                           "Device reported partition type 0x%x "
+                           "is not supported by Pocl\n",
+                           (unsigned int)properties[0]);
      }
-     // error out if the total number of CUs surpasses the number of device CUs,
-     // or if we have to many subdevices
-     POCL_GOTO_ERROR_COND(
-       (total_cus == 0 || total_cus > in_device->max_compute_units ||
-        count_devices > in_device->max_sub_devices),
-       CL_INVALID_DEVICE_PARTITION_COUNT);
-     num_props = count_devices + 2; // partition type, one spec per device, terminating 0
-   } else {
-     // we end here if some of our devices claim to support a different
-     // partition type, but this function was not updated accordingly
-
-     char what[1024];
-     snprintf(what, 1024, "Device-reported partition type 0x%x", (unsigned int)properties[0]);
-     POCL_ABORT_UNIMPLEMENTED(what);
-   }
 
-   // num_devices must match count_devices if non-zero
-   POCL_GOTO_ERROR_COND((num_devices && count_devices != num_devices), CL_INVALID_VALUE);
+   // num_devices must be greater than or equal to count_devices if non-zero
+   POCL_GOTO_ERROR_COND((num_devices && num_devices < count_devices), CL_INVALID_VALUE);
 
    if (out_devices) {
      // we allocate our own array of devices to simplify management
      new_devs = calloc(count_devices, sizeof(cl_device_id));
      POCL_GOTO_ERROR_COND((!new_devs), CL_OUT_OF_HOST_MEMORY);
+     unsigned sum = 0;
 
      for (i = 0; i < count_devices; ++i) {
        new_devs[i] = calloc(1, sizeof(struct _cl_device_id));
        POCL_GOTO_ERROR_COND((new_devs[i] == NULL), CL_OUT_OF_HOST_MEMORY);
-       POCL_INIT_OBJECT(new_devs[i]);
 
        // clone in_device
        memcpy(new_devs[i], in_device, sizeof(struct _cl_device_id));
+       /* this must be done AFTER the clone, otherwise we end up with
+        * lock states and refcounts copied from parent device */
+       POCL_INIT_OBJECT (new_devs[i]);
+
+       new_devs[i]->parent_device = in_device;
 
-       // override the fields: partition type, parent, max_compute_units,
-       // max_sub_devices
+       new_devs[i]->max_sub_devices = new_devs[i]->max_compute_units
+           = (properties[0] == CL_DEVICE_PARTITION_EQUALLY
+                  ? properties[1]
+                  : properties[i + 1]);
+
+       /* for devices with 1 CU, report zero subdevices and
+        * no partitioning support. */
+       if (new_devs[i]->max_compute_units == 1)
+         {
+           new_devs[i]->max_sub_devices = 0;
+           new_devs[i]->num_partition_properties = 0;
+           new_devs[i]->partition_properties = NULL;
+         }
+
+       /* copy the partition type argument, for clGetDeviceInfo() */
        new_devs[i]->partition_type = calloc(num_props, sizeof(*properties));
        POCL_GOTO_ERROR_COND((new_devs[i]->partition_type == NULL),
          CL_OUT_OF_HOST_MEMORY);
        memcpy(new_devs[i]->partition_type, properties, num_props*sizeof(*properties));
        new_devs[i]->num_partition_types = num_props;
 
-       new_devs[i]->parent_device = in_device;
-       new_devs[i]->max_sub_devices = new_devs[i]->max_compute_units =
-         (properties[0] == CL_DEVICE_PARTITION_EQUALLY ? properties[1] :
-          properties[i+1]);
+       new_devs[i]->core_count = new_devs[i]->max_compute_units;
+       if (in_device->parent_device)
+         new_devs[i]->core_start = in_device->core_start + sum;
+       else
+         new_devs[i]->core_start = sum;
+       sum += new_devs[i]->core_count;
      }
 
      memcpy(out_devices, new_devs, count_devices*sizeof(cl_device_id));
@@ -150,12 +190,15 @@ ERROR:
       // with the ones we actually managed to allocate
       if (new_devs[i] == NULL)
         break;
-      POCL_RELEASE_OBJECT(new_devs[i], new_refcount);
+      POCL_RELEASE_OBJECT (new_devs[i], new_refcount);
       if (new_refcount == 0)
-        POCL_MEM_FREE(new_devs[i]);
+        {
+          POCL_MEM_FREE (new_devs[i]);
+          POCL_MEM_FREE (new_devs[i]->partition_type);
+        }
     }
 
-    free(new_devs);
+    POCL_MEM_FREE (new_devs);
   }
   return errcode;
 
diff --git a/lib/CL/clCreateUserEvent.c b/lib/CL/clCreateUserEvent.c
index 984b0d8..08b3db0 100644
--- a/lib/CL/clCreateUserEvent.c
+++ b/lib/CL/clCreateUserEvent.c
@@ -20,6 +20,11 @@ POname(clCreateUserEvent)(cl_context     context ,
       event->pocl_refcount = 1;
       event->status = CL_SUBMITTED;
       event->context = context;
+      pocl_user_event_data *p = malloc (sizeof (pocl_user_event_data));
+      assert (p);
+      pthread_cond_init (&p->wakeup_cond, NULL);
+      pthread_mutex_init (&p->lock, NULL);
+      event->data = p;
     }
 
   if (errcode_ret)
diff --git a/lib/CL/clEnqueueBarrierWithWaitList.c b/lib/CL/clEnqueueBarrierWithWaitList.c
index 48aa2cb..456028f 100644
--- a/lib/CL/clEnqueueBarrierWithWaitList.c
+++ b/lib/CL/clEnqueueBarrierWithWaitList.c
@@ -32,30 +32,32 @@ POname(clEnqueueBarrierWithWaitList)(cl_command_queue command_queue,
                                      cl_event         *event) 
 CL_API_SUFFIX__VERSION_1_2
 {
+  int errcode;
   _cl_command_node *cmd;
-  unsigned i;
-
-  POCL_RETURN_ERROR_COND((event_wait_list == NULL && num_events_in_wait_list > 0),
-    CL_INVALID_EVENT_WAIT_LIST);
-
-  POCL_RETURN_ERROR_COND((event_wait_list != NULL && num_events_in_wait_list == 0),
-    CL_INVALID_EVENT_WAIT_LIST);
-
-  for (i = 0; i < num_events_in_wait_list; i++)
-    POCL_RETURN_ERROR_COND((event_wait_list[i] == NULL), CL_INVALID_EVENT_WAIT_LIST);
 
   POCL_RETURN_ERROR_COND((command_queue == NULL), CL_INVALID_COMMAND_QUEUE);
-  POCL_RETURN_ERROR_COND((command_queue->device == NULL), CL_INVALID_COMMAND_QUEUE);
-  POCL_RETURN_ERROR_COND((command_queue->context == NULL), CL_INVALID_COMMAND_QUEUE);
+
+  errcode = pocl_check_event_wait_list (command_queue, num_events_in_wait_list,
+                                        event_wait_list);
+  if (errcode != CL_SUCCESS)
+    return errcode;
 
   /* Even if we do not need to create a full command, the runtime requires it */
-  pocl_create_command (&cmd, command_queue, 
-                       CL_COMMAND_BARRIER, NULL, 
-                       num_events_in_wait_list, event_wait_list, 0, NULL);
+  errcode = pocl_create_command (&cmd, command_queue, CL_COMMAND_BARRIER,
+                                 event, num_events_in_wait_list,
+                                 event_wait_list, 0, NULL);
+
+  if (errcode != CL_SUCCESS)
+    goto ERROR;
 
   cmd->command.barrier.has_wait_list = num_events_in_wait_list;
   pocl_command_enqueue(command_queue, cmd);
 
   return CL_SUCCESS;
+
+ ERROR:
+  POCL_MEM_FREE(cmd);
+  return errcode;
+
 }
 POsym(clEnqueueBarrierWithWaitList)
diff --git a/lib/CL/clEnqueueCopyBuffer.c b/lib/CL/clEnqueueCopyBuffer.c
index edae42a..413a371 100644
--- a/lib/CL/clEnqueueCopyBuffer.c
+++ b/lib/CL/clEnqueueCopyBuffer.c
@@ -62,11 +62,10 @@ CL_API_SUFFIX__VERSION_1_0
 
   POCL_RETURN_ERROR_COND((size == 0), CL_INVALID_VALUE);
 
-  POCL_RETURN_ERROR_COND((event_wait_list == NULL && num_events_in_wait_list > 0),
-    CL_INVALID_EVENT_WAIT_LIST);
-
-  POCL_RETURN_ERROR_COND((event_wait_list != NULL && num_events_in_wait_list == 0),
-    CL_INVALID_EVENT_WAIT_LIST);
+  errcode = pocl_check_event_wait_list (command_queue, num_events_in_wait_list,
+                                        event_wait_list);
+  if (errcode != CL_SUCCESS)
+    return errcode;
 
   if (pocl_buffers_boundcheck(src_buffer, dst_buffer, src_offset,
         dst_offset, size) != CL_SUCCESS) return CL_INVALID_VALUE;
diff --git a/lib/CL/clEnqueueFillBuffer.c b/lib/CL/clEnqueueFillBuffer.c
index 9c330f3..62d7084 100644
--- a/lib/CL/clEnqueueFillBuffer.c
+++ b/lib/CL/clEnqueueFillBuffer.c
@@ -49,11 +49,10 @@ CL_API_SUFFIX__VERSION_1_2
   POCL_RETURN_ERROR_ON((command_queue->context != buffer->context), CL_INVALID_CONTEXT,
                        "buffer and command_queue are not from the same context\n");
 
-  POCL_RETURN_ERROR_COND((event_wait_list == NULL && num_events_in_wait_list > 0),
-                         CL_INVALID_EVENT_WAIT_LIST);
-
-  POCL_RETURN_ERROR_COND((event_wait_list != NULL && num_events_in_wait_list == 0),
-                         CL_INVALID_EVENT_WAIT_LIST);
+  errcode = pocl_check_event_wait_list (command_queue, num_events_in_wait_list,
+                                        event_wait_list);
+  if (errcode != CL_SUCCESS)
+    return errcode;
 
   errcode = pocl_buffer_boundcheck(buffer, offset, size);
   if (errcode != CL_SUCCESS)
diff --git a/lib/CL/clEnqueueFillImage.c b/lib/CL/clEnqueueFillImage.c
index e22c165..7386474 100644
--- a/lib/CL/clEnqueueFillImage.c
+++ b/lib/CL/clEnqueueFillImage.c
@@ -39,9 +39,7 @@ CL_API_SUFFIX__VERSION_1_2
 {
   int errcode = CL_SUCCESS;
   _cl_command_node *cmd = NULL;
-  cl_image_format *supported_image_formats = NULL;
   void *fill_pixel = NULL;
-/*  size_t tuned_origin[3]; */
 
   POCL_RETURN_ERROR_COND((command_queue == NULL), CL_INVALID_COMMAND_QUEUE);
 
@@ -59,18 +57,12 @@ CL_API_SUFFIX__VERSION_1_2
   POCL_RETURN_ERROR_ON((!image->is_image), CL_INVALID_MEM_OBJECT,
                                                 "image argument is not an image\n");
 
-  POCL_RETURN_ERROR_COND((event_wait_list == NULL && num_events_in_wait_list > 0),
-    CL_INVALID_EVENT_WAIT_LIST);
-
-  POCL_RETURN_ERROR_COND((event_wait_list != NULL && num_events_in_wait_list == 0),
-    CL_INVALID_EVENT_WAIT_LIST);
-
-
-  errcode = pocl_check_image_origin_region (image, origin, region);
+  errcode = pocl_check_event_wait_list (command_queue, num_events_in_wait_list,
+                                        event_wait_list);
   if (errcode != CL_SUCCESS)
     return errcode;
 
-  errcode = pocl_check_device_supports_image(image, command_queue);
+  errcode = pocl_check_image_origin_region (image, origin, region);
   if (errcode != CL_SUCCESS)
     return errcode;
 
@@ -81,59 +73,52 @@ CL_API_SUFFIX__VERSION_1_2
       goto ERROR_CLEAN;
     }
 
-  /* TODO: channel order, saturating data type conversion */
-  if (image->image_elem_size == 1)
-    {
-      ((cl_char4*)fill_pixel)->s[0] = ((cl_int4*)fill_color)->s[0];
-      ((cl_char4*)fill_pixel)->s[1] = ((cl_int4*)fill_color)->s[1];
-      ((cl_char4*)fill_pixel)->s[2] = ((cl_int4*)fill_color)->s[2];
-      ((cl_char4*)fill_pixel)->s[3] = ((cl_int4*)fill_color)->s[3];
-    }
-  if (image->image_elem_size == 2)
-    {
-      ((cl_short4*)fill_pixel)->s[0] = ((cl_int4*)fill_color)->s[0];
-      ((cl_short4*)fill_pixel)->s[1] = ((cl_int4*)fill_color)->s[1];
-      ((cl_short4*)fill_pixel)->s[2] = ((cl_int4*)fill_color)->s[2];
-      ((cl_short4*)fill_pixel)->s[3] = ((cl_int4*)fill_color)->s[3];
-    }
- if (image->image_elem_size == 4)
-    {
-      memcpy (fill_pixel, fill_color, sizeof (cl_int4));      
-    }
-
-  /* POCL uses top-left corner as origin for images and AMD SDK ImageOverlap 
-     test uses bottom-left corner as origin. Because of this we need to modify 
-     y-coordinate so the fill goes in the right place.
-  tuned_origin[0] = origin[0];
-  tuned_origin[1] = image->image_height - region[1] - origin[1];
-  tuned_origin[2] = origin[2];
-  */
-  errcode = pocl_create_command (&cmd, command_queue, CL_COMMAND_FILL_IMAGE, 
-                                 event, num_events_in_wait_list, 
+  /* The fill color is:
+   *
+   * a four component RGBA floating-point color value if the image channel
+   * data type is NOT an unnormalized signed and unsigned integer type,
+   *
+   * a four component signed integer value if the image channel data type
+   * is an unnormalized signed integer type and
+   *
+   * a four component unsigned integer value if the image channel data type
+   * is an unormalized unsigned integer type.
+   *
+   * The fill color will be converted to the appropriate
+   * image channel format and order associated with image.
+   */
+  pocl_write_pixel_zero (fill_pixel, fill_color, image->image_channel_order,
+                         image->image_elem_size,
+                         image->image_channel_data_type);
+
+  cl_mem saved_image = image;
+  HANDLE_IMAGE1D_BUFFER (image);
+
+  errcode = pocl_create_command (&cmd, command_queue, CL_COMMAND_FILL_IMAGE,
+                                 event, num_events_in_wait_list,
                                  event_wait_list, 1, &image);
   if (errcode != CL_SUCCESS)
     goto ERROR_CLEAN;
 
+  cmd->command.fill_image.rowpitch = saved_image->image_row_pitch;
+  cmd->command.fill_image.slicepitch = saved_image->image_slice_pitch;
+  cmd->command.fill_image.fill_pixel = fill_pixel;
+  cmd->command.fill_image.pixel_size
+      = saved_image->image_elem_size * saved_image->image_channels;
   cmd->command.fill_image.data = command_queue->device->data;
   cmd->command.fill_image.device_ptr = 
     image->device_ptrs[command_queue->device->dev_id].mem_ptr;
   memcpy (&(cmd->command.fill_image.buffer_origin), origin, 
           3*sizeof(size_t));
   memcpy (&(cmd->command.fill_image.region), region, 3*sizeof(size_t));
-  cmd->command.fill_image.rowpitch = image->image_row_pitch;
-  cmd->command.fill_image.slicepitch = image->image_slice_pitch;
-  cmd->command.fill_image.fill_pixel = fill_pixel;
-  cmd->command.fill_image.pixel_size = image->image_elem_size * image->image_channels;
 
   POname(clRetainMemObject) (image);
   image->owning_device = command_queue->device;
   pocl_command_enqueue(command_queue, cmd);
   
-  POCL_MEM_FREE(supported_image_formats);
   return errcode;
   
  ERROR_CLEAN:
-  POCL_MEM_FREE(supported_image_formats);
   POCL_MEM_FREE(fill_pixel);
   return errcode;
 }
diff --git a/lib/CL/clEnqueueMapBuffer.c b/lib/CL/clEnqueueMapBuffer.c
index 58896b5..d04d764 100644
--- a/lib/CL/clEnqueueMapBuffer.c
+++ b/lib/CL/clEnqueueMapBuffer.c
@@ -59,14 +59,14 @@ POname(clEnqueueMapBuffer)(cl_command_queue command_queue,
   POCL_GOTO_ERROR_ON((command_queue->context != buffer->context),
     CL_INVALID_CONTEXT, "buffer and command_queue are not from the same context\n");
 
-  POCL_GOTO_ERROR_COND((event_wait_list == NULL && num_events_in_wait_list > 0),
-    CL_INVALID_EVENT_WAIT_LIST);
-
-  POCL_GOTO_ERROR_COND((event_wait_list != NULL && num_events_in_wait_list == 0),
-    CL_INVALID_EVENT_WAIT_LIST);
+  errcode = pocl_check_event_wait_list (command_queue, num_events_in_wait_list,
+                                        event_wait_list);
+  if (errcode != CL_SUCCESS)
+    goto ERROR;
 
   errcode = pocl_buffer_boundcheck(buffer, offset, size);
-  if (errcode != CL_SUCCESS) goto ERROR;
+  if (errcode != CL_SUCCESS)
+    goto ERROR;
 
   POCL_GOTO_ERROR_ON((buffer->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS) &&
     map_flags & CL_MAP_READ), CL_INVALID_OPERATION, "buffer has been created with "
@@ -90,7 +90,8 @@ POname(clEnqueueMapBuffer)(cl_command_queue command_queue,
       goto ERROR;
     }
 
-  if (buffer->flags & CL_MEM_USE_HOST_PTR || buffer->flags & CL_MEM_ALLOC_HOST_PTR)
+  if ((buffer->flags & CL_MEM_USE_HOST_PTR)
+      || (buffer->flags & CL_MEM_ALLOC_HOST_PTR))
     {
       /* In this case it should use the given host_ptr + offset as
          the mapping area in the host memory. */   
@@ -103,11 +104,10 @@ POname(clEnqueueMapBuffer)(cl_command_queue command_queue,
          the mapping will be stored (the last argument is NULL) in
          the host memory. When the last argument is non-NULL, the
          buffer will be mapped there (assumed it will succeed).  */
-      
-      host_ptr = device->ops->map_mem 
-        (device->data, buffer->device_ptrs[device->dev_id].mem_ptr, offset, 
-         size, 
-         NULL);
+
+      host_ptr = device->ops->map_mem (
+          device->data, buffer->device_ptrs[device->dev_id].mem_ptr, offset,
+          size, NULL);
     }
 
   if (host_ptr == NULL)
@@ -130,10 +130,12 @@ POname(clEnqueueMapBuffer)(cl_command_queue command_queue,
   mapping_info->offset = offset;
   mapping_info->size = size;
   POCL_LOCK_OBJ (buffer);
-  DL_APPEND (buffer->mappings, mapping_info);  
+  DL_APPEND (buffer->mappings, mapping_info);
   POCL_UNLOCK_OBJ (buffer);
 
-  POname(clRetainMemObject) (buffer);
+  POCL_MSG_PRINT_MEMORY ("Buffer %p New Mapping: host_ptr %p offset %zu\n",
+                         buffer, mapping_info->host_ptr, mapping_info->offset);
+
   buffer->owning_device = command_queue->device;
   pocl_command_enqueue(command_queue, cmd);
 
@@ -161,21 +163,3 @@ ERROR:
   return NULL;
 }
 POsym(clEnqueueMapBuffer)
-
-void*
-pocl_map_mem_cmd(cl_device_id device, 
-                 cl_mem buffer, 
-                 mem_mapping_t *mapping_info) {
-
-
-  
-  /* The second call ensures the memory is flushed/updated to the
-     host location. */
-  device->ops->map_mem 
-    (device->data, buffer->device_ptrs[device->dev_id].mem_ptr, 
-     mapping_info->offset, mapping_info->size, mapping_info->host_ptr);
-  
-  buffer->map_count++;
-  return mapping_info->host_ptr;
-
-}
diff --git a/lib/CL/clEnqueueMapImage.c b/lib/CL/clEnqueueMapImage.c
index fbeae8e..b300419 100644
--- a/lib/CL/clEnqueueMapImage.c
+++ b/lib/CL/clEnqueueMapImage.c
@@ -49,7 +49,6 @@ CL_API_SUFFIX__VERSION_1_0
   cl_device_id device;
   _cl_command_node *cmd = NULL;
   mem_mapping_t *mapping_info = NULL;
-  cl_uint event_i;
 
   POCL_GOTO_ERROR_COND((command_queue == NULL), CL_INVALID_COMMAND_QUEUE);
 
@@ -66,24 +65,8 @@ CL_API_SUFFIX__VERSION_1_0
   POCL_GOTO_ERROR_ON((!image->is_image), CL_INVALID_MEM_OBJECT,
     "image argument is not an image type cl_mem\n");
 
-  POCL_GOTO_ERROR_COND((event_wait_list == NULL && num_events_in_wait_list > 0),
-    CL_INVALID_EVENT_WAIT_LIST);
-
-  POCL_GOTO_ERROR_COND((event_wait_list != NULL && num_events_in_wait_list == 0),
-    CL_INVALID_EVENT_WAIT_LIST);
-
-  for (event_i = 0; event_i < num_events_in_wait_list; ++event_i)
-    {
-      POCL_GOTO_ERROR_COND((event_wait_list[event_i] == NULL), CL_INVALID_EVENT_WAIT_LIST);
-      if (event_i > 0)
-        {
-          POCL_GOTO_ERROR_COND((event_wait_list[event_i]->context 
-                                  != event_wait_list[event_i - 1]->context), 
-                                 CL_INVALID_CONTEXT);
-        }
-    }
-
-  errcode = pocl_check_device_supports_image(image, command_queue);
+  errcode = pocl_check_event_wait_list (command_queue, num_events_in_wait_list,
+                                        event_wait_list);
   if (errcode != CL_SUCCESS)
     goto ERROR;
 
@@ -101,9 +84,13 @@ CL_API_SUFFIX__VERSION_1_0
        "image_slice_pitch must be a non-NULL value\n");
 
   /* TODO: more error checks */
-  
-  offset = image->image_channels * image->image_elem_size * origin[0];
-  
+  size_t tuned_origin[3]
+      = { origin[0] * image->image_elem_size * image->image_channels,
+          origin[1], origin[2] };
+
+  offset = tuned_origin[0] + tuned_origin[1] * image->image_row_pitch
+           + tuned_origin[2] * image->image_slice_pitch;
+
   mapping_info = (mem_mapping_t*) malloc (sizeof (mem_mapping_t));
   if (mapping_info == NULL)
     {
@@ -111,6 +98,31 @@ CL_API_SUFFIX__VERSION_1_0
       goto ERROR;
     }
 
+  *image_row_pitch = image->image_row_pitch;
+  if (image_slice_pitch)
+    *image_slice_pitch = image->image_slice_pitch;
+
+  HANDLE_IMAGE1D_BUFFER (image);
+
+  /* CL_INVALID_OPERATION if buffer has been created with
+   * CL_MEM_HOST_WRITE_ONLY
+   * or CL_MEM_HOST_NO_ACCESS and CL_MAP_READ is set in map_flags or
+   *
+   * if buffer has been created with CL_MEM_HOST_READ_ONL or
+   * CL_MEM_HOST_NO_ACCESS
+   * and CL_MAP_WRITE or CL_MAP_WRITE_INVALIDATE_REGION is set in map_flags.
+   */
+
+  POCL_GOTO_ERROR_COND (
+      ((map_flags & CL_MAP_READ)
+       && (image->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS))),
+      CL_INVALID_OPERATION);
+
+  POCL_GOTO_ERROR_COND (
+      ((map_flags & CL_MAP_WRITE)
+       && (image->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS))),
+      CL_INVALID_OPERATION);
+
   if (image->flags & CL_MEM_USE_HOST_PTR)
     {
       /* In this case it should use the given host_ptr + offset as
@@ -133,7 +145,7 @@ CL_API_SUFFIX__VERSION_1_0
 
   if (map == NULL)
     {
-      POCL_UPDATE_EVENT_COMPLETE(event);
+      POCL_UPDATE_EVENT_COMPLETE (*event);
       errcode = CL_MAP_FAILURE;
       goto ERROR;
     }
@@ -147,6 +159,9 @@ CL_API_SUFFIX__VERSION_1_0
   DL_APPEND (image->mappings, mapping_info);
   POCL_UNLOCK_OBJ (image);
 
+  POCL_MSG_PRINT_MEMORY ("Image %p, Mapping: host_ptr %p offset %zu\n", image,
+                         mapping_info->host_ptr, mapping_info->offset);
+
   errcode = pocl_create_command (&cmd, command_queue, CL_COMMAND_MAP_IMAGE, 
                                  event, num_events_in_wait_list, 
                                  event_wait_list, 1, &image);
@@ -157,8 +172,6 @@ CL_API_SUFFIX__VERSION_1_0
   cmd->command.map.mapping = mapping_info;
   POname(clRetainMemObject) (image);
   image->owning_device = command_queue->device;
-
-  image->owning_device = command_queue->device;
   pocl_command_enqueue(command_queue, cmd);
 
   if (blocking_map)
@@ -166,10 +179,6 @@ CL_API_SUFFIX__VERSION_1_0
       POname(clFinish) (command_queue);
     }
 
-  *image_row_pitch = image->image_row_pitch;
-  if (image_slice_pitch)
-    *image_slice_pitch = image->image_slice_pitch;
-
   if (errcode_ret != NULL)
     (*errcode_ret) = CL_SUCCESS;
 
diff --git a/lib/CL/clEnqueueMarkerWithWaitList.c b/lib/CL/clEnqueueMarkerWithWaitList.c
index ef62e37..304806b 100644
--- a/lib/CL/clEnqueueMarkerWithWaitList.c
+++ b/lib/CL/clEnqueueMarkerWithWaitList.c
@@ -38,6 +38,11 @@ CL_API_SUFFIX__VERSION_1_2
 
   POCL_RETURN_ERROR_COND((command_queue == NULL), CL_INVALID_COMMAND_QUEUE);
 
+  errcode = pocl_check_event_wait_list (command_queue, num_events_in_wait_list,
+                                        event_wait_list);
+  if (errcode != CL_SUCCESS)
+    return errcode;
+
   errcode = pocl_create_command (&cmd, command_queue, CL_COMMAND_MARKER, 
                                  event, num_events_in_wait_list, 
                                  event_wait_list, 0, NULL);
diff --git a/lib/CL/clEnqueueMigrateMemObjects.c b/lib/CL/clEnqueueMigrateMemObjects.c
index 47e49c7..e969f56 100644
--- a/lib/CL/clEnqueueMigrateMemObjects.c
+++ b/lib/CL/clEnqueueMigrateMemObjects.c
@@ -38,17 +38,21 @@ POname(clEnqueueMigrateMemObjects) (cl_command_queue command_queue,
   int errcode;
   _cl_command_node *cmd = NULL;
 
-  POCL_RETURN_ERROR_COND((event_wait_list == NULL && num_events_in_wait_list > 0),
-    CL_INVALID_EVENT_WAIT_LIST);
-
-  POCL_RETURN_ERROR_COND((event_wait_list != NULL && num_events_in_wait_list == 0),
-    CL_INVALID_EVENT_WAIT_LIST);
-
   POCL_RETURN_ERROR_COND((command_queue == NULL), CL_INVALID_COMMAND_QUEUE);
-
   POCL_RETURN_ERROR_COND((num_mem_objects == 0), CL_INVALID_VALUE);
   POCL_RETURN_ERROR_COND((mem_objects == NULL), CL_INVALID_VALUE);
 
+  cl_mem_migration_flags invalid_flags =
+     ~(CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED | CL_MIGRATE_MEM_OBJECT_HOST);
+  POCL_RETURN_ERROR_COND (((flags != 0) && (flags & invalid_flags)),
+                          CL_INVALID_VALUE);
+  /* TODO check if it's OK to ignore flags. */
+
+  errcode = pocl_check_event_wait_list (command_queue, num_events_in_wait_list,
+                                        event_wait_list);
+  if (errcode != CL_SUCCESS)
+    return errcode;
+
   for (i = 0; i < num_mem_objects; ++i)
     {
       POCL_RETURN_ERROR_COND((mem_objects[i] == NULL), CL_INVALID_MEM_OBJECT);
@@ -57,9 +61,6 @@ POname(clEnqueueMigrateMemObjects) (cl_command_queue command_queue,
         CL_INVALID_CONTEXT);
     }
 
-  for (i = 0; i < num_events_in_wait_list; i++)
-    POCL_RETURN_ERROR_COND((event_wait_list[i] == NULL), CL_INVALID_EVENT_WAIT_LIST);
-
   errcode = pocl_create_command (&cmd, command_queue,
                                  CL_COMMAND_MIGRATE_MEM_OBJECTS,
                                  event, num_events_in_wait_list,
@@ -71,16 +72,18 @@ POname(clEnqueueMigrateMemObjects) (cl_command_queue command_queue,
   cmd->command.migrate.data = command_queue->device->data;
   cmd->command.migrate.num_mem_objects = num_mem_objects;
   cmd->command.migrate.mem_objects = malloc (sizeof (cl_mem) * num_mem_objects);
+  cl_mem *new_mem_objects = cmd->command.migrate.mem_objects;
   cmd->command.migrate.source_devices = malloc
     (num_mem_objects * sizeof (cl_device_id));
-  memcpy (cmd->command.migrate.mem_objects, mem_objects,
-          num_mem_objects * sizeof (cl_mem));
+  memcpy (new_mem_objects, mem_objects, num_mem_objects * sizeof (cl_mem));
 
   for (i = 0; i < num_mem_objects; ++i)
     {
-      POname(clRetainMemObject) (mem_objects[i]);
-      cmd->command.migrate.source_devices[i] = mem_objects[i]->owning_device;
-      mem_objects[i]->owning_device = command_queue->device;
+      HANDLE_IMAGE1D_BUFFER (new_mem_objects[i]);
+      POname (clRetainMemObject) (new_mem_objects[i]);
+      cmd->command.migrate.source_devices[i]
+          = new_mem_objects[i]->owning_device;
+      new_mem_objects[i]->owning_device = command_queue->device;
     }
 
   pocl_command_enqueue (command_queue, cmd);
diff --git a/lib/CL/clEnqueueNDRangeKernel.c b/lib/CL/clEnqueueNDRangeKernel.c
index 919ae60..056a5f9 100644
--- a/lib/CL/clEnqueueNDRangeKernel.c
+++ b/lib/CL/clEnqueueNDRangeKernel.c
@@ -39,11 +39,43 @@
 #include <errno.h>
 #include <string.h>
 
-#define COMMAND_LENGTH 1024
-#define ARGUMENT_STRING_LENGTH 32
-
 //#define DEBUG_NDRANGE
 
+/* Euclid's algorithm for the Greatest Common Divisor */
+static inline size_t
+gcd (size_t a, size_t b)
+{
+  int c;
+  while (a) {
+    c = a; a = b % a; b = c;
+  }
+  return b;
+}
+
+/* Find the largest divisor of dividend which is less than limit */
+static inline size_t
+upper_divisor (size_t dividend, size_t limit)
+{
+  /* The algorithm is currently not very smart, we
+   * start from limit and subtract until we find something
+   * that divides dividend. In optimal conditions this is found
+   * quickly, but it takes limit steps if dividend is prime.
+   * TODO FIXME improve algorithm
+   */
+  if (dividend < limit) return dividend; // small optimization
+  assert (limit > 0); // should never be called with limit == 0
+  while (dividend % limit != 0) --limit;
+  return limit;
+}
+
+/* Check that a divides b and b divides c */
+static inline int
+divide_chain (size_t a, size_t b, size_t c)
+{
+  return (b % a == 0 && c % b == 0);
+}
+
+
 CL_API_ENTRY cl_int CL_API_CALL
 POname(clEnqueueNDRangeKernel)(cl_command_queue command_queue,
                        cl_kernel kernel,
@@ -58,31 +90,49 @@ POname(clEnqueueNDRangeKernel)(cl_command_queue command_queue,
   size_t offset_x, offset_y, offset_z;
   size_t global_x, global_y, global_z;
   size_t local_x, local_y, local_z;
+  offset_x = offset_y = offset_z = 0;
+  global_x = global_y = global_z = 0;
+  local_x = local_y = local_z = 0;
+  /* cached values for max_work_item_sizes,
+   * since we are going to access them repeatedly */
+  size_t max_local_x, max_local_y, max_local_z;
+  /* cached values for max_work_group_size,
+   * since we are going to access them repeatedly */
+  size_t max_group_size;
+
   int b_migrate_count, buffer_count;
   unsigned i;
-  int error = 0;
+  int errcode = 0;
   cl_device_id realdev = NULL;
   struct pocl_context pc;
   _cl_command_node *command_node;
   /* alloc from stack to avoid malloc. num_args is the absolute max needed */
-  cl_mem mem_list[kernel->num_args];
+  cl_mem mem_list[kernel->num_args + 1];
   /* reserve space for potential buffer migrate events */
-  cl_event new_event_wait_list[num_events_in_wait_list + kernel->num_args];
+  cl_event new_event_wait_list[num_events_in_wait_list + kernel->num_args + 1];
 
   POCL_RETURN_ERROR_COND((command_queue == NULL), CL_INVALID_COMMAND_QUEUE);
 
   POCL_RETURN_ERROR_COND((kernel == NULL), CL_INVALID_KERNEL);
 
   POCL_RETURN_ERROR_ON((command_queue->context != kernel->context),
-    CL_INVALID_CONTEXT, "kernel and command_queue are not from the same context\n");
+    CL_INVALID_CONTEXT,
+    "kernel and command_queue are not from the same context\n");
+
+  errcode = pocl_check_event_wait_list (command_queue, num_events_in_wait_list,
+                                        event_wait_list);
+  if (errcode != CL_SUCCESS)
+    return errcode;
 
   POCL_RETURN_ERROR_COND((work_dim < 1), CL_INVALID_WORK_DIMENSION);
-  POCL_RETURN_ERROR_ON((work_dim > command_queue->device->max_work_item_dimensions),
-    CL_INVALID_WORK_DIMENSION, "work_dim exceeds devices' max workitem dimensions\n");
+  POCL_RETURN_ERROR_ON(
+    (work_dim > command_queue->device->max_work_item_dimensions),
+    CL_INVALID_WORK_DIMENSION,
+    "work_dim exceeds devices' max workitem dimensions\n");
 
-  assert(command_queue->device->max_work_item_dimensions <= 3);
+  assert (command_queue->device->max_work_item_dimensions <= 3);
 
-  realdev = POCL_REAL_DEV(command_queue->device);
+  realdev = pocl_real_dev (command_queue->device);
 
   if (global_work_offset != NULL)
     {
@@ -106,104 +156,285 @@ POname(clEnqueueNDRangeKernel)(cl_command_queue command_queue,
 
   for (i = 0; i < kernel->num_args; i++)
     {
-      POCL_RETURN_ERROR_ON((!kernel->arg_info[i].is_set), CL_INVALID_KERNEL_ARGS,
-        "The %i-th kernel argument is not set!\n", i);
+      POCL_RETURN_ERROR_ON((!kernel->arg_info[i].is_set),
+        CL_INVALID_KERNEL_ARGS, "The %i-th kernel argument is not set!\n", i);
     }
 
+  max_local_x = command_queue->device->max_work_item_sizes[0];
+  max_local_y = command_queue->device->max_work_item_sizes[1];
+  max_local_z = command_queue->device->max_work_item_sizes[2];
+  max_group_size = command_queue->device->max_work_group_size;
+
   if (local_work_size != NULL)
     {
       local_x = local_work_size[0];
       local_y = work_dim > 1 ? local_work_size[1] : 1;
       local_z = work_dim > 2 ? local_work_size[2] : 1;
-      if (local_x > global_x || local_y > global_y || local_z > global_z)
-        goto DETERMINE_LOCAL_SIZE;
+
+      POCL_RETURN_ERROR_ON((local_x * local_y * local_z > max_group_size),
+        CL_INVALID_WORK_GROUP_SIZE,
+        "Local worksize dimensions exceed device's max workgroup size\n");
+
+      POCL_RETURN_ERROR_ON((local_x > max_local_x),
+        CL_INVALID_WORK_ITEM_SIZE,
+        "local_work_size.x > device's max_workitem_sizes[0]\n");
+
+      if (work_dim > 1)
+        POCL_RETURN_ERROR_ON((local_y > max_local_y),
+          CL_INVALID_WORK_ITEM_SIZE,
+          "local_work_size.y > device's max_workitem_sizes[1]\n");
+
+      if (work_dim > 2)
+        POCL_RETURN_ERROR_ON((local_z > max_local_z),
+          CL_INVALID_WORK_ITEM_SIZE,
+          "local_work_size.z > device's max_workitem_sizes[2]\n");
+
+      /* TODO For full 2.x conformance the 'local must divide global'
+       * requirement will have to be limited to the cases of kernels compiled
+       * with the -cl-uniform-work-group-size option
+       */
+      POCL_RETURN_ERROR_COND((global_x % local_x != 0),
+        CL_INVALID_WORK_GROUP_SIZE);
+      POCL_RETURN_ERROR_COND((global_y % local_y != 0),
+        CL_INVALID_WORK_GROUP_SIZE);
+      POCL_RETURN_ERROR_COND((global_z % local_z != 0),
+        CL_INVALID_WORK_GROUP_SIZE);
+
     }
-  else
+
+  /* If the kernel has the reqd_work_group_size attribute, then the local
+   * work size _must_ be specified, and it _must_ match the attribute
+   * specification
+   */
+  if (kernel->reqd_wg_size != NULL &&
+      kernel->reqd_wg_size[0] > 0 &&
+      kernel->reqd_wg_size[1] > 0 &&
+      kernel->reqd_wg_size[2] > 0)
     {
-      /* Embarrassingly parallel kernel with a free work-group
-         size. Try to figure out one which utilizes all the
-         resources efficiently. Assume work-groups are scheduled
-         to compute units, so try to split it to a number of
-         work groups at the equal to the number of CUs, while still
-         trying to respect the preferred WG size multiple (for better
-         SIMD instruction utilization).
+      POCL_RETURN_ERROR_COND((local_work_size == NULL ||
+          local_x != kernel->reqd_wg_size[0] ||
+          local_y != kernel->reqd_wg_size[1] ||
+          local_z != kernel->reqd_wg_size[2]), CL_INVALID_WORK_GROUP_SIZE);
+    }
+  /* otherwise, if the local work size was not specified find the optimal one.
+   * Note that at some point we also checked for local > global. This doesn't
+   * make sense while we only have 1.2 support for kernel enqueue (and
+   * when only uniform group sizes are allowed), but it might turn useful
+   * when picking the hardware sub-group size in more sophisticated
+   * 2.0 support scenarios.
+   */
+  else if (local_work_size == NULL)
+    {
+      /* Embarrassingly parallel kernel with a free work-group size. Try to
+       * figure out one which utilizes all the resources efficiently. Assume
+       * work-groups are scheduled to compute units, so try to split it to a
+       * number of work groups at the equal to the number of CUs, while still
+       * trying to respect the preferred WG size multiple (for better SIMD
+       * instruction utilization).
       */
-      size_t preferred_wg_multiple;
-DETERMINE_LOCAL_SIZE:
-      POname(clGetKernelWorkGroupInfo)
-        (kernel, command_queue->device,
-         CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
-         sizeof (size_t), &preferred_wg_multiple, NULL);
+      size_t preferred_wg_multiple = realdev->preferred_wg_size_multiple;
+
+      if (!preferred_wg_multiple) /* unlikely */
+        preferred_wg_multiple = 1;
 
       POCL_MSG_PRINT_INFO("Preferred WG size multiple %zu\n",
                           preferred_wg_multiple);
 
-      local_x = global_x;
-      local_y = global_y;
-      local_z = global_z;
-
-      /* First try to split a dimension with the WG multiple
-         to make it still be divisible with the WG multiple. */
-      do {
-        /* Split the dimension, but avoid ending up with a dimension that
-           is not multiple of the wanted size. */
-        if (local_x > 1 && local_x % 2 == 0 &&
-            (local_x / 2) % preferred_wg_multiple == 0)
-          {
-            local_x /= 2;
-            continue;
-          }
-        else if (local_y > 1 && local_y % 2 == 0 &&
-                 (local_y / 2) % preferred_wg_multiple == 0)
-          {
-            local_y /= 2;
-            continue;
-          }
-        else if (local_z > 1 && local_z % 2 == 0 &&
-                 (local_z / 2) % preferred_wg_multiple == 0)
-          {
-            local_z /= 2;
-            continue;
-          }
+      /* However, we have some constraints about the local size:
+       * 1. local_{x,y,z} must divide global_{x,y,z} exactly, at least
+       *    as long as we only support uniform group sizes (i.e. OpenCL 1.x);
+       * 2. each of local_{x,y,z} must be less than the corresponding max size
+       *    for the device;
+       * 3. the product of local_{x,y,z} must be less than the maximum local
+       *    work-group size.
+       *
+       * Due to constraint 1., we may not have the possibility to proceed by
+       * multiples of the preferred_wg_multiple (e.g. if preferred = 16 and
+       * global size = 24). Our stepping granularity in each direction will
+       * therefore be the GCD of the global size in that direction and the
+       * preferred wg size.
+       *
+       * Note that the grain might actually be as low as 1, if the two values
+       * are coprimes (e.g. preferred = 8, global size = 17). There is no good
+       * solution in this case, and there's nothing we can do about it. On the
+       * opposite side of the spectrum, we might be lucky and grain_* =
+       * preferred_wg_multiple (this is the case e.g. if the programmer already
+       * checked for the preferred wg multiple and rounded the global size up
+       * to the multiple of it).
+       */
+
+      const size_t grain_x = gcd (preferred_wg_multiple, global_x);
+      const size_t grain_y = gcd (preferred_wg_multiple, global_y);
+      const size_t grain_z = gcd (preferred_wg_multiple, global_z);
+
+      /* We now want to get the largest multiple of the grain size that still
+       * divides global_* _and_ is less than the maximum local size in each
+       * direction.
+       *
+       * So we have G = K*g and we want to find k such that k*g < M and
+       * k*g still divides G, i.e. k must divide K.
+       * The largest multiple of g that is less than M can be found as
+       * (M/g)*g (integer division), so our upper bound for k is k' = M/g.
+       */
+
+      /*                      /------- K ------\  /-------- k' -------\  */
+      local_x = upper_divisor (global_x / grain_x, max_local_x / grain_x);
+      local_y = upper_divisor (global_y / grain_y, max_local_y / grain_y);
+      local_z = upper_divisor (global_z / grain_z, max_local_z / grain_z);
+
+      local_x *= grain_x;
+      local_y *= grain_y;
+      local_z *= grain_z;
+
+      /* So we now have the largest possible local sizes that divide the global
+       * sizes while being multiples of the grain size.
+       * We still have to ensure that the work-group size overall is not larger
+       * than the maximum allowed, and we have to do this while preserving the
+       * 'local divides global' condition, and we would like to preserve the
+       * 'multiple of grain' too, if possible.
+       * We always reduce z first, then y, then x, on the assumption that
+       * kernels will work with x varying faster, and thus being a better
+       * vectorization candidate, followed by y and then by z. (This assumption
+       * is in some sense sanctioned by the standard itself, see e.g. the
+       * get_{global,local}_linear_id functions in OpenCL 2.x)
+       * TODO this might not be optimal in all cases. For example, devices with
+       * a hardware sampler might benefit from more evenly sized work-groups
+       * for kernels that use images. Some kind of kernel + device analysis
+       * would be needed here.
+       */
+
+      while (local_x * local_y * local_z > max_group_size)
+        {
+          /* We are going to try three strategies, in order:
+           *
+           * Halving a coordinate, if the halved coordinate is still a multiple
+           * of the grain size and a divisor of the global size.
+           *
+           * Setting the coordinates with the smallest grain to 1,
+           * since they aren't good candidates for vectorizations anyway.
+           *
+           * Setting to 1 any coordinate, as a desperate measure.
+           */
+
+#define TRY_HALVE(coord) \
+if ((local_##coord & 1) == 0 && \
+    divide_chain (grain_##coord, local_##coord/2, global_##coord)) \
+  { \
+    local_##coord /= 2; \
+    continue; \
+  }
+
+#define TRY_LEAST_GRAIN(c1, c2, c3) \
+if (local_##c1 > 1 && grain_##c1 <= grain_##c2 && grain_##c1 <= grain_##c3) \
+  { \
+    local_##c1 = 1; \
+    continue; \
+  }
+
+#define DESPERATE_CASE(coord) \
+if (local_##coord > 1) \
+  { \
+    local_##coord = 1; \
+    continue; \
+  }
+          /* Halving attempt first */
+          TRY_HALVE(z) else TRY_HALVE(y) else TRY_HALVE(x)
+
+          /* Ok no luck. Find the coordinate with the smallest grain and
+           * kill that */
+          TRY_LEAST_GRAIN(z, x, y) else
+          TRY_LEAST_GRAIN(y, z, x) else
+          TRY_LEAST_GRAIN(x, y, z)
+
+          /* No luck either? Give up, kill everything */
+          DESPERATE_CASE(z) else DESPERATE_CASE(y) else DESPERATE_CASE(x)
+#undef DESPERATE_CASE
+#undef TRY_LEAST_GRAIN
+#undef TRY_HALVE
+        }
 
-        /* Next find out a dimension that is not a multiple anyways,
-           so one cannot nicely vectorize over it, and set it to one. */
-        if (local_z > 1 && local_z % preferred_wg_multiple != 0)
-          {
-            local_z = 1;
-            continue;
-          }
-        else if (local_y > 1 && local_y % preferred_wg_multiple != 0)
-          {
-            local_y = 1;
-            continue;
-          }
-        else if (local_z > 1 && local_z % preferred_wg_multiple != 0)
-          {
-            local_z = 1;
+      /* We now have the largest possible local work-group size that satisfies
+       * all the hard constraints (divide global, per-dimension bound, overall
+       * bound) and our soft constraint of being as close as possible a
+       * multiple of the preferred work-group size multiple. Such a greedy
+       * algorithm minimizes the total number of work-groups. In moderate-sized
+       * launch grid, this may result in less work-groups than the number of
+       * Compute Units, with a resulting imbalance in the workload
+       * distribution. At the same time, we want to avoid too many work-groups,
+       * since some devices are penalized by such fragmentation. Finding a good
+       * balance between the two is a hard problem, and generally depends on
+       * the device as well as the kernel utilization of its resources.
+       * Lacking that, as a first step we will simply try to guarantee that we
+       * have at least one work-group per CU, as long as the local work size
+       * does not drop below a given threshold.
+       */
+
+      /* Pick a minimum work-group size of 4 times the preferred work-group
+       * size multiple, under the assumption that this would be a good
+       * candidate below which a Compute Unit will not do enough work.
+       */
+      const size_t min_group_size = 4 * preferred_wg_multiple;
+
+      /* We need the number of Compute Units in the device, since we want
+       * at least that many work-groups, if possible */
+
+      cl_uint ncus = command_queue->device->max_compute_units;
+
+      /* number of workgroups */
+      size_t nwg_x = global_x / local_x;
+      size_t nwg_y = global_y / local_y;
+      size_t nwg_z = global_z / local_z;
+
+      size_t splits; /* number of splits to bring ngws to reach ncu */
+      /* Only proceed if splitting wouldn't bring us below the minimum
+       * group size */
+      while (((splits = ncus / (nwg_x * nwg_y * nwg_z)) > 1) &&
+             (local_x * local_y * local_z > splits * min_group_size))
+        {
+          /* Very simple splitting approach: find a dimension divisible by
+           * split, and lacking that divide by something less, if possible.
+           * If we fail at splitting at all, we will try killing the smaller of
+           * the dimensions.
+           * We will set splits to 0 if we succeed in the TRY_SPLIT, so that
+           * we know that we can skip the rest.
+           * If we get to the end of the while without splitting and without
+           * killing a dimension, we bail out early because it means we
+           * couldn't do anything useful without dropping below min_group_size.
+           */
+
+#define TRY_SPLIT(coord) \
+if ((local_##coord % splits) == 0 && \
+    divide_chain (grain_##coord, local_##coord/splits, global_##coord)) \
+  { \
+    local_##coord /= splits; nwg_##coord *= splits; splits = 0; \
+    continue; \
+  }
+
+#define TRY_LEAST_DIM(c1, c2, c3) \
+if (local_##c1 > 1 && local_##c1 <= local_##c2 && local_##c1 <= local_##c3 && \
+    local_##c2*local_##c3 >= min_group_size) \
+  { \
+    local_##c1 = 1; nwg_##c1 = global_##c1; \
+    continue; \
+  }
+
+          while (splits > 1)
+            {
+              TRY_SPLIT(z) else TRY_SPLIT(y) else TRY_SPLIT(x)
+                else splits--;
+            }
+          /* When we get here, splits will be 0 if we split, 1 if we failed:
+           * in which case we will just kill one of the dimensions instead,
+           * using the same TRY_LEAST_GRAIN and DESPERATE_CASE seen before
+           */
+          if (splits == 0)
             continue;
-          }
 
-        /* Finally, start setting them to zero starting from the Z
-           dimension. */
-        if (local_z > 1)
-          {
-            local_z = 1;
-            continue;
-          }
-        else if (local_y > 1)
-          {
-            local_y = 1;
-            continue;
-          }
-        else if (local_x > 1)
-          {
-            local_x = 1;
-            continue;
-          }
-      }
-      while (local_x * local_y * local_z >
-             command_queue->device->max_work_group_size);
+          TRY_LEAST_DIM(z, x, y) else TRY_LEAST_DIM(y, z, x) else
+          TRY_LEAST_DIM(x, y, z) else break;
+#undef TRY_LEAST_DIM
+#undef TRY_SPLIT
+        }
     }
 
   POCL_MSG_PRINT_INFO("Queueing kernel %s with local size %u x %u x %u group "
@@ -214,52 +445,23 @@ DETERMINE_LOCAL_SIZE:
                       (unsigned)(global_y / local_y),
                       (unsigned)(global_z / local_z));
 
-  POCL_RETURN_ERROR_ON((local_x * local_y * local_z > command_queue->device->max_work_group_size),
-    CL_INVALID_WORK_GROUP_SIZE, "Local worksize dimensions exceed device's max workgroup size\n");
+  assert (local_x * local_y * local_z <= max_group_size);
+  assert (local_x <= max_local_x);
+  assert (local_y <= max_local_y);
+  assert (local_z <= max_local_z);
 
-  POCL_RETURN_ERROR_ON((local_x > command_queue->device->max_work_item_sizes[0]),
-    CL_INVALID_WORK_ITEM_SIZE, "local_work_size.x > device's max_workitem_sizes[0]\n");
-
-  if (work_dim > 1)
-    POCL_RETURN_ERROR_ON((local_y > command_queue->device->max_work_item_sizes[1]),
-    CL_INVALID_WORK_ITEM_SIZE, "local_work_size.y > device's max_workitem_sizes[1]\n");
-
-  if (work_dim > 2)
-    POCL_RETURN_ERROR_ON((local_z > command_queue->device->max_work_item_sizes[2]),
-    CL_INVALID_WORK_ITEM_SIZE, "local_work_size.z > device's max_workitem_sizes[2]\n");
-
-  POCL_RETURN_ERROR_COND((global_x % local_x != 0), CL_INVALID_WORK_GROUP_SIZE);
-  POCL_RETURN_ERROR_COND((global_y % local_y != 0), CL_INVALID_WORK_GROUP_SIZE);
-  POCL_RETURN_ERROR_COND((global_z % local_z != 0), CL_INVALID_WORK_GROUP_SIZE);
-
-  POCL_RETURN_ERROR_COND((event_wait_list == NULL && num_events_in_wait_list > 0),
-    CL_INVALID_EVENT_WAIT_LIST);
-
-  POCL_RETURN_ERROR_COND((event_wait_list != NULL && num_events_in_wait_list == 0),
-    CL_INVALID_EVENT_WAIT_LIST);
+  /* See TODO above for 'local must divide global' */
+  assert (global_x % local_x == 0);
+  assert (global_y % local_y == 0);
+  assert (global_z % local_z == 0);
 
   char cachedir[POCL_FILENAME_LENGTH];
   int realdev_i = pocl_cl_device_to_index (kernel->program, realdev);
-  assert(realdev_i >= 0);
+  assert (realdev_i >= 0);
   pocl_cache_kernel_cachedir_path (cachedir, kernel->program,
                                    realdev_i, kernel, "",
                                    local_x, local_y, local_z);
 
-  if (kernel->program->source || kernel->program->binaries[realdev_i])
-    {
-#ifdef OCS_AVAILABLE
-      // SPMD devices already have compiled at this point
-      if (realdev->spmd)
-        error = CL_SUCCESS;
-      else
-        error = pocl_llvm_generate_workgroup_function (cachedir, realdev, kernel,
-                                                       local_x, local_y, local_z);
-#else
-      error = 1;
-#endif
-      if (error) goto ERROR;
-    }
-
   b_migrate_count = 0;
   buffer_count = 0;
 
@@ -307,13 +509,13 @@ DETERMINE_LOCAL_SIZE:
               sizeof(cl_event) * num_events_in_wait_list);
     }
 
-  error = pocl_create_command (&command_node, command_queue,
+  errcode = pocl_create_command (&command_node, command_queue,
                                CL_COMMAND_NDRANGE_KERNEL, event,
                                num_events_in_wait_list + b_migrate_count,
                                (num_events_in_wait_list + b_migrate_count)?
                                new_event_wait_list : NULL,
                                buffer_count, mem_list);
-  if (error != CL_SUCCESS)
+  if (errcode != CL_SUCCESS)
     goto ERROR;
 
   pc.work_dim = work_dim;
@@ -370,10 +572,10 @@ DETERMINE_LOCAL_SIZE:
   POname(clRetainKernel) (kernel);
 
   pocl_command_enqueue (command_queue, command_node);
-  error = CL_SUCCESS;
+  errcode = CL_SUCCESS;
 
 ERROR:
-  return error;
+  return errcode;
 
 }
 POsym(clEnqueueNDRangeKernel)
diff --git a/lib/CL/clEnqueueNativeKernel.c b/lib/CL/clEnqueueNativeKernel.c
index dfb91ae..525fbee 100644
--- a/lib/CL/clEnqueueNativeKernel.c
+++ b/lib/CL/clEnqueueNativeKernel.c
@@ -23,7 +23,7 @@ POname(clEnqueueNativeKernel)(cl_command_queue   command_queue ,
   _cl_command_node *command_node = NULL;
   cl_mem *mem_list_copy = NULL;
   void *args_copy = NULL;
-  cl_int error;
+  cl_int errcode;
 
   POCL_RETURN_ERROR_COND((command_queue == NULL), CL_INVALID_COMMAND_QUEUE);
 
@@ -44,18 +44,18 @@ POname(clEnqueueNativeKernel)(cl_command_queue   command_queue ,
     CL_EXEC_NATIVE_KERNEL), CL_INVALID_OPERATION, "device associated with "
     "command_queue cannot execute the native kernel\n");
 
-  POCL_RETURN_ERROR_COND((event_wait_list == NULL && num_events_in_wait_list > 0),
-    CL_INVALID_EVENT_WAIT_LIST);
+  errcode = pocl_check_event_wait_list (command_queue, num_events_in_wait_list,
+                                        event_wait_list);
+  if (errcode != CL_SUCCESS)
+    return errcode;
 
-  POCL_RETURN_ERROR_COND((event_wait_list != NULL && num_events_in_wait_list == 0),
-    CL_INVALID_EVENT_WAIT_LIST);
 
-  error = pocl_create_command (&command_node, command_queue,
+  errcode = pocl_create_command (&command_node, command_queue,
                                CL_COMMAND_NATIVE_KERNEL,
                                event, num_events_in_wait_list,
                                event_wait_list, num_mem_objects, mem_list);
-  if (error != CL_SUCCESS)
-    return error;
+  if (errcode != CL_SUCCESS)
+    return errcode;
 
   command_node->command.native.data = command_queue->device->data;
   command_node->command.native.num_mem_objects = num_mem_objects;
diff --git a/lib/CL/clEnqueueReadBuffer.c b/lib/CL/clEnqueueReadBuffer.c
index 2f34e2a..8ab6cef 100644
--- a/lib/CL/clEnqueueReadBuffer.c
+++ b/lib/CL/clEnqueueReadBuffer.c
@@ -38,9 +38,9 @@ POname(clEnqueueReadBuffer)(cl_command_queue command_queue,
                     cl_event *event) CL_API_SUFFIX__VERSION_1_0
 {
   cl_device_id device;
-  unsigned i;
   _cl_command_node *cmd = NULL;
   int errcode;
+  size_t i;
 
   POCL_RETURN_ERROR_COND((command_queue == NULL), CL_INVALID_COMMAND_QUEUE);
 
@@ -49,20 +49,22 @@ POname(clEnqueueReadBuffer)(cl_command_queue command_queue,
   POCL_RETURN_ERROR_ON((command_queue->context != buffer->context),
     CL_INVALID_CONTEXT, "buffer and command_queue are not from the same context\n");
 
+  POCL_RETURN_ERROR_ON (
+      (buffer->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)),
+      CL_INVALID_OPERATION,
+      "buffer has been created with CL_MEM_HOST_WRITE_ONLY "
+      "or CL_MEM_HOST_NO_ACCESS\n");
+
   POCL_RETURN_ERROR_COND((ptr == NULL), CL_INVALID_VALUE);
   if (pocl_buffer_boundcheck(buffer, offset, cb) != CL_SUCCESS)
     return CL_INVALID_VALUE;
 
-  POCL_RETURN_ERROR_COND((event_wait_list == NULL && num_events_in_wait_list > 0),
-    CL_INVALID_EVENT_WAIT_LIST);
-
-  POCL_RETURN_ERROR_COND((event_wait_list != NULL && num_events_in_wait_list == 0),
-    CL_INVALID_EVENT_WAIT_LIST);
-
-  for(i=0; i<num_events_in_wait_list; i++)
-    POCL_RETURN_ERROR_COND((event_wait_list[i] == NULL), CL_INVALID_EVENT_WAIT_LIST);
+  errcode = pocl_check_event_wait_list (command_queue, num_events_in_wait_list,
+                                        event_wait_list);
+  if (errcode != CL_SUCCESS)
+    return errcode;
 
-  device = POCL_REAL_DEV(command_queue->device);
+  POCL_CHECK_DEV_IN_CMDQ;
 
   errcode = pocl_create_command (&cmd, command_queue, CL_COMMAND_READ_BUFFER, 
                                  event, num_events_in_wait_list, 
diff --git a/lib/CL/clEnqueueReadBufferRect.c b/lib/CL/clEnqueueReadBufferRect.c
index cad92e1..0aa872a 100644
--- a/lib/CL/clEnqueueReadBufferRect.c
+++ b/lib/CL/clEnqueueReadBufferRect.c
@@ -46,6 +46,7 @@ POname(clEnqueueReadBufferRect)(cl_command_queue command_queue,
   cl_device_id device;
   unsigned i;
   _cl_command_node *cmd;
+  int errcode;
 
   POCL_RETURN_ERROR_COND((command_queue == NULL), CL_INVALID_COMMAND_QUEUE);
 
@@ -61,11 +62,10 @@ POname(clEnqueueReadBufferRect)(cl_command_queue command_queue,
   POCL_RETURN_ERROR_ON((command_queue->context != buffer->context),
     CL_INVALID_CONTEXT, "buffer and command_queue are not from the same context\n");
 
-  POCL_RETURN_ERROR_COND((event_wait_list == NULL && num_events_in_wait_list > 0),
-    CL_INVALID_EVENT_WAIT_LIST);
-
-  POCL_RETURN_ERROR_COND((event_wait_list != NULL && num_events_in_wait_list == 0),
-    CL_INVALID_EVENT_WAIT_LIST);
+  errcode = pocl_check_event_wait_list (command_queue, num_events_in_wait_list,
+                                        event_wait_list);
+  if (errcode != CL_SUCCESS)
+    return errcode;
 
   POCL_RETURN_ERROR_COND((ptr == NULL), CL_INVALID_VALUE);
   POCL_RETURN_ERROR_COND((buffer_origin == NULL), CL_INVALID_VALUE);
diff --git a/lib/CL/clEnqueueReadImage.c b/lib/CL/clEnqueueReadImage.c
index 84f2f46..e867d15 100644
--- a/lib/CL/clEnqueueReadImage.c
+++ b/lib/CL/clEnqueueReadImage.c
@@ -32,8 +32,8 @@ POname(clEnqueueReadImage)(cl_command_queue     command_queue,
                            cl_bool              blocking_read, 
                            const size_t *       origin, /* [3] */
                            const size_t *       region, /* [3] */
-                           size_t               host_row_pitch,
-                           size_t               host_slice_pitch, 
+                           size_t               row_pitch,
+                           size_t               slice_pitch,
                            void *               ptr,
                            cl_uint              num_events_in_wait_list,
                            const cl_event *     event_wait_list,
@@ -52,13 +52,29 @@ CL_API_SUFFIX__VERSION_1_0
   POCL_RETURN_ERROR_ON((command_queue->context != image->context),
     CL_INVALID_CONTEXT, "image and command_queue are not from the same context\n");
 
-  POCL_RETURN_ERROR_COND((event_wait_list == NULL && num_events_in_wait_list > 0),
-    CL_INVALID_EVENT_WAIT_LIST);
+  POCL_RETURN_ERROR_ON (
+      (!command_queue->device->image_support), CL_INVALID_OPERATION,
+      "Device %s does not support images\n", command_queue->device->long_name);
 
-  POCL_RETURN_ERROR_COND((event_wait_list != NULL && num_events_in_wait_list == 0),
-    CL_INVALID_EVENT_WAIT_LIST);
+  errcode = pocl_check_event_wait_list (command_queue, num_events_in_wait_list,
+                                        event_wait_list);
+  if (errcode != CL_SUCCESS)
+    return errcode;
+
+  POCL_RETURN_ERROR_ON (
+      (image->flags & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)),
+      CL_INVALID_OPERATION,
+      "image has been created with CL_MEM_HOST_WRITE_ONLY "
+      "or CL_MEM_HOST_NO_ACCESS\n");
+
+  if (image->buffer)
+    POCL_RETURN_ERROR_ON (
+        (image->buffer->flags
+         & (CL_MEM_HOST_WRITE_ONLY | CL_MEM_HOST_NO_ACCESS)),
+        CL_INVALID_OPERATION,
+        "1D Image buffer has been created with CL_MEM_HOST_WRITE_ONLY "
+        "or CL_MEM_HOST_NO_ACCESS\n");
 
-  errcode = pocl_check_device_supports_image(image, command_queue);
   if (errcode != CL_SUCCESS)
     return errcode;
 
@@ -80,13 +96,20 @@ CL_API_SUFFIX__VERSION_1_0
       return errcode;
     }
 
-  cmd->command.read_image.device_ptr = 
-    image->device_ptrs[command_queue->device->dev_id].mem_ptr;
-  cmd->command.read_image.host_ptr = ptr;
-  memcpy ((cmd->command.read_image.origin), tuned_origin, 3*sizeof (size_t));
-  memcpy ((cmd->command.read_image.region), tuned_region, 3*sizeof (size_t));
   cmd->command.read_image.b_rowpitch = image->image_row_pitch;
   cmd->command.read_image.b_slicepitch = image->image_slice_pitch;
+  cmd->command.read_image.h_rowpitch
+      = (row_pitch ? row_pitch : tuned_region[0]);
+  cmd->command.read_image.h_slicepitch
+      = (slice_pitch ? slice_pitch : (tuned_region[0] * region[1]));
+  memcpy ((cmd->command.read_image.origin), tuned_origin, 3 * sizeof (size_t));
+  memcpy ((cmd->command.read_image.region), tuned_region, 3 * sizeof (size_t));
+
+  HANDLE_IMAGE1D_BUFFER (image);
+
+  cmd->command.read_image.device_ptr
+      = image->device_ptrs[command_queue->device->dev_id].mem_ptr;
+  cmd->command.read_image.host_ptr = ptr;
   cmd->command.read_image.buffer = image;
 
   POname(clRetainMemObject) (image);  
diff --git a/lib/CL/clEnqueueSVMFree.c b/lib/CL/clEnqueueSVMFree.c
index 267d7d7..93f69b5 100644
--- a/lib/CL/clEnqueueSVMFree.c
+++ b/lib/CL/clEnqueueSVMFree.c
@@ -38,6 +38,7 @@ POname(clEnqueueSVMFree) (cl_command_queue command_queue,
                   cl_event *event) CL_API_SUFFIX__VERSION_2_0
 {
   unsigned i;
+  cl_int errcode;
 
   POCL_RETURN_ERROR_COND((command_queue == NULL), CL_INVALID_COMMAND_QUEUE);
 
@@ -50,18 +51,17 @@ POname(clEnqueueSVMFree) (cl_command_queue command_queue,
   for (i=0; i<num_svm_pointers; i++)
     POCL_RETURN_ERROR_COND((svm_pointers[i] == NULL), CL_INVALID_VALUE);
 
-  POCL_RETURN_ERROR_COND((event_wait_list == NULL && num_events_in_wait_list > 0),
-                         CL_INVALID_EVENT_WAIT_LIST);
-
-  POCL_RETURN_ERROR_COND((event_wait_list != NULL && num_events_in_wait_list == 0),
-                         CL_INVALID_EVENT_WAIT_LIST);
+  errcode = pocl_check_event_wait_list (command_queue, num_events_in_wait_list,
+                                        event_wait_list);
+  if (errcode != CL_SUCCESS)
+    return errcode;
 
   for(i=0; i<num_events_in_wait_list; i++)
     POCL_RETURN_ERROR_COND((event_wait_list[i] == NULL), CL_INVALID_EVENT_WAIT_LIST);
 
   _cl_command_node *cmd = NULL;
 
-  int errcode = pocl_create_command (&cmd, command_queue, CL_COMMAND_SVM_FREE,
+  errcode = pocl_create_command (&cmd, command_queue, CL_COMMAND_SVM_FREE,
                                      event, num_events_in_wait_list,
                                      event_wait_list, 0, NULL);
 
diff --git a/lib/CL/clEnqueueSVMMap.c b/lib/CL/clEnqueueSVMMap.c
index 8adb9ec..5914ba9 100644
--- a/lib/CL/clEnqueueSVMMap.c
+++ b/lib/CL/clEnqueueSVMMap.c
@@ -35,6 +35,8 @@ POname(clEnqueueSVMMap) (cl_command_queue command_queue,
                  cl_event *event) CL_API_SUFFIX__VERSION_2_0
 {
   unsigned i;
+  cl_int errcode;
+
   POCL_RETURN_ERROR_COND((command_queue == NULL), CL_INVALID_COMMAND_QUEUE);
 
   POCL_RETURN_ERROR_ON((command_queue->context->svm_allocdev == NULL),
@@ -54,18 +56,17 @@ POname(clEnqueueSVMMap) (cl_command_queue command_queue,
 
   POCL_RETURN_ERROR_COND((size == 0), CL_INVALID_VALUE);
 
-  POCL_RETURN_ERROR_COND((event_wait_list == NULL && num_events_in_wait_list > 0),
-                         CL_INVALID_EVENT_WAIT_LIST);
-
-  POCL_RETURN_ERROR_COND((event_wait_list != NULL && num_events_in_wait_list == 0),
-                         CL_INVALID_EVENT_WAIT_LIST);
+  errcode = pocl_check_event_wait_list (command_queue, num_events_in_wait_list,
+                                        event_wait_list);
+  if (errcode != CL_SUCCESS)
+    return errcode;
 
   for(i=0; i<num_events_in_wait_list; i++)
     POCL_RETURN_ERROR_COND((event_wait_list[i] == NULL), CL_INVALID_EVENT_WAIT_LIST);
 
   _cl_command_node *cmd = NULL;
 
-  int errcode = pocl_create_command (&cmd, command_queue, CL_COMMAND_SVM_MAP,
+  errcode = pocl_create_command (&cmd, command_queue, CL_COMMAND_SVM_MAP,
                                      event, num_events_in_wait_list,
                                      event_wait_list, 0, NULL);
 
diff --git a/lib/CL/clEnqueueSVMMemFill.c b/lib/CL/clEnqueueSVMMemFill.c
index 083336c..b52b316 100644
--- a/lib/CL/clEnqueueSVMMemFill.c
+++ b/lib/CL/clEnqueueSVMMemFill.c
@@ -35,6 +35,8 @@ POname(clEnqueueSVMMemFill) (cl_command_queue command_queue,
                      cl_event *event) CL_API_SUFFIX__VERSION_2_0
 {
   unsigned i;
+  cl_int errcode;
+
   POCL_RETURN_ERROR_COND((command_queue == NULL), CL_INVALID_COMMAND_QUEUE);
 
   POCL_RETURN_ERROR_ON((command_queue->context->svm_allocdev == NULL),
@@ -57,18 +59,17 @@ POname(clEnqueueSVMMemFill) (cl_command_queue command_queue,
   POCL_RETURN_ERROR_ON((size % pattern_size > 0), CL_INVALID_VALUE,
                        "size must be a multiple of pattern_size\n");
 
-  POCL_RETURN_ERROR_COND((event_wait_list == NULL && num_events_in_wait_list > 0),
-                         CL_INVALID_EVENT_WAIT_LIST);
-
-  POCL_RETURN_ERROR_COND((event_wait_list != NULL && num_events_in_wait_list == 0),
-                         CL_INVALID_EVENT_WAIT_LIST);
+  errcode = pocl_check_event_wait_list (command_queue, num_events_in_wait_list,
+                                        event_wait_list);
+  if (errcode != CL_SUCCESS)
+    return errcode;
 
   for(i=0; i<num_events_in_wait_list; i++)
     POCL_RETURN_ERROR_COND((event_wait_list[i] == NULL), CL_INVALID_EVENT_WAIT_LIST);
 
   _cl_command_node *cmd = NULL;
 
-  int errcode = pocl_create_command (&cmd, command_queue, CL_COMMAND_SVM_MEMFILL,
+  errcode = pocl_create_command (&cmd, command_queue, CL_COMMAND_SVM_MEMFILL,
                                      event, num_events_in_wait_list,
                                      event_wait_list, 0, NULL);
 
diff --git a/lib/CL/clEnqueueSVMMemcpy.c b/lib/CL/clEnqueueSVMMemcpy.c
index 2813d54..9f24732 100644
--- a/lib/CL/clEnqueueSVMMemcpy.c
+++ b/lib/CL/clEnqueueSVMMemcpy.c
@@ -35,6 +35,8 @@ POname(clEnqueueSVMMemcpy) (cl_command_queue command_queue,
                     cl_event *event) CL_API_SUFFIX__VERSION_2_0
 {
   unsigned i;
+  cl_int errcode;
+
   POCL_RETURN_ERROR_COND((command_queue == NULL), CL_INVALID_COMMAND_QUEUE);
 
   POCL_RETURN_ERROR_ON((command_queue->context->svm_allocdev == NULL),
@@ -46,11 +48,10 @@ POname(clEnqueueSVMMemcpy) (cl_command_queue command_queue,
 
   POCL_RETURN_ERROR_COND((size == 0), CL_INVALID_VALUE);
 
-  POCL_RETURN_ERROR_COND((event_wait_list == NULL && num_events_in_wait_list > 0),
-                         CL_INVALID_EVENT_WAIT_LIST);
-
-  POCL_RETURN_ERROR_COND((event_wait_list != NULL && num_events_in_wait_list == 0),
-                         CL_INVALID_EVENT_WAIT_LIST);
+  errcode = pocl_check_event_wait_list (command_queue, num_events_in_wait_list,
+                                        event_wait_list);
+  if (errcode != CL_SUCCESS)
+    return errcode;
 
   for(i=0; i<num_events_in_wait_list; i++)
     POCL_RETURN_ERROR_COND((event_wait_list[i] == NULL), CL_INVALID_EVENT_WAIT_LIST);
@@ -60,7 +61,7 @@ POname(clEnqueueSVMMemcpy) (cl_command_queue command_queue,
   if (blocking_copy)
     POCL_ABORT_UNIMPLEMENTED("Blocking memcpy");
 
-  int errcode = pocl_create_command (&cmd, command_queue, CL_COMMAND_SVM_MEMCPY,
+  errcode = pocl_create_command (&cmd, command_queue, CL_COMMAND_SVM_MEMCPY,
                                      event, num_events_in_wait_list,
                                      event_wait_list, 0, NULL);
 
diff --git a/lib/CL/clEnqueueSVMUnmap.c b/lib/CL/clEnqueueSVMUnmap.c
index 2fd1df0..eec628c 100644
--- a/lib/CL/clEnqueueSVMUnmap.c
+++ b/lib/CL/clEnqueueSVMUnmap.c
@@ -32,6 +32,8 @@ POname(clEnqueueSVMUnmap) (cl_command_queue command_queue,
                    cl_event *event) CL_API_SUFFIX__VERSION_2_0
 {
   unsigned i;
+  cl_int errcode;
+
   POCL_RETURN_ERROR_COND((command_queue == NULL), CL_INVALID_COMMAND_QUEUE);
 
   POCL_RETURN_ERROR_ON((command_queue->context->svm_allocdev == NULL),
@@ -44,18 +46,17 @@ POname(clEnqueueSVMUnmap) (cl_command_queue command_queue,
 
   POCL_RETURN_ERROR_COND((svm_ptr == NULL), CL_INVALID_VALUE);
 
-  POCL_RETURN_ERROR_COND((event_wait_list == NULL && num_events_in_wait_list > 0),
-                         CL_INVALID_EVENT_WAIT_LIST);
-
-  POCL_RETURN_ERROR_COND((event_wait_list != NULL && num_events_in_wait_list == 0),
-                         CL_INVALID_EVENT_WAIT_LIST);
+  errcode = pocl_check_event_wait_list (command_queue, num_events_in_wait_list,
+                                        event_wait_list);
+  if (errcode != CL_SUCCESS)
+    return errcode;
 
   for(i=0; i<num_events_in_wait_list; i++)
     POCL_RETURN_ERROR_COND((event_wait_list[i] == NULL), CL_INVALID_EVENT_WAIT_LIST);
 
   _cl_command_node *cmd = NULL;
 
-  int errcode = pocl_create_command (&cmd, command_queue, CL_COMMAND_SVM_UNMAP,
+  errcode = pocl_create_command (&cmd, command_queue, CL_COMMAND_SVM_UNMAP,
                                      event, num_events_in_wait_list,
                                      event_wait_list, 0, NULL);
 
diff --git a/lib/CL/clEnqueueTask.c b/lib/CL/clEnqueueTask.c
index 6ff2ac9..ba56f57 100644
--- a/lib/CL/clEnqueueTask.c
+++ b/lib/CL/clEnqueueTask.c
@@ -1,4 +1,30 @@
+/* OpenCL runtime library: clEnqueueTask()
+
+   Copyright (c) 2012-2017 pocl developers
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+
 #include "pocl_cl.h"
+
+
 CL_API_ENTRY cl_int CL_API_CALL
 POname(clEnqueueTask)(cl_command_queue   command_queue,
               cl_kernel          kernel,
diff --git a/lib/CL/clEnqueueUnmapMemObject.c b/lib/CL/clEnqueueUnmapMemObject.c
index 46533a1..41f4386 100644
--- a/lib/CL/clEnqueueUnmapMemObject.c
+++ b/lib/CL/clEnqueueUnmapMemObject.c
@@ -47,26 +47,28 @@ POname(clEnqueueUnmapMemObject)(cl_command_queue command_queue,
   POCL_RETURN_ERROR_ON((command_queue->context != memobj->context),
     CL_INVALID_CONTEXT, "memobj and command_queue are not from the same context\n");
 
-  POCL_RETURN_ERROR_COND((event_wait_list == NULL && num_events_in_wait_list > 0),
-    CL_INVALID_EVENT_WAIT_LIST);
+  errcode = pocl_check_event_wait_list (command_queue, num_events_in_wait_list,
+                                        event_wait_list);
+  if (errcode != CL_SUCCESS)
+    return errcode;
 
-  POCL_RETURN_ERROR_COND((event_wait_list != NULL && num_events_in_wait_list == 0),
-    CL_INVALID_EVENT_WAIT_LIST);
+  POCL_CHECK_DEV_IN_CMDQ;
 
-  for (i = 0; i < num_events_in_wait_list; ++i)
-    {
-      POCL_RETURN_ERROR_COND((event_wait_list[i] == NULL), CL_INVALID_EVENT_WAIT_LIST);
-      if (i > 0)
-        {
-          POCL_RETURN_ERROR_COND((event_wait_list[i]->context
-                                  != event_wait_list[i - 1]->context),
-                                 CL_INVALID_CONTEXT);
-        }
-    }
+  HANDLE_IMAGE1D_BUFFER (memobj);
+
+  POCL_RETURN_ERROR_ON ((memobj->flags & CL_MEM_HOST_NO_ACCESS),
+                        CL_INVALID_OPERATION,
+                        "buffer has been created with "
+                        "CL_MEM_HOST_WRITE_ONLY or CL_MEM_HOST_NO_ACCESS and "
+                        "CL_MAP_READ is set in map_flags\n");
 
   POCL_LOCK_OBJ (memobj);
   DL_FOREACH (memobj->mappings, mapping)
     {
+      POCL_MSG_PRINT_MEMORY (
+          "UnMap %p search Mapping: host_ptr %p offset %zu\n", mapped_ptr,
+          mapping->host_ptr, mapping->offset);
+
       if (mapping->host_ptr == mapped_ptr)
           break;
     }
@@ -74,9 +76,6 @@ POname(clEnqueueUnmapMemObject)(cl_command_queue command_queue,
   POCL_RETURN_ERROR_ON((mapping == NULL), CL_INVALID_VALUE,
       "Could not find mapping of this memobj\n");
 
-  /* find the index of the device's ptr in the buffer */
-  POCL_CHECK_DEV_IN_CMDQ;
-
   errcode = pocl_create_command (&cmd, command_queue, 
                                  CL_COMMAND_UNMAP_MEM_OBJECT, 
                                  event, num_events_in_wait_list, 
diff --git a/lib/CL/clEnqueueWaitForEvents.c b/lib/CL/clEnqueueWaitForEvents.c
index 6985523..999b50d 100644
--- a/lib/CL/clEnqueueWaitForEvents.c
+++ b/lib/CL/clEnqueueWaitForEvents.c
@@ -1,10 +1,41 @@
-#include "pocl_cl.h"
+/* OpenCL runtime library: clEnqueueWaitForEvents()
+
+   Copyright (c) 2012-2017 pocl developers
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+
+#include "pocl_util.h"
+
 CL_API_ENTRY cl_int CL_API_CALL
 POname(clEnqueueWaitForEvents)(cl_command_queue  command_queue,
                        cl_uint           num_events,
                        const cl_event *  event_list) 
 CL_API_SUFFIX__VERSION_1_0
 {
+  cl_int errcode;
+
+  errcode = pocl_check_event_wait_list (command_queue, num_events, event_list);
+  if (errcode != CL_SUCCESS)
+    return errcode;
+
   POCL_ABORT_UNIMPLEMENTED("The entire clEnqueueWaitForEvents call");
   return CL_SUCCESS;
 }
diff --git a/lib/CL/clEnqueueWriteBuffer.c b/lib/CL/clEnqueueWriteBuffer.c
index d7ec7f8..cb5b219 100644
--- a/lib/CL/clEnqueueWriteBuffer.c
+++ b/lib/CL/clEnqueueWriteBuffer.c
@@ -49,27 +49,23 @@ POname(clEnqueueWriteBuffer)(cl_command_queue command_queue,
   POCL_RETURN_ERROR_ON((command_queue->context != buffer->context),
     CL_INVALID_CONTEXT, "buffer and command_queue are not from the same context\n");
 
+  POCL_RETURN_ERROR_ON (
+      (buffer->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)),
+      CL_INVALID_OPERATION,
+      "buffer has been created with CL_MEM_HOST_READ_ONLY "
+      "or CL_MEM_HOST_NO_ACCESS\n");
+
   POCL_RETURN_ERROR_COND((ptr == NULL), CL_INVALID_VALUE);
+
   if (pocl_buffer_boundcheck(buffer, offset, cb) != CL_SUCCESS)
     return CL_INVALID_VALUE;
 
-  POCL_RETURN_ERROR_COND((event_wait_list == NULL && num_events_in_wait_list > 0),
-    CL_INVALID_EVENT_WAIT_LIST);
-
-  POCL_RETURN_ERROR_COND((event_wait_list != NULL && num_events_in_wait_list == 0),
-    CL_INVALID_EVENT_WAIT_LIST);
-
-  for(i=0; i<num_events_in_wait_list; i++)
-    POCL_RETURN_ERROR_COND((event_wait_list[i] == NULL), CL_INVALID_EVENT_WAIT_LIST);
-
-  device = POCL_REAL_DEV(command_queue->device);
+  errcode = pocl_check_event_wait_list (command_queue, num_events_in_wait_list,
+                                        event_wait_list);
+  if (errcode != CL_SUCCESS)
+    return errcode;
 
-  for (i = 0; i < command_queue->context->num_devices; ++i)
-    {
-        if (command_queue->context->devices[i] == device)
-            break;
-    }
-  assert(i < command_queue->context->num_devices);
+  POCL_CHECK_DEV_IN_CMDQ;
 
   errcode = pocl_create_command (&cmd, command_queue, 
                                  CL_COMMAND_WRITE_BUFFER, 
diff --git a/lib/CL/clEnqueueWriteBufferRect.c b/lib/CL/clEnqueueWriteBufferRect.c
index 6ec19ac..bd00b12 100644
--- a/lib/CL/clEnqueueWriteBufferRect.c
+++ b/lib/CL/clEnqueueWriteBufferRect.c
@@ -44,6 +44,7 @@ POname(clEnqueueWriteBufferRect)(cl_command_queue command_queue,
   cl_device_id device;
   unsigned i;
   _cl_command_node *cmd;
+  cl_int errcode;
 
   POCL_RETURN_ERROR_COND((command_queue == NULL), CL_INVALID_COMMAND_QUEUE);
 
@@ -59,11 +60,10 @@ POname(clEnqueueWriteBufferRect)(cl_command_queue command_queue,
   POCL_RETURN_ERROR_ON((command_queue->context != buffer->context),
     CL_INVALID_CONTEXT, "buffer and command_queue are not from the same context\n");
 
-  POCL_RETURN_ERROR_COND((event_wait_list == NULL && num_events_in_wait_list > 0),
-    CL_INVALID_EVENT_WAIT_LIST);
-
-  POCL_RETURN_ERROR_COND((event_wait_list != NULL && num_events_in_wait_list == 0),
-    CL_INVALID_EVENT_WAIT_LIST);
+  errcode = pocl_check_event_wait_list (command_queue, num_events_in_wait_list,
+                                        event_wait_list);
+  if (errcode != CL_SUCCESS)
+    return errcode;
 
   POCL_RETURN_ERROR_COND((ptr == NULL), CL_INVALID_VALUE);
   POCL_RETURN_ERROR_COND((buffer_origin == NULL), CL_INVALID_VALUE);
@@ -79,17 +79,7 @@ POname(clEnqueueWriteBufferRect)(cl_command_queue command_queue,
   if (pocl_buffer_boundcheck_3d(((size_t)-1), host_origin, region, &host_row_pitch,
       &host_slice_pitch, "") != CL_SUCCESS) return CL_INVALID_VALUE;
 
-
-
-
-  device = POCL_REAL_DEV(command_queue->device);
-
-  for (i = 0; i < command_queue->context->num_devices; ++i)
-    {
-      if (command_queue->context->devices[i] == device)
-        break;
-    }
-  assert(i < command_queue->context->num_devices);
+  POCL_CHECK_DEV_IN_CMDQ;
 
   POname(clRetainMemObject) (buffer);
 
diff --git a/lib/CL/clEnqueueWriteImage.c b/lib/CL/clEnqueueWriteImage.c
index 477c3dd..a9c3bb0 100644
--- a/lib/CL/clEnqueueWriteImage.c
+++ b/lib/CL/clEnqueueWriteImage.c
@@ -7,8 +7,8 @@ POname(clEnqueueWriteImage)(cl_command_queue    command_queue,
                     cl_bool             blocking_write, 
                     const size_t *      origin, /*[3]*/
                     const size_t *      region, /*[3]*/
-                    size_t              host_row_pitch,
-                    size_t              host_slice_pitch, 
+                    size_t              input_row_pitch,
+                    size_t              input_slice_pitch,
                     const void *        ptr,
                     cl_uint             num_events_in_wait_list,
                     const cl_event *    event_wait_list,
@@ -26,16 +26,29 @@ POname(clEnqueueWriteImage)(cl_command_queue    command_queue,
   POCL_RETURN_ERROR_ON((command_queue->context != image->context),
     CL_INVALID_CONTEXT, "image and command_queue are not from the same context\n");
 
-  POCL_RETURN_ERROR_COND((event_wait_list == NULL && num_events_in_wait_list > 0),
-    CL_INVALID_EVENT_WAIT_LIST);
+  POCL_RETURN_ERROR_ON (
+      (!command_queue->device->image_support), CL_INVALID_OPERATION,
+      "Device %s does not support images\n", command_queue->device->long_name);
 
-  POCL_RETURN_ERROR_COND((event_wait_list != NULL && num_events_in_wait_list == 0),
-    CL_INVALID_EVENT_WAIT_LIST);
-
-  errcode = pocl_check_device_supports_image(image, command_queue);
+  errcode = pocl_check_event_wait_list (command_queue, num_events_in_wait_list,
+                                        event_wait_list);
   if (errcode != CL_SUCCESS)
     return errcode;
 
+  POCL_RETURN_ERROR_ON (
+      (image->flags & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)),
+      CL_INVALID_OPERATION,
+      "image buffer has been created with CL_MEM_HOST_READ_ONLY "
+      "or CL_MEM_HOST_NO_ACCESS\n");
+
+  if (image->buffer)
+    POCL_RETURN_ERROR_ON (
+        (image->buffer->flags
+         & (CL_MEM_HOST_READ_ONLY | CL_MEM_HOST_NO_ACCESS)),
+        CL_INVALID_OPERATION,
+        "image buffer has been created with CL_MEM_HOST_READ_ONLY "
+        "or CL_MEM_HOST_NO_ACCESS\n");
+
   errcode = pocl_check_image_origin_region (image, origin, region);
   if (errcode != CL_SUCCESS)
     return errcode;
@@ -55,14 +68,22 @@ POname(clEnqueueWriteImage)(cl_command_queue    command_queue,
       return errcode;
     }  
 
-  cmd->command.write_image.device_ptr = 
-    image->device_ptrs[command_queue->device->dev_id].mem_ptr;
-  cmd->command.write_image.host_ptr = (void*) ptr;
   memcpy ((cmd->command.write_image.origin), tuned_origin, 3*sizeof (size_t));
   memcpy ((cmd->command.write_image.region), tuned_region, 3*sizeof (size_t));
   cmd->command.write_image.b_rowpitch = image->image_row_pitch;
   cmd->command.write_image.b_slicepitch = image->image_slice_pitch;
+  cmd->command.write_image.h_rowpitch
+      = (input_row_pitch ? input_row_pitch : tuned_region[0]);
+  cmd->command.write_image.h_slicepitch
+      = (input_slice_pitch ? input_slice_pitch
+                           : (tuned_region[0] * region[1]));
+
+  HANDLE_IMAGE1D_BUFFER (image);
+
   cmd->command.write_image.buffer = image;
+  cmd->command.write_image.device_ptr
+      = image->device_ptrs[command_queue->device->dev_id].mem_ptr;
+  cmd->command.write_image.host_ptr = (void *)ptr;
 
   POname(clRetainMemObject) (image);
   image->owning_device = command_queue->device;
diff --git a/lib/CL/clGetDeviceIDs.c b/lib/CL/clGetDeviceIDs.c
index 9a39cab..6257c52 100644
--- a/lib/CL/clGetDeviceIDs.c
+++ b/lib/CL/clGetDeviceIDs.c
@@ -34,8 +34,8 @@ POname(clGetDeviceIDs)(cl_platform_id   platform,
 {
   int total_num = 0;
   int devices_added = 0;
+  cl_platform_id tmp_platform;
 
-  pocl_init_devices();
   /* TODO: OpenCL API specification allows implementation dependent
      behaviour if platform == NULL. Should we just allow it? */
   POCL_RETURN_ERROR_COND((platform == NULL), CL_INVALID_PLATFORM);
@@ -43,6 +43,14 @@ POname(clGetDeviceIDs)(cl_platform_id   platform,
   POCL_RETURN_ERROR_COND((num_entries == 0 && devices != NULL), CL_INVALID_VALUE);
   POCL_RETURN_ERROR_COND((num_devices == NULL && devices == NULL), CL_INVALID_VALUE);
 
+  POname (clGetPlatformIDs) (1, &tmp_platform, NULL);
+  POCL_RETURN_ERROR_ON ((platform != tmp_platform), CL_INVALID_PLATFORM,
+                        "Can only return devices from the POCL platform\n");
+
+  int err = pocl_init_devices();
+  if (err)
+    return err;
+
   total_num = pocl_get_device_type_count(device_type);
 
   if (total_num == 0)
diff --git a/lib/CL/clGetDeviceInfo.c b/lib/CL/clGetDeviceInfo.c
index 5f653f0..5ca2548 100644
--- a/lib/CL/clGetDeviceInfo.c
+++ b/lib/CL/clGetDeviceInfo.c
@@ -27,23 +27,26 @@
    a zero, assume the device info query hasn't been implemented 
    for the device driver at hand. Warns about an incomplete 
    implementation. */
-#define POCL_RETURN_DEVICE_INFO_WITH_IMPL_CHECK(__TYPE__, __VALUE__)    \
-  {                                                                 \
-    size_t const value_size = sizeof(__TYPE__);                     \
-    if (param_value)                                                \
-      {                                                             \
-        if (param_value_size < value_size) return CL_INVALID_VALUE; \
-        *(__TYPE__*)param_value = __VALUE__;                        \
-        if (__VALUE__ == 0) POCL_WARN_INCOMPLETE();                 \
-      }                                                             \
-    if (param_value_size_ret)                                       \
-      *param_value_size_ret = value_size;                           \
-    return CL_SUCCESS;                                              \
-  } 
+#define POCL_RETURN_DEVICE_INFO_WITH_IMPL_CHECK(__TYPE__, __VALUE__)          \
+  if (__VALUE__ == (__TYPE__)0)                                               \
+    POCL_WARN_INCOMPLETE ();                                                  \
+  POCL_RETURN_GETINFO (__TYPE__, __VALUE__);
 
-    
+#define POCL_RETURN_DEVICE_INFO_WITH_IMG_CHECK(__TYPE__, __VALUE__)           \
+  if ((device->image_support) && (__VALUE__ == (__TYPE__)0))                  \
+    POCL_WARN_INCOMPLETE ();                                                  \
+  POCL_RETURN_GETINFO (__TYPE__, __VALUE__);
+
+#define STRINGIFY_(x) #x
+#define STRINGIFY(x) STRINGIFY_ (x)
+#define HOST_DEVICE_CL_VERSION_MAJOR_STR                                      \
+  STRINGIFY (HOST_DEVICE_CL_VERSION_MAJOR)
+#define HOST_DEVICE_CL_VERSION_MINOR_STR                                      \
+  STRINGIFY (HOST_DEVICE_CL_VERSION_MINOR)
+#define HOST_CL_VERSION                                                       \
+  "OpenCL C " HOST_DEVICE_CL_VERSION_MAJOR_STR                                \
+  "." HOST_DEVICE_CL_VERSION_MINOR_STR " pocl"
 
-  
 CL_API_ENTRY cl_int CL_API_CALL
 POname(clGetDeviceInfo)(cl_device_id   device,
                 cl_device_info param_name, 
@@ -53,8 +56,8 @@ POname(clGetDeviceInfo)(cl_device_id   device,
 {
   switch (param_name)
   {
-  case CL_DEVICE_IMAGE_SUPPORT: 
-    POCL_RETURN_GETINFO(cl_bool, CL_TRUE);
+  case CL_DEVICE_IMAGE_SUPPORT:
+    POCL_RETURN_GETINFO(cl_bool, device->image_support);
   case CL_DEVICE_TYPE:
     POCL_RETURN_GETINFO(cl_device_type, device->type);   
   case CL_DEVICE_VENDOR_ID:
@@ -98,7 +101,9 @@ POname(clGetDeviceInfo)(cl_device_id   device,
       typedef struct { size_t size[3]; } size_t_3;
       POCL_RETURN_GETINFO(size_t_3, *(size_t_3 const *)device->max_work_item_sizes);
     }
-    
+  case CL_DEVICE_MAX_MEM_ALLOC_SIZE:
+    POCL_RETURN_DEVICE_INFO_WITH_IMPL_CHECK (cl_ulong,
+                                             device->max_mem_alloc_size);
   case CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR:
     POCL_RETURN_DEVICE_INFO_WITH_IMPL_CHECK(cl_uint, device->preferred_vector_width_char);
   case CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT:
@@ -115,38 +120,45 @@ POname(clGetDeviceInfo)(cl_device_id   device,
     POCL_RETURN_DEVICE_INFO_WITH_IMPL_CHECK(cl_uint, device->max_clock_frequency);
   case CL_DEVICE_ADDRESS_BITS                      :
     POCL_RETURN_DEVICE_INFO_WITH_IMPL_CHECK(cl_uint, device->address_bits);
-  case CL_DEVICE_MAX_READ_IMAGE_ARGS               : 
-    POCL_RETURN_DEVICE_INFO_WITH_IMPL_CHECK(cl_uint, device->max_read_image_args);
+
+  case CL_DEVICE_MAX_READ_IMAGE_ARGS:
+    POCL_RETURN_DEVICE_INFO_WITH_IMG_CHECK (cl_uint,
+                                            device->max_read_image_args);
   case CL_DEVICE_MAX_WRITE_IMAGE_ARGS              :
-    POCL_RETURN_DEVICE_INFO_WITH_IMPL_CHECK(cl_uint, device->max_write_image_args);
+    POCL_RETURN_DEVICE_INFO_WITH_IMG_CHECK (cl_uint,
+                                            device->max_write_image_args);
   case CL_DEVICE_MAX_READ_WRITE_IMAGE_ARGS         :
-    POCL_RETURN_DEVICE_INFO_WITH_IMPL_CHECK(cl_uint, device->max_read_write_image_args);
-  case CL_DEVICE_MAX_MEM_ALLOC_SIZE:
-    POCL_RETURN_DEVICE_INFO_WITH_IMPL_CHECK(cl_ulong, device->max_mem_alloc_size);
-  case CL_DEVICE_IMAGE2D_MAX_WIDTH                 : 
-    POCL_RETURN_DEVICE_INFO_WITH_IMPL_CHECK(size_t, device->image2d_max_width);
+    POCL_RETURN_DEVICE_INFO_WITH_IMG_CHECK (cl_uint,
+                                            device->max_read_write_image_args);
+  case CL_DEVICE_IMAGE2D_MAX_WIDTH:
+    POCL_RETURN_DEVICE_INFO_WITH_IMG_CHECK (size_t, device->image2d_max_width);
   case CL_DEVICE_IMAGE2D_MAX_HEIGHT                :
-    POCL_RETURN_DEVICE_INFO_WITH_IMPL_CHECK(size_t, device->image2d_max_height);
-  case CL_DEVICE_IMAGE3D_MAX_WIDTH                 : 
-    POCL_RETURN_DEVICE_INFO_WITH_IMPL_CHECK(size_t, device->image3d_max_width);
-  case CL_DEVICE_IMAGE3D_MAX_HEIGHT                : 
-    POCL_RETURN_DEVICE_INFO_WITH_IMPL_CHECK(size_t, device->image3d_max_height);
+    POCL_RETURN_DEVICE_INFO_WITH_IMG_CHECK (size_t,
+                                            device->image2d_max_height);
+  case CL_DEVICE_IMAGE3D_MAX_WIDTH:
+    POCL_RETURN_DEVICE_INFO_WITH_IMG_CHECK (size_t, device->image3d_max_width);
+  case CL_DEVICE_IMAGE3D_MAX_HEIGHT:
+    POCL_RETURN_DEVICE_INFO_WITH_IMG_CHECK (size_t,
+                                            device->image3d_max_height);
   case CL_DEVICE_IMAGE3D_MAX_DEPTH                 :
-    POCL_RETURN_DEVICE_INFO_WITH_IMPL_CHECK(size_t, device->image3d_max_depth);
+    POCL_RETURN_DEVICE_INFO_WITH_IMG_CHECK (size_t, device->image3d_max_depth);
   case CL_DEVICE_IMAGE_MAX_BUFFER_SIZE             :
-    POCL_RETURN_DEVICE_INFO_WITH_IMPL_CHECK(size_t, device->image_max_buffer_size);
+    POCL_RETURN_DEVICE_INFO_WITH_IMG_CHECK (size_t,
+                                            device->image_max_buffer_size);
   case CL_DEVICE_IMAGE_MAX_ARRAY_SIZE              :
-    POCL_RETURN_DEVICE_INFO_WITH_IMPL_CHECK(size_t, device->image_max_array_size);
-  case CL_DEVICE_MAX_PARAMETER_SIZE                : 
+    POCL_RETURN_DEVICE_INFO_WITH_IMG_CHECK (size_t,
+                                            device->image_max_array_size);
+  case CL_DEVICE_MAX_SAMPLERS:
+    POCL_RETURN_DEVICE_INFO_WITH_IMG_CHECK (cl_uint, device->max_samplers);
+
+  case CL_DEVICE_MAX_PARAMETER_SIZE:
     POCL_RETURN_DEVICE_INFO_WITH_IMPL_CHECK(size_t, device->max_parameter_size);
-  case CL_DEVICE_MAX_SAMPLERS                      : 
-    POCL_RETURN_DEVICE_INFO_WITH_IMPL_CHECK(cl_uint, device->max_samplers);
   case CL_DEVICE_MEM_BASE_ADDR_ALIGN               : 
     POCL_RETURN_DEVICE_INFO_WITH_IMPL_CHECK(cl_uint, device->mem_base_addr_align);
   case CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE          : 
     POCL_RETURN_DEVICE_INFO_WITH_IMPL_CHECK(cl_uint, device->min_data_type_align_size);
-  case CL_DEVICE_SINGLE_FP_CONFIG                  : 
-    POCL_RETURN_DEVICE_INFO_WITH_IMPL_CHECK(cl_ulong, device->single_fp_config);
+  case CL_DEVICE_SINGLE_FP_CONFIG                  :
+    POCL_RETURN_GETINFO (cl_ulong, device->single_fp_config);
   case CL_DEVICE_GLOBAL_MEM_CACHE_TYPE             :
     POCL_RETURN_GETINFO(cl_uint, device->global_mem_cache_type);
   case CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE         : 
@@ -216,9 +228,9 @@ POname(clGetDeviceInfo)(cl_device_id   device,
       POCL_RETURN_GETINFO(cl_platform_id, platform_id);
     }
   case CL_DEVICE_DOUBLE_FP_CONFIG                  :
-    POCL_RETURN_DEVICE_INFO_WITH_IMPL_CHECK(cl_ulong, device->double_fp_config);
+    POCL_RETURN_GETINFO (cl_ulong, device->double_fp_config);
   case CL_DEVICE_HALF_FP_CONFIG                    :
-    POCL_RETURN_DEVICE_INFO_WITH_IMPL_CHECK(cl_ulong, device->half_fp_config);
+    POCL_RETURN_GETINFO (cl_ulong, device->half_fp_config);
   case CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF       :
     POCL_RETURN_DEVICE_INFO_WITH_IMPL_CHECK(cl_uint, device->preferred_vector_width_half);
   case CL_DEVICE_HOST_UNIFIED_MEMORY               : 
@@ -238,37 +250,45 @@ POname(clGetDeviceInfo)(cl_device_id   device,
   case CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF          : 
     POCL_RETURN_DEVICE_INFO_WITH_IMPL_CHECK(cl_uint, device->native_vector_width_half);
   case CL_DEVICE_OPENCL_C_VERSION                  :
-    POCL_RETURN_GETINFO_STR("OpenCL C 2.0");
+    POCL_RETURN_GETINFO_STR (HOST_CL_VERSION);
   case CL_DEVICE_BUILT_IN_KERNELS                  :
     POCL_RETURN_GETINFO_STR("");
 
-  /* TODO proper device partition support. For the time being,
-   * the values returned only serve the purpose of indicating
-   * that it is not actually supported */
   case CL_DEVICE_PARENT_DEVICE                     :
     POCL_RETURN_GETINFO(cl_device_id, device->parent_device);
+
   case CL_DEVICE_PARTITION_MAX_SUB_DEVICES         :
     POCL_RETURN_GETINFO(cl_uint, device->max_sub_devices);
+
   case CL_DEVICE_PARTITION_PROPERTIES              :
-    POCL_RETURN_GETINFO_ARRAY(cl_device_partition_property,
-      device->num_partition_properties, device->partition_properties);
+    if (device->num_partition_properties)
+      POCL_RETURN_GETINFO_ARRAY (cl_device_partition_property,
+                                 device->num_partition_properties,
+                                 device->partition_properties);
+    else
+      POCL_RETURN_GETINFO (cl_device_partition_property, 0);
+
   case CL_DEVICE_PARTITION_TYPE                    :
-    POCL_RETURN_GETINFO_ARRAY(cl_device_partition_property,
-      device->num_partition_types, device->partition_type);
+    if (device->num_partition_types)
+      POCL_RETURN_GETINFO_ARRAY (cl_device_partition_property,
+                                 device->num_partition_types,
+                                 device->partition_type);
+    else
+      POCL_RETURN_GETINFO (cl_device_partition_property, 0);
+
   case CL_DEVICE_PARTITION_AFFINITY_DOMAIN         :
     POCL_RETURN_GETINFO(cl_device_affinity_domain, 0);
 
   case CL_DEVICE_PREFERRED_INTEROP_USER_SYNC       :
     POCL_RETURN_GETINFO(cl_bool, CL_TRUE);
+
   case CL_DEVICE_PRINTF_BUFFER_SIZE                :
     POCL_RETURN_DEVICE_INFO_WITH_IMPL_CHECK(size_t, device->printf_buffer_size);
+
   case CL_DEVICE_REFERENCE_COUNT:
     POCL_RETURN_DEVICE_INFO_WITH_IMPL_CHECK(cl_uint, 
                                             (cl_uint)device->pocl_refcount)
 
-
-
-
   case CL_DEVICE_SVM_CAPABILITIES:
     POCL_RETURN_GETINFO(cl_device_svm_capabilities, device->svm_caps);
   case CL_DEVICE_MAX_ON_DEVICE_EVENTS:
@@ -292,7 +312,10 @@ POname(clGetDeviceInfo)(cl_device_id   device,
   case CL_DEVICE_PREFERRED_PLATFORM_ATOMIC_ALIGNMENT:
     POCL_RETURN_GETINFO(cl_uint, 0);
   case CL_DEVICE_SPIR_VERSIONS:
-    POCL_RETURN_GETINFO_STR("1.2");
+    if (strstr (device->extensions, "cl_khr_spir"))
+      POCL_RETURN_GETINFO_STR ("1.2");
+    else
+      POCL_RETURN_GETINFO_STR ("");
   case CL_DEVICE_QUEUE_ON_DEVICE_PROPERTIES:
     POCL_RETURN_GETINFO(cl_command_queue_properties, device->on_dev_queue_props);
   case CL_DEVICE_QUEUE_ON_HOST_PROPERTIES:
diff --git a/lib/CL/clReleaseContext.c b/lib/CL/clGetExtensionFunctionAddressForPlatform.c
similarity index 50%
copy from lib/CL/clReleaseContext.c
copy to lib/CL/clGetExtensionFunctionAddressForPlatform.c
index 99d70d3..1085cab 100644
--- a/lib/CL/clReleaseContext.c
+++ b/lib/CL/clGetExtensionFunctionAddressForPlatform.c
@@ -1,17 +1,17 @@
-/* OpenCL runtime library: clReleaseContext()
+/* OpenCL runtime library: clGetExtensionFunctionAddressForPlatform()
+
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
 
-   Copyright (c) 2011 Universidad Rey Juan Carlos
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -23,34 +23,36 @@
 
 #include "pocl_cl.h"
 
-CL_API_ENTRY cl_int CL_API_CALL
-POname(clReleaseContext)(cl_context context) CL_API_SUFFIX__VERSION_1_0
+#include <string.h>
+
+CL_API_ENTRY void * CL_API_CALL
+POname (clGetExtensionFunctionAddressForPlatform) (cl_platform_id  platform,
+                                                   const char *func_name)
+CL_EXT_SUFFIX__VERSION_1_2
 {
-  int new_refcount;
-  // if context is invalid, return immediately
-  if (!context->valid)
+  cl_platform_id pocl_platform;
+  cl_uint actual_num = 0;
+  POname (clGetPlatformIDs) (1, &pocl_platform, &actual_num);
+  if (actual_num != 1)
     {
-      POCL_MEM_FREE(context);
-      return CL_SUCCESS;
+      POCL_MSG_WARN ("Couldn't get the platform ID of Pocl platform\n");
+      return NULL;
     }
 
-  POCL_RELEASE_OBJECT(context, new_refcount);
-  if (new_refcount == 0)
+  if (platform != pocl_platform)
     {
-      /* The context holds references to all its devices,
-         memory objects, command-queues etc. Release the
-         references and let the objects to get freed. */
-      /* TODO: call the corresponding clRelease* functions
-         for all the referred objects. */
-      unsigned i;
-      for (i = 0; i < context->num_devices; ++i) 
-        {
-          POname(clReleaseDevice) (context->devices[i]);
-        }   
-      POCL_MEM_FREE(context->devices);
-      POCL_MEM_FREE(context->properties);
-      POCL_MEM_FREE(context);
+      POCL_MSG_PRINT_INFO ("Requested Function Address not "
+                           "for Pocl platform, ignoring\n");
+      return NULL;
     }
-  return CL_SUCCESS;
+
+#ifdef BUILD_ICD
+  if (strcmp (func_name, "clIcdGetPlatformIDsKHR") == 0)
+    return (void *)&POname(clIcdGetPlatformIDsKHR);
+#endif
+  if (strcmp (func_name, "clGetPlatformInfo") == 0)
+    return (void *)&POname(clGetPlatformInfo);
+
+  return NULL;
 }
-POsym(clReleaseContext)
+POsymAlways (clGetExtensionFunctionAddressForPlatform)
diff --git a/lib/CL/clGetKernelArgInfo.c b/lib/CL/clGetKernelArgInfo.c
index d332ba9..77063ad 100644
--- a/lib/CL/clGetKernelArgInfo.c
+++ b/lib/CL/clGetKernelArgInfo.c
@@ -38,6 +38,15 @@ POname(clGetKernelArgInfo)(cl_kernel      kernel ,
     "This kernel has %u args, cannot getInfo on arg %u\n",
     (unsigned)kernel->num_args, (unsigned)arg_indx);
 
+  /* pocl always uses -cl-kernel-arg-info because it needs the arg metadata, but
+   * to the user programs we should report missing arg info in case they don't
+   * request it. Piglit tests this. */
+  if (kernel->program->compiler_options)
+    POCL_RETURN_ERROR_ON (
+        (!strstr (kernel->program->compiler_options, "cl-kernel-arg-info")),
+        CL_KERNEL_ARG_INFO_NOT_AVAILABLE,
+        "argument information is not available!\n");
+
   struct pocl_argument_info *arg = &kernel->arg_info[arg_indx];
   switch (param_name) {
     case CL_KERNEL_ARG_ADDRESS_QUALIFIER:
diff --git a/lib/CL/clGetKernelInfo.c b/lib/CL/clGetKernelInfo.c
index e3903af..d12584e 100644
--- a/lib/CL/clGetKernelInfo.c
+++ b/lib/CL/clGetKernelInfo.c
@@ -46,6 +46,11 @@ POname(clGetKernelInfo)(cl_kernel      kernel ,
     POCL_RETURN_GETINFO(cl_context, kernel->context);
   case CL_KERNEL_PROGRAM:
     POCL_RETURN_GETINFO(cl_program, kernel->program);
+  case CL_KERNEL_ATTRIBUTES:
+    if (kernel->attributes)
+      POCL_RETURN_GETINFO_STR (kernel->attributes);
+    else
+      POCL_RETURN_GETINFO_STR ("");
   }
   return CL_INVALID_VALUE;
 }
diff --git a/lib/CL/clGetKernelWorkGroupInfo.c b/lib/CL/clGetKernelWorkGroupInfo.c
index 42221a4..85715f4 100644
--- a/lib/CL/clGetKernelWorkGroupInfo.c
+++ b/lib/CL/clGetKernelWorkGroupInfo.c
@@ -24,11 +24,11 @@ POname(clGetKernelWorkGroupInfo)
       unsigned i;
       int found_it = 0;
       for (i = 0; i < kernel->context->num_devices; i++)
-        if (POCL_REAL_DEV(device) == kernel->context->devices[i])
-        {
-          found_it = 1;
-          break;
-        }
+        if (pocl_real_dev (device) == kernel->context->devices[i])
+          {
+            found_it = 1;
+            break;
+          }
       POCL_RETURN_ERROR_ON((!found_it), CL_INVALID_DEVICE, "could not find the "
         "device supplied in argument\n");
     }
@@ -49,12 +49,10 @@ POname(clGetKernelWorkGroupInfo)
     case CL_KERNEL_COMPILE_WORK_GROUP_SIZE:
     {
         typedef struct { size_t size[3]; } size_t_3;
-#if 0
-        printf("### reqd wg sizes %d %d %d\n", 
-               kernel->reqd_wg_size[0], 
-               kernel->reqd_wg_size[1], 
+        POCL_MSG_PRINT_GENERAL ("### reqd wg sizes %d %d %d\n",
+               kernel->reqd_wg_size[0],
+               kernel->reqd_wg_size[1],
                kernel->reqd_wg_size[2]);
-#endif
         POCL_RETURN_GETINFO(size_t_3, *(size_t_3*)kernel->reqd_wg_size);
     }
       
@@ -83,7 +81,9 @@ POname(clGetKernelWorkGroupInfo)
     }
       
     case CL_KERNEL_PRIVATE_MEM_SIZE:
-      POCL_ABORT_UNIMPLEMENTED("clGetKernelWorkGroupInfo: CL_KERNEL_PRIVATE_MEM_SIZE");
+      POCL_MSG_WARN ("clGetKernelWorkGroupInfo: CL_KERNEL_PRIVATE_MEM_SIZE "
+                     "implementation is incomplete\n");
+      POCL_RETURN_GETINFO (cl_ulong, sizeof (struct pocl_context));
 
     default:
       return CL_INVALID_VALUE;
diff --git a/lib/CL/clGetMemObjectInfo.c b/lib/CL/clGetMemObjectInfo.c
index e8db03f..25c909d 100644
--- a/lib/CL/clGetMemObjectInfo.c
+++ b/lib/CL/clGetMemObjectInfo.c
@@ -42,7 +42,10 @@ POname(clGetMemObjectInfo)(cl_mem      memobj ,
   case CL_MEM_SIZE:
     POCL_RETURN_GETINFO (size_t, memobj->size);
   case CL_MEM_HOST_PTR:
-    POCL_RETURN_GETINFO (void *, memobj->mem_host_ptr);
+    if (memobj->flags & CL_MEM_USE_HOST_PTR)
+      POCL_RETURN_GETINFO (void *, memobj->mem_host_ptr);
+    else
+      POCL_RETURN_GETINFO (void *, NULL);
   case CL_MEM_MAP_COUNT:
     POCL_RETURN_GETINFO (cl_uint, memobj->map_count);
   case CL_MEM_REFERENCE_COUNT:
@@ -54,8 +57,8 @@ POname(clGetMemObjectInfo)(cl_mem      memobj ,
   case CL_MEM_OFFSET:
     if (memobj->parent == NULL)
       POCL_RETURN_GETINFO (size_t, 0);
-
-    POCL_ABORT_UNIMPLEMENTED("clGetMemObjectInfo: CL_MEM_OFFSET in subbuffers");
+    else
+      POCL_RETURN_GETINFO (size_t, memobj->origin);
   }
   return CL_INVALID_VALUE;
 }
diff --git a/lib/CL/clGetPlatformIDs.c b/lib/CL/clGetPlatformIDs.c
index 084fa41..a4844a1 100644
--- a/lib/CL/clGetPlatformIDs.c
+++ b/lib/CL/clGetPlatformIDs.c
@@ -103,15 +103,15 @@ struct _cl_icd_dispatch pocl_dispatch = {
   &POclEnqueueWaitForEvents,
   &POclEnqueueBarrier,
   &POclGetExtensionFunctionAddress,
-  NULL, /* &POclCreateFromGLBuffer,      */
+  &POclCreateFromGLBuffer,
   &POclCreateFromGLTexture2D,
   &POclCreateFromGLTexture3D,
-  NULL, /* &POclCreateFromGLRenderbuffer, */
-  NULL, /* &POclGetGLObjectInfo,  70       */
-  NULL, /* &POclGetGLTextureInfo,        */
-  NULL, /* &POclEnqueueAcquireGLObjects, */
-  NULL, /* &POclEnqueueReleaseGLObjects, */
-  NULL, /* &POclGetGLContextInfoKHR,     */
+  &POclCreateFromGLRenderbuffer,
+  &POclGetGLObjectInfo,
+  &POclGetGLTextureInfo,
+  &POclEnqueueAcquireGLObjects,
+  &POclEnqueueReleaseGLObjects,
+  &POclGetGLContextInfoKHR,
   NULL, /* &clUnknown75 */
   NULL, /* &clUnknown76 */
   NULL, /* &clUnknown77 */
@@ -134,18 +134,18 @@ struct _cl_icd_dispatch pocl_dispatch = {
   &POclRetainDevice,
   &POclReleaseDevice,
   &POclCreateImage,
-  NULL, /* &POclCreateProgramWithBuiltInKernels, */
-  NULL, /* &POclCompileProgram,          */
-  NULL, /* &POclLinkProgram,             */
+  &POclCreateProgramWithBuiltInKernels,
+  &POclCompileProgram,
+  &POclLinkProgram,
   &POclUnloadPlatformCompiler, 
   &POclGetKernelArgInfo,
   &POclEnqueueFillBuffer,
   &POclEnqueueFillImage,
-  NULL, /* &POclEnqueueMigrateMemObjects, */
+  &POclEnqueueMigrateMemObjects,
   &POclEnqueueMarkerWithWaitList,
-  NULL, /* &POclEnqueueBarrierWithWaitList, */
-  NULL, /* &POclGetExtensionFunctionAddressForPlatform, */
-  NULL, /* &POclCreateFromGLTexture,     */
+  &POclEnqueueBarrierWithWaitList,
+  &POclGetExtensionFunctionAddressForPlatform,
+  &POclCreateFromGLTexture,
   NULL, /* &clUnknown109 */
   NULL, /* &clUnknown110 */
   NULL, /* &clUnknown111 */
@@ -221,9 +221,10 @@ struct _cl_icd_dispatch pocl_dispatch = {
 #endif
 };
 
-struct _cl_platform_id _platforms[1]  = {{&pocl_dispatch}};
+static struct _cl_platform_id _platforms[1]  = {{&pocl_dispatch}};
 #else
-struct _cl_platform_id _platforms[1]  = {};
+
+static struct _cl_platform_id _platforms[1] = {{ 1 }};
 #endif
 
 #ifdef __GNUC__
@@ -238,7 +239,7 @@ CL_API_ENTRY cl_int CL_API_CALL
 POname(clGetPlatformIDs)(cl_uint           num_entries,
                  cl_platform_id *  platforms,
                  cl_uint *         num_platforms) CL_API_SUFFIX__VERSION_1_0
-{	
+{
   const unsigned num = 1;
   unsigned i;
   
diff --git a/lib/CL/clGetProgramBuildInfo.c b/lib/CL/clGetProgramBuildInfo.c
index 17db30b..85c9b7c 100644
--- a/lib/CL/clGetProgramBuildInfo.c
+++ b/lib/CL/clGetProgramBuildInfo.c
@@ -39,6 +39,8 @@ POname(clGetProgramBuildInfo)(cl_program            program,
 
   POCL_RETURN_ERROR_COND((program == NULL), CL_INVALID_PROGRAM);
 
+  POCL_RETURN_ERROR_COND ((device == NULL), CL_INVALID_DEVICE);
+
   int device_i = pocl_cl_device_to_index(program, device);
   POCL_RETURN_ERROR_ON((device_i < 0), CL_INVALID_DEVICE, "Program does not have "
     "this device in it's device list\n");
@@ -60,31 +62,22 @@ POname(clGetProgramBuildInfo)(cl_program            program,
       POCL_RETURN_ERROR_ON((program->build_status == CL_BUILD_NONE),
                            CL_INVALID_PROGRAM,
                            "Program was not built");
-      char *build_log;
       if (program->main_build_log[0])
-          build_log = strdup(program->main_build_log);
+        {
+          POCL_RETURN_GETINFO_STR (program->main_build_log);
+        }
       else if (program->build_log[device_i])
-          build_log = strdup(program->build_log[device_i]);
+        {
+          POCL_RETURN_GETINFO_STR (program->build_log[device_i]);
+        }
       else
-          build_log = pocl_cache_read_buildlog(program, device_i);
-      if (program->build_status == CL_BUILD_NONE)
-          build_log = "";
-      POCL_RETURN_ERROR_ON((build_log==NULL), CL_OUT_OF_HOST_MEMORY, "failed to read build log");
-
-      size_t const value_size = strlen(build_log) + 1;
-      if (param_value)
-      {
-        if (param_value_size < value_size)
         {
-            POCL_MEM_FREE(build_log);
-            return CL_INVALID_VALUE;
+          char *build_log = pocl_cache_read_buildlog (program, device_i);
+          if (build_log)
+            POCL_RETURN_GETINFO_STR_FREE (build_log);
         }
-        memcpy(param_value, build_log, value_size);
-      }
-      POCL_MEM_FREE(build_log);
-      if (param_value_size_ret)
-        *param_value_size_ret = value_size;
-      return CL_SUCCESS;
+
+      POCL_RETURN_GETINFO_STR ("");
     }
   case CL_PROGRAM_BINARY_TYPE:
     {
diff --git a/lib/CL/clGetSamplerInfo.c b/lib/CL/clGetSamplerInfo.c
index 02cb6a2..4ecbdf9 100644
--- a/lib/CL/clGetSamplerInfo.c
+++ b/lib/CL/clGetSamplerInfo.c
@@ -1,4 +1,27 @@
-#include "pocl_cl.h"
+/* OpenCL runtime library: clGetSamplerInfo()
+
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include "pocl_util.h"
 
 
 CL_API_ENTRY cl_int CL_API_CALL
@@ -8,8 +31,23 @@ POname(clGetSamplerInfo)(cl_sampler          sampler ,
                  void *              param_value ,
                  size_t *            param_value_size_ret ) CL_API_SUFFIX__VERSION_1_0
 {
-  POCL_ABORT_UNIMPLEMENTED("The entire clGetSamplerInfo call");
-  return CL_SUCCESS;
+  POCL_RETURN_ERROR_COND ((sampler == NULL), CL_INVALID_SAMPLER);
+
+  switch (param_name)
+    {
+    case CL_SAMPLER_REFERENCE_COUNT:
+      POCL_RETURN_GETINFO (cl_uint, sampler->pocl_refcount);
+    case CL_SAMPLER_CONTEXT:
+      POCL_RETURN_GETINFO (cl_context, sampler->context);
+    case CL_SAMPLER_NORMALIZED_COORDS:
+      POCL_RETURN_GETINFO (cl_bool, sampler->normalized_coords);
+    case CL_SAMPLER_ADDRESSING_MODE:
+      POCL_RETURN_GETINFO (cl_addressing_mode, sampler->addressing_mode);
+    case CL_SAMPLER_FILTER_MODE:
+      POCL_RETURN_GETINFO (cl_filter_mode, sampler->filter_mode);
+    }
+
+  return CL_INVALID_VALUE;
 }
 
 POsym(clGetSamplerInfo)
diff --git a/lib/CL/clLinkProgram.c b/lib/CL/clLinkProgram.c
new file mode 100644
index 0000000..0c9e267
--- /dev/null
+++ b/lib/CL/clLinkProgram.c
@@ -0,0 +1,121 @@
+/* OpenCL runtime library: clLinkProgram()
+
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include "pocl_cl.h"
+#include "pocl_shared.h"
+#include "pocl_util.h"
+
+CL_API_ENTRY cl_program CL_API_CALL
+POname (clLinkProgram) (cl_context context,
+                        cl_uint num_devices,
+                        const cl_device_id *device_list,
+                        const char *options,
+                        cl_uint num_input_programs,
+                        const cl_program *input_programs,
+                        void (CL_CALLBACK *pfn_notify) (cl_program program, void *user_data),
+                        void *user_data,
+                        cl_int *errcode_ret)
+CL_API_SUFFIX__VERSION_1_2
+{
+  int errcode; unsigned i;
+  cl_program program = NULL;
+  cl_device_id *unique_devlist = NULL;
+
+  POCL_GOTO_LABEL_COND (PFN_NOTIFY, (context == NULL), CL_INVALID_CONTEXT);
+
+  POCL_GOTO_LABEL_COND (PFN_NOTIFY, (num_input_programs == 0),
+                        CL_INVALID_VALUE);
+
+  POCL_GOTO_LABEL_COND (PFN_NOTIFY, (input_programs == NULL),
+                        CL_INVALID_VALUE);
+
+  POCL_GOTO_LABEL_COND (PFN_NOTIFY, (num_devices > 0 && device_list == NULL),
+                        CL_INVALID_VALUE);
+  POCL_GOTO_LABEL_COND (PFN_NOTIFY, (num_devices == 0 && device_list != NULL),
+                        CL_INVALID_VALUE);
+
+  for (i = 0; i < num_input_programs; i++)
+    {
+      cl_program p = input_programs[i];
+      POCL_GOTO_LABEL_ON (
+          PFN_NOTIFY,
+          ((p->binary_type != CL_PROGRAM_BINARY_TYPE_LIBRARY)
+           && (p->binary_type != CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT)),
+          CL_INVALID_OPERATION,
+          "clLinkProgram called for !library && !compiled_obj\n");
+    }
+
+  if (num_devices == 0)
+    {
+      num_devices = context->num_devices;
+      device_list = context->devices;
+    }
+  else
+    {
+      /* convert subdevices to devices and remove duplicates */
+      cl_uint real_num_devices = 0;
+      unique_devlist = pocl_unique_device_list (device_list,
+                                                num_devices,
+                                                &real_num_devices);
+      num_devices = real_num_devices;
+      device_list = unique_devlist;
+    }
+
+  program = create_program_skeleton (context, num_devices, device_list,
+                                     NULL, NULL, NULL, &errcode, 1);
+  if (errcode != CL_SUCCESS)
+    goto PFN_NOTIFY;
+
+  assert (num_devices == program->num_devices);
+
+  /* link the program */
+  errcode = compile_and_link_program (0, 1, program,
+                                      num_devices, device_list, options,
+                                      0, NULL, NULL,
+                                      num_input_programs, input_programs,
+                                      pfn_notify, user_data);
+
+  /* compile_and_link_program already called the callback */
+  goto ERROR;
+
+PFN_NOTIFY:
+  if (pfn_notify)
+    pfn_notify (program, user_data);
+
+ERROR:
+  POCL_MEM_FREE (unique_devlist);
+
+  if (errcode_ret)
+    *errcode_ret = errcode;
+
+  if (errcode == CL_SUCCESS)
+    {
+      return program;
+    }
+  else
+    {
+      POname (clReleaseProgram) (program);
+      return NULL;
+    }
+}
+POsym (clLinkProgram)
diff --git a/lib/CL/clReleaseCommandQueue.c b/lib/CL/clReleaseCommandQueue.c
index 321f9ee..6feaa45 100644
--- a/lib/CL/clReleaseCommandQueue.c
+++ b/lib/CL/clReleaseCommandQueue.c
@@ -23,7 +23,6 @@
 
 #include "pocl_cl.h"
 #include "pocl_util.h"
-#include "pocl_queue_util.h"
 
 CL_API_ENTRY cl_int CL_API_CALL
 POname(clReleaseCommandQueue)(cl_command_queue command_queue) CL_API_SUFFIX__VERSION_1_0
@@ -36,15 +35,21 @@ POname(clReleaseCommandQueue)(cl_command_queue command_queue) CL_API_SUFFIX__VER
 
   POname(clFlush)(command_queue);
   POCL_RELEASE_OBJECT(command_queue, new_refcount);
+  POCL_MSG_PRINT_REFCOUNTS ("Release Command Queue %p  %d\n", command_queue, new_refcount);
+
   if (new_refcount == 0)
     {
-      POCL_MSG_PRINT_INFO ("Free Command Queue %p\n", command_queue);
+      POCL_MSG_PRINT_REFCOUNTS ("Free Command Queue %p\n", command_queue);
       POname(clFinish)(command_queue);
-      pocl_queue_list_delete(command_queue);
+      if (command_queue->device->ops->free_queue)
+        command_queue->device->ops->free_queue (command_queue);
+      POCL_DESTROY_OBJECT (command_queue);
       POCL_MEM_FREE(command_queue);
 
       POname(clReleaseContext)(context);
       POname(clReleaseDevice)(device);
+
+      POCL_MSG_PRINT_REFCOUNTS ("Context refs after freeing CmdQueue: %d\n", context->pocl_refcount);
     }
   return CL_SUCCESS;
 }
diff --git a/lib/CL/clReleaseContext.c b/lib/CL/clReleaseContext.c
index 99d70d3..762686b 100644
--- a/lib/CL/clReleaseContext.c
+++ b/lib/CL/clReleaseContext.c
@@ -34,9 +34,11 @@ POname(clReleaseContext)(cl_context context) CL_API_SUFFIX__VERSION_1_0
       return CL_SUCCESS;
     }
 
+  POCL_MSG_PRINT_REFCOUNTS ("Release Context \n");
   POCL_RELEASE_OBJECT(context, new_refcount);
   if (new_refcount == 0)
     {
+      POCL_MSG_PRINT_REFCOUNTS ("Free Context %p\n", context);
       /* The context holds references to all its devices,
          memory objects, command-queues etc. Release the
          references and let the objects to get freed. */
@@ -49,6 +51,7 @@ POname(clReleaseContext)(cl_context context) CL_API_SUFFIX__VERSION_1_0
         }   
       POCL_MEM_FREE(context->devices);
       POCL_MEM_FREE(context->properties);
+      POCL_DESTROY_OBJECT (context);
       POCL_MEM_FREE(context);
     }
   return CL_SUCCESS;
diff --git a/lib/CL/clReleaseDevice.c b/lib/CL/clReleaseDevice.c
index b48b29e..f48c9e0 100644
--- a/lib/CL/clReleaseDevice.c
+++ b/lib/CL/clReleaseDevice.c
@@ -33,7 +33,15 @@ POname(clReleaseDevice)(cl_device_id device) CL_API_SUFFIX__VERSION_1_2
   POCL_RELEASE_OBJECT (device, new_refcount);
 
   if (new_refcount == 0)
-    POCL_MEM_FREE(device);
+    {
+      POCL_DESTROY_OBJECT (device);
+      POCL_MEM_FREE (device->partition_type);
+      POCL_MSG_PRINT_REFCOUNTS ("Free Device %p\n", device);
+      POCL_MEM_FREE (device);
+    }
+  else
+    POCL_MSG_PRINT_REFCOUNTS ("Release Device %p : %u\n", device,
+                              device->pocl_refcount);
 
   return CL_SUCCESS;
 }
diff --git a/lib/CL/clReleaseEvent.c b/lib/CL/clReleaseEvent.c
index cb4d270..d822326 100644
--- a/lib/CL/clReleaseEvent.c
+++ b/lib/CL/clReleaseEvent.c
@@ -36,7 +36,23 @@ POname(clReleaseEvent)(cl_event event) CL_API_SUFFIX__VERSION_1_0
   
   if (new_refcount == 0)
     {
-      POCL_MSG_PRINT_INFO ("Freeing event %d\n", event->id);
+      event_callback_item *cb_ptr = NULL;
+      event_callback_item *next = NULL;
+      for (cb_ptr = event->callback_list; cb_ptr; cb_ptr = next)
+        {
+          next = cb_ptr->next;
+          POCL_MEM_FREE (cb_ptr);
+        }
+
+      if (event->command_type == CL_COMMAND_USER)
+        {
+          pocl_user_event_data *p = event->data;
+          pthread_cond_destroy (&p->wakeup_cond);
+          pthread_mutex_destroy (&p->lock);
+          POCL_MEM_FREE (p);
+        }
+
+      POCL_MSG_PRINT_REFCOUNTS ("Free event %d\n", event->id);
       if (event->command_type != CL_COMMAND_USER &&
           event->queue->device->ops->free_event_data)
         event->queue->device->ops->free_event_data(event);
@@ -45,6 +61,7 @@ POname(clReleaseEvent)(cl_event event) CL_API_SUFFIX__VERSION_1_0
       if (event->queue)
         POname(clReleaseCommandQueue) (event->queue);
 
+      POCL_DESTROY_OBJECT (event);
       pocl_mem_manager_free_event (event);
     }
 
diff --git a/lib/CL/clReleaseKernel.c b/lib/CL/clReleaseKernel.c
index dcb9e72..6a98d08 100644
--- a/lib/CL/clReleaseKernel.c
+++ b/lib/CL/clReleaseKernel.c
@@ -33,17 +33,22 @@ POname(clReleaseKernel)(cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0
 
   POCL_RETURN_ERROR_COND((kernel == NULL), CL_INVALID_KERNEL);
   POCL_RELEASE_OBJECT (kernel, new_refcount);
+  POCL_MSG_PRINT_REFCOUNTS ("Release kernel %p  %d\n", kernel, new_refcount);
 
   if (new_refcount == 0)
     {
-      POCL_MSG_PRINT_INFO ("Freeing kernel %p\n", kernel);
-      if (kernel->program != NULL)
+      POCL_MSG_PRINT_REFCOUNTS ("Free kernel %p\n", kernel);
+      cl_program program = kernel->program;
+      /* default kernels are not put into the program->kernels linked list */
+      if ((program != NULL)
+          && (!program->operating_on_default_kernels))
         {
           /* Find the kernel in the program's linked list of kernels */
-          POCL_LOCK_OBJ (kernel->program);
-          for (pk=&kernel->program->kernels; *pk != NULL; pk = &(*pk)->next)
+          POCL_LOCK_OBJ (program);
+          for (pk = &program->kernels; *pk != NULL; pk = &(*pk)->next)
             {
-              if (*pk == kernel) break;
+              if (*pk == kernel)
+                break;
             }
           if (*pk == NULL)
             {
@@ -51,29 +56,36 @@ POname(clReleaseKernel)(cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0
                  of kernels -- something is wrong */
               return CL_INVALID_VALUE;
             }
-          
+
           /* Remove the kernel from the program's linked list of
              kernels */
           *pk = (*pk)->next;
-          POCL_UNLOCK_OBJ (kernel->program);
-          POname(clReleaseProgram) (kernel->program);
+          POCL_UNLOCK_OBJ (program);
+          POname (clReleaseProgram) (program);
+          POCL_MSG_PRINT_REFCOUNTS ("Released non-default kernel kernel %p, program %p now has refs: %d \n", kernel, kernel->program, kernel->program->pocl_refcount);
         }
-      
-      POCL_MEM_FREE(kernel->name);
 
-      for (i = 0; i < kernel->num_args; i++)
-        {
-          struct pocl_argument *p = &(kernel->dyn_arguments[i]);
-          if (p->value != NULL)
-            {
-              pocl_aligned_free (p->value);
-              p->value = NULL;
-            }
-        }
+      POCL_MEM_FREE (kernel->name);
+      POCL_MEM_FREE (kernel->attributes);
+
+      if (kernel->arg_info)
+        for (i = 0; i < kernel->num_args; i++)
+          {
+            POCL_MEM_FREE (kernel->arg_info[i].name);
+            POCL_MEM_FREE (kernel->arg_info[i].type_name);
+          }
+
+      if (kernel->dyn_arguments)
+        for (i = 0; i < (kernel->num_args + kernel->num_locals); i++)
+          {
+            pocl_aligned_free (kernel->dyn_arguments[i].value);
+          }
 
-      POCL_MEM_FREE(kernel->dyn_arguments);
-      POCL_MEM_FREE(kernel->reqd_wg_size);
-      POCL_MEM_FREE(kernel);
+      POCL_MEM_FREE (kernel->arg_info);
+      POCL_MEM_FREE (kernel->dyn_arguments);
+      POCL_MEM_FREE (kernel->reqd_wg_size);
+      POCL_DESTROY_OBJECT (kernel);
+      POCL_MEM_FREE (kernel);
     }
   
   return CL_SUCCESS;
diff --git a/lib/CL/clReleaseMemObject.c b/lib/CL/clReleaseMemObject.c
index 3238c54..5e40f6a 100644
--- a/lib/CL/clReleaseMemObject.c
+++ b/lib/CL/clReleaseMemObject.c
@@ -39,6 +39,8 @@ POname(clReleaseMemObject)(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0
 
   POCL_RELEASE_OBJECT(memobj, new_refcount);
 
+  POCL_MSG_PRINT_REFCOUNTS ("Release mem obj %p  %d\n", memobj, new_refcount);
+
   /* OpenCL 1.2 Page 118:
 
      After the memobj reference count becomes zero and commands queued for execution on 
@@ -49,7 +51,15 @@ POname(clReleaseMemObject)(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0
 
   if (new_refcount == 0)
     {
-      POCL_MSG_PRINT_INFO ("free mem obj %p\n", memobj);
+      if (memobj->is_image && (memobj->type == CL_MEM_OBJECT_IMAGE1D_BUFFER))
+        {
+          cl_mem b = memobj->buffer;
+          assert (b);
+          cl_int err = POname (clReleaseMemObject) (b);
+          POCL_MEM_FREE (memobj);
+          return err;
+        }
+      POCL_MSG_PRINT_REFCOUNTS ("Free mem obj %p\n", memobj);
       if (memobj->parent == NULL)
         {
           cl_device_id shared_mem_owner_dev =
@@ -94,6 +104,7 @@ POname(clReleaseMemObject)(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0
         callback = next_callback;
       }
 
+      POCL_DESTROY_OBJECT (memobj);
       POCL_MEM_FREE(memobj);
 
       if (parent)
diff --git a/lib/CL/clReleaseProgram.c b/lib/CL/clReleaseProgram.c
index 5e21ad1..9907d68 100644
--- a/lib/CL/clReleaseProgram.c
+++ b/lib/CL/clReleaseProgram.c
@@ -33,6 +33,7 @@
 #include "pocl_cl.h"
 #include "pocl_util.h"
 #include "pocl_cache.h"
+#include "pocl_llvm.h"
 #include "devices.h"
 
 CL_API_ENTRY cl_int CL_API_CALL
@@ -45,10 +46,12 @@ POname(clReleaseProgram)(cl_program program) CL_API_SUFFIX__VERSION_1_0
   POCL_RETURN_ERROR_COND((program == NULL), CL_INVALID_PROGRAM);
 
   POCL_RELEASE_OBJECT (program, new_refcount);
+  POCL_MSG_PRINT_REFCOUNTS ("Release program %p, new refcount: %d, kernel #: %zu \n", program, new_refcount, program->num_kernels);
 
   if (new_refcount == 0)
     {
       cl_context context = program->context;
+      POCL_MSG_PRINT_REFCOUNTS ("Free program %p\n", program);
 
       /* Mark all kernels as having no program.
          FIXME: this should not be needed if the kernels
@@ -59,9 +62,6 @@ POname(clReleaseProgram)(cl_program program) CL_API_SUFFIX__VERSION_1_0
           k->program = NULL;
         }
 
-      if (program->buildprogram_callback)
-        POCL_MEM_FREE(program->buildprogram_callback);
-
       if (program->devices != program->context->devices)
         POCL_MEM_FREE(program->devices);
 
@@ -73,6 +73,12 @@ POname(clReleaseProgram)(cl_program program) CL_API_SUFFIX__VERSION_1_0
           POCL_MEM_FREE(program->binaries[i]);
       POCL_MEM_FREE(program->binaries);
 
+      POCL_MEM_FREE(program->pocl_binary_sizes);
+      if (program->pocl_binaries)
+        for (i = 0; i < program->num_devices; ++i)
+          POCL_MEM_FREE(program->pocl_binaries[i]);
+      POCL_MEM_FREE(program->pocl_binaries);
+
       pocl_cache_cleanup_cachedir(program);
 
       if (program->build_log)
@@ -80,9 +86,10 @@ POname(clReleaseProgram)(cl_program program) CL_API_SUFFIX__VERSION_1_0
           POCL_MEM_FREE(program->build_log[i]);
       POCL_MEM_FREE(program->build_log);
 
+      program->operating_on_default_kernels = 1;
       if (program->num_kernels)
         {
-          for (i=0; i < program->num_kernels; i++)
+          for (i = 0; i < program->num_kernels; i++)
             {
               if (program->kernel_names)
                 POCL_MEM_FREE(program->kernel_names[i]);
@@ -94,7 +101,16 @@ POname(clReleaseProgram)(cl_program program) CL_API_SUFFIX__VERSION_1_0
         }
 
       POCL_MEM_FREE(program->build_hash);
+      POCL_MEM_FREE(program->compiler_options);
+
+#ifdef OCS_AVAILABLE
+      if (program->llvm_irs)
+        for (i = 0; i < program->num_devices; ++i)
+          pocl_free_llvm_irs (program, i);
+#endif
+
       POCL_MEM_FREE(program->llvm_irs);
+      POCL_DESTROY_OBJECT (program);
       POCL_MEM_FREE(program);
 
       POname(clReleaseContext)(context);
diff --git a/lib/CL/clReleaseSampler.c b/lib/CL/clReleaseSampler.c
index b9d1d86..368dc4f 100644
--- a/lib/CL/clReleaseSampler.c
+++ b/lib/CL/clReleaseSampler.c
@@ -1,8 +1,46 @@
-#include "pocl_cl.h"
-extern CL_API_ENTRY cl_int CL_API_CALL
+/* OpenCL runtime library: clReleaseSampler()
+
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include "pocl_util.h"
+
+CL_API_ENTRY cl_int CL_API_CALL
 POname(clReleaseSampler)(cl_sampler sampler)
 CL_API_SUFFIX__VERSION_1_0
 {
+  POCL_RETURN_ERROR_COND ((sampler == NULL), CL_INVALID_SAMPLER);
+
+  int new_refcount;
+  POCL_RELEASE_OBJECT (sampler, new_refcount);
+  POCL_MSG_PRINT_REFCOUNTS ("RELEASE Sampler %p, REFCNT: %d\n", sampler,
+                            new_refcount);
+
+  if (new_refcount == 0)
+    {
+      POname (clReleaseContext) (sampler->context);
+      POCL_DESTROY_OBJECT (sampler);
+      POCL_MEM_FREE (sampler);
+    }
+
   return CL_SUCCESS;
 }
 POsym(clReleaseSampler)
diff --git a/lib/CL/clRetainCommandQueue.c b/lib/CL/clRetainCommandQueue.c
index f7acb77..dc10710 100644
--- a/lib/CL/clRetainCommandQueue.c
+++ b/lib/CL/clRetainCommandQueue.c
@@ -28,6 +28,7 @@ POname(clRetainCommandQueue)(cl_command_queue command_queue) CL_API_SUFFIX__VERS
 {
   POCL_RETURN_ERROR_COND((command_queue == NULL), CL_INVALID_COMMAND_QUEUE);
   POCL_RETAIN_OBJECT(command_queue);
+  POCL_MSG_PRINT_REFCOUNTS ("Retain Command Queue %p  : %d\n", command_queue, command_queue->pocl_refcount);
   return CL_SUCCESS;
 }
 POsym(clRetainCommandQueue)
diff --git a/lib/CL/clRetainContext.c b/lib/CL/clRetainContext.c
index e0b6a4b..7e12552 100644
--- a/lib/CL/clRetainContext.c
+++ b/lib/CL/clRetainContext.c
@@ -28,6 +28,7 @@ POname(clRetainContext)(cl_context context) CL_API_SUFFIX__VERSION_1_0
 {
   POCL_RETURN_ERROR_COND((context == NULL), CL_INVALID_CONTEXT);
   POCL_RETAIN_OBJECT(context);
+  POCL_MSG_PRINT_REFCOUNTS ("Retain Context %p  : %d\n", context, context->pocl_refcount);
   return CL_SUCCESS;
 }
 POsym(clRetainContext)
diff --git a/lib/CL/clRetainDevice.c b/lib/CL/clRetainDevice.c
index 5f7e10a..4ca734d 100644
--- a/lib/CL/clRetainDevice.c
+++ b/lib/CL/clRetainDevice.c
@@ -29,6 +29,7 @@ POname(clRetainDevice)(cl_device_id device) CL_API_SUFFIX__VERSION_1_2
     return CL_SUCCESS;
 
   POCL_RETAIN_OBJECT (device);
+  POCL_MSG_PRINT_REFCOUNTS ("Retain Device %p  : %d\n", device, device->pocl_refcount);
   return CL_SUCCESS;
 }
 POsym(clRetainDevice)
diff --git a/lib/CL/clRetainEvent.c b/lib/CL/clRetainEvent.c
index b73e9df..ddc6396 100644
--- a/lib/CL/clRetainEvent.c
+++ b/lib/CL/clRetainEvent.c
@@ -1,3 +1,26 @@
+/* OpenCL runtime library: clRetainEvent()
+
+   Copyright (c) 2012-2017 pocl developers
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
 #include "pocl_cl.h"
 
 CL_API_ENTRY cl_int CL_API_CALL
@@ -6,6 +29,7 @@ POname(clRetainEvent)(cl_event  event ) CL_API_SUFFIX__VERSION_1_0
   POCL_RETURN_ERROR_COND((event == NULL), CL_INVALID_EVENT);
 
   POCL_RETAIN_OBJECT(event);
+  POCL_MSG_PRINT_REFCOUNTS ("Retain Event %p  : %d\n", event, event->pocl_refcount);
 
   return CL_SUCCESS;
 }
diff --git a/lib/CL/clRetainKernel.c b/lib/CL/clRetainKernel.c
index 0bd033d..8e1101b 100644
--- a/lib/CL/clRetainKernel.c
+++ b/lib/CL/clRetainKernel.c
@@ -28,6 +28,7 @@ POname(clRetainKernel)(cl_kernel kernel) CL_API_SUFFIX__VERSION_1_0
 {
   POCL_RETURN_ERROR_COND((kernel == NULL), CL_INVALID_KERNEL);
   POCL_RETAIN_OBJECT (kernel);
+  POCL_MSG_PRINT_REFCOUNTS ("Retain Kernel %p  : %d\n", kernel, kernel->pocl_refcount);
   return CL_SUCCESS;
 }
 POsym(clRetainKernel)
diff --git a/lib/CL/clRetainMemObject.c b/lib/CL/clRetainMemObject.c
index fa5df37..fe94db4 100644
--- a/lib/CL/clRetainMemObject.c
+++ b/lib/CL/clRetainMemObject.c
@@ -28,6 +28,7 @@ POname(clRetainMemObject)(cl_mem memobj) CL_API_SUFFIX__VERSION_1_0
 {
   POCL_RETURN_ERROR_COND((memobj == NULL), CL_INVALID_MEM_OBJECT);
   POCL_RETAIN_OBJECT(memobj);
+  POCL_MSG_PRINT_REFCOUNTS ("Retain MemObj %p  : %d\n", memobj, memobj->pocl_refcount);
   return CL_SUCCESS;
 }
 POsym(clRetainMemObject)
diff --git a/lib/CL/clRetainProgram.c b/lib/CL/clRetainProgram.c
index dc8d227..8c95816 100644
--- a/lib/CL/clRetainProgram.c
+++ b/lib/CL/clRetainProgram.c
@@ -28,6 +28,7 @@ POname(clRetainProgram)(cl_program program) CL_API_SUFFIX__VERSION_1_0
 {
   POCL_RETURN_ERROR_COND((program == NULL), CL_INVALID_PROGRAM);
   POCL_RETAIN_OBJECT(program);
+  POCL_MSG_PRINT_REFCOUNTS ("Retain Program %p  : %d\n", program, program->pocl_refcount);
   return CL_SUCCESS;
 }
 POsym(clRetainProgram)
diff --git a/lib/CL/clRetainSampler.c b/lib/CL/clRetainSampler.c
index b002d82..b6676b5 100644
--- a/lib/CL/clRetainSampler.c
+++ b/lib/CL/clRetainSampler.c
@@ -1,9 +1,37 @@
-#include "pocl_cl.h"
+/* OpenCL runtime library: clRetainSampler()
 
-CL_API_ENTRY cl_int CL_API_CALL
-POname(clRetainSampler)(cl_sampler  sampler ) CL_API_SUFFIX__VERSION_1_0
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include "pocl_util.h"
+
+CL_API_ENTRY cl_int CL_API_CALL POname (clRetainSampler) (cl_sampler sampler)
+    CL_API_SUFFIX__VERSION_1_0
 {
-  POCL_ABORT_UNIMPLEMENTED("The entire clRetainSampler call");
+  POCL_RETURN_ERROR_COND ((sampler == NULL), CL_INVALID_SAMPLER);
+
+  POCL_RETAIN_OBJECT (sampler);
+  POCL_MSG_PRINT_REFCOUNTS ("RETAIN Sampler %p  : %d\n", sampler,
+                            sampler->pocl_refcount);
+
   return CL_SUCCESS;
 }
 
diff --git a/lib/CL/clSetKernelArg.c b/lib/CL/clSetKernelArg.c
index 086bbc1..f849a76 100644
--- a/lib/CL/clSetKernelArg.c
+++ b/lib/CL/clSetKernelArg.c
@@ -51,20 +51,49 @@ POname(clSetKernelArg)(cl_kernel kernel,
 
   pi = &(kernel->arg_info[arg_index]);
 
-  POCL_RETURN_ERROR_ON((arg_size == 0 && pi->is_local),
-    CL_INVALID_ARG_SIZE, "arg_size == 0 and arg %u is in local address space\n",
-    arg_index);
-
-  POCL_RETURN_ERROR_ON(((pi->type == POCL_ARG_TYPE_POINTER
-    || pi->type == POCL_ARG_TYPE_IMAGE)
-    && (!pi->is_local) && (arg_size != sizeof(cl_mem))),
-    CL_INVALID_ARG_SIZE, "Arg %u is pointer/buffer/image, but arg_size is "
-    "not sizeof(cl_mem)", arg_index);
-
-  POCL_RETURN_ERROR_ON((pi->type == POCL_ARG_TYPE_SAMPLER
-    && (arg_size != sizeof(cl_sampler_t))),
-    CL_INVALID_ARG_SIZE, "Arg %u is sampler, but arg_size is "
-    "not sizeof(cl_sampler_t)", arg_index);
+  POCL_MSG_PRINT_INFO ("ARG TYPE: %s \n", pi->type_name);
+
+  POCL_RETURN_ERROR_ON (
+      ((arg_value != NULL) && pi->is_local), CL_INVALID_ARG_VALUE,
+      "arg_value != NULl and arg %u is in local address space\n", arg_index);
+
+  /* Trigger CL_INVALID_ARG_VALUE if arg_value specified is NULL
+   * for an argument that is not declared with the __local qualifier. */
+  POCL_RETURN_ERROR_ON (
+      ((arg_value == NULL) && (!pi->is_local)
+       && (pi->type != POCL_ARG_TYPE_POINTER)),
+      CL_INVALID_ARG_VALUE,
+      "arg_value == NULL and arg %u is not in local address space\n",
+      arg_index);
+
+  /* Trigger CL_INVALID_ARG_SIZE if arg_size is zero
+   * and the argument is declared with the __local qualifier. */
+  POCL_RETURN_ERROR_ON (((arg_size == 0) && pi->is_local), CL_INVALID_ARG_SIZE,
+                        "arg_size == 0 and arg %u is in local address space\n",
+                        arg_index);
+
+  POCL_RETURN_ERROR_ON (
+      ((pi->type == POCL_ARG_TYPE_SAMPLER) && (arg_value == NULL)),
+      CL_INVALID_SAMPLER, "arg_value == NULL and arg is a cl_sampler\n");
+
+  if (pi->type == POCL_ARG_TYPE_POINTER || pi->type == POCL_ARG_TYPE_IMAGE
+      || pi->type == POCL_ARG_TYPE_SAMPLER)
+    POCL_RETURN_ERROR_ON (((!pi->is_local) && (arg_size != sizeof (cl_mem))),
+                          CL_INVALID_ARG_SIZE,
+                          "Arg %u is pointer/buffer/image, but arg_size is "
+                          "not sizeof(cl_mem)\n",
+                          arg_index);
+  else if (pi->type_size)
+    {
+      size_t as = arg_size;
+      /* handle <type>3 vectors, we accept both <type>3 and <type>4 sizes */
+      if (as % 3 == 0)
+        as = (as / 3) * 4;
+      POCL_RETURN_ERROR_ON (
+          (pi->type_size != as), CL_INVALID_ARG_SIZE,
+          "Arg %u is %s, but arg_size is not sizeof(%s) == %u\n", arg_index,
+          pi->type_name, pi->type_name, pi->type_size);
+    }
 
   p = &(kernel->dyn_arguments[arg_index]); 
   POCL_LOCK_OBJ (kernel);
diff --git a/lib/CL/clSetUserEventStatus.c b/lib/CL/clSetUserEventStatus.c
index 320815d..f608499 100644
--- a/lib/CL/clSetUserEventStatus.c
+++ b/lib/CL/clSetUserEventStatus.c
@@ -6,21 +6,39 @@ POname(clSetUserEventStatus)(cl_event event ,
                              cl_int   execution_status ) 
 CL_API_SUFFIX__VERSION_1_1
 {
+  int errcode;
   /* Must be a valid user event */
-  POCL_RETURN_ERROR_COND((event == NULL), CL_INVALID_EVENT);
-  POCL_RETURN_ERROR_COND((event->command_type != CL_COMMAND_USER), CL_INVALID_EVENT);
+  POCL_RETURN_ERROR_COND ((event == NULL), CL_INVALID_EVENT);
   /* Can only be set to CL_COMPLETE (0) or negative values */
-  POCL_RETURN_ERROR_COND((execution_status > CL_COMPLETE), CL_INVALID_VALUE);
-  /* Can only be done once */
-  POCL_RETURN_ERROR_COND((event->status <= CL_COMPLETE), CL_INVALID_OPERATION);
+  POCL_RETURN_ERROR_COND ((execution_status > CL_COMPLETE), CL_INVALID_VALUE);
 
   POCL_LOCK_OBJ (event);
+
+  POCL_GOTO_ERROR_COND ((event->command_type != CL_COMMAND_USER),
+                        CL_INVALID_EVENT);
+  /* Can only be done once */
+  POCL_GOTO_ERROR_COND ((event->status <= CL_COMPLETE), CL_INVALID_OPERATION);
+
   event->status = execution_status;
-  if (execution_status == CL_COMPLETE)
+  POCL_UNLOCK_OBJ (event);
+
+  if (execution_status <= CL_COMPLETE)
     {
+      POCL_MSG_PRINT_EVENTS ("User event %u completed with status: %i\n",
+                             event->id, execution_status);
       pocl_broadcast (event);
+      pocl_event_updated (event, execution_status);
     }
-  POCL_UNLOCK_OBJ (event);
+
+  pocl_user_event_data *p = event->data;
+  POCL_LOCK (p->lock);
+  pthread_cond_broadcast (&p->wakeup_cond);
+  POCL_UNLOCK (p->lock);
+
   return CL_SUCCESS;
+
+ERROR:
+  POCL_UNLOCK_OBJ (event);
+  return errcode;
 }
 POsym(clSetUserEventStatus)
diff --git a/lib/CL/clUnloadCompiler.c b/lib/CL/clUnloadCompiler.c
index 6ec6bd8..f8785fb 100644
--- a/lib/CL/clUnloadCompiler.c
+++ b/lib/CL/clUnloadCompiler.c
@@ -1,8 +1,35 @@
+/* OpenCL runtime library: clUnloadCompiler()
+
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
 #include "pocl_cl.h"
-CL_API_ENTRY cl_int CL_API_CALL
-POname(clUnloadCompiler)(void)
-CL_API_SUFFIX__VERSION_1_0
+#include "pocl_llvm.h"
+
+CL_API_ENTRY cl_int CL_API_CALL POname (clUnloadCompiler) (void)
+    CL_API_SUFFIX__VERSION_1_1
 {
+#ifdef OCS_AVAILABLE
+  pocl_llvm_release ();
+#endif
   return CL_SUCCESS;
 }
 POsym(clUnloadCompiler)
diff --git a/lib/CL/clUnloadPlatformCompiler.c b/lib/CL/clUnloadPlatformCompiler.c
index 979d543..d63b1d2 100644
--- a/lib/CL/clUnloadPlatformCompiler.c
+++ b/lib/CL/clUnloadPlatformCompiler.c
@@ -22,11 +22,29 @@
  **/
 
 #include "pocl_cl.h"
+#include "pocl_llvm.h"
 
 CL_API_ENTRY cl_int CL_API_CALL
 POname(clUnloadPlatformCompiler)(cl_platform_id platform)
 CL_API_SUFFIX__VERSION_1_2
 {
+#if defined(OCS_AVAILABLE)
+  cl_platform_id pocl_id;
+  POname (clGetPlatformIDs) (1, &pocl_id, NULL);
+  if (platform == pocl_id)
+    {
+      pocl_llvm_release ();
+    }
+  else
+    {
+      POCL_MSG_WARN (
+          "clUnloadPlatformCompiler called with non-pocl platform! \n");
+      return CL_INVALID_PLATFORM;
+    }
+#else
+  POCL_MSG_WARN (
+      "clUnloadPlatformCompiler called with LLVM-less build of pocl! \n");
+#endif
   return CL_SUCCESS;
 }
 POsym(clUnloadPlatformCompiler)
diff --git a/lib/CL/clWaitForEvents.c b/lib/CL/clWaitForEvents.c
index 353fdb8..b403699 100644
--- a/lib/CL/clWaitForEvents.c
+++ b/lib/CL/clWaitForEvents.c
@@ -29,6 +29,7 @@ POname(clWaitForEvents)(cl_uint              num_events ,
 {
   unsigned event_i;
   cl_device_id dev;
+  cl_int ret = CL_SUCCESS;
   POCL_RETURN_ERROR_COND((num_events == 0 || event_list == NULL), CL_INVALID_VALUE);
 
   for (event_i = 0; event_i < num_events; ++event_i)
@@ -51,12 +52,26 @@ POname(clWaitForEvents)(cl_uint              num_events ,
         dev->ops->wait_event(dev, event_list[event_i]);
       else
         POname(clFinish)(event_list[event_i]->queue);
+      if (event_list[event_i]->status < 0)
+        ret = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
     }
   /* brute force wait for user events */
+  struct timespec time_to_wait = { 0, 0 };
   for (event_i = 0; event_i < num_events; ++event_i)
     if (event_list[event_i]->command_type == CL_COMMAND_USER)
-      while (event_list[event_i]->status != CL_COMPLETE){}
+      {
+        while (event_list[event_i]->status > CL_COMPLETE)
+          {
+            pocl_user_event_data *p = event_list[event_i]->data;
+            POCL_LOCK (p->lock);
+            time_to_wait.tv_sec = time (NULL) + 1;
+            pthread_cond_timedwait (&p->wakeup_cond, &p->lock, &time_to_wait);
+            POCL_UNLOCK (p->lock);
+          }
+        if (event_list[event_i]->status < 0)
+          ret = CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST;
+      }
 
-  return CL_SUCCESS;
+  return ret;
 }
 POsym(clWaitForEvents)
diff --git a/lib/CL/devices/CMakeLists.txt b/lib/CL/devices/CMakeLists.txt
index 9243a4b..b558a1b 100644
--- a/lib/CL/devices/CMakeLists.txt
+++ b/lib/CL/devices/CMakeLists.txt
@@ -49,7 +49,12 @@ if(ENABLE_HSA)
   list(APPEND POCL_DEVICES_LINK_LIST ${HSALIB})
 endif()
 
-set(POCL_DEVICES_SOURCES 
+if(ENABLE_CUDA)
+  add_subdirectory("cuda")
+  list(APPEND POCL_DEVICES_LINK_LIST cuda)
+endif()
+
+set(POCL_DEVICES_SOURCES
   devices.h  devices.c
   bufalloc.c  dev_image.h
   common.h common.c
diff --git a/lib/CL/devices/basic/basic.c b/lib/CL/devices/basic/basic.c
index d6ad663..433ce94 100644
--- a/lib/CL/devices/basic/basic.c
+++ b/lib/CL/devices/basic/basic.c
@@ -47,9 +47,6 @@
 
 #define max(a,b) (((a) > (b)) ? (a) : (b))
 
-#define COMMAND_LENGTH 2048
-#define WORKGROUP_STRING_LENGTH 1024
-
 struct data {
   /* Currently loaded kernel. */
   cl_kernel current_kernel;
@@ -64,103 +61,21 @@ struct data {
 };
 
 static const cl_image_format supported_image_formats[] = {
-    { CL_R, CL_SNORM_INT8 },
-    { CL_R, CL_SNORM_INT16 },
-    { CL_R, CL_UNORM_INT8 },
-    { CL_R, CL_UNORM_INT16 },
-    { CL_R, CL_UNORM_SHORT_565 }, 
-    { CL_R, CL_UNORM_SHORT_555 },
-    { CL_R, CL_UNORM_INT_101010 }, 
-    { CL_R, CL_SIGNED_INT8 },
-    { CL_R, CL_SIGNED_INT16 }, 
-    { CL_R, CL_SIGNED_INT32 },
-    { CL_R, CL_UNSIGNED_INT8 }, 
-    { CL_R, CL_UNSIGNED_INT16 },
-    { CL_R, CL_UNSIGNED_INT32 }, 
-    { CL_R, CL_HALF_FLOAT },
-    { CL_R, CL_FLOAT },
-    { CL_Rx, CL_SNORM_INT8 },
-    { CL_Rx, CL_SNORM_INT16 },
-    { CL_Rx, CL_UNORM_INT8 },
-    { CL_Rx, CL_UNORM_INT16 },
-    { CL_Rx, CL_UNORM_SHORT_565 }, 
-    { CL_Rx, CL_UNORM_SHORT_555 },
-    { CL_Rx, CL_UNORM_INT_101010 }, 
-    { CL_Rx, CL_SIGNED_INT8 },
-    { CL_Rx, CL_SIGNED_INT16 }, 
-    { CL_Rx, CL_SIGNED_INT32 },
-    { CL_Rx, CL_UNSIGNED_INT8 }, 
-    { CL_Rx, CL_UNSIGNED_INT16 },
-    { CL_Rx, CL_UNSIGNED_INT32 }, 
-    { CL_Rx, CL_HALF_FLOAT },
-    { CL_Rx, CL_FLOAT },
     { CL_A, CL_SNORM_INT8 },
     { CL_A, CL_SNORM_INT16 },
     { CL_A, CL_UNORM_INT8 },
     { CL_A, CL_UNORM_INT16 },
-    { CL_A, CL_UNORM_SHORT_565 }, 
-    { CL_A, CL_UNORM_SHORT_555 },
-    { CL_A, CL_UNORM_INT_101010 }, 
     { CL_A, CL_SIGNED_INT8 },
     { CL_A, CL_SIGNED_INT16 }, 
     { CL_A, CL_SIGNED_INT32 },
     { CL_A, CL_UNSIGNED_INT8 }, 
     { CL_A, CL_UNSIGNED_INT16 },
     { CL_A, CL_UNSIGNED_INT32 }, 
-    { CL_A, CL_HALF_FLOAT },
     { CL_A, CL_FLOAT },
-    { CL_RG, CL_SNORM_INT8 },
-    { CL_RG, CL_SNORM_INT16 },
-    { CL_RG, CL_UNORM_INT8 },
-    { CL_RG, CL_UNORM_INT16 },
-    { CL_RG, CL_UNORM_SHORT_565 }, 
-    { CL_RG, CL_UNORM_SHORT_555 },
-    { CL_RG, CL_UNORM_INT_101010 }, 
-    { CL_RG, CL_SIGNED_INT8 },
-    { CL_RG, CL_SIGNED_INT16 }, 
-    { CL_RG, CL_SIGNED_INT32 },
-    { CL_RG, CL_UNSIGNED_INT8 }, 
-    { CL_RG, CL_UNSIGNED_INT16 },
-    { CL_RG, CL_UNSIGNED_INT32 }, 
-    { CL_RG, CL_HALF_FLOAT },
-    { CL_RG, CL_FLOAT },
-    { CL_RGx, CL_SNORM_INT8 },
-    { CL_RGx, CL_SNORM_INT16 },
-    { CL_RGx, CL_UNORM_INT8 },
-    { CL_RGx, CL_UNORM_INT16 },
-    { CL_RGx, CL_UNORM_SHORT_565 }, 
-    { CL_RGx, CL_UNORM_SHORT_555 },
-    { CL_RGx, CL_UNORM_INT_101010 }, 
-    { CL_RGx, CL_SIGNED_INT8 },
-    { CL_RGx, CL_SIGNED_INT16 }, 
-    { CL_RGx, CL_SIGNED_INT32 },
-    { CL_RGx, CL_UNSIGNED_INT8 }, 
-    { CL_RGx, CL_UNSIGNED_INT16 },
-    { CL_RGx, CL_UNSIGNED_INT32 }, 
-    { CL_RGx, CL_HALF_FLOAT },
-    { CL_RGx, CL_FLOAT },
-    { CL_RA, CL_SNORM_INT8 },
-    { CL_RA, CL_SNORM_INT16 },
-    { CL_RA, CL_UNORM_INT8 },
-    { CL_RA, CL_UNORM_INT16 },
-    { CL_RA, CL_UNORM_SHORT_565 }, 
-    { CL_RA, CL_UNORM_SHORT_555 },
-    { CL_RA, CL_UNORM_INT_101010 }, 
-    { CL_RA, CL_SIGNED_INT8 },
-    { CL_RA, CL_SIGNED_INT16 }, 
-    { CL_RA, CL_SIGNED_INT32 },
-    { CL_RA, CL_UNSIGNED_INT8 }, 
-    { CL_RA, CL_UNSIGNED_INT16 },
-    { CL_RA, CL_UNSIGNED_INT32 }, 
-    { CL_RA, CL_HALF_FLOAT },
-    { CL_RA, CL_FLOAT },
     { CL_RGBA, CL_SNORM_INT8 },
     { CL_RGBA, CL_SNORM_INT16 },
     { CL_RGBA, CL_UNORM_INT8 },
     { CL_RGBA, CL_UNORM_INT16 },
-    { CL_RGBA, CL_UNORM_SHORT_565 }, 
-    { CL_RGBA, CL_UNORM_SHORT_555 },
-    { CL_RGBA, CL_UNORM_INT_101010 }, 
     { CL_RGBA, CL_SIGNED_INT8 },
     { CL_RGBA, CL_SIGNED_INT16 }, 
     { CL_RGBA, CL_SIGNED_INT32 },
@@ -169,24 +84,6 @@ static const cl_image_format supported_image_formats[] = {
     { CL_RGBA, CL_UNSIGNED_INT32 }, 
     { CL_RGBA, CL_HALF_FLOAT },
     { CL_RGBA, CL_FLOAT },
-    { CL_INTENSITY, CL_UNORM_INT8 }, 
-    { CL_INTENSITY, CL_UNORM_INT16 }, 
-    { CL_INTENSITY, CL_SNORM_INT8 }, 
-    { CL_INTENSITY, CL_SNORM_INT16 }, 
-    { CL_INTENSITY, CL_HALF_FLOAT }, 
-    { CL_INTENSITY, CL_FLOAT },
-    { CL_LUMINANCE, CL_UNORM_INT8 }, 
-    { CL_LUMINANCE, CL_UNORM_INT16 }, 
-    { CL_LUMINANCE, CL_SNORM_INT8 }, 
-    { CL_LUMINANCE, CL_SNORM_INT16 }, 
-    { CL_LUMINANCE, CL_HALF_FLOAT }, 
-    { CL_LUMINANCE, CL_FLOAT },
-    { CL_RGB, CL_UNORM_SHORT_565 }, 
-    { CL_RGB, CL_UNORM_SHORT_555 },
-    { CL_RGB, CL_UNORM_INT_101010 }, 
-    { CL_RGBx, CL_UNORM_SHORT_565 }, 
-    { CL_RGBx, CL_UNORM_SHORT_555 },
-    { CL_RGBx, CL_UNORM_INT_101010 }, 
     { CL_ARGB, CL_SNORM_INT8 },
     { CL_ARGB, CL_UNORM_INT8 },
     { CL_ARGB, CL_SIGNED_INT8 },
@@ -241,8 +138,10 @@ pocl_basic_build_hash (cl_device_id device)
   return res;
 }
 
+static cl_device_partition_property basic_partition_properties[1] = { 0 };
+
 void
-pocl_basic_init_device_infos(struct _cl_device_id* dev)
+pocl_basic_init_device_infos(unsigned j, struct _cl_device_id* dev)
 {
   dev->type = CL_DEVICE_TYPE_CPU;
   dev->vendor_id = 0;
@@ -257,10 +156,13 @@ pocl_basic_init_device_infos(struct _cl_device_id* dev)
     the SIMD lanes times the vector units, but not more than
     that to avoid stack overflow and cache trashing.
   */
-  dev->max_work_item_sizes[0] = dev->max_work_item_sizes[1] =
-	  dev->max_work_item_sizes[2] = dev->max_work_group_size = 1024*4;
+  dev->max_work_item_sizes[0] = dev->max_work_item_sizes[1]
+      = dev->max_work_item_sizes[2] = dev->max_work_group_size = 1024 * 4;
 
   dev->preferred_wg_size_multiple = 8;
+#ifdef OCS_AVAILABLE
+  cpu_setup_vector_widths (dev);
+#else
   dev->preferred_vector_width_char = POCL_DEVICES_PREFERRED_VECTOR_WIDTH_CHAR;
   dev->preferred_vector_width_short = POCL_DEVICES_PREFERRED_VECTOR_WIDTH_SHORT;
   dev->preferred_vector_width_int = POCL_DEVICES_PREFERRED_VECTOR_WIDTH_INT;
@@ -269,13 +171,15 @@ pocl_basic_init_device_infos(struct _cl_device_id* dev)
   dev->preferred_vector_width_double = POCL_DEVICES_PREFERRED_VECTOR_WIDTH_DOUBLE;
   dev->preferred_vector_width_half = POCL_DEVICES_PREFERRED_VECTOR_WIDTH_HALF;
   /* TODO: figure out what the difference between preferred and native widths are */
-  dev->native_vector_width_char = POCL_DEVICES_PREFERRED_VECTOR_WIDTH_CHAR;
-  dev->native_vector_width_short = POCL_DEVICES_PREFERRED_VECTOR_WIDTH_SHORT;
-  dev->native_vector_width_int = POCL_DEVICES_PREFERRED_VECTOR_WIDTH_INT;
-  dev->native_vector_width_long = POCL_DEVICES_PREFERRED_VECTOR_WIDTH_LONG;
-  dev->native_vector_width_float = POCL_DEVICES_PREFERRED_VECTOR_WIDTH_FLOAT;
-  dev->native_vector_width_double = POCL_DEVICES_PREFERRED_VECTOR_WIDTH_DOUBLE;
-  dev->native_vector_width_half = POCL_DEVICES_PREFERRED_VECTOR_WIDTH_HALF;
+  dev->native_vector_width_char = POCL_DEVICES_NATIVE_VECTOR_WIDTH_CHAR;
+  dev->native_vector_width_short = POCL_DEVICES_NATIVE_VECTOR_WIDTH_SHORT;
+  dev->native_vector_width_int = POCL_DEVICES_NATIVE_VECTOR_WIDTH_INT;
+  dev->native_vector_width_long = POCL_DEVICES_NATIVE_VECTOR_WIDTH_LONG;
+  dev->native_vector_width_float = POCL_DEVICES_NATIVE_VECTOR_WIDTH_FLOAT;
+  dev->native_vector_width_double = POCL_DEVICES_NATIVE_VECTOR_WIDTH_DOUBLE;
+  dev->native_vector_width_half = POCL_DEVICES_NATIVE_VECTOR_WIDTH_HALF;
+#endif
+
   dev->max_clock_frequency = 0;
   dev->address_bits = POCL_DEVICE_ADDRESS_BITS;
   dev->image_support = CL_TRUE;
@@ -295,7 +199,29 @@ pocl_basic_init_device_infos(struct _cl_device_id* dev)
   dev->mem_base_addr_align = MAX_EXTENDED_ALIGNMENT*8; // this is in bits
   dev->half_fp_config = 0;
   dev->single_fp_config = CL_FP_ROUND_TO_NEAREST | CL_FP_INF_NAN;
-  dev->double_fp_config = CL_FP_ROUND_TO_NEAREST | CL_FP_INF_NAN;
+#ifdef __x86_64__
+  dev->single_fp_config |= (CL_FP_DENORM | CL_FP_ROUND_TO_INF
+                            | CL_FP_ROUND_TO_ZERO
+                            | CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT);
+#ifdef OCS_AVAILABLE
+  if (cpu_has_fma())
+    dev->single_fp_config |= CL_FP_FMA;
+#endif
+#endif
+
+#ifdef _CL_DISABLE_DOUBLE
+  dev->double_fp_config = 0;
+#else
+  /* TODO: all of these are the minimum mandated, but not all CPUs may actually
+   * support all of them. */
+  dev->double_fp_config = CL_FP_FMA | CL_FP_ROUND_TO_NEAREST
+                          | CL_FP_ROUND_TO_ZERO | CL_FP_ROUND_TO_INF
+                          | CL_FP_INF_NAN | CL_FP_DENORM;
+  /* this is a workaround for issue 28 in https://github.com/Oblomov/clinfo
+   * https://github.com/Oblomov/clinfo/issues/28 */
+  dev->double_fp_config |= CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT;
+#endif
+
   dev->global_mem_cache_type = CL_NONE;
   dev->global_mem_cacheline_size = 0;
   dev->global_mem_cache_size = 0;
@@ -313,15 +239,23 @@ pocl_basic_init_device_infos(struct _cl_device_id* dev)
   dev->available = CL_TRUE;
   dev->compiler_available = CL_TRUE;
   dev->spmd = CL_FALSE;
+  dev->workgroup_pass = CL_TRUE;
   dev->execution_capabilities = CL_EXEC_KERNEL | CL_EXEC_NATIVE_KERNEL;
   dev->platform = 0;
 
   dev->parent_device = NULL;
-  // basic does not support partitioning
+  /* These two are only used for subdevices.
+   * Each subdevice has these two setup when created.
+   * The subdevice will then use these CUs:
+   *  [start, start+1, ..., start+count-1]
+   * this may not work with more complicated partitioning schemes,
+   * but is good enough for now. */
+  dev->core_start = 0;
+  dev->core_count = 0;
+  /* basic does not support partitioning */
   dev->max_sub_devices = 1;
   dev->num_partition_properties = 1;
-  dev->partition_properties = calloc(dev->num_partition_properties,
-    sizeof(cl_device_partition_property));
+  dev->partition_properties = basic_partition_properties;
   dev->num_partition_types = 0;
   dev->partition_type = NULL;
 
@@ -362,12 +296,7 @@ pocl_basic_init_device_infos(struct _cl_device_id* dev)
 #ifdef OCS_AVAILABLE
 
   dev->llvm_target_triplet = OCL_KERNEL_TARGET;
-
-#ifdef POCL_BUILT_WITH_CMAKE
   dev->llvm_cpu = get_cpu_name();
-#else
-  dev->llvm_cpu = OCL_KERNEL_TARGET_CPU;
-#endif
 
 #else
   dev->llvm_cpu = NULL;
@@ -392,13 +321,14 @@ pocl_basic_probe(struct pocl_device_ops *ops)
 
 
 
-void
-pocl_basic_init (cl_device_id device, const char* parameters)
+cl_int
+pocl_basic_init (unsigned j, cl_device_id device, const char* parameters)
 {
   struct data *d;
+  cl_int ret = CL_SUCCESS;
+  int err;
   static int first_basic_init = 1;
-  static int device_number = 0;
-  
+
   if (first_basic_init)
     {
       pocl_init_dlhandle_cache();
@@ -407,7 +337,9 @@ pocl_basic_init (cl_device_id device, const char* parameters)
   device->global_mem_id = 0;
 
   d = (struct data *) calloc (1, sizeof (struct data));
-  
+  if (d == NULL)
+    return CL_OUT_OF_HOST_MEMORY;
+
   d->current_kernel = NULL;
   d->current_dlhandle = 0;
   device->data = d;
@@ -417,7 +349,10 @@ pocl_basic_init (cl_device_id device, const char* parameters)
      initialize global_mem_size which it is not yet. Just put 
      a nonzero there for now. */
   device->global_mem_size = 1;
-  pocl_topology_detect_device_info(device);
+  err = pocl_topology_detect_device_info(device);
+  if (err)
+    ret = CL_INVALID_DEVICE;
+
   POCL_INIT_LOCK (d->cq_lock);
   pocl_cpuinfo_detect_device_info(device);
   pocl_set_buffer_image_limits(device);
@@ -429,8 +364,7 @@ pocl_basic_init (cl_device_id device, const char* parameters)
     device->vendor_id =
       magic[0] | magic[1] << 8 | magic[2] << 16 | magic[3] << 24;
 
-  device->vendor_id += device_number;
-  device_number++;
+  device->vendor_id += j;
 
   /* The basic driver represents only one "compute unit" as
      it doesn't exploit multiple hardware threads. Multiple
@@ -446,6 +380,7 @@ pocl_basic_init (cl_device_id device, const char* parameters)
   device->has_64bit_long=0;
   #endif
 
+  return ret;
 }
 
 cl_int
@@ -454,8 +389,7 @@ pocl_basic_alloc_mem_obj (cl_device_id device, cl_mem mem_obj, void* host_ptr)
   void *b = NULL;
   cl_mem_flags flags = mem_obj->flags;
   unsigned i;
-  POCL_MSG_PRINT_INFO("BASIC: alloc_mem_obj, mem %p, dev %d\n", 
-                      mem_obj, device->dev_id);
+  POCL_MSG_PRINT_MEMORY (" mem %p, dev %d\n", mem_obj, device->dev_id);
   /* check if some driver has already allocated memory for this mem_obj 
      in our global address space, and use that*/
   for (i = 0; i < mem_obj->context->num_devices; ++i)
@@ -468,7 +402,9 @@ pocl_basic_alloc_mem_obj (cl_device_id device, cl_mem mem_obj, void* host_ptr)
         {
           mem_obj->device_ptrs[device->dev_id].mem_ptr =
             mem_obj->device_ptrs[i].mem_ptr;
-          POCL_MSG_PRINT_INFO("BASIC: alloc_mem_obj %p dev %d, using already allocated mem\n", mem_obj, device->dev_id);
+          POCL_MSG_PRINT_MEMORY (
+              "mem %p dev %d, using already allocated mem\n", mem_obj,
+              device->dev_id);
           return CL_SUCCESS;
         }
     }
@@ -482,6 +418,7 @@ pocl_basic_alloc_mem_obj (cl_device_id device, cl_mem mem_obj, void* host_ptr)
     }
   else
     {
+      POCL_MSG_PRINT_MEMORY ("!USE_HOST_PTR\n");
       b = pocl_memalign_alloc_global_mem (device, MAX_EXTENDED_ALIGNMENT,
                                           mem_obj->size);
       if (b==NULL)
@@ -491,11 +428,12 @@ pocl_basic_alloc_mem_obj (cl_device_id device, cl_mem mem_obj, void* host_ptr)
     }
 
   /* use this dev mem allocation as host ptr */
-  if (flags & CL_MEM_ALLOC_HOST_PTR && (mem_obj->mem_host_ptr == NULL))
+  if ((flags & CL_MEM_ALLOC_HOST_PTR) && (mem_obj->mem_host_ptr == NULL))
     mem_obj->mem_host_ptr = b;
 
   if (flags & CL_MEM_COPY_HOST_PTR)
     {
+      POCL_MSG_PRINT_MEMORY ("COPY_HOST_PTR\n");
       // mem_host_ptr must be non-NULL
       assert(host_ptr != NULL);
       memcpy (b, host_ptr, mem_obj->size);
@@ -636,7 +574,12 @@ pocl_basic_run
   pc->local_size[0] = cmd->command.run.local_x;
   pc->local_size[1] = cmd->command.run.local_y;
   pc->local_size[2] = cmd->command.run.local_z;
-  
+
+  unsigned rm = pocl_save_rm ();
+  pocl_set_default_rm ();
+  unsigned ftz = pocl_save_ftz ();
+  pocl_set_ftz (kernel->program->flush_denorms);
+
   for (z = 0; z < pc->num_groups[2]; ++z)
     {
       for (y = 0; y < pc->num_groups[1]; ++y)
@@ -652,6 +595,10 @@ pocl_basic_run
             }
         }
     }
+
+  pocl_restore_rm (rm);
+  pocl_restore_ftz (ftz);
+
   for (i = 0; i < kernel->num_args; ++i)
     {
       if (kernel->arg_info[i].is_local)
@@ -860,7 +807,7 @@ void pocl_basic_memfill(void *ptr,
       break;
     case 16:
       {
-      uint64_t * p = (uint64_t*)ptr + offset;
+      uint64_t * p = (uint64_t*)ptr + (offset << 1);
       for (i = 0; i < size; i++)
         for (j = 0; j < 2; j++)
           p[(i<<1) + j] = *((uint64_t*)pattern + j);
@@ -868,7 +815,7 @@ void pocl_basic_memfill(void *ptr,
       break;
     case 32:
       {
-      uint64_t * p = (uint64_t*)ptr + offset;
+      uint64_t * p = (uint64_t*)ptr + (offset << 2);
       for (i = 0; i < size; i++)
         for (j = 0; j < 4; j++)
           p[(i<<2) + j] = *((uint64_t*)pattern + j);
@@ -876,7 +823,7 @@ void pocl_basic_memfill(void *ptr,
       break;
     case 64:
       {
-      uint64_t * p = (uint64_t*)ptr + offset;
+      uint64_t * p = (uint64_t*)ptr + (offset << 3);
       for (i = 0; i < size; i++)
         for (j = 0; j < 8; j++)
           p[(i<<3) + j] = *((uint64_t*)pattern + j);
@@ -884,7 +831,7 @@ void pocl_basic_memfill(void *ptr,
       break;
     case 128:
       {
-      uint64_t * p = (uint64_t*)ptr + offset;
+      uint64_t * p = (uint64_t*)ptr + (offset << 4);
       for (i = 0; i < size; i++)
         for (j = 0; j < 16; j++)
           p[(i<<4) + j] = *((uint64_t*)pattern + j);
@@ -901,9 +848,17 @@ pocl_basic_map_mem (void *data, void *buf_ptr,
                       size_t offset, size_t size,
                       void *host_ptr) 
 {
-  /* All global pointers of the pthread/CPU device are in 
-     the host address space already, and up to date. */
-  if (host_ptr != NULL) return host_ptr;
+  /* If the buffer was allocated with CL_MEM_ALLOC_HOST_PTR |
+   * CL_MEM_COPY_HOST_PTR,
+   * the host_ptr is not the same memory as pocl's device_ptrs[], and we need
+   * to copy pocl's buffer content back to host_ptr. */
+  if ((host_ptr != NULL) && (host_ptr != (buf_ptr + offset)))
+    {
+      POCL_MSG_PRINT_MEMORY ("device: MAP memcpy() "
+                             "buf_ptr %p + offset %zu to host_ptr %p\n",
+                             buf_ptr, offset, host_ptr);
+      memcpy ((char *)host_ptr, (char *)buf_ptr + offset, size);
+    }
   return (char*)buf_ptr + offset;
 }
 
@@ -911,7 +866,18 @@ void* pocl_basic_unmap_mem(void *data, void *host_ptr,
                            void *device_start_ptr,
                            size_t offset, size_t size)
 {
-  return host_ptr;
+  /* If the buffer was allocated with CL_MEM_ALLOC_HOST_PTR |
+   * CL_MEM_COPY_HOST_PTR,
+   * the host_ptr is not the same memory as pocl's device_ptrs[], and we need
+   * to copy host_ptr content back to  pocl's device_ptrs[]. */
+  if ((host_ptr != NULL) && (host_ptr != (device_start_ptr + offset)))
+    {
+      POCL_MSG_PRINT_MEMORY ("device: UNMAP memcpy() "
+                             "host_ptr %p to buf_ptr %p + offset %zu\n",
+                             host_ptr, device_start_ptr, offset);
+      memcpy ((char *)device_start_ptr + offset, (char *)host_ptr, size);
+    }
+  return (char *)host_ptr;
 }
 
 
@@ -951,15 +917,13 @@ static void basic_command_scheduler (struct data *d)
   while ((node = d->ready_list))
     {
       assert (pocl_command_is_ready(node->event));
-      CDL_DELETE (d->ready_list, node);
-
-      
-      pthread_mutex_unlock (&d->cq_lock);
       assert (node->event->status == CL_SUBMITTED);
+      CDL_DELETE (d->ready_list, node);
+      POCL_UNLOCK (d->cq_lock);
       pocl_exec_command(node);
-      pthread_mutex_lock (&d->cq_lock);
+      POCL_LOCK (d->cq_lock);
     }
-    
+
   return;
 }
 
@@ -967,15 +931,16 @@ void
 pocl_basic_submit (_cl_command_node *node, cl_command_queue cq)
 {
   struct data *d = node->device->data;
-  cl_event *event = &(node->event);
   
   node->device->ops->compile_kernel (node, NULL, NULL);
+
+  POCL_LOCK_OBJ (node->event);
+  node->ready = 1;
   POCL_LOCK (d->cq_lock);
-  POCL_UPDATE_EVENT_SUBMITTED(event);
   pocl_command_push(node, &d->ready_list, &d->command_list);
-  
-  basic_command_scheduler (d);
+  POCL_UNLOCK_OBJ (node->event);
 
+  basic_command_scheduler (d);
   POCL_UNLOCK (d->cq_lock);
 
   return;
@@ -989,16 +954,7 @@ void pocl_basic_flush (cl_device_id device, cl_command_queue cq)
   basic_command_scheduler (d);
   POCL_UNLOCK (d->cq_lock);
 }
-/*
-static void
-pocl_basic_push_command (_cl_command_node *node)
-{
-  struct data *d = (struct data*)node->device->data;
-
-  pocl_command_push(node, &d->ready_list, &d->command_list);
 
-}
-*/
 void
 pocl_basic_join(cl_device_id device, cl_command_queue cq)
 {
@@ -1012,18 +968,25 @@ pocl_basic_join(cl_device_id device, cl_command_queue cq)
 }
 
 void
-pocl_basic_notify (cl_device_id device, cl_event event)
+pocl_basic_notify (cl_device_id device, cl_event event, cl_event finished)
 {
   struct data *d = (struct data*)device->data;
   _cl_command_node * volatile node = event->command;
-  
-  POCL_LOCK_OBJ (event);
-  if (!(node->ready) && pocl_command_is_ready(node->event))
+
+  if (finished->status < CL_COMPLETE)
+    {
+      POCL_UPDATE_EVENT_FAILED (event);
+      return;
+    }
+
+  if (!node->ready)
+    return;
+
+  if (pocl_command_is_ready (event))
     {
-      node->ready = 1;
-      POCL_UNLOCK_OBJ (event);
-      if (node->event->status == CL_SUBMITTED)
+      if (event->status == CL_QUEUED)
         {
+          POCL_UPDATE_EVENT_SUBMITTED (event);
           POCL_LOCK (d->cq_lock);
           CDL_DELETE (d->command_list, node);
           CDL_PREPEND (d->ready_list, node);
@@ -1032,7 +995,6 @@ pocl_basic_notify (cl_device_id device, cl_event event)
         }
       return;
     }
-  POCL_UNLOCK_OBJ (event);
 }
 
 void
diff --git a/lib/CL/devices/common.c b/lib/CL/devices/common.c
index 6708f0b..8c01191 100644
--- a/lib/CL/devices/common.c
+++ b/lib/CL/devices/common.c
@@ -22,14 +22,17 @@
    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    THE SOFTWARE.
 */
+
+#define _GNU_SOURCE
+
 #include "common.h"
 #include "pocl_shared.h"
+#include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 #include <utlist.h>
-#include <assert.h>
 
 #ifndef _MSC_VER
 #  include <sys/time.h>
@@ -40,14 +43,15 @@
 #endif
 
 #include "config.h"
-#include "pocl_image_util.h"
-#include "pocl_file_util.h"
-#include "pocl_util.h"
-#include "pocl_cache.h"
+#include "config2.h"
 #include "devices.h"
+#include "pocl_cache.h"
+#include "pocl_debug.h"
+#include "pocl_file_util.h"
+#include "pocl_image_util.h"
 #include "pocl_mem_management.h"
 #include "pocl_runtime_config.h"
-#include "pocl_debug.h"
+#include "pocl_util.h"
 
 #ifdef OCS_AVAILABLE
 #include "pocl_llvm.h"
@@ -55,7 +59,8 @@
 
 #include "_kernel_constants.h"
 
-#define COMMAND_LENGTH 2048
+
+#define WORKGROUP_STRING_LENGTH 1024
 
 /**
  * Generate code from the final bitcode using the LLVM
@@ -69,78 +74,159 @@
 
 #ifdef OCS_AVAILABLE
 char*
-llvm_codegen (const char* tmpdir, cl_kernel kernel, cl_device_id device) {
+llvm_codegen (const char* tmpdir, cl_kernel kernel, cl_device_id device,
+              size_t local_x, size_t local_y, size_t local_z)
+{
+  int error = 0;
+  void *write_lock = NULL;
+  void *llvm_module = NULL;
+
+  char tmp_module[POCL_FILENAME_LENGTH];
+
+  char objfile_path[POCL_FILENAME_LENGTH];
+  char *objfile = NULL;
+  size_t objfile_size = 0;
+
+  cl_program program = kernel->program;
 
-  char command[COMMAND_LENGTH];
-  char bytecode[POCL_FILENAME_LENGTH];
-  char objfile[POCL_FILENAME_LENGTH];
-  /* strlen of / .so 4+1 */
-  int file_name_alloc_size = 
-    min(POCL_FILENAME_LENGTH, strlen(tmpdir) + strlen(kernel->name) + 5);
-  char* module = (char*) malloc(file_name_alloc_size); 
-  /* To avoid corrupted .so files, create a tmp file first
-     and then rename it. */
-  char tmp_module[file_name_alloc_size + 4]; /* .tmp postfix */
-  int error;
+  int device_i = pocl_cl_device_to_index (program, device);
+  assert (device_i >= 0);
 
-  error = snprintf(module, POCL_FILENAME_LENGTH,
-                   "%s/%s.so", tmpdir, kernel->name);
+  /* $/parallel.bc */
+  char parallel_bc_path[POCL_FILENAME_LENGTH];
+  pocl_cache_work_group_function_path (parallel_bc_path, program, device_i,
+                                       kernel, local_x, local_y, local_z);
 
-  assert (error >= 0);
+  /* $/kernel.so */
+  char final_binary_path[POCL_FILENAME_LENGTH];
+  pocl_cache_final_binary_path (final_binary_path, program, device_i, kernel,
+                                local_x, local_y, local_z);
 
-  error = snprintf(objfile, POCL_FILENAME_LENGTH,
-                   "%s/%s.so.o", tmpdir, kernel->name);
-  assert (error >= 0);
+  if (pocl_exists (final_binary_path))
+    goto FINISH;
 
-  if (pocl_exists(module))
-    return module;
+  /* $/kernel.so.o */
+  assert (strlen (final_binary_path) < (POCL_FILENAME_LENGTH - 3));
+  strcpy (objfile_path, final_binary_path);
+  strcat (objfile_path, ".o");
 
-  memcpy (tmp_module, module, file_name_alloc_size);
-  strcat (tmp_module, ".tmp");
+  error = pocl_llvm_generate_workgroup_function_nowrite (
+      device, kernel, local_x, local_y, local_z, &llvm_module);
+  if (error)
+    {
+      POCL_MSG_PRINT_GENERAL ("pocl_llvm_generate_workgroup_function() failed"
+                              " for kernel %s\n",
+                              kernel->name);
+      goto FINISH;
+    }
+  assert (llvm_module != NULL);
 
-  void* write_lock = pocl_cache_acquire_writer_lock(kernel->program, device);
-  assert(write_lock);
+  /* may happen if another thread is building the same program & wins
+   * the llvm lock. */
+  if (pocl_exists (final_binary_path))
+    goto FINISH;
+
+  error = pocl_llvm_codegen (kernel, device, llvm_module, &objfile,
+                             &objfile_size);
+  if (error)
+    {
+      POCL_MSG_PRINT_GENERAL ("pocl_llvm_codegen() failed"
+                              " for kernel %s\n",
+                              kernel->name);
+      goto FINISH;
+    }
+
+  if (pocl_exists (final_binary_path))
+    goto FINISH;
+
+  /**************************************************************************/
+  write_lock = pocl_cache_acquire_writer_lock (kernel->program, device);
+  assert (write_lock);
+
+  /* write parallel.bc only if we want to leave compiler files*/
+  if (pocl_get_bool_option ("POCL_LEAVE_KERNEL_COMPILER_TEMP_FILES", 0))
+    {
+      POCL_MSG_PRINT_LLVM ("Writing parallel.bc to %s.\n", parallel_bc_path);
+      error = pocl_cache_write_kernel_parallel_bc (
+          llvm_module, program, device_i, kernel, local_x, local_y, local_z);
+    }
+  else
+    {
+      char kernel_parallel_path[POCL_FILENAME_LENGTH];
+      pocl_cache_kernel_cachedir_path (kernel_parallel_path, program, device_i,
+                                       kernel, "", local_x, local_y, local_z);
+      error = pocl_mkdir_p (kernel_parallel_path);
+    }
+  if (error)
+    {
+      POCL_MSG_PRINT_GENERAL ("writing parallel.bc failed"
+                              " for kernel %s\n",
+                              kernel->name);
+      goto FINISH;
+    }
 
-  error = snprintf (bytecode, POCL_FILENAME_LENGTH,
-		    "%s%s", tmpdir, POCL_PARALLEL_BC_FILENAME);
-  assert (error >= 0);
+  /* write krenel.so.o always, required for linking step. */
+  POCL_MSG_PRINT_LLVM ("Writing code gen output to %s.\n", objfile_path);
+  error = pocl_write_file (objfile_path, objfile, objfile_size, 0, 0);
+  if (error)
+    {
+      POCL_MSG_PRINT_GENERAL ("writing kernel.so.o failed"
+                              " for kernel %s\n",
+                              kernel->name);
+      goto FINISH;
+    }
+  else
+    {
+      POCL_MSG_PRINT_GENERAL ("written kernel.so.o size %zu\n", objfile_size);
+    }
 
-  error = pocl_llvm_codegen( kernel, device, bytecode, objfile);
-  assert (error == 0);
+  /* create a temporary filename */
+  pocl_cache_tempname (tmp_module, ".so", NULL);
+  assert (pocl_exists (tmp_module) > 0);
 
-  /* clang is used as the linker driver in LINK_CMD */
-  error = snprintf (command, COMMAND_LENGTH,
+  /* clang is used as the linker driver for ANDROID, otherwise GNU ld is used
+   */
+  POCL_MSG_PRINT_INFO ("Linking final module\n");
+  char *const args1[]
 #ifndef POCL_ANDROID
-#ifdef OCS_AVAILABLE
-                    CLANGXX " " HOST_CLANG_FLAGS " " HOST_LD_FLAGS " -o %s %s",
+      = { LINK_COMMAND,
+          "-o",
+          tmp_module,
+          objfile_path,
+          HOST_LD_FLAGS_ARRAY,
+          NULL };
 #else
-                    LINK_COMMAND " " HOST_LD_FLAGS " -o %s %s",
+      = { POCL_ANDROID_PREFIX "/bin/ld",
+          "-o",
+          tmp_module,
+          objfile_path,
+          HOST_LD_FLAGS_ARRAY,
+          NULL };
 #endif
-#else
-                    POCL_ANDROID_PREFIX"/bin/ld " HOST_LD_FLAGS " -o %s %s ",
-#endif
-                    tmp_module, objfile);
-  assert (error >= 0);
-
-  POCL_MSG_PRINT_INFO ("executing [%s]\n", command);
-  error = system (command);
-  assert (error == 0);
+  error = pocl_run_command (args1);
+  if (error)
+    goto FINISH;
 
-  error = snprintf (command, COMMAND_LENGTH, "mv %s %s", tmp_module, module);
-  assert (error >= 0);
-  error = system (command);
-  assert (error == 0);
+  error = pocl_rename (tmp_module, final_binary_path);
+  if (error)
+    goto FINISH;
 
   /* Save space in kernel cache */
   if (!pocl_get_bool_option("POCL_LEAVE_KERNEL_COMPILER_TEMP_FILES", 0))
     {
-      pocl_remove(objfile);
-      pocl_remove(bytecode);
+      pocl_remove (objfile_path);
     }
 
-  pocl_cache_release_lock(write_lock);
 
-  return module;
+FINISH:
+  pocl_cache_release_lock (write_lock);
+  pocl_destroy_llvm_module (llvm_module);
+  POCL_MEM_FREE (objfile);
+
+  if (error)
+    return NULL;
+  else
+    return strdup (final_binary_path);
 }
 #endif
 
@@ -154,17 +240,20 @@ fill_dev_image_t (dev_image_t* di, struct pocl_argument* parg,
                   cl_device_id device)
 {
   cl_mem mem = *(cl_mem *)parg->value;
-  di->data = (mem->device_ptrs[device->dev_id].mem_ptr);
   di->width = mem->image_width;
   di->height = mem->image_height;
   di->depth = mem->image_depth;
   di->row_pitch = mem->image_row_pitch;
   di->slice_pitch = mem->image_slice_pitch;
   di->order = mem->image_channel_order;
+  di->image_array_size = mem->image_array_size;
   di->data_type = mem->image_channel_data_type;
   pocl_get_image_information (mem->image_channel_order,
                               mem->image_channel_data_type, &(di->num_channels),
                               &(di->elem_size));
+
+  HANDLE_IMAGE1D_BUFFER (mem);
+  di->data = (mem->device_ptrs[device->dev_id].mem_ptr);
 }
 
 void
@@ -268,6 +357,7 @@ pocl_mem_objs_cleanup (cl_event event)
   event->mem_objs = NULL;
 }
 
+static const size_t zero_origin[] = { 0, 0, 0 };
 /**
  * executes given command.
  */
@@ -276,7 +366,7 @@ pocl_exec_command (_cl_command_node * volatile node)
 {
   unsigned i;
   /* because of POCL_UPDATE_EVENT_ */
-  cl_event *event = &(node->event);
+  cl_event event = node->event;
   switch (node->type)
     {
     case CL_COMMAND_READ_BUFFER:
@@ -286,9 +376,8 @@ pocl_exec_command (_cl_command_node * volatile node)
          node->command.read.host_ptr, 
          node->command.read.device_ptr,
          node->command.read.offset,
-         node->command.read.cb); 
-      POCL_UPDATE_EVENT_COMPLETE(event);
-      POCL_DEBUG_EVENT_TIME(event, "Read Buffer           ");
+         node->command.read.cb);
+      POCL_UPDATE_EVENT_COMPLETE_MSG (event, "Read Buffer           ");
       break;
     case CL_COMMAND_WRITE_BUFFER:
       POCL_UPDATE_EVENT_RUNNING(event);
@@ -298,8 +387,7 @@ pocl_exec_command (_cl_command_node * volatile node)
          node->command.write.device_ptr,
          node->command.write.offset, 
          node->command.write.cb);
-      POCL_UPDATE_EVENT_COMPLETE(event);
-      POCL_DEBUG_EVENT_TIME(event, "Write Buffer          ");
+      POCL_UPDATE_EVENT_COMPLETE_MSG (event, "Write Buffer          ");
       break;
     case CL_COMMAND_COPY_BUFFER:
       POCL_UPDATE_EVENT_RUNNING(event);
@@ -310,37 +398,43 @@ pocl_exec_command (_cl_command_node * volatile node)
                             node->command.copy.src_buffer,
                             node->command.copy.src_offset, 
                             node->command.copy.cb);
-      POCL_UPDATE_EVENT_COMPLETE(event);
-      POCL_DEBUG_EVENT_TIME(event, "Copy Buffer           ");
+      POCL_UPDATE_EVENT_COMPLETE_MSG (event, "Copy Buffer           ");
       break;
     case CL_COMMAND_MIGRATE_MEM_OBJECTS:
       POCL_UPDATE_EVENT_RUNNING(event);
       pocl_migrate_mem_objects (node);
-      POCL_UPDATE_EVENT_COMPLETE(event);
-      POCL_DEBUG_EVENT_TIME(event, "Migrate Buffer        ");
+      POCL_UPDATE_EVENT_COMPLETE_MSG (event, "Migrate Buffer        ");
       break;
     case CL_COMMAND_MAP_IMAGE:
     case CL_COMMAND_MAP_BUFFER: 
       POCL_UPDATE_EVENT_RUNNING(event);
-      pocl_map_mem_cmd (node->device, node->command.map.buffer, 
-                        node->command.map.mapping);
-      POCL_UPDATE_EVENT_COMPLETE(event);
-      POCL_DEBUG_EVENT_TIME(event, "Map Image/Buffer      ");
+      POCL_LOCK_OBJ (node->command.map.buffer);
+      if (node->device->ops->map_mem != NULL)
+        node->device->ops->map_mem (node->device->data,
+                                    (node->command.map.buffer)
+                                        ->device_ptrs[node->device->dev_id]
+                                        .mem_ptr,
+                                    (node->command.map.mapping)->offset,
+                                    (node->command.map.mapping)->size,
+                                    (node->command.map.mapping)->host_ptr);
+      (node->command.map.buffer)->map_count++;
+      POCL_UNLOCK_OBJ (node->command.map.buffer);
+      POCL_UPDATE_EVENT_COMPLETE_MSG (event, "Map Image/Buffer      ");
       break;
     case CL_COMMAND_WRITE_IMAGE:
       POCL_UPDATE_EVENT_RUNNING(event);
-      node->device->ops->write_rect
-        (node->device->data,
+      node->device->ops->write_rect (
+         node->device->data,
          node->command.write_image.host_ptr,
          node->command.write_image.device_ptr,
          node->command.write_image.origin,
-         node->command.write_image.origin,
+         zero_origin,
          node->command.write_image.region,
          node->command.write_image.b_rowpitch,
          node->command.write_image.b_slicepitch,
-         node->command.write_image.b_rowpitch,
-         node->command.write_image.b_slicepitch);
-      POCL_UPDATE_EVENT_COMPLETE(event);
+         node->command.write_image.h_rowpitch,
+         node->command.write_image.h_slicepitch);
+      POCL_UPDATE_EVENT_COMPLETE_MSG (event, "Write Image           ");
       break;
     case CL_COMMAND_WRITE_BUFFER_RECT:
       POCL_UPDATE_EVENT_RUNNING(event);
@@ -355,23 +449,22 @@ pocl_exec_command (_cl_command_node * volatile node)
          node->command.write_image.b_slicepitch,
          node->command.write_image.h_rowpitch,
          node->command.write_image.h_slicepitch);
-      POCL_UPDATE_EVENT_COMPLETE(event);
-      POCL_DEBUG_EVENT_TIME(event, "Write Image           ");
+      POCL_UPDATE_EVENT_COMPLETE_MSG (event, "Write Image           ");
       break;
     case CL_COMMAND_READ_IMAGE:
       POCL_UPDATE_EVENT_RUNNING(event);
-      node->device->ops->read_rect
-        (node->device->data, node->command.read_image.host_ptr,
+      node->device->ops->read_rect (
+         node->device->data,
+         node->command.read_image.host_ptr,
          node->command.read_image.device_ptr,
          node->command.read_image.origin,
-         node->command.read_image.origin,
+         zero_origin,
          node->command.read_image.region,
          node->command.read_image.b_rowpitch,
          node->command.read_image.b_slicepitch,
-         node->command.read_image.b_rowpitch,
-         node->command.read_image.b_slicepitch);
-      POCL_UPDATE_EVENT_COMPLETE(event);
-      POCL_DEBUG_EVENT_TIME(event, "Read Image            ");
+         node->command.read_image.h_rowpitch,
+         node->command.read_image.h_slicepitch);
+      POCL_UPDATE_EVENT_COMPLETE_MSG (event, "Read Image            ");
       break;
     case CL_COMMAND_READ_BUFFER_RECT:
       POCL_UPDATE_EVENT_RUNNING(event);
@@ -385,8 +478,7 @@ pocl_exec_command (_cl_command_node * volatile node)
          node->command.read_image.b_slicepitch,
          node->command.read_image.h_rowpitch,
          node->command.read_image.h_slicepitch);
-      POCL_UPDATE_EVENT_COMPLETE(event);
-      POCL_DEBUG_EVENT_TIME(event, "Read Buffer Rect      ");
+      POCL_UPDATE_EVENT_COMPLETE_MSG (event, "Read Buffer Rect      ");
       break;
     case CL_COMMAND_COPY_BUFFER_RECT:
     case CL_COMMAND_COPY_BUFFER_TO_IMAGE:
@@ -456,52 +548,44 @@ pocl_exec_command (_cl_command_node * volatile node)
              tmp_slicepitch);
           free (tmp);
         }
-      POCL_UPDATE_EVENT_COMPLETE(event);
-      POCL_DEBUG_EVENT_TIME(event, "Copy Buffer Rect      ");
+      POCL_UPDATE_EVENT_COMPLETE_MSG (event, "Copy Buffer Rect      ");
       break;
     case CL_COMMAND_UNMAP_MEM_OBJECT:
       POCL_UPDATE_EVENT_RUNNING(event);
-      if ((node->command.unmap.memobj)->flags & 
-          (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR))
-        {
-          /* TODO: should we ensure the device global region is updated from
-             the host memory? How does the specs define it,
-             can the host_ptr be assumed to point to the host and the
-             device accessible memory or just point there until the
-             kernel(s) get executed or similar? */
-          /* Assume the region is automatically up to date. */
-        } else 
-        {
-          if (node->device->ops->unmap_mem != NULL)        
-            node->device->ops->unmap_mem
-              (node->device->data, 
-               (node->command.unmap.mapping)->host_ptr, 
-               (node->command.unmap.memobj)->device_ptrs[node->device->dev_id].mem_ptr, 
-               (node->command.unmap.mapping)->offset,
-               (node->command.unmap.mapping)->size);
-        }
+      /* TODO: should we ensure the device global region is updated from
+         the host memory? How does the specs define it,
+         can the host_ptr be assumed to point to the host and the
+         device accessible memory or just point there until the
+         kernel(s) get executed or similar? */
+      /* Assume the region is automatically up to date. */
       POCL_LOCK_OBJ (node->command.unmap.memobj);
+      if (node->device->ops->unmap_mem != NULL)
+        node->device->ops->unmap_mem (node->device->data,
+                                      (node->command.unmap.mapping)->host_ptr,
+                                      (node->command.unmap.memobj)
+                                          ->device_ptrs[node->device->dev_id]
+                                          .mem_ptr,
+                                      (node->command.unmap.mapping)->offset,
+                                      (node->command.unmap.mapping)->size);
       DL_DELETE((node->command.unmap.memobj)->mappings, 
                 node->command.unmap.mapping);
       (node->command.unmap.memobj)->map_count--;
+      POCL_MEM_FREE (node->command.unmap.mapping);
       POCL_UNLOCK_OBJ (node->command.unmap.memobj);
-      POCL_UPDATE_EVENT_COMPLETE(event);
-      POCL_DEBUG_EVENT_TIME(event, "Unmap Mem obj         ");
+      POCL_UPDATE_EVENT_COMPLETE_MSG (event, "Unmap Mem obj         ");
       break;
     case CL_COMMAND_NDRANGE_KERNEL:
       POCL_UPDATE_EVENT_RUNNING(event);
-      assert (*event == node->event);
+      assert (event == node->event);
       node->device->ops->run(node->command.run.data, node);
-      POCL_UPDATE_EVENT_COMPLETE(event);
-      POCL_DEBUG_EVENT_TIME(event, "Enqueue NDRange       ");
+      POCL_UPDATE_EVENT_COMPLETE_MSG (event, "Enqueue NDRange       ");
       pocl_ndrange_node_cleanup(node);
       break;
     case CL_COMMAND_NATIVE_KERNEL:
       POCL_UPDATE_EVENT_RUNNING(event);
       node->device->ops->run_native(node->command.native.data, node);
       pocl_native_kernel_cleanup(node);
-      POCL_UPDATE_EVENT_COMPLETE(event);
-      POCL_DEBUG_EVENT_TIME(event, "Native Kernel         ");
+      POCL_UPDATE_EVENT_COMPLETE_MSG (event, "Native Kernel         ");
       break;
     case CL_COMMAND_FILL_IMAGE:
       POCL_UPDATE_EVENT_RUNNING(event);
@@ -514,8 +598,7 @@ pocl_exec_command (_cl_command_node * volatile node)
          node->command.fill_image.slicepitch,
          node->command.fill_image.fill_pixel,
          node->command.fill_image.pixel_size);
-      POCL_UPDATE_EVENT_COMPLETE(event);
-      POCL_DEBUG_EVENT_TIME(event, "Fill Image            ");
+      POCL_UPDATE_EVENT_COMPLETE_MSG (event, "Fill Image            ");
       free(node->command.fill_image.fill_pixel);
       break;
     case CL_COMMAND_FILL_BUFFER:
@@ -526,8 +609,7 @@ pocl_exec_command (_cl_command_node * volatile node)
          node->command.memfill.offset,
          node->command.memfill.pattern,
          node->command.memfill.pattern_size);
-      POCL_UPDATE_EVENT_COMPLETE(event);
-      POCL_DEBUG_EVENT_TIME(event, "Fill Buffer           ");
+      POCL_UPDATE_EVENT_COMPLETE_MSG (event, "Fill Buffer           ");
       pocl_aligned_free(node->command.memfill.pattern);
       break;
     case CL_COMMAND_MARKER:
@@ -550,8 +632,7 @@ pocl_exec_command (_cl_command_node * volatile node)
         for (i=0; i < node->command.svm_free.num_svm_pointers; i++)
           node->device->ops->free_ptr(node->device,
                                       node->command.svm_free.svm_pointers[i]);
-      POCL_UPDATE_EVENT_COMPLETE(event);
-      POCL_DEBUG_EVENT_TIME(event, "SVM Free              ");
+      POCL_UPDATE_EVENT_COMPLETE_MSG (event, "SVM Free              ");
       break;
     case CL_COMMAND_SVM_MAP:
       POCL_UPDATE_EVENT_RUNNING(event);
@@ -561,8 +642,7 @@ pocl_exec_command (_cl_command_node * volatile node)
         node->device->ops->map_mem
           (node->device->data, node->command.svm_map.svm_ptr,
            0, node->command.svm_map.size, NULL);
-      POCL_UPDATE_EVENT_COMPLETE(event);
-      POCL_DEBUG_EVENT_TIME(event, "SVM Map              ");
+      POCL_UPDATE_EVENT_COMPLETE_MSG (event, "SVM Map              ");
       break;
     case CL_COMMAND_SVM_UNMAP:
       POCL_UPDATE_EVENT_RUNNING(event);
@@ -572,8 +652,7 @@ pocl_exec_command (_cl_command_node * volatile node)
         node->device->ops->unmap_mem
           (node->device->data, NULL,
            node->command.svm_unmap.svm_ptr, 0, 0);
-      POCL_UPDATE_EVENT_COMPLETE(event);
-      POCL_DEBUG_EVENT_TIME(event, "SVM Unmap             ");
+      POCL_UPDATE_EVENT_COMPLETE_MSG (event, "SVM Unmap             ");
       break;
     case CL_COMMAND_SVM_MEMCPY:
       POCL_UPDATE_EVENT_RUNNING(event);
@@ -581,8 +660,7 @@ pocl_exec_command (_cl_command_node * volatile node)
                               node->command.svm_memcpy.src, 0,
                               node->command.svm_memcpy.dst, 0,
                               node->command.svm_memcpy.size);
-      POCL_UPDATE_EVENT_COMPLETE(event);
-      POCL_DEBUG_EVENT_TIME(event, "SVM Memcpy            ");
+      POCL_UPDATE_EVENT_COMPLETE_MSG (event, "SVM Memcpy            ");
       break;
     case CL_COMMAND_SVM_MEMFILL:
       POCL_UPDATE_EVENT_RUNNING(event);
@@ -591,8 +669,7 @@ pocl_exec_command (_cl_command_node * volatile node)
                                  node->command.memfill.size, 0,
                                  node->command.memfill.pattern,
                                  node->command.memfill.pattern_size);
-      POCL_UPDATE_EVENT_COMPLETE(event);
-      POCL_DEBUG_EVENT_TIME(event, "SVM MemFill           ");
+      POCL_UPDATE_EVENT_COMPLETE_MSG (event, "SVM MemFill           ");
       pocl_aligned_free(node->command.memfill.pattern);
       break;
     default:
@@ -607,9 +684,13 @@ pocl_broadcast (cl_event brc_event)
 {
   event_node *target;
   event_node *tmp;
+
   while ((target = brc_event->notify_list))
     {
-      POCL_LOCK_OBJ (target->event);
+      if (brc_event->command_type == CL_COMMAND_USER)
+        POCL_LOCK_OBJ (target->event);
+      else
+        pocl_lock_events_inorder (brc_event, target->event);
       /* remove event from wait list */
       LL_FOREACH (target->event->wait_list, tmp)
         {
@@ -620,17 +701,20 @@ pocl_broadcast (cl_event brc_event)
               break;
             }
         }
-      if (target->event->status == CL_SUBMITTED)
-        {
+        if (brc_event->command_type == CL_COMMAND_USER)
           POCL_UNLOCK_OBJ (target->event);
-          target->event->command->device->ops->notify 
-            (target->event->command->device, target->event);
-        }
-      else 
-        POCL_UNLOCK_OBJ (target->event);
-      
-      LL_DELETE (brc_event->notify_list, target);
-      pocl_mem_manager_free_event_node (target);
+
+        if ((target->event->status == CL_SUBMITTED)
+            || (target->event->status == CL_QUEUED))
+          {
+            target->event->command->device->ops->notify (
+                target->event->command->device, target->event, brc_event);
+          }
+
+        LL_DELETE (brc_event->notify_list, target);
+        if (brc_event->command_type != CL_COMMAND_USER)
+          pocl_unlock_events_inorder (brc_event, target->event);
+        pocl_mem_manager_free_event_node (target);
     }
 }
 
@@ -641,13 +725,13 @@ pocl_broadcast (cl_event brc_event)
 void
 fill_dev_sampler_t (dev_sampler_t *ds, struct pocl_argument *parg)
 {
-  cl_sampler_t sampler = *(cl_sampler_t *)parg->value;
+  cl_sampler sampler = *(cl_sampler *)parg->value;
 
-  *ds = 0;
-  *ds |= sampler.normalized_coords == CL_TRUE ? CLK_NORMALIZED_COORDS_TRUE :
-      CLK_NORMALIZED_COORDS_FALSE;
+  *ds = (sampler->normalized_coords == CL_TRUE) ? CLK_NORMALIZED_COORDS_TRUE
+                                                : CLK_NORMALIZED_COORDS_FALSE;
 
-  switch (sampler.addressing_mode) {
+  switch (sampler->addressing_mode)
+    {
     case CL_ADDRESS_NONE:
       *ds |= CLK_ADDRESS_NONE; break;
     case CL_ADDRESS_CLAMP_TO_EDGE:
@@ -660,7 +744,8 @@ fill_dev_sampler_t (dev_sampler_t *ds, struct pocl_argument *parg)
       *ds |= CLK_ADDRESS_MIRRORED_REPEAT; break;
   }
 
-  switch (sampler.filter_mode) {
+  switch (sampler->filter_mode)
+    {
     case CL_FILTER_NEAREST:
       *ds |= CLK_FILTER_NEAREST; break;
     case CL_FILTER_LINEAR :
@@ -719,7 +804,7 @@ static int handle_count = 0;
 void
 pocl_check_dlhandle_cache (_cl_command_node *cmd)
 {
-  char workgroup_string[256];
+  char workgroup_string[WORKGROUP_STRING_LENGTH];
   pocl_dlhandle_cache_item *ci = NULL;
 
   POCL_LOCK (pocl_dlhandle_cache_lock);
@@ -770,7 +855,10 @@ pocl_check_dlhandle_cache (_cl_command_node *cmd)
       POCL_LOCK (pocl_llvm_codegen_lock);
       module_fn = (char *)llvm_codegen (cmd->command.run.tmp_dir,
                                         cmd->command.run.kernel,
-                                        cmd->device);
+                                        cmd->device,
+                                        cmd->command.run.local_x,
+                                        cmd->command.run.local_y,
+                                        cmd->command.run.local_z);
       POCL_UNLOCK (pocl_llvm_codegen_lock);
       POCL_MSG_PRINT_INFO("Using static WG size binary: %s\n", module_fn);
 #else
@@ -813,7 +901,8 @@ pocl_check_dlhandle_cache (_cl_command_node *cmd)
     }
   free(module_fn);
 
-  snprintf (workgroup_string, 256, "_pocl_launcher_%s_workgroup", 
+  snprintf (workgroup_string, WORKGROUP_STRING_LENGTH,
+            "_pocl_launcher_%s_workgroup",
             cmd->command.run.kernel->name);
 
   POCL_LOCK (pocl_dlhandle_lock);
@@ -829,44 +918,26 @@ pocl_check_dlhandle_cache (_cl_command_node *cmd)
   POCL_UNLOCK (pocl_dlhandle_cache_lock);
 }
 
-/*
-static void
-pocl_free_dlhandle (_cl_command_node *cmd)
-{
-  pocl_dlhandle_cache_item *ci = NULL;
-  POCL_LOCK (pocl_dlhandle_cache_lock);
-  DL_FOREACH (pocl_dlhandle_cache, ci)
-    {
-      if (strcmp (ci->tmp_dir, cmd->command.run.tmp_dir) == 0 &&
-          strcmp (ci->function_name, 
-                  cmd->command.run.kernel->name) == 0)
-        {
-          if ((--ci->ref_count))
-            break;
-          --handle_count;
-          DL_DELETE (pocl_dlhandle_cache, ci);
-          POCL_UNLOCK (pocl_dlhandle_cache_lock);
-          free (ci->tmp_dir);
-          free (ci->function_name);
-          POCL_LOCK (pocl_llvm_codegen_lock);
-          assert(!lt_dlclose (ci->dlhandle));
-          POCL_UNLOCK (pocl_llvm_codegen_lock);
-          free (ci);
-          return;
-        }
-    }
-  POCL_UNLOCK (pocl_dlhandle_cache_lock);
-}
-*/
 
 #define MIN_MAX_MEM_ALLOC_SIZE (128*1024*1024)
 
 /* accounting object for the main memory */
 static pocl_global_mem_t system_memory;
 
+static size_t
+next_larger_pow2 (size_t in)
+{
+  size_t out = 1;
+  while (in > out)
+    out <<= 1;
+  return out;
+}
+
 void
 pocl_setup_device_for_system_memory(cl_device_id device)
 {
+  int limit_memory_gb = pocl_get_int_option ("POCL_MEMORY_LIMIT", 0);
+
   /* set up system memory limits, if required */
   if (system_memory.total_alloc_limit == 0)
   {
@@ -877,7 +948,7 @@ pocl_setup_device_for_system_memory(cl_device_id device)
        */
       size_t alloc_limit = device->global_mem_size;
       if ((alloc_limit >> 20) > (7 << 10))
-        system_memory.total_alloc_limit = alloc_limit - (size_t)(1 << 31);
+        system_memory.total_alloc_limit = alloc_limit - (size_t)(1UL << 31);
       else
         {
           size_t temp = (alloc_limit >> 2);
@@ -889,6 +960,18 @@ pocl_setup_device_for_system_memory(cl_device_id device)
   }
 
   device->global_mem_size = system_memory.total_alloc_limit;
+
+  if (limit_memory_gb > 0)
+    {
+      size_t limited_memory = (size_t)limit_memory_gb << 30;
+      if (device->global_mem_size > limited_memory)
+        device->global_mem_size = limited_memory;
+      else
+        POCL_MSG_WARN ("requested POCL_MEMORY_LIMIT %i GBs is larger than "
+                       "physical memory size (%zu) GBs, ignoring\n",
+                       limit_memory_gb, (device->global_mem_size >> 30));
+    }
+
   if (device->global_mem_size < MIN_MAX_MEM_ALLOC_SIZE)
     POCL_ABORT("Not enough memory to run on this device.\n");
 
@@ -909,7 +992,9 @@ pocl_setup_device_for_system_memory(cl_device_id device)
     alloc_limit = MIN_MAX_MEM_ALLOC_SIZE;
 
   if (alloc_limit > device->global_mem_size)
-    alloc_limit = device->global_mem_size;
+    alloc_limit = next_larger_pow2 (device->global_mem_size / 4);
+  if (alloc_limit > (device->global_mem_size / 2))
+    alloc_limit >>= 1;
 
   if (alloc_limit < MIN_MAX_MEM_ALLOC_SIZE)
     alloc_limit = MIN_MAX_MEM_ALLOC_SIZE;
@@ -929,9 +1014,13 @@ pocl_set_buffer_image_limits(cl_device_id device)
 {
   pocl_setup_device_for_system_memory(device);
   /* these aren't set up in pocl_setup_device_for_system_memory,
-   * because some devices (HSA) set them up themselves */
-  device->local_mem_size = device->max_constant_buffer_size =
-      device->max_mem_alloc_size;
+   * because some devices (HSA) set them up themselves
+   *
+   * it's max mem alloc / 4 because some programs (conformance test)
+   * try to allocate max size constant objects and run out of memory
+   * while trying to fill them. */
+  device->local_mem_size = device->max_constant_buffer_size
+      = next_larger_pow2 (device->global_mem_size / 256);
 
   /* We don't have hardware limitations on the buffer-backed image sizes,
    * so we set the maximum size in terms of the maximum amount of pixels
@@ -981,11 +1070,13 @@ pocl_memalign_alloc_global_mem(cl_device_id device, size_t align, size_t size)
   if (!ptr)
     return NULL;
 
+  POCL_LOCK_OBJ (mem);
   mem->currently_allocated += size;
   if (mem->max_ever_allocated < mem->currently_allocated)
     mem->max_ever_allocated = mem->currently_allocated;
-
   assert(mem->currently_allocated <= mem->total_alloc_limit);
+  POCL_UNLOCK_OBJ (mem);
+
   return ptr;
 }
 
@@ -994,8 +1085,10 @@ pocl_free_global_mem(cl_device_id device, void* ptr, size_t size)
 {
   pocl_global_mem_t *mem = device->global_memory;
 
+  POCL_LOCK_OBJ (mem);
   assert(mem->currently_allocated >= size);
   mem->currently_allocated -= size;
+  POCL_UNLOCK_OBJ (mem);
 
   POCL_MEM_FREE(ptr);
 }
@@ -1003,7 +1096,7 @@ pocl_free_global_mem(cl_device_id device, void* ptr, size_t size)
 void
 pocl_print_system_memory_stats()
 {
-  POCL_MSG_PRINT("MEM STATS:\n", "",
+  POCL_MSG_PRINT_F (MEMORY, INFO, "",
   "____ Total available system memory  : %10zu KB\n"
   " ____ Currently used system memory   : %10zu KB\n"
   " ____ Max used system memory         : %10zu KB\n",
diff --git a/lib/CL/devices/common.h b/lib/CL/devices/common.h
index 469fcef..03b5dd1 100644
--- a/lib/CL/devices/common.h
+++ b/lib/CL/devices/common.h
@@ -38,48 +38,19 @@
 
 #define SETUP_DEVICE_CL_VERSION(a, b) XSETUP_DEVICE_CL_VERSION(a, b)
 
+#define POCL_DEVICES_PREFERRED_VECTOR_WIDTH_CHAR    1
+#define POCL_DEVICES_PREFERRED_VECTOR_WIDTH_SHORT   1
+#define POCL_DEVICES_PREFERRED_VECTOR_WIDTH_INT     1
+#define POCL_DEVICES_PREFERRED_VECTOR_WIDTH_LONG    1
+#define POCL_DEVICES_PREFERRED_VECTOR_WIDTH_FLOAT   1
+#define POCL_DEVICES_PREFERRED_VECTOR_WIDTH_DOUBLE  1
+#define POCL_DEVICES_NATIVE_VECTOR_WIDTH_CHAR       1
+#define POCL_DEVICES_NATIVE_VECTOR_WIDTH_SHORT      1
+#define POCL_DEVICES_NATIVE_VECTOR_WIDTH_INT        1
+#define POCL_DEVICES_NATIVE_VECTOR_WIDTH_LONG       1
+#define POCL_DEVICES_NATIVE_VECTOR_WIDTH_FLOAT      1
+#define POCL_DEVICES_NATIVE_VECTOR_WIDTH_DOUBLE     1
 
-/* Determine preferred vector sizes */
-#if defined(__AVX__)
-#  define POCL_DEVICES_PREFERRED_VECTOR_WIDTH_CHAR   16
-#  define POCL_DEVICES_PREFERRED_VECTOR_WIDTH_SHORT   8
-#  define POCL_DEVICES_PREFERRED_VECTOR_WIDTH_INT     4
-#  define POCL_DEVICES_PREFERRED_VECTOR_WIDTH_LONG    2
-#  define POCL_DEVICES_PREFERRED_VECTOR_WIDTH_FLOAT   4
-#  define POCL_DEVICES_PREFERRED_VECTOR_WIDTH_DOUBLE  2
-#  define POCL_DEVICES_NATIVE_VECTOR_WIDTH_CHAR      16
-#  define POCL_DEVICES_NATIVE_VECTOR_WIDTH_SHORT      8
-#  define POCL_DEVICES_NATIVE_VECTOR_WIDTH_INT        4
-#  define POCL_DEVICES_NATIVE_VECTOR_WIDTH_LONG       2
-#  define POCL_DEVICES_NATIVE_VECTOR_WIDTH_FLOAT      8
-#  define POCL_DEVICES_NATIVE_VECTOR_WIDTH_DOUBLE     4
-#elif defined(__SSE2__)
-#  define POCL_DEVICES_PREFERRED_VECTOR_WIDTH_CHAR   16
-#  define POCL_DEVICES_PREFERRED_VECTOR_WIDTH_SHORT   8
-#  define POCL_DEVICES_PREFERRED_VECTOR_WIDTH_INT     4
-#  define POCL_DEVICES_PREFERRED_VECTOR_WIDTH_LONG    2
-#  define POCL_DEVICES_PREFERRED_VECTOR_WIDTH_FLOAT   4
-#  define POCL_DEVICES_PREFERRED_VECTOR_WIDTH_DOUBLE  2
-#  define POCL_DEVICES_NATIVE_VECTOR_WIDTH_CHAR      16
-#  define POCL_DEVICES_NATIVE_VECTOR_WIDTH_SHORT      8
-#  define POCL_DEVICES_NATIVE_VECTOR_WIDTH_INT        4
-#  define POCL_DEVICES_NATIVE_VECTOR_WIDTH_LONG       2
-#  define POCL_DEVICES_NATIVE_VECTOR_WIDTH_FLOAT      4
-#  define POCL_DEVICES_NATIVE_VECTOR_WIDTH_DOUBLE     2
-#else
-#  define POCL_DEVICES_PREFERRED_VECTOR_WIDTH_CHAR    1
-#  define POCL_DEVICES_PREFERRED_VECTOR_WIDTH_SHORT   1
-#  define POCL_DEVICES_PREFERRED_VECTOR_WIDTH_INT     1
-#  define POCL_DEVICES_PREFERRED_VECTOR_WIDTH_LONG    1
-#  define POCL_DEVICES_PREFERRED_VECTOR_WIDTH_FLOAT   1
-#  define POCL_DEVICES_PREFERRED_VECTOR_WIDTH_DOUBLE  1
-#  define POCL_DEVICES_NATIVE_VECTOR_WIDTH_CHAR       1
-#  define POCL_DEVICES_NATIVE_VECTOR_WIDTH_SHORT      1
-#  define POCL_DEVICES_NATIVE_VECTOR_WIDTH_INT        1
-#  define POCL_DEVICES_NATIVE_VECTOR_WIDTH_LONG       1
-#  define POCL_DEVICES_NATIVE_VECTOR_WIDTH_FLOAT      1
-#  define POCL_DEVICES_NATIVE_VECTOR_WIDTH_DOUBLE     1
-#endif
 /* Half is internally represented as short */
 #define POCL_DEVICES_PREFERRED_VECTOR_WIDTH_HALF POCL_DEVICES_PREFERRED_VECTOR_WIDTH_SHORT
 #define POCL_DEVICES_NATIVE_VECTOR_WIDTH_HALF POCL_DEVICES_NATIVE_VECTOR_WIDTH_SHORT
@@ -92,9 +63,8 @@ extern "C" {
 #pragma GCC visibility push(hidden)
 #endif
 
-char* llvm_codegen (const char* tmpdir,
-                    cl_kernel kernel,
-                    cl_device_id device);
+char *llvm_codegen (const char *tmpdir, cl_kernel kernel, cl_device_id device,
+                    size_t local_x, size_t local_y, size_t local_z);
 
 void fill_dev_image_t (dev_image_t* di, struct pocl_argument* parg, 
                        cl_device_id device);
diff --git a/lib/CL/devices/cpuinfo.c b/lib/CL/devices/cpuinfo.c
index 17e25c6..822aa8b 100644
--- a/lib/CL/devices/cpuinfo.c
+++ b/lib/CL/devices/cpuinfo.c
@@ -48,9 +48,9 @@ static const char* cpufreq_file="/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_ma
  #define DEFAULTVENDOR "AIM" // Apple-IBM-Motorola
  #define DEFAULTVENDORID 0x1014 // IBM
  #define VENDORSTRING "vendor"
-#elif defined __arm__
+#elif defined __arm__ || __aarch64__
  #define FREQSTRING " "
- #define MODELSTRING "Processor"
+ #define MODELSTRING "CPU part"
  #define DEFAULTVENDOR "ARM"
  #define DEFAULTVENDORID 0x13b5 // ARM
  #define VENDORSTRING "CPU implementer"
@@ -263,6 +263,62 @@ pocl_sysfs_detect_compute_unit_count()
 }
 #endif
 
+#if __arm__ || __aarch64__
+enum
+{
+  JEP106_ARM    = 0x41,
+  JEP106_BRDCOM = 0x42,
+  JEP106_CAVIUM = 0x43,
+  JEP106_APM    = 0x50,
+  JEP106_QCOM   = 0x51
+};
+
+static const struct
+{
+  unsigned id; /* JEDEC JEP106 code; /proc/cpuinfo, field "CPU implementer" */
+  char const *name;
+}
+vendor_list[] =
+{
+  { JEP106_ARM,    "ARM" },
+  { JEP106_BRDCOM, "Broadcom" },
+  { JEP106_CAVIUM, "Cavium" },
+  { JEP106_APM,    "Applied Micro" },
+  { JEP106_QCOM,   "Qualcomm" }
+};
+
+typedef struct
+{
+  unsigned id; /* part code; /proc/cpuinfo, field "CPU part" */
+  char const *name;
+} part_tuple_t;
+
+static const part_tuple_t part_list_arm[] =
+{
+  { 0xd0a, "cortex-a75" },
+  { 0xd09, "cortex-a73" },
+  { 0xd08, "cortex-a72" },
+  { 0xd07, "cortex-a57" },
+  { 0xd05, "cortex-a55" },
+  { 0xd04, "cortex-a35" },
+  { 0xd03, "cortex-a53" },
+  { 0xd01, "cortex-a32" },
+  { 0xc0f, "cortex-a15" },
+  { 0xc0e, "cortex-a17" },
+  { 0xc0d, "cortex-a12" }, /* Rockchip RK3288 */
+  { 0xc0c, "cortex-a12" },
+  { 0xc09, "cortex-a9" },
+  { 0xc08, "cortex-a8" },
+  { 0xc07, "cortex-a7" },
+  { 0xc05, "cortex-a5" }
+};
+
+static const part_tuple_t part_list_apm[] =
+{
+  { 0x0, "x-gene-1" }
+};
+#endif
+
 static void
 pocl_cpuinfo_get_cpu_name_and_vendor(cl_device_id device)
 {
@@ -284,8 +340,12 @@ pocl_cpuinfo_get_cpu_name_and_vendor(cl_device_id device)
   fclose(f);
   contents[num_read]='\0';
 
-  char *start, *end;
+  char const *start, *end;
   /* find the vendor_id string an put */
+
+#if __arm__ || __aarch64__
+  unsigned vendor_id = -1;
+#endif
 #ifdef VENDORSTRING
   do {
     start = strstr(contents, VENDORSTRING"\t: ");
@@ -295,6 +355,22 @@ pocl_cpuinfo_get_cpu_name_and_vendor(cl_device_id device)
     end = strchr(start, '\n');
     if (!end)
       break;
+
+#if __arm__ || __aarch64__
+    if (1 == sscanf (start, "%x", &vendor_id))
+      {
+        for (size_t i = 0; i < sizeof (vendor_list) / sizeof (vendor_list[0]); ++i)
+          {
+            if (vendor_id == vendor_list[i].id)
+              {
+                start = vendor_list[i].name;
+                end = start + strlen (vendor_list[i].name);
+                break;
+              }
+          }
+      }
+#endif
+
     char *_vendor = malloc(end-start + 1);
     if (!_vendor)
       break;
@@ -313,12 +389,42 @@ pocl_cpuinfo_get_cpu_name_and_vendor(cl_device_id device)
   if (end == NULL)
     return;
 
+#if __arm__ || __aarch64__
+  unsigned part_id;
+  if (1 == sscanf (start, "%x", &part_id))
+    {
+      part_tuple_t const *part_list = NULL;
+      size_t part_count = 0;
+
+      switch (vendor_id)
+      {
+        case JEP106_ARM:
+          part_list = part_list_arm;
+          part_count = sizeof (part_list_arm) / sizeof (part_list_arm[0]);
+          break;
+        case JEP106_APM:
+          part_list = part_list_apm;
+          part_count = sizeof (part_list_apm) / sizeof (part_list_apm[0]);
+          break;
+      }
+
+      for (size_t i = 0; i < part_count; ++i)
+        {
+          if (part_id == part_list[i].id)
+            {
+              start = part_list[i].name;
+              end = start + strlen (part_list[i].name);
+              break;
+            }
+        }
+    }
+#endif
+
   /* create the descriptive long_name for device */
   int len = strlen (device->short_name) + (end-start) + 2;
   char *new_name = (char*)malloc (len);
   snprintf (new_name, len, "%s-%s", device->short_name, start);
   device->long_name = new_name;
-
 }
 
 void
diff --git a/include/CMakeLists.txt b/lib/CL/devices/cuda/CMakeLists.txt
similarity index 67%
copy from include/CMakeLists.txt
copy to lib/CL/devices/cuda/CMakeLists.txt
index e8462c5..1c41190 100644
--- a/include/CMakeLists.txt
+++ b/lib/CL/devices/cuda/CMakeLists.txt
@@ -1,7 +1,7 @@
 #=============================================================================
 #   CMake build system files
 #
-#   Copyright (c) 2014 pocl developers
+#   Copyright (c) 2016 pocl developers
 #
 #   Permission is hereby granted, free of charge, to any person obtaining a copy
 #   of this software and associated documentation files (the "Software"), to deal
@@ -23,12 +23,12 @@
 #
 #=============================================================================
 
-add_subdirectory("CL")
+find_package(CUDA REQUIRED)
+message(STATUS "CUDA_TOOLKIT_ROOT_DIR = ${CUDA_TOOLKIT_ROOT_DIR}")
 
-set(PRIVATE_HEADERS _enable_all_exts.h _kernel.h _kernel_c.h _kernel_constants.h pocl_types.h pocl_device.h pocl.h)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${LLVM_CXXFLAGS}")
+include_directories(${LLVM_INCLUDE_DIRS} ${CUDA_INCLUDE_DIRS} ${CMAKE_CURRENT_SOURCE_DIR}/../../../llvmopencl)
 
-install(FILES ${PRIVATE_HEADERS}
-        DESTINATION ${POCL_INSTALL_PRIVATE_HEADER_DIR})
-
-install(FILES "poclu.h"
-        DESTINATION ${POCL_INSTALL_PUBLIC_HEADER_DIR})
+add_library("pocl-devices-cuda" OBJECT pocl-cuda.c pocl-cuda.h pocl-ptx-gen.cc pocl-ptx-gen.h)
+target_compile_definitions("pocl-devices-cuda" PRIVATE "-DCUDA_TOOLKIT_ROOT_DIR=\"${CUDA_TOOLKIT_ROOT_DIR}\"")
+set(POCL_DEVICES_OBJS "${POCL_DEVICES_OBJS};$<TARGET_OBJECTS:pocl-devices-cuda>" PARENT_SCOPE)
diff --git a/lib/CL/devices/cuda/pocl-cuda.c b/lib/CL/devices/cuda/pocl-cuda.c
new file mode 100644
index 0000000..bf27c2c
--- /dev/null
+++ b/lib/CL/devices/cuda/pocl-cuda.c
@@ -0,0 +1,1623 @@
+/* pocl-cuda.c - driver for CUDA devices
+
+   Copyright (c) 2016-2017 James Price / University of Bristol
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to
+   deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+   FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include "config.h"
+
+#include "common.h"
+#include "devices.h"
+#include "pocl-cuda.h"
+#include "pocl-ptx-gen.h"
+#include "pocl_cache.h"
+#include "pocl_file_util.h"
+#include "pocl_llvm.h"
+#include "pocl_mem_management.h"
+#include "pocl_runtime_config.h"
+#include "pocl_util.h"
+
+#include <string.h>
+
+#include <cuda.h>
+
+typedef struct pocl_cuda_device_data_s
+{
+  CUdevice device;
+  CUcontext context;
+  CUevent epoch_event;
+  cl_ulong epoch;
+  char libdevice[PATH_MAX];
+  pocl_lock_t compile_lock;
+} pocl_cuda_device_data_t;
+
+typedef struct pocl_cuda_queue_data_s
+{
+  CUstream stream;
+  int use_threads;
+  pthread_t submit_thread;
+  pthread_t finalize_thread;
+  pthread_mutex_t lock;
+  pthread_cond_t pending_cond;
+  pthread_cond_t running_cond;
+  _cl_command_node *volatile pending_queue;
+  _cl_command_node *volatile running_queue;
+  cl_command_queue queue;
+} pocl_cuda_queue_data_t;
+
+typedef struct pocl_cuda_kernel_data_s
+{
+  CUmodule module;
+  CUmodule module_offsets;
+  CUfunction kernel;
+  CUfunction kernel_offsets;
+  size_t *alignments;
+} pocl_cuda_kernel_data_t;
+
+typedef struct pocl_cuda_event_data_s
+{
+  CUevent start;
+  CUevent end;
+  volatile int events_ready;
+  cl_int *ext_event_flag;
+  volatile unsigned num_ext_events;
+} pocl_cuda_event_data_t;
+
+extern unsigned int pocl_num_devices;
+
+void *pocl_cuda_submit_thread (void *);
+void *pocl_cuda_finalize_thread (void *);
+
+static void
+pocl_cuda_abort_on_error (CUresult result, unsigned line, const char *func,
+                          const char *code, const char *api)
+{
+  if (result != CUDA_SUCCESS)
+    {
+      const char *err_name;
+      const char *err_string;
+      cuGetErrorName (result, &err_name);
+      cuGetErrorString (result, &err_string);
+      POCL_MSG_PRINT2 (CUDA, func, line, "Error during %s\n", api);
+      POCL_ABORT ("%s: %s\n", err_name, err_string);
+    }
+}
+
+static int
+pocl_cuda_error (CUresult result, unsigned line, const char *func,
+                          const char *code, const char *api)
+{
+  int err = (result != CUDA_SUCCESS);
+  if (err)
+    {
+      const char *err_name;
+      const char *err_string;
+      cuGetErrorName (result, &err_name);
+      cuGetErrorString (result, &err_string);
+      POCL_MSG_ERR ("CUDA error during %s. %s: %s\n", api, err_name, err_string);
+    }
+  return err;
+}
+
+#define CUDA_CHECK(result, api)                                               \
+  pocl_cuda_abort_on_error (result, __LINE__, __FUNCTION__, #result, api)
+
+#define CUDA_CHECK_ERROR(result, api)                                         \
+  pocl_cuda_error (result, __LINE__, __FUNCTION__, #result, api)
+
+void
+pocl_cuda_init_device_ops (struct pocl_device_ops *ops)
+{
+  pocl_basic_init_device_ops (ops);
+
+  ops->device_name = "CUDA";
+  ops->init_device_infos = pocl_cuda_init_device_infos;
+  ops->build_hash = pocl_cuda_build_hash;
+  ops->probe = pocl_cuda_probe;
+  ops->uninit = pocl_cuda_uninit;
+  ops->init = pocl_cuda_init;
+  ops->init_queue = pocl_cuda_init_queue;
+  ops->free_queue = pocl_cuda_free_queue;
+  ops->alloc_mem_obj = pocl_cuda_alloc_mem_obj;
+  ops->free = pocl_cuda_free;
+  ops->free_ptr = pocl_cuda_free_ptr;
+  ops->compile_kernel = pocl_cuda_compile_kernel;
+  ops->map_mem = pocl_cuda_map_mem;
+  ops->submit = pocl_cuda_submit;
+  ops->notify = pocl_cuda_notify;
+  ops->wait_event = pocl_cuda_wait_event;
+  ops->update_event = pocl_cuda_update_event;
+  ops->free_event_data = pocl_cuda_free_event_data;
+  ops->join = pocl_cuda_join;
+  ops->flush = pocl_cuda_flush;
+
+  ops->read = NULL;
+  ops->read_rect = NULL;
+  ops->write = NULL;
+  ops->write_rect = NULL;
+  ops->copy = NULL;
+  ops->copy_rect = NULL;
+  ops->unmap_mem = NULL;
+  ops->run = NULL;
+
+  /* TODO: implement remaining ops functions if needed: */
+  /* broadcast */
+  /* get_timer_value */
+}
+
+cl_int
+pocl_cuda_init (unsigned j, cl_device_id dev, const char *parameters)
+{
+  CUresult result;
+  int ret = CL_SUCCESS;
+
+  if (dev->data)
+    return ret;
+
+  pocl_cuda_device_data_t *data = calloc (1, sizeof (pocl_cuda_device_data_t));
+  result = cuDeviceGet (&data->device, j);
+  if (CUDA_CHECK_ERROR (result, "cuDeviceGet"))
+    ret = CL_INVALID_DEVICE;
+
+  /* Get specific device name */
+  dev->long_name = dev->short_name = calloc (256, sizeof (char));
+
+  if (ret != CL_INVALID_DEVICE)
+    cuDeviceGetName (dev->long_name, 256, data->device);
+  else
+    snprintf (dev->long_name, 255, "Unavailable CUDA device #%d", j);
+
+  SETUP_DEVICE_CL_VERSION (CUDA_DEVICE_CL_VERSION_MAJOR,
+                           CUDA_DEVICE_CL_VERSION_MINOR);
+
+  /* Get other device properties */
+  if (ret != CL_INVALID_DEVICE)
+    {
+      /* CUDA device attributes (as fetched by cuDeviceGetAttribute) are always (unsigned)
+       * integers, where the OpenCL counterparts are of a variety of (other) integer types.
+       * Fetch the values in an unsigned int and copy it over.
+       * We also OR all return values of cuDeviceGetAttribute, and at the end we will check
+       * if it's not CL_SUCCESS. We miss the exact line that failed this way, but it's
+       * faster than checking after each attribute fetch.
+       */
+      unsigned int value = 0;
+#define GET_CU_PROP(key, target) do { \
+  result |= cuDeviceGetAttribute (&value, CU_DEVICE_ATTRIBUTE_##key, data->device); \
+  target = value; \
+} while (0)
+
+      GET_CU_PROP (MAX_THREADS_PER_BLOCK, dev->max_work_group_size);
+      GET_CU_PROP (MAX_BLOCK_DIM_X, dev->max_work_item_sizes[0]);
+      GET_CU_PROP (MAX_BLOCK_DIM_Y, dev->max_work_item_sizes[1]);
+      GET_CU_PROP (MAX_BLOCK_DIM_Z, dev->max_work_item_sizes[2]);
+      GET_CU_PROP (MAX_SHARED_MEMORY_PER_BLOCK, dev->local_mem_size);
+      GET_CU_PROP (MULTIPROCESSOR_COUNT, dev->max_compute_units);
+      GET_CU_PROP (ECC_ENABLED, dev->error_correction_support);
+      GET_CU_PROP (INTEGRATED, dev->host_unified_memory);
+      GET_CU_PROP (TOTAL_CONSTANT_MEMORY, dev->max_constant_buffer_size);
+      GET_CU_PROP (CLOCK_RATE, dev->max_clock_frequency);
+      dev->max_clock_frequency /= 1000;
+      GET_CU_PROP (TEXTURE_ALIGNMENT, dev->mem_base_addr_align);
+      dev->mem_base_addr_align *= 8;
+      GET_CU_PROP (INTEGRATED, dev->host_unified_memory);
+    }
+  if (CUDA_CHECK_ERROR (result, "cuDeviceGetAttribute"))
+    ret = CL_INVALID_DEVICE;
+
+  dev->preferred_wg_size_multiple = 32;
+  dev->preferred_vector_width_char = 1;
+  dev->preferred_vector_width_short = 1;
+  dev->preferred_vector_width_int = 1;
+  dev->preferred_vector_width_long = 1;
+  dev->preferred_vector_width_float = 1;
+  dev->preferred_vector_width_double = 1;
+  dev->preferred_vector_width_half = 0;
+  dev->native_vector_width_char = 1;
+  dev->native_vector_width_short = 1;
+  dev->native_vector_width_int = 1;
+  dev->native_vector_width_long = 1;
+  dev->native_vector_width_float = 1;
+  dev->native_vector_width_double = 1;
+  dev->native_vector_width_half = 0;
+
+  dev->single_fp_config = CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO
+                          | CL_FP_ROUND_TO_INF | CL_FP_FMA | CL_FP_INF_NAN
+                          | CL_FP_DENORM;
+  dev->double_fp_config = CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO
+                          | CL_FP_ROUND_TO_INF | CL_FP_FMA | CL_FP_INF_NAN
+                          | CL_FP_DENORM;
+
+  dev->local_mem_type = CL_LOCAL;
+
+  /* Get GPU architecture name */
+  int sm_maj = 0, sm_min = 0;
+  if (ret != CL_INVALID_DEVICE)
+    {
+      cuDeviceGetAttribute (&sm_maj, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                            data->device);
+      cuDeviceGetAttribute (&sm_min, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
+                            data->device);
+    }
+  char *gpu_arch = calloc (16, sizeof (char));
+  snprintf (gpu_arch, 16, "sm_%d%d", sm_maj, sm_min);
+  dev->llvm_cpu = pocl_get_string_option ("POCL_CUDA_GPU_ARCH", gpu_arch);
+  POCL_MSG_PRINT_INFO ("[CUDA] GPU architecture = %s\n", dev->llvm_cpu);
+
+  /* Find libdevice library */
+  if (findLibDevice (data->libdevice, dev->llvm_cpu))
+    {
+      if (ret != CL_INVALID_DEVICE)
+        {
+          POCL_MSG_ERR ("[CUDA] failed to find libdevice library\n");
+          dev->compiler_available = 0;
+        }
+    }
+
+  /* Create context */
+  if (ret != CL_INVALID_DEVICE)
+    {
+      result = cuCtxCreate (&data->context, CU_CTX_MAP_HOST, data->device);
+      if (CUDA_CHECK_ERROR (result, "cuCtxCreate"))
+        ret = CL_INVALID_DEVICE;
+    }
+
+  /* Create epoch event for timing info */
+  if (ret != CL_INVALID_DEVICE)
+    {
+      result = cuEventCreate (&data->epoch_event, CU_EVENT_DEFAULT);
+      CUDA_CHECK_ERROR (result, "cuEventCreate");
+
+      data->epoch = dev->ops->get_timer_value (dev->data);
+
+      result = cuEventRecord (data->epoch_event, 0);
+      result = cuEventSynchronize (data->epoch_event);
+      if (CUDA_CHECK_ERROR (result, "cuEventSynchronize"))
+        ret = CL_INVALID_DEVICE;
+    }
+
+  /* Get global memory size */
+  size_t memfree = 0, memtotal = 0;
+  if (ret != CL_INVALID_DEVICE)
+    result = cuMemGetInfo (&memfree, &memtotal);
+  dev->max_mem_alloc_size = max (memtotal / 4, 128 * 1024 * 1024);
+  dev->global_mem_size = memtotal;
+
+  dev->data = data;
+
+  POCL_INIT_LOCK (data->compile_lock);
+  return ret;
+}
+
+cl_int
+pocl_cuda_init_queue (cl_command_queue queue)
+{
+  cuCtxSetCurrent (((pocl_cuda_device_data_t *)queue->device->data)->context);
+
+  pocl_cuda_queue_data_t *queue_data
+      = calloc (1, sizeof (pocl_cuda_queue_data_t));
+  queue->data = queue_data;
+  queue_data->queue = queue;
+
+  CUresult result
+      = cuStreamCreate (&queue_data->stream, CU_STREAM_NON_BLOCKING);
+  if (CUDA_CHECK_ERROR (result, "cuStreamCreate"))
+    return CL_OUT_OF_RESOURCES;
+
+  queue_data->use_threads
+      = !pocl_get_bool_option ("POCL_CUDA_DISABLE_QUEUE_THREADS", 0);
+
+  if (queue_data->use_threads)
+    {
+      pthread_mutex_init (&queue_data->lock, NULL);
+      pthread_cond_init (&queue_data->pending_cond, NULL);
+      pthread_cond_init (&queue_data->running_cond, NULL);
+      int err = pthread_create (&queue_data->submit_thread, NULL,
+                                pocl_cuda_submit_thread, queue_data);
+      if (err)
+        {
+          POCL_MSG_ERR ("[CUDA] Error creating submit thread: %d\n", err);
+          return CL_OUT_OF_RESOURCES;
+        }
+
+      err = pthread_create (&queue_data->finalize_thread, NULL,
+                            pocl_cuda_finalize_thread, queue_data);
+      if (err)
+        {
+          POCL_MSG_ERR ("[CUDA] Error creating finalize thread: %d\n", err);
+          return CL_OUT_OF_RESOURCES;
+        }
+    }
+
+  return CL_SUCCESS;
+}
+
+void
+pocl_cuda_free_queue (cl_command_queue queue)
+{
+  pocl_cuda_queue_data_t *queue_data = (pocl_cuda_queue_data_t *)queue->data;
+
+  cuCtxSetCurrent (((pocl_cuda_device_data_t *)queue->device->data)->context);
+  cuStreamDestroy (queue_data->stream);
+
+  assert (queue_data->pending_queue == NULL);
+  assert (queue_data->running_queue == NULL);
+
+  /* Kill queue threads */
+  if (queue_data->use_threads)
+    {
+      pthread_mutex_lock (&queue_data->lock);
+      queue_data->queue = NULL;
+      pthread_cond_signal (&queue_data->pending_cond);
+      pthread_cond_signal (&queue_data->running_cond);
+      pthread_mutex_unlock (&queue_data->lock);
+      pthread_join (queue_data->submit_thread, NULL);
+      pthread_join (queue_data->finalize_thread, NULL);
+    }
+}
+
+char *
+pocl_cuda_build_hash (cl_device_id device)
+{
+  char *res = calloc (1000, sizeof (char));
+  snprintf (res, 1000, "CUDA-%s", device->llvm_cpu);
+  return res;
+}
+
+void
+pocl_cuda_init_device_infos (unsigned j, struct _cl_device_id *dev)
+{
+  pocl_basic_init_device_infos (j, dev);
+
+  dev->vendor = "NVIDIA Corporation";
+  dev->vendor_id = 0x10de; /* the PCIID for NVIDIA */
+
+  dev->type = CL_DEVICE_TYPE_GPU;
+  dev->address_bits = (sizeof (void *) * 8);
+  dev->llvm_target_triplet = (sizeof (void *) == 8) ? "nvptx64" : "nvptx";
+  dev->spmd = CL_TRUE;
+  dev->workgroup_pass = CL_FALSE;
+  dev->execution_capabilities = CL_EXEC_KERNEL;
+
+  dev->global_as_id = 1;
+  dev->local_as_id = 3;
+  dev->constant_as_id = 1;
+
+  /* TODO: Get images working */
+  dev->image_support = CL_FALSE;
+
+  dev->has_64bit_long = 1;
+}
+
+unsigned int
+pocl_cuda_probe (struct pocl_device_ops *ops)
+{
+  int env_count = pocl_device_get_env_count (ops->device_name);
+
+  int probe_count = 0;
+  CUresult ret = cuInit (0);
+  if (ret == CUDA_SUCCESS)
+    {
+      ret = cuDeviceGetCount (&probe_count);
+      if (ret != CUDA_SUCCESS)
+        probe_count = 0;
+    }
+
+  /* If the user requested a specific number of CUDA devices,
+   * pretend we only have that many, if we can. If they requested
+   * more than there are, abort informing the user of the issue.
+   */
+  if (env_count >= 0)
+    {
+      if (env_count > probe_count)
+        POCL_ABORT ("[CUDA] %d devices requested, but only %d are available\n",
+          env_count, probe_count);
+      probe_count = env_count;
+    }
+
+  return probe_count;
+}
+
+void
+pocl_cuda_uninit (cl_device_id device)
+{
+  pocl_cuda_device_data_t *data = device->data;
+
+  if (device->available)
+      cuCtxDestroy (data->context);
+
+  POCL_MEM_FREE (data);
+  device->data = NULL;
+
+  POCL_MEM_FREE (device->long_name);
+}
+
+cl_int
+pocl_cuda_alloc_mem_obj (cl_device_id device, cl_mem mem_obj, void *host_ptr)
+{
+  cuCtxSetCurrent (((pocl_cuda_device_data_t *)device->data)->context);
+
+  CUresult result;
+  void *b = NULL;
+
+  /* If memory for this global memory is not yet allocated -> do it */
+  if (mem_obj->device_ptrs[device->global_mem_id].mem_ptr == NULL)
+    {
+      cl_mem_flags flags = mem_obj->flags;
+
+      if (flags & CL_MEM_USE_HOST_PTR)
+        {
+#if defined __arm__
+          /* cuMemHostRegister is not supported on ARM.
+           * Allocate device memory and perform explicit copies
+           * before and after running a kernel */
+          result = cuMemAlloc ((CUdeviceptr *)&b, mem_obj->size);
+          CUDA_CHECK (result, "cuMemAlloc");
+#else
+          result = cuMemHostRegister (host_ptr, mem_obj->size,
+                                      CU_MEMHOSTREGISTER_DEVICEMAP);
+          if (result != CUDA_SUCCESS
+              && result != CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED)
+            CUDA_CHECK (result, "cuMemHostRegister");
+          result = cuMemHostGetDevicePointer ((CUdeviceptr *)&b, host_ptr, 0);
+          CUDA_CHECK (result, "cuMemHostGetDevicePointer");
+#endif
+        }
+      else if (flags & CL_MEM_ALLOC_HOST_PTR)
+        {
+          result = cuMemHostAlloc (&mem_obj->mem_host_ptr, mem_obj->size,
+                                   CU_MEMHOSTREGISTER_DEVICEMAP);
+          CUDA_CHECK (result, "cuMemHostAlloc");
+          result = cuMemHostGetDevicePointer ((CUdeviceptr *)&b,
+                                              mem_obj->mem_host_ptr, 0);
+          CUDA_CHECK (result, "cuMemHostGetDevicePointer");
+        }
+      else
+        {
+          result = cuMemAlloc ((CUdeviceptr *)&b, mem_obj->size);
+          if (result != CUDA_SUCCESS)
+            {
+              const char *err;
+              cuGetErrorName (result, &err);
+              POCL_MSG_PRINT2 (CUDA, __FUNCTION__, __LINE__,
+                               "-> Failed to allocate memory: %s\n", err);
+              return CL_MEM_OBJECT_ALLOCATION_FAILURE;
+            }
+        }
+
+      if (flags & CL_MEM_COPY_HOST_PTR)
+        {
+          result = cuMemcpyHtoD ((CUdeviceptr)b, host_ptr, mem_obj->size);
+          CUDA_CHECK (result, "cuMemcpyHtoD");
+
+          result = cuStreamSynchronize (0);
+          CUDA_CHECK (result, "cuStreamSynchronize");
+        }
+
+      mem_obj->device_ptrs[device->global_mem_id].mem_ptr = b;
+      mem_obj->device_ptrs[device->global_mem_id].global_mem_id
+          = device->global_mem_id;
+    }
+
+  /* Copy allocated global mem info to devices own slot */
+  mem_obj->device_ptrs[device->dev_id]
+      = mem_obj->device_ptrs[device->global_mem_id];
+
+  return CL_SUCCESS;
+}
+
+void
+pocl_cuda_free (cl_device_id device, cl_mem mem_obj)
+{
+  cuCtxSetCurrent (((pocl_cuda_device_data_t *)device->data)->context);
+
+  if (mem_obj->flags & CL_MEM_ALLOC_HOST_PTR)
+    {
+      cuMemFreeHost (mem_obj->mem_host_ptr);
+      mem_obj->mem_host_ptr = NULL;
+    }
+  else if (mem_obj->flags & CL_MEM_USE_HOST_PTR)
+    {
+      cuMemHostUnregister (mem_obj->mem_host_ptr);
+      mem_obj->mem_host_ptr = NULL;
+    }
+  else
+    {
+      void *ptr = mem_obj->device_ptrs[device->dev_id].mem_ptr;
+      cuMemFree ((CUdeviceptr)ptr);
+    }
+}
+
+void
+pocl_cuda_free_ptr (cl_device_id device, void *mem_ptr)
+{
+  cuCtxSetCurrent (((pocl_cuda_device_data_t *)device->data)->context);
+
+  cuMemFreeHost (mem_ptr);
+}
+
+void
+pocl_cuda_submit_read (CUstream stream, void *host_ptr, const void *device_ptr,
+                       size_t offset, size_t cb)
+{
+  CUresult result = cuMemcpyDtoHAsync (
+      host_ptr, (CUdeviceptr) (device_ptr + offset), cb, stream);
+  CUDA_CHECK (result, "cuMemcpyDtoHAsync");
+}
+
+void
+pocl_cuda_submit_write (CUstream stream, const void *host_ptr,
+                        void *device_ptr, size_t offset, size_t cb)
+{
+  CUresult result = cuMemcpyHtoDAsync ((CUdeviceptr) (device_ptr + offset),
+                                       host_ptr, cb, stream);
+  CUDA_CHECK (result, "cuMemcpyHtoDAsync");
+}
+
+void
+pocl_cuda_submit_copy (CUstream stream, cl_device_id src_dev, cl_mem src_buf,
+                       size_t src_offset, cl_device_id dst_dev, cl_mem dst_buf,
+                       size_t dst_offset, size_t cb)
+{
+  void *src_ptr = src_buf->device_ptrs[src_dev->dev_id].mem_ptr + src_offset;
+  void *dst_ptr = dst_buf->device_ptrs[dst_dev->dev_id].mem_ptr + dst_offset;
+
+  int src_is_cuda = !strcmp (src_dev->ops->device_name, "CUDA");
+  int dst_is_cuda = !strcmp (dst_dev->ops->device_name, "CUDA");
+  if (!src_is_cuda && src_dev->global_mem_id)
+    POCL_ABORT_UNIMPLEMENTED ("[CUDA] copy from non-host memory");
+  if (!dst_is_cuda && dst_dev->global_mem_id)
+    POCL_ABORT_UNIMPLEMENTED ("[CUDA] copy to non-host memory");
+
+  if (src_ptr == dst_ptr)
+    return;
+
+  CUresult result;
+  if (src_is_cuda && dst_is_cuda)
+    {
+      result = cuMemcpyDtoDAsync ((CUdeviceptr)dst_ptr, (CUdeviceptr)src_ptr,
+                                  cb, stream);
+      CUDA_CHECK (result, "cuMemcpyDtoDAsync");
+    }
+  else if (src_is_cuda)
+    {
+      result = cuMemcpyDtoHAsync (dst_ptr, (CUdeviceptr)src_ptr, cb, stream);
+      CUDA_CHECK (result, "cuMemcpyDtoHAsync");
+    }
+  else if (dst_is_cuda)
+    {
+      result = cuMemcpyHtoDAsync ((CUdeviceptr)dst_ptr, src_ptr, cb, stream);
+      CUDA_CHECK (result, "cuMemcpyHtoDAsync");
+    }
+  else
+    {
+      /* This should infer a host->host copy via UVA */
+      result = cuMemcpyAsync ((CUdeviceptr)dst_ptr, (CUdeviceptr)src_ptr, cb,
+                              stream);
+      CUDA_CHECK (result, "cuMemcpyAsync");
+    }
+}
+
+void
+pocl_cuda_submit_read_rect (CUstream stream, void *__restrict__ const host_ptr,
+                            void *__restrict__ const device_ptr,
+                            const size_t *__restrict__ const buffer_origin,
+                            const size_t *__restrict__ const host_origin,
+                            const size_t *__restrict__ const region,
+                            size_t const buffer_row_pitch,
+                            size_t const buffer_slice_pitch,
+                            size_t const host_row_pitch,
+                            size_t const host_slice_pitch)
+{
+  CUDA_MEMCPY3D params = { 0 };
+
+  params.WidthInBytes = region[0];
+  params.Height = region[1];
+  params.Depth = region[2];
+
+  params.dstMemoryType = CU_MEMORYTYPE_HOST;
+  params.dstHost = host_ptr;
+  params.dstXInBytes = host_origin[0];
+  params.dstY = host_origin[1];
+  params.dstZ = host_origin[2];
+  params.dstPitch = host_row_pitch;
+  params.dstHeight = host_slice_pitch / host_row_pitch;
+
+  params.srcMemoryType = CU_MEMORYTYPE_DEVICE;
+  params.srcDevice = (CUdeviceptr)device_ptr;
+  params.srcXInBytes = buffer_origin[0];
+  params.srcY = buffer_origin[1];
+  params.srcZ = buffer_origin[2];
+  params.srcPitch = buffer_row_pitch;
+  params.srcHeight = buffer_slice_pitch / buffer_row_pitch;
+
+  CUresult result = cuMemcpy3DAsync (&params, stream);
+  CUDA_CHECK (result, "cuMemcpy3DAsync");
+}
+
+void
+pocl_cuda_submit_write_rect (CUstream stream,
+                             const void *__restrict__ const host_ptr,
+                             void *__restrict__ const device_ptr,
+                             const size_t *__restrict__ const buffer_origin,
+                             const size_t *__restrict__ const host_origin,
+                             const size_t *__restrict__ const region,
+                             size_t const buffer_row_pitch,
+                             size_t const buffer_slice_pitch,
+                             size_t const host_row_pitch,
+                             size_t const host_slice_pitch)
+{
+  CUDA_MEMCPY3D params = { 0 };
+
+  params.WidthInBytes = region[0];
+  params.Height = region[1];
+  params.Depth = region[2];
+
+  params.srcMemoryType = CU_MEMORYTYPE_HOST;
+  params.srcHost = host_ptr;
+  params.srcXInBytes = host_origin[0];
+  params.srcY = host_origin[1];
+  params.srcZ = host_origin[2];
+  params.srcPitch = host_row_pitch;
+  params.srcHeight = host_slice_pitch / host_row_pitch;
+
+  params.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+  params.dstDevice = (CUdeviceptr)device_ptr;
+  params.dstXInBytes = buffer_origin[0];
+  params.dstY = buffer_origin[1];
+  params.dstZ = buffer_origin[2];
+  params.dstPitch = buffer_row_pitch;
+  params.dstHeight = buffer_slice_pitch / buffer_row_pitch;
+
+  CUresult result = cuMemcpy3DAsync (&params, stream);
+  CUDA_CHECK (result, "cuMemcpy3DAsync");
+}
+
+void
+pocl_cuda_submit_copy_rect (CUstream stream, cl_device_id src_dev,
+                            cl_mem src_buf, cl_device_id dst_dev,
+                            cl_mem dst_buf,
+                            const size_t *__restrict__ const src_origin,
+                            const size_t *__restrict__ const dst_origin,
+                            const size_t *__restrict__ const region,
+                            size_t const src_row_pitch,
+                            size_t const src_slice_pitch,
+                            size_t const dst_row_pitch,
+                            size_t const dst_slice_pitch)
+{
+  void *src_ptr = src_buf->device_ptrs[src_dev->dev_id].mem_ptr;
+  void *dst_ptr = dst_buf->device_ptrs[dst_dev->dev_id].mem_ptr;
+
+  CUDA_MEMCPY3D params = { 0 };
+
+  params.WidthInBytes = region[0];
+  params.Height = region[1];
+  params.Depth = region[2];
+
+  params.srcDevice = (CUdeviceptr)src_ptr;
+  params.srcXInBytes = src_origin[0];
+  params.srcY = src_origin[1];
+  params.srcZ = src_origin[2];
+  params.srcPitch = src_row_pitch;
+  params.srcHeight = src_slice_pitch / src_row_pitch;
+
+  params.dstDevice = (CUdeviceptr)dst_ptr;
+  params.dstXInBytes = dst_origin[0];
+  params.dstY = dst_origin[1];
+  params.dstZ = dst_origin[2];
+  params.dstPitch = dst_row_pitch;
+  params.dstHeight = dst_slice_pitch / dst_row_pitch;
+
+  int src_is_cuda = !strcmp (src_dev->ops->device_name, "CUDA");
+  int dst_is_cuda = !strcmp (dst_dev->ops->device_name, "CUDA");
+  if (!src_is_cuda && src_dev->global_mem_id)
+    POCL_ABORT_UNIMPLEMENTED ("[CUDA] copy from non-host memory");
+  if (!dst_is_cuda && dst_dev->global_mem_id)
+    POCL_ABORT_UNIMPLEMENTED ("[CUDA] copy to non-host memory");
+
+  params.srcMemoryType
+      = src_is_cuda ? CU_MEMORYTYPE_DEVICE : CU_MEMORYTYPE_HOST;
+  params.dstMemoryType
+      = dst_is_cuda ? CU_MEMORYTYPE_DEVICE : CU_MEMORYTYPE_HOST;
+
+  CUresult result = cuMemcpy3DAsync (&params, stream);
+  CUDA_CHECK (result, "cuMemcpy3DAsync");
+}
+
+void *
+pocl_cuda_map_mem (void *data, void *buf_ptr, size_t offset, size_t size,
+                   void *host_ptr)
+{
+  assert (host_ptr == NULL);
+
+  return malloc (size);
+}
+
+void
+pocl_cuda_submit_map_mem (CUstream stream, void *buf_ptr, size_t offset,
+                          size_t size, void *host_ptr)
+{
+  assert (host_ptr != NULL);
+
+  /* TODO: Map instead of copy? */
+  /* TODO: don't copy if mapped as CL_MAP_WRITE_INVALIDATE_REGION */
+  CUresult result = cuMemcpyDtoHAsync (
+      host_ptr, (CUdeviceptr) (buf_ptr + offset), size, stream);
+  CUDA_CHECK (result, "cuMemcpyDtoHAsync");
+}
+
+void *
+pocl_cuda_submit_unmap_mem (CUstream stream, void *host_ptr,
+                            void *device_start_ptr, size_t offset, size_t size)
+{
+  if (host_ptr)
+    {
+      /* TODO: Only copy back if mapped for writing */
+      CUresult result = cuMemcpyHtoDAsync (
+          (CUdeviceptr) (device_start_ptr + offset), host_ptr, size, stream);
+      CUDA_CHECK (result, "cuMemcpyHtoDAsync");
+    }
+  return NULL;
+}
+
+static pocl_cuda_kernel_data_t *
+load_or_generate_kernel (cl_kernel kernel, cl_device_id device,
+                         int has_offsets)
+{
+  CUresult result;
+
+  /* Check if we already have a compiled kernel function */
+  pocl_cuda_kernel_data_t *kdata = (pocl_cuda_kernel_data_t *)kernel->data;
+  if (kdata)
+    {
+      kdata += device->dev_id;
+      if ((has_offsets && kdata->kernel_offsets)
+          || (!has_offsets && kdata->kernel))
+        return kdata;
+    }
+  else
+    {
+      /* TODO: when can we release this? */
+      kernel->data
+          = calloc (pocl_num_devices, sizeof (pocl_cuda_kernel_data_t));
+      kdata = kernel->data + device->dev_id;
+    }
+
+  pocl_cuda_device_data_t *ddata = (pocl_cuda_device_data_t *)device->data;
+  cuCtxSetCurrent (ddata->context);
+
+  POCL_LOCK(ddata->compile_lock);
+
+  /* Generate the parallel bitcode file linked with the kernel library */
+  int error = pocl_llvm_generate_workgroup_function (device, kernel, 0, 0, 0);
+  if (error)
+    {
+      POCL_MSG_PRINT_GENERAL ("pocl_llvm_generate_workgroup_function() failed"
+                              " for kernel %s\n", kernel->name);
+      assert (error == 0);
+    }
+
+  char bc_filename[POCL_FILENAME_LENGTH];
+  unsigned device_i = pocl_cl_device_to_index (kernel->program, device);
+  pocl_cache_work_group_function_path (bc_filename, kernel->program, device_i,
+                                       kernel, 0, 0, 0);
+
+  char ptx_filename[POCL_FILENAME_LENGTH];
+  strcpy (ptx_filename, bc_filename);
+  if (has_offsets)
+    strncat (ptx_filename, ".offsets", POCL_FILENAME_LENGTH - 1);
+  strncat (ptx_filename, ".ptx", POCL_FILENAME_LENGTH - 1);
+
+  if (!pocl_exists (ptx_filename))
+    {
+      /* Generate PTX from LLVM bitcode */
+      if (pocl_ptx_gen (bc_filename, ptx_filename, kernel->name,
+                        device->llvm_cpu,
+                        ((pocl_cuda_device_data_t *)device->data)->libdevice,
+                        has_offsets))
+        POCL_ABORT ("pocl-cuda: failed to generate PTX\n");
+    }
+
+  /* Load PTX module */
+  /* TODO: When can we unload the module? */
+  CUmodule module;
+  result = cuModuleLoad (&module, ptx_filename);
+  CUDA_CHECK (result, "cuModuleLoad");
+
+  /* Get kernel function */
+  CUfunction function;
+  result = cuModuleGetFunction (&function, module, kernel->name);
+  CUDA_CHECK (result, "cuModuleGetFunction");
+
+  /* Get pointer aligment */
+  if (!kdata->alignments)
+    {
+      kdata->alignments = calloc (kernel->num_args + kernel->num_locals + 4,
+                                  sizeof (size_t));
+      pocl_cuda_get_ptr_arg_alignment (bc_filename, kernel->name,
+                                       kdata->alignments);
+    }
+
+  if (has_offsets)
+    {
+      kdata->module_offsets = module;
+      kdata->kernel_offsets = function;
+    }
+  else
+    {
+      kdata->module = module;
+      kdata->kernel = function;
+    }
+
+  POCL_UNLOCK (ddata->compile_lock);
+
+  return kdata;
+}
+
+void
+pocl_cuda_compile_kernel (_cl_command_node *cmd, cl_kernel kernel,
+                          cl_device_id device)
+{
+  load_or_generate_kernel (kernel, device, 0);
+}
+
+void
+pocl_cuda_submit_kernel (CUstream stream, _cl_command_run run,
+                         cl_device_id device, cl_event event)
+{
+  cl_kernel kernel = run.kernel;
+  pocl_argument *arguments = run.arguments;
+  struct pocl_context pc = run.pc;
+
+  /* Check if we need to handle global work offsets */
+  int has_offsets = 0;
+  if (pc.global_offset[0] || pc.global_offset[1] || pc.global_offset[2])
+    has_offsets = 1;
+
+  /* Get kernel function */
+  pocl_cuda_kernel_data_t *kdata
+      = load_or_generate_kernel (kernel, device, has_offsets);
+  CUmodule module = has_offsets ? kdata->module_offsets : kdata->module;
+  CUfunction function = has_offsets ? kdata->kernel_offsets : kdata->kernel;
+
+  /* Prepare kernel arguments */
+  void *null = NULL;
+  unsigned sharedMemBytes = 0;
+  void *params[kernel->num_args + kernel->num_locals + 4];
+  unsigned sharedMemOffsets[kernel->num_args + kernel->num_locals];
+  unsigned constantMemBytes = 0;
+  unsigned constantMemOffsets[kernel->num_args];
+  unsigned globalOffsets[3];
+
+  /* Get handle to constant memory buffer */
+  size_t constant_mem_size;
+  CUdeviceptr constant_mem_base = 0;
+  cuModuleGetGlobal (&constant_mem_base, &constant_mem_size, module,
+                     "_constant_memory_region_");
+
+  CUresult result;
+  unsigned i;
+  for (i = 0; i < kernel->num_args; i++)
+    {
+      pocl_argument_type type = kernel->arg_info[i].type;
+      switch (type)
+        {
+        case POCL_ARG_TYPE_NONE:
+          params[i] = arguments[i].value;
+          break;
+        case POCL_ARG_TYPE_POINTER:
+          {
+            if (kernel->arg_info[i].is_local)
+              {
+                size_t size = arguments[i].size;
+                size_t align = kdata->alignments[i];
+
+                /* Pad offset to align memory */
+                if (sharedMemBytes % align)
+                  sharedMemBytes += align - (sharedMemBytes % align);
+
+                sharedMemOffsets[i] = sharedMemBytes;
+                params[i] = sharedMemOffsets + i;
+
+                sharedMemBytes += size;
+              }
+            else if (kernel->arg_info[i].address_qualifier
+                     == CL_KERNEL_ARG_ADDRESS_CONSTANT)
+              {
+                assert (constant_mem_base);
+
+                /* Get device pointer */
+                cl_mem mem = *(void **)arguments[i].value;
+                CUdeviceptr src
+                    = (CUdeviceptr)mem->device_ptrs[device->dev_id].mem_ptr;
+
+                size_t align = kdata->alignments[i];
+                if (constantMemBytes % align)
+                  {
+                    constantMemBytes += align - (constantMemBytes % align);
+                  }
+
+                /* Copy to constant buffer at current offset */
+                result
+                    = cuMemcpyDtoDAsync (constant_mem_base + constantMemBytes,
+                                         src, mem->size, stream);
+                CUDA_CHECK (result, "cuMemcpyDtoDAsync");
+
+                constantMemOffsets[i] = constantMemBytes;
+                params[i] = constantMemOffsets + i;
+
+                constantMemBytes += mem->size;
+              }
+            else
+              {
+                if (arguments[i].value)
+                  {
+                    cl_mem mem = *(void **)arguments[i].value;
+                    params[i] = &mem->device_ptrs[device->dev_id].mem_ptr;
+
+#if defined __arm__
+                    /* On ARM with USE_HOST_PTR, perform explicit copy to
+                     * device */
+                    if (mem->flags & CL_MEM_USE_HOST_PTR)
+                      {
+                        cuMemcpyHtoD (*(CUdeviceptr *)(params[i]),
+                                      mem->mem_host_ptr, mem->size);
+                        cuStreamSynchronize (0);
+                      }
+#endif
+                  }
+                else
+                  {
+                    params[i] = &null;
+                  }
+              }
+            break;
+          }
+        case POCL_ARG_TYPE_IMAGE:
+        case POCL_ARG_TYPE_SAMPLER:
+          POCL_ABORT ("Unhandled argument type for CUDA\n");
+          break;
+        }
+    }
+
+  if (constantMemBytes > constant_mem_size)
+    POCL_ABORT ("[CUDA] Total constant buffer size %u exceeds %lu allocated\n",
+                constantMemBytes, constant_mem_size);
+
+  unsigned arg_index = kernel->num_args;
+
+  /* Deal with automatic local allocations */
+  /* TODO: Would be better to remove arguments and make these static GEPs */
+  for (i = 0; i < kernel->num_locals; ++i, ++arg_index)
+    {
+      size_t size = arguments[arg_index].size;
+      size_t align = kdata->alignments[arg_index];
+
+      /* Pad offset to align memory */
+      if (sharedMemBytes % align)
+        sharedMemBytes += align - (sharedMemBytes % align);
+
+      sharedMemOffsets[arg_index] = sharedMemBytes;
+      sharedMemBytes += size;
+      params[arg_index] = sharedMemOffsets + arg_index;
+    }
+
+  /* Add global work dimensionality */
+  params[arg_index++] = &pc.work_dim;
+
+  /* Add global offsets if necessary */
+  if (has_offsets)
+    {
+      globalOffsets[0] = pc.global_offset[0];
+      globalOffsets[1] = pc.global_offset[1];
+      globalOffsets[2] = pc.global_offset[2];
+      params[arg_index++] = globalOffsets + 0;
+      params[arg_index++] = globalOffsets + 1;
+      params[arg_index++] = globalOffsets + 2;
+    }
+
+  /* Launch kernel */
+  result = cuLaunchKernel (function, pc.num_groups[0], pc.num_groups[1],
+                           pc.num_groups[2], run.local_x, run.local_y,
+                           run.local_z, sharedMemBytes, stream, params, NULL);
+  CUDA_CHECK (result, "cuLaunchKernel");
+}
+
+void
+pocl_cuda_submit_node (_cl_command_node *node, cl_command_queue cq)
+{
+  CUresult result;
+  CUstream stream = ((pocl_cuda_queue_data_t *)cq->data)->stream;
+
+  POCL_LOCK_OBJ (node->event);
+
+  pocl_cuda_event_data_t *event_data
+      = (pocl_cuda_event_data_t *)node->event->data;
+
+  /* Process event dependencies */
+  event_node *dep = NULL;
+  LL_FOREACH (node->event->wait_list, dep)
+    {
+      /* If it is in the process of completing, just skip it */
+      if (dep->event->status <= CL_COMPLETE)
+        continue;
+
+      /* Add CUDA event dependency */
+      if (dep->event->command_type != CL_COMMAND_USER
+          && dep->event->queue->device->ops == cq->device->ops)
+        {
+          /* Block stream on event, but only for different queues */
+          if (dep->event->queue != node->event->queue)
+            {
+              pocl_cuda_event_data_t *dep_data
+                  = (pocl_cuda_event_data_t *)dep->event->data;
+
+              /* Wait until dependency has finished being submitted */
+              while (!dep_data->events_ready)
+                ;
+
+              result = cuStreamWaitEvent (stream, dep_data->end, 0);
+              CUDA_CHECK (result, "cuStreamWaitEvent");
+            }
+        }
+      else
+        {
+          if (!((pocl_cuda_queue_data_t *)cq->data)->use_threads)
+            POCL_ABORT (
+                "Can't handle non-CUDA dependencies without queue threads\n");
+
+          event_data->num_ext_events++;
+        }
+    }
+
+  /* Wait on flag for external events */
+  if (event_data->num_ext_events)
+    {
+      CUdeviceptr dev_ext_event_flag;
+      result = cuMemHostAlloc ((void **)&event_data->ext_event_flag, 4,
+                               CU_MEMHOSTALLOC_DEVICEMAP);
+      CUDA_CHECK (result, "cuMemAllocHost");
+
+      *event_data->ext_event_flag = 0;
+
+      result = cuMemHostGetDevicePointer (&dev_ext_event_flag,
+                                          event_data->ext_event_flag, 0);
+      CUDA_CHECK (result, "cuMemHostGetDevicePointer");
+      result = cuStreamWaitValue32 (stream, dev_ext_event_flag, 1,
+                                    CU_STREAM_WAIT_VALUE_GEQ);
+      CUDA_CHECK (result, "cuStreamWaitValue32");
+    }
+
+  /* Create and record event for command start if profiling enabled */
+  if (cq->properties & CL_QUEUE_PROFILING_ENABLE)
+    {
+      result = cuEventCreate (&event_data->start, CU_EVENT_DEFAULT);
+      CUDA_CHECK (result, "cuEventCreate");
+      result = cuEventRecord (event_data->start, stream);
+      CUDA_CHECK (result, "cuEventRecord");
+    }
+
+  POCL_UPDATE_EVENT_SUBMITTED (node->event);
+
+  POCL_UNLOCK_OBJ (node->event);
+
+  switch (node->type)
+    {
+    case CL_COMMAND_READ_BUFFER:
+      pocl_cuda_submit_read (stream, node->command.read.host_ptr,
+                             node->command.read.device_ptr,
+                             node->command.read.offset, node->command.read.cb);
+      break;
+    case CL_COMMAND_WRITE_BUFFER:
+      pocl_cuda_submit_write (
+          stream, node->command.write.host_ptr, node->command.write.device_ptr,
+          node->command.write.offset, node->command.write.cb);
+      break;
+    case CL_COMMAND_COPY_BUFFER:
+      {
+        cl_device_id src_dev = node->command.copy.src_dev;
+        cl_mem src_buf = node->command.copy.src_buffer;
+        cl_device_id dst_dev = node->command.copy.dst_dev;
+        cl_mem dst_buf = node->command.copy.dst_buffer;
+        if (!src_dev)
+          src_dev = dst_dev;
+        pocl_cuda_submit_copy (
+            stream, src_dev, src_buf, node->command.copy.src_offset, dst_dev,
+            dst_buf, node->command.copy.dst_offset, node->command.copy.cb);
+        break;
+      }
+    case CL_COMMAND_READ_BUFFER_RECT:
+      pocl_cuda_submit_read_rect (
+          stream, node->command.read_image.host_ptr,
+          node->command.read_image.device_ptr, node->command.read_image.origin,
+          node->command.read_image.h_origin, node->command.read_image.region,
+          node->command.read_image.b_rowpitch,
+          node->command.read_image.b_slicepitch,
+          node->command.read_image.h_rowpitch,
+          node->command.read_image.h_slicepitch);
+      break;
+    case CL_COMMAND_WRITE_BUFFER_RECT:
+      pocl_cuda_submit_write_rect (stream, node->command.write_image.host_ptr,
+                                   node->command.write_image.device_ptr,
+                                   node->command.write_image.origin,
+                                   node->command.write_image.h_origin,
+                                   node->command.write_image.region,
+                                   node->command.write_image.b_rowpitch,
+                                   node->command.write_image.b_slicepitch,
+                                   node->command.write_image.h_rowpitch,
+                                   node->command.write_image.h_slicepitch);
+      break;
+    case CL_COMMAND_COPY_BUFFER_RECT:
+      {
+        cl_device_id src_dev = node->command.copy_image.src_device;
+        cl_mem src_buf = node->command.copy_image.src_buffer;
+        cl_device_id dst_dev = node->command.copy_image.dst_device;
+        cl_mem dst_buf = node->command.copy_image.dst_buffer;
+        if (!src_dev)
+          src_dev = dst_dev;
+        pocl_cuda_submit_copy_rect (stream, src_dev, src_buf, dst_dev, dst_buf,
+                                    node->command.copy_image.src_origin,
+                                    node->command.copy_image.dst_origin,
+                                    node->command.copy_image.region,
+                                    node->command.copy_image.src_rowpitch,
+                                    node->command.copy_image.src_slicepitch,
+                                    node->command.copy_image.dst_rowpitch,
+                                    node->command.copy_image.dst_slicepitch);
+        break;
+      }
+    case CL_COMMAND_MIGRATE_MEM_OBJECTS:
+      {
+        int i;
+        for (i = 0; i < node->command.migrate.num_mem_objects; i++)
+          {
+            cl_device_id src_dev = node->command.migrate.source_devices[i];
+            cl_device_id dst_dev = cq->device;
+            cl_mem buf = node->command.migrate.mem_objects[i];
+            if (!src_dev)
+              src_dev = dst_dev;
+            pocl_cuda_submit_copy (stream, src_dev, buf, 0, dst_dev, buf, 0,
+                                   buf->size);
+          }
+        break;
+      }
+    case CL_COMMAND_MAP_BUFFER:
+      {
+        cl_device_id device = node->device;
+        cl_mem buffer = node->command.map.buffer;
+        if (!(buffer->flags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR)))
+          pocl_cuda_submit_map_mem (
+              stream, buffer->device_ptrs[device->dev_id].mem_ptr,
+              node->command.map.mapping->offset,
+              node->command.map.mapping->size,
+              node->command.map.mapping->host_ptr);
+        POCL_LOCK_OBJ (buffer);
+        buffer->map_count++;
+        POCL_UNLOCK_OBJ (buffer);
+        break;
+      }
+    case CL_COMMAND_UNMAP_MEM_OBJECT:
+      {
+        cl_device_id device = node->device;
+        cl_mem buffer = node->command.unmap.memobj;
+        pocl_cuda_submit_unmap_mem (
+            stream, node->command.unmap.mapping->host_ptr,
+            buffer->device_ptrs[device->dev_id].mem_ptr,
+            node->command.unmap.mapping->offset,
+            node->command.unmap.mapping->size);
+        break;
+      }
+    case CL_COMMAND_NDRANGE_KERNEL:
+      pocl_cuda_submit_kernel (stream, node->command.run, node->device,
+                               node->event);
+      break;
+
+    case CL_COMMAND_MARKER:
+    case CL_COMMAND_BARRIER:
+      break;
+
+    case CL_COMMAND_FILL_BUFFER:
+    case CL_COMMAND_READ_IMAGE:
+    case CL_COMMAND_WRITE_IMAGE:
+    case CL_COMMAND_COPY_IMAGE:
+    case CL_COMMAND_COPY_BUFFER_TO_IMAGE:
+    case CL_COMMAND_COPY_IMAGE_TO_BUFFER:
+    case CL_COMMAND_FILL_IMAGE:
+    case CL_COMMAND_MAP_IMAGE:
+    case CL_COMMAND_NATIVE_KERNEL:
+    case CL_COMMAND_SVM_FREE:
+    case CL_COMMAND_SVM_MAP:
+    case CL_COMMAND_SVM_UNMAP:
+    case CL_COMMAND_SVM_MEMCPY:
+    case CL_COMMAND_SVM_MEMFILL:
+    default:
+      POCL_ABORT_UNIMPLEMENTED (pocl_command_to_str (node->type));
+      break;
+    }
+
+  /* Create and record event for command end */
+  if (cq->properties & CL_QUEUE_PROFILING_ENABLE)
+    result = cuEventCreate (&event_data->end, CU_EVENT_DEFAULT);
+  else
+    result = cuEventCreate (&event_data->end, CU_EVENT_DISABLE_TIMING);
+  CUDA_CHECK (result, "cuEventCreate");
+  result = cuEventRecord (event_data->end, stream);
+  CUDA_CHECK (result, "cuEventRecord");
+
+  event_data->events_ready = 1;
+}
+
+void
+pocl_cuda_submit (_cl_command_node *node, cl_command_queue cq)
+{
+  /* Allocate CUDA event data */
+  node->event->data
+      = (pocl_cuda_event_data_t *)calloc (1, sizeof (pocl_cuda_event_data_t));
+
+  if (((pocl_cuda_queue_data_t *)cq->data)->use_threads)
+    {
+      /* Add command to work queue */
+      pocl_cuda_queue_data_t *queue_data = (pocl_cuda_queue_data_t *)cq->data;
+      pthread_mutex_lock (&queue_data->lock);
+      DL_APPEND (queue_data->pending_queue, node);
+      pthread_cond_signal (&queue_data->pending_cond);
+      pthread_mutex_unlock (&queue_data->lock);
+    }
+  else
+    {
+      /* Submit command in this thread */
+      cuCtxSetCurrent (((pocl_cuda_device_data_t *)cq->device->data)->context);
+      pocl_cuda_submit_node (node, cq);
+    }
+}
+
+void
+pocl_cuda_notify (cl_device_id device, cl_event event, cl_event finished)
+{
+  /* Ignore CUDA device events, we've already handled these dependencies */
+  if (finished->queue && finished->queue->device->ops == device->ops)
+    return;
+
+  if (event->status == CL_QUEUED)
+    return;
+
+  pocl_cuda_event_data_t *event_data = (pocl_cuda_event_data_t *)event->data;
+
+  assert (event_data);
+  assert (event_data->num_ext_events > 0);
+  assert (event_data->ext_event_flag);
+
+  /* If dependency failed, so should we */
+  /* TODO: This isn't true if this is an implicit dependency */
+  if (finished->status < 0)
+    event->status = -1;
+
+  /* Decrement external event counter */
+  /* Trigger flag if none left */
+  if (!--event_data->num_ext_events)
+    *event_data->ext_event_flag = 1;
+}
+
+void
+pocl_cuda_flush (cl_device_id device, cl_command_queue cq)
+{
+  /* TODO: Something here? */
+}
+
+void
+pocl_cuda_finalize_command (cl_device_id device, cl_event event)
+{
+  CUresult result;
+  pocl_cuda_event_data_t *event_data = (pocl_cuda_event_data_t *)event->data;
+
+  /* Wait for command to finish */
+  cuCtxSetCurrent (((pocl_cuda_device_data_t *)device->data)->context);
+  result = cuEventSynchronize (event_data->end);
+  CUDA_CHECK (result, "cuEventSynchronize");
+
+  /* Clean up mapped memory allocations */
+  if (event->command_type == CL_COMMAND_UNMAP_MEM_OBJECT)
+    {
+      cl_mem buffer = event->command->command.unmap.memobj;
+      mem_mapping_t *mapping = event->command->command.unmap.mapping;
+      if (mapping->host_ptr
+          && !(buffer->flags
+               & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR)))
+        free (mapping->host_ptr);
+
+      POCL_LOCK_OBJ (buffer);
+      DL_DELETE (buffer->mappings, mapping);
+      buffer->map_count--;
+      POCL_UNLOCK_OBJ (buffer);
+    }
+
+  if (event->command_type == CL_COMMAND_NDRANGE_KERNEL
+      || event->command_type == CL_COMMAND_TASK)
+    {
+#if defined __arm__
+      /* On ARM with USE_HOST_PTR, perform explict copies back from device */
+      cl_kernel kernel = event->command.run.kernel;
+      pocl_argument *arguments = event->command.run.arguments;
+      unsigned i;
+      for (i = 0; i < kernel->num_args; i++)
+        {
+          if (kernel->arg_info[i].type == POCL_ARG_TYPE_POINTER)
+            {
+              if (!kernel->arg_info[i].is_local && arguments[i].value)
+                {
+                  cl_mem mem = *(void **)arguments[i].value;
+                  if (mem->flags & CL_MEM_USE_HOST_PTR)
+                    {
+                      CUdeviceptr ptr
+                          = (CUdeviceptr)mem->device_ptrs[device->dev_id]
+                                .mem_ptr;
+                      cuMemcpyDtoH (mem->mem_host_ptr, ptr, mem->size);
+                      cuStreamSynchronize (0);
+                    }
+                }
+            }
+        }
+#endif
+
+      pocl_ndrange_node_cleanup (event->command);
+    }
+  else
+    {
+      pocl_mem_manager_free_command (event->command);
+    }
+
+  /* Handle failed events */
+  if (event->status < 0)
+    {
+      pocl_broadcast (event);
+      pocl_update_command_queue (event);
+      POname (clReleaseEvent) (event);
+      return;
+    }
+
+  POCL_UPDATE_EVENT_RUNNING (event);
+  POCL_UPDATE_EVENT_COMPLETE (event);
+}
+
+void
+pocl_cuda_update_event (cl_device_id device, cl_event event, cl_int status)
+{
+  switch (status)
+    {
+    case CL_QUEUED:
+      if (event->queue->properties & CL_QUEUE_PROFILING_ENABLE)
+        event->time_queue = device->ops->get_timer_value (device->data);
+      event->status = status;
+      break;
+    case CL_SUBMITTED:
+      if (event->queue->properties & CL_QUEUE_PROFILING_ENABLE)
+        event->time_submit = device->ops->get_timer_value (device->data);
+      event->status = status;
+      break;
+    case CL_RUNNING:
+      /* Wait until complete to get the timing from the CUDA events */
+      event->status = status;
+      break;
+    case CL_COMPLETE:
+      pocl_mem_objs_cleanup (event);
+
+      /* Update timing info with CUDA event timers if profiling enabled */
+      if (event->queue->properties & CL_QUEUE_PROFILING_ENABLE)
+        {
+          /* CUDA doesn't provide a way to get event timestamps directly,
+           * only the elapsed time between two events. We use the elapsed
+           * time from the epoch event enqueued on device creation to get
+           * the actual timestamps.
+           *
+           * Since the CUDA timer resolution is lower than the host timer,
+           * this can sometimes result in the start time being before the
+           * submit time, so we use max() to ensure the timestamps are
+           * sane. */
+
+          float diff;
+          CUresult result;
+          pocl_cuda_event_data_t *event_data
+              = (pocl_cuda_event_data_t *)event->data;
+          cl_ulong epoch = ((pocl_cuda_device_data_t *)device->data)->epoch;
+
+          result = cuEventElapsedTime (
+              &diff, ((pocl_cuda_device_data_t *)device->data)->epoch_event,
+              event_data->start);
+          CUDA_CHECK (result, "cuEventElapsedTime");
+          event->time_start = (cl_ulong) (epoch + diff * 1e6);
+          event->time_start = max (event->time_start, event->time_submit + 1);
+
+          result = cuEventElapsedTime (
+              &diff, ((pocl_cuda_device_data_t *)device->data)->epoch_event,
+              event_data->end);
+          CUDA_CHECK (result, "cuEventElapsedTime");
+          event->time_end = (cl_ulong) (epoch + diff * 1e6);
+          event->time_end = max (event->time_end, event->time_start + 1);
+        }
+
+      POCL_LOCK_OBJ (event);
+      event->status = CL_COMPLETE;
+      POCL_UNLOCK_OBJ (event);
+      device->ops->broadcast (event);
+
+      pocl_update_command_queue (event);
+
+      break;
+    default:
+      assert ("Invalid event status\n");
+      break;
+    }
+}
+
+void
+pocl_cuda_wait_event_recurse (cl_device_id device, cl_event event)
+{
+  while (event->wait_list)
+    pocl_cuda_wait_event_recurse (device, event->wait_list->event);
+
+  pocl_cuda_finalize_command (device, event);
+}
+
+void
+pocl_cuda_wait_event (cl_device_id device, cl_event event)
+{
+  if (((pocl_cuda_queue_data_t *)event->queue->data)->use_threads)
+    {
+      /* Wait until background thread marks command as complete */
+      while (event->status > CL_COMPLETE)
+        ;
+    }
+  else
+    {
+      /* Recursively finalize commands in this thread */
+      pocl_cuda_wait_event_recurse (device, event);
+    }
+}
+
+void
+pocl_cuda_free_event_data (cl_event event)
+{
+  if (event->data)
+    {
+      pocl_cuda_event_data_t *event_data
+          = (pocl_cuda_event_data_t *)event->data;
+
+      if (event->queue->properties & CL_QUEUE_PROFILING_ENABLE)
+        cuEventDestroy (event_data->start);
+      cuEventDestroy (event_data->end);
+      if (event_data->ext_event_flag)
+        {
+          CUresult result = cuMemFreeHost (event_data->ext_event_flag);
+          CUDA_CHECK (result, "cuMemFreeHost");
+        }
+      free (event->data);
+    }
+}
+
+void
+pocl_cuda_join (cl_device_id device, cl_command_queue cq)
+{
+  /* Grab event at end of queue */
+  POCL_LOCK_OBJ (cq);
+  cl_event event = cq->last_event.event;
+  if (!event)
+    {
+      POCL_UNLOCK_OBJ (cq);
+      return;
+    }
+  POname (clRetainEvent) (event);
+  POCL_UNLOCK_OBJ (cq);
+
+  pocl_cuda_wait_event (device, event);
+
+  POname (clReleaseEvent) (event);
+}
+
+void *
+pocl_cuda_submit_thread (void *data)
+{
+  pocl_cuda_queue_data_t *queue_data = (pocl_cuda_queue_data_t *)data;
+
+  cl_command_queue queue = queue_data->queue;
+  if (queue)
+    cuCtxSetCurrent (
+        ((pocl_cuda_device_data_t *)queue->device->data)->context);
+  else
+    /* This queue has already been released */
+    return NULL;
+
+  while (1)
+    {
+      /* Attempt to get next command from work queue */
+      _cl_command_node *node = NULL;
+      pthread_mutex_lock (&queue_data->lock);
+      if (!queue_data->queue)
+        {
+          pthread_mutex_unlock (&queue_data->lock);
+          break;
+        }
+      if (!queue_data->pending_queue)
+        {
+          pthread_cond_wait (&queue_data->pending_cond, &queue_data->lock);
+        }
+      if (queue_data->pending_queue)
+        {
+          node = queue_data->pending_queue;
+          DL_DELETE (queue_data->pending_queue, node);
+        }
+      pthread_mutex_unlock (&queue_data->lock);
+
+      /* Submit command, if we found one */
+      if (node)
+        {
+          pocl_cuda_submit_node (node, queue_data->queue);
+
+          /* Add command to running queue */
+          pthread_mutex_lock (&queue_data->lock);
+          DL_APPEND (queue_data->running_queue, node);
+          pthread_cond_signal (&queue_data->running_cond);
+          pthread_mutex_unlock (&queue_data->lock);
+        }
+    }
+
+  return NULL;
+}
+
+void *
+pocl_cuda_finalize_thread (void *data)
+{
+  pocl_cuda_queue_data_t *queue_data = (pocl_cuda_queue_data_t *)data;
+
+  cl_command_queue queue = queue_data->queue;
+  if (queue)
+    cuCtxSetCurrent (
+        ((pocl_cuda_device_data_t *)queue->device->data)->context);
+  else
+    /* This queue has already been released */
+    return NULL;
+
+  while (1)
+    {
+      /* Attempt to get next node from running queue */
+      _cl_command_node *node = NULL;
+      pthread_mutex_lock (&queue_data->lock);
+      if (!queue_data->queue)
+        {
+          pthread_mutex_unlock (&queue_data->lock);
+          break;
+        }
+      if (!queue_data->running_queue)
+        {
+          pthread_cond_wait (&queue_data->running_cond, &queue_data->lock);
+        }
+      if (queue_data->running_queue)
+        {
+          node = queue_data->running_queue;
+          DL_DELETE (queue_data->running_queue, node);
+        }
+      pthread_mutex_unlock (&queue_data->lock);
+
+      /* Wait for command to finish, if we found one */
+      if (node)
+        pocl_cuda_finalize_command (queue->device, node->event);
+    }
+
+  return NULL;
+}
diff --git a/lib/kernel/rsqrt.cl b/lib/CL/devices/cuda/pocl-cuda.h
similarity index 77%
copy from lib/kernel/rsqrt.cl
copy to lib/CL/devices/cuda/pocl-cuda.h
index 3c75ca1..7161539 100644
--- a/lib/kernel/rsqrt.cl
+++ b/lib/CL/devices/cuda/pocl-cuda.h
@@ -1,18 +1,17 @@
-/* OpenCL built-in library: rsqrt()
+/* pocl-cuda.h - declarations for CUDA driver
+
+   Copyright (c) 2016-2017 James Price / University of Bristol
 
-   Copyright (c) 2011 Erik Schnetter <eschnetter at perimeterinstitute.ca>
-                      Perimeter Institute for Theoretical Physics
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -22,8 +21,15 @@
    THE SOFTWARE.
 */
 
-#include "templates.h"
+#ifndef POCL_CUDA_H
+#define POCL_CUDA_H
+
+#include "pocl_cl.h"
+#include "pocl_icd.h"
+#include "config.h"
 
-DEFINE_EXPR_V_V(rsqrt, (stype)1/sqrt(a))
+#include "prototypes.inc"
+GEN_PROTOTYPES (cuda)
+GEN_PROTOTYPES (basic)
 
-DEFINE_EXPR_F_F(half_rsqrt, rsqrt(a))
+#endif /* POCL_CUDA_H */
diff --git a/lib/CL/devices/cuda/pocl-ptx-gen.cc b/lib/CL/devices/cuda/pocl-ptx-gen.cc
new file mode 100644
index 0000000..ab8f885
--- /dev/null
+++ b/lib/CL/devices/cuda/pocl-ptx-gen.cc
@@ -0,0 +1,925 @@
+/* pocl-ptx-gen.cc - PTX code generation functions
+
+   Copyright (c) 2016-2017 James Price / University of Bristol
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include "config.h"
+
+#include "LLVMUtils.h"
+#include "common.h"
+#include "pocl-ptx-gen.h"
+#include "pocl.h"
+#include "pocl_file_util.h"
+#include "pocl_runtime_config.h"
+
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+#include <set>
+
+namespace llvm {
+extern ModulePass *createNVVMReflectPass(const StringMap<int> &Mapping);
+}
+
+static void addKernelAnnotations(llvm::Module *Module, const char *KernelName);
+static void fixConstantMemArgs(llvm::Module *Module, const char *KernelName);
+static void fixLocalMemArgs(llvm::Module *Module, const char *KernelName);
+static void fixPrintF(llvm::Module *Module);
+static void handleGetWorkDim(llvm::Module *Module, const char *KernelName);
+static void handleGlobalOffsets(llvm::Module *Module, const char *KernelName,
+                                bool HasOffsets);
+static void linkLibDevice(llvm::Module *Module, const char *KernelName,
+                          const char *LibDevicePath);
+static void mapLibDeviceCalls(llvm::Module *Module);
+
+int pocl_ptx_gen(const char *BitcodeFilename, const char *PTXFilename,
+                 const char *KernelName, const char *Arch,
+                 const char *LibDevicePath, int HasOffsets) {
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Buffer =
+      llvm::MemoryBuffer::getFile(BitcodeFilename);
+  if (!Buffer) {
+    POCL_MSG_ERR("[CUDA] ptx-gen: failed to open bitcode file\n");
+    return 1;
+  }
+
+  // Load the LLVM bitcode module.
+  llvm::LLVMContext Context;
+  llvm::Expected<std::unique_ptr<llvm::Module>> Module =
+      parseBitcodeFile(Buffer->get()->getMemBufferRef(), Context);
+  if (!Module) {
+    POCL_MSG_ERR("[CUDA] ptx-gen: failed to load bitcode\n");
+    return 1;
+  }
+
+  // Apply transforms to prepare for lowering to PTX.
+  fixPrintF(Module->get());
+  fixConstantMemArgs(Module->get(), KernelName);
+  fixLocalMemArgs(Module->get(), KernelName);
+  handleGetWorkDim(Module->get(), KernelName);
+  handleGlobalOffsets(Module->get(), KernelName, HasOffsets);
+  addKernelAnnotations(Module->get(), KernelName);
+  mapLibDeviceCalls(Module->get());
+  linkLibDevice(Module->get(), KernelName, LibDevicePath);
+  if (pocl_get_bool_option("POCL_CUDA_DUMP_NVVM", 0)) {
+    std::string ModuleString;
+    llvm::raw_string_ostream ModuleStringStream(ModuleString);
+    (*Module)->print(ModuleStringStream, NULL);
+    POCL_MSG_PRINT_INFO("NVVM module:\n%s\n", ModuleString.c_str());
+  }
+
+  std::string Error;
+
+  // Verify module.
+  if (pocl_get_bool_option("POCL_CUDA_VERIFY_MODULE", 0)) {
+    llvm::raw_string_ostream Errs(Error);
+    if (llvm::verifyModule(*Module->get(), &Errs)) {
+      POCL_MSG_ERR("\n%s\n", Error.c_str());
+      POCL_ABORT("[CUDA] ptx-gen: module verification failed\n");
+    }
+  }
+
+  llvm::StringRef Triple =
+      (sizeof(void *) == 8) ? "nvptx64-nvidia-cuda" : "nvptx-nvidia-cuda";
+
+  // Get NVPTX target.
+  const llvm::Target *Target =
+      llvm::TargetRegistry::lookupTarget(Triple, Error);
+  if (!Target) {
+    POCL_MSG_ERR("[CUDA] ptx-gen: failed to get target\n");
+    POCL_MSG_ERR("%s\n", Error.c_str());
+    return 1;
+  }
+
+  // TODO: Set options?
+  llvm::TargetOptions Options;
+
+  // TODO: CPU and features?
+  std::unique_ptr<llvm::TargetMachine> Machine(
+      Target->createTargetMachine(Triple, Arch, "+ptx40", Options, llvm::None));
+
+  llvm::legacy::PassManager Passes;
+
+  // Add pass to emit PTX.
+  llvm::SmallVector<char, 4096> Data;
+  llvm::raw_svector_ostream PTXStream(Data);
+  if (Machine->addPassesToEmitFile(Passes, PTXStream,
+                                   llvm::TargetMachine::CGFT_AssemblyFile)) {
+    POCL_MSG_ERR("[CUDA] ptx-gen: failed to add passes\n");
+    return 1;
+  }
+
+  // Run passes.
+  Passes.run(**Module);
+
+  std::string PTX = PTXStream.str();
+  return pocl_write_file(PTXFilename, PTX.c_str(), PTX.size(), 0, 0);
+}
+
+// Add the metadata needed to mark a function as a kernel in PTX.
+void addKernelAnnotations(llvm::Module *Module, const char *KernelName) {
+  llvm::LLVMContext &Context = Module->getContext();
+
+  // Remove existing nvvm.annotations metadata since it is sometimes corrupt.
+  auto *Annotations = Module->getNamedMetadata("nvvm.annotations");
+  if (Annotations)
+    Annotations->eraseFromParent();
+
+  // Add nvvm.annotations metadata to mark kernel entry point.
+  Annotations = Module->getOrInsertNamedMetadata("nvvm.annotations");
+
+  // Get handle to function.
+  auto *Function = Module->getFunction(KernelName);
+  if (!Function)
+    POCL_ABORT("[CUDA] ptx-gen: kernel function not found in module\n");
+
+  // Create metadata.
+  llvm::Constant *One =
+      llvm::ConstantInt::getSigned(llvm::Type::getInt32Ty(Context), 1);
+  llvm::Metadata *FuncMD = llvm::ValueAsMetadata::get(Function);
+  llvm::Metadata *NameMD = llvm::MDString::get(Context, "kernel");
+  llvm::Metadata *OneMD = llvm::ConstantAsMetadata::get(One);
+
+  llvm::MDNode *Node = llvm::MDNode::get(Context, {FuncMD, NameMD, OneMD});
+  Annotations->addOperand(Node);
+}
+
+// PTX doesn't support variadic functions, so we need to modify the IR to
+// support printf. The vprintf system call that is provided is described here:
+// http://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/index.html#system-calls
+// Essentially, the variadic list of arguments is replaced with a single array
+// instead.
+//
+// This function changes the prototype of __cl_printf to take an array instead
+// of a variadic argument list. It updates the function body to read from
+// this array to retrieve each argument instead of using the dummy __cl_va_arg
+// function. We then visit each __cl_printf callsite and generate the argument
+// array to pass instead of the variadic list.
+void fixPrintF(llvm::Module *Module) {
+  llvm::Function *OldPrintF = Module->getFunction("__cl_printf");
+  if (!OldPrintF)
+    return;
+
+  llvm::LLVMContext &Context = Module->getContext();
+  llvm::Type *I32 = llvm::Type::getInt32Ty(Context);
+  llvm::Type *I64 = llvm::Type::getInt64Ty(Context);
+  llvm::Type *I64Ptr = llvm::PointerType::get(I64, 0);
+  llvm::Type *FormatType = OldPrintF->getFunctionType()->getParamType(0);
+
+  // Remove calls to va_start and va_end.
+  pocl::eraseFunctionAndCallers(Module->getFunction("llvm.va_start"));
+  pocl::eraseFunctionAndCallers(Module->getFunction("llvm.va_end"));
+
+  // Create new non-variadic __cl_printf function.
+  llvm::Type *ReturnType = OldPrintF->getReturnType();
+  llvm::FunctionType *NewPrintfType =
+      llvm::FunctionType::get(ReturnType, {FormatType, I64Ptr}, false);
+  llvm::Function *NewPrintF = llvm::Function::Create(
+      NewPrintfType, OldPrintF->getLinkage(), "", Module);
+  NewPrintF->takeName(OldPrintF);
+
+  // Take function body from old function.
+  NewPrintF->getBasicBlockList().splice(NewPrintF->begin(),
+                                        OldPrintF->getBasicBlockList());
+
+  // Create i32 to hold current argument index.
+  llvm::AllocaInst *ArgIndexPtr =
+#if LLVM_OLDER_THAN_5_0
+      new llvm::AllocaInst(I32, llvm::ConstantInt::get(I32, 1));
+#else
+      new llvm::AllocaInst(I32, 0, llvm::ConstantInt::get(I32, 1));
+#endif
+  ArgIndexPtr->insertBefore(&*NewPrintF->begin()->begin());
+  llvm::StoreInst *ArgIndexInit =
+      new llvm::StoreInst(llvm::ConstantInt::get(I32, 0), ArgIndexPtr);
+  ArgIndexInit->insertAfter(ArgIndexPtr);
+
+  // Replace calls to _cl_va_arg with reads from new i64 array argument.
+  llvm::Function *VaArgFunc = Module->getFunction("__cl_va_arg");
+  if (VaArgFunc) {
+#if LLVM_OLDER_THAN_5_0
+    llvm::Argument *ArgsIn = &*++NewPrintF->arg_begin();
+#else
+    auto args = NewPrintF->arg_begin();
+    args++;
+    llvm::Argument *ArgsIn = args;
+#endif
+    std::vector<llvm::Value *> VaArgCalls(VaArgFunc->user_begin(),
+                                          VaArgFunc->user_end());
+    for (auto &U : VaArgCalls) {
+      llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(U);
+      if (!Call)
+        continue;
+
+      // Get current argument index.
+      llvm::LoadInst *ArgIndex = new llvm::LoadInst(ArgIndexPtr);
+      ArgIndex->insertBefore(Call);
+
+      // Get pointer to argument data.
+      llvm::Value *ArgOut = Call->getArgOperand(1);
+      llvm::GetElementPtrInst *ArgIn =
+          llvm::GetElementPtrInst::Create(I64, ArgsIn, {ArgIndex});
+      ArgIn->insertAfter(ArgIndex);
+
+      // Cast ArgOut pointer to i64*.
+      llvm::BitCastInst *ArgOutBC = new llvm::BitCastInst(ArgOut, I64Ptr);
+      ArgOutBC->insertAfter(ArgIn);
+      ArgOut = ArgOutBC;
+
+      // Load argument.
+      llvm::LoadInst *ArgValue = new llvm::LoadInst(ArgIn);
+      ArgValue->insertAfter(ArgIn);
+      llvm::StoreInst *ArgStore = new llvm::StoreInst(ArgValue, ArgOut);
+      ArgStore->insertAfter(ArgOutBC);
+
+      // Increment argument index.
+      llvm::BinaryOperator *Inc = llvm::BinaryOperator::Create(
+          llvm::BinaryOperator::Add, ArgIndex, llvm::ConstantInt::get(I32, 1));
+      Inc->insertAfter(ArgIndex);
+      llvm::StoreInst *StoreInc = new llvm::StoreInst(Inc, ArgIndexPtr);
+      StoreInc->insertAfter(Inc);
+
+      // Remove call to _cl_va_arg.
+      Call->eraseFromParent();
+    }
+
+    // Remove function from module.
+    VaArgFunc->eraseFromParent();
+  }
+
+  // Loop over function callers.
+  // Generate array of i64 arguments to replace variadic arguments/
+  std::vector<llvm::Value *> Callers(OldPrintF->user_begin(),
+                                     OldPrintF->user_end());
+  for (auto &U : Callers) {
+    llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(U);
+    if (!Call)
+      continue;
+
+    unsigned NumArgs = Call->getNumArgOperands() - 1;
+    llvm::Value *Format = Call->getArgOperand(0);
+
+    // Allocate array for arguments.
+    // TODO: Deal with vector arguments.
+    llvm::AllocaInst *Args =
+#if LLVM_OLDER_THAN_5_0
+        new llvm::AllocaInst(I64, llvm::ConstantInt::get(I32, NumArgs));
+#else
+        new llvm::AllocaInst(I64, 0, llvm::ConstantInt::get(I32, NumArgs));
+#endif
+    Args->insertBefore(Call);
+
+    // Loop over arguments (skipping format).
+    for (unsigned A = 0; A < NumArgs; A++) {
+      llvm::Value *Arg = Call->getArgOperand(A + 1);
+      llvm::Type *ArgType = Arg->getType();
+
+      // Get pointer to argument in i64 array.
+      // TODO: promote arguments that are shorter than 32 bits.
+      llvm::Constant *ArgIndex = llvm::ConstantInt::get(I32, A);
+      llvm::Instruction *ArgPtr =
+          llvm::GetElementPtrInst::Create(I64, Args, {ArgIndex});
+      ArgPtr->insertBefore(Call);
+
+      // Cast pointer to correct type if necessary.
+      if (ArgPtr->getType()->getPointerElementType() != ArgType) {
+        llvm::BitCastInst *ArgPtrBC =
+            new llvm::BitCastInst(ArgPtr, ArgType->getPointerTo(0));
+        ArgPtrBC->insertAfter(ArgPtr);
+        ArgPtr = ArgPtrBC;
+      }
+
+      // Store argument to i64 array.
+      llvm::StoreInst *Store = new llvm::StoreInst(Arg, ArgPtr);
+      Store->insertBefore(Call);
+    }
+
+    // Fix address space of undef format values.
+    if (Format->getValueID() == llvm::Value::UndefValueVal) {
+      Format = llvm::UndefValue::get(FormatType);
+    }
+
+    // Replace call with new non-variadic function.
+    llvm::CallInst *NewCall = llvm::CallInst::Create(NewPrintF, {Format, Args});
+    NewCall->insertBefore(Call);
+    Call->replaceAllUsesWith(NewCall);
+    Call->eraseFromParent();
+  }
+
+  // Update arguments.
+  llvm::Function::arg_iterator OldArg = OldPrintF->arg_begin();
+  llvm::Function::arg_iterator NewArg = NewPrintF->arg_begin();
+  NewArg->takeName(&*OldArg);
+  OldArg->replaceAllUsesWith(&*NewArg);
+
+  // Remove old function.
+  OldPrintF->eraseFromParent();
+
+  // Get handle to vprintf function.
+  llvm::Function *VPrintF = Module->getFunction("vprintf");
+  if (!VPrintF)
+    return;
+
+  // If vprintf format address space is already generic, then we're done.
+  auto *VPrintFFormatType = VPrintF->getFunctionType()->getParamType(0);
+  if (VPrintFFormatType->getPointerAddressSpace() == 0)
+    return;
+
+  // Change address space of vprintf format argument to generic.
+  auto *I8Ptr = llvm::PointerType::get(llvm::Type::getInt8Ty(Context), 0);
+  auto *NewVPrintFType =
+      llvm::FunctionType::get(VPrintF->getReturnType(), {I8Ptr, I8Ptr}, false);
+  auto *NewVPrintF =
+      llvm::Function::Create(NewVPrintFType, VPrintF->getLinkage(), "", Module);
+  NewVPrintF->takeName(VPrintF);
+
+  // Update vprintf callers to pass format arguments in generic address space.
+  Callers.assign(VPrintF->user_begin(), VPrintF->user_end());
+  for (auto &U : Callers) {
+    llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(U);
+    if (!Call)
+      continue;
+
+    llvm::Value *Format = Call->getArgOperand(0);
+    llvm::Type *FormatType = Format->getType();
+    if (FormatType->getPointerAddressSpace() != 0) {
+      // Cast address space to generic.
+      llvm::Type *NewFormatType =
+          FormatType->getPointerElementType()->getPointerTo(0);
+      llvm::AddrSpaceCastInst *FormatASC =
+          new llvm::AddrSpaceCastInst(Format, NewFormatType);
+      FormatASC->insertBefore(Call);
+      Call->setArgOperand(0, FormatASC);
+    }
+    Call->setCalledFunction(NewVPrintF);
+  }
+
+  VPrintF->eraseFromParent();
+}
+
+// Replace all load users of a scalar global variable with new value.
+static void replaceScalarGlobalVar(llvm::Module *Module, const char *Name,
+                                   llvm::Value *NewValue) {
+  auto GlobalVar = Module->getGlobalVariable(Name);
+  if (!GlobalVar)
+    return;
+
+  std::vector<llvm::Value *> Users(GlobalVar->user_begin(),
+                                   GlobalVar->user_end());
+  for (auto *U : Users) {
+    auto Load = llvm::dyn_cast<llvm::LoadInst>(U);
+    assert(Load && "Use of a scalar global variable is not a load");
+    Load->replaceAllUsesWith(NewValue);
+    Load->eraseFromParent();
+  }
+  GlobalVar->eraseFromParent();
+}
+
+// Add an extra kernel argument for the dimensionality.
+void handleGetWorkDim(llvm::Module *Module, const char *KernelName) {
+  llvm::Function *Function = Module->getFunction(KernelName);
+  if (!Function)
+    POCL_ABORT("[CUDA] ptx-gen: kernel function not found in module\n");
+
+  // Add additional argument for the work item dimensionality.
+  llvm::FunctionType *FunctionType = Function->getFunctionType();
+  std::vector<llvm::Type *> ArgumentTypes(FunctionType->param_begin(),
+                                          FunctionType->param_end());
+  ArgumentTypes.push_back(llvm::Type::getInt32Ty(Module->getContext()));
+
+  // Create new function.
+  llvm::FunctionType *NewFunctionType =
+      llvm::FunctionType::get(Function->getReturnType(), ArgumentTypes, false);
+  llvm::Function *NewFunction = llvm::Function::Create(
+      NewFunctionType, Function->getLinkage(), Function->getName(), Module);
+  NewFunction->takeName(Function);
+
+  // Map function arguments.
+  llvm::ValueToValueMapTy VV;
+  llvm::Function::arg_iterator OldArg;
+  llvm::Function::arg_iterator NewArg;
+  for (OldArg = Function->arg_begin(), NewArg = NewFunction->arg_begin();
+       OldArg != Function->arg_end(); NewArg++, OldArg++) {
+    NewArg->takeName(&*OldArg);
+    VV[&*OldArg] = &*NewArg;
+  }
+
+  // Clone function.
+  llvm::SmallVector<llvm::ReturnInst *, 1> RI;
+  llvm::CloneFunctionInto(NewFunction, Function, VV, true, RI);
+
+  Function->eraseFromParent();
+
+  auto WorkDimVar = Module->getGlobalVariable("_work_dim");
+  if (!WorkDimVar)
+    return;
+
+  // Replace uses of the global offset variables with the new arguments.
+  NewArg->setName("work_dim");
+  replaceScalarGlobalVar(Module, "_work_dim", (&*NewArg++));
+
+  // TODO: What if get_work_dim() is called from a non-kernel function?
+}
+
+// If we don't need to handle offsets, just replaces uses of the offset
+// variables with constant zero. Otherwise, add additional kernel arguments for
+// the offsets and use those instead.
+void handleGlobalOffsets(llvm::Module *Module, const char *KernelName,
+                         bool HasOffsets) {
+  if (!HasOffsets) {
+    llvm::Type *I32 = llvm::Type::getInt32Ty(Module->getContext());
+    llvm::Value *Zero = llvm::ConstantInt::getSigned(I32, 0);
+    replaceScalarGlobalVar(Module, "_global_offset_x", Zero);
+    replaceScalarGlobalVar(Module, "_global_offset_y", Zero);
+    replaceScalarGlobalVar(Module, "_global_offset_z", Zero);
+    return;
+  }
+
+  llvm::Function *Function = Module->getFunction(KernelName);
+  if (!Function)
+    POCL_ABORT("[CUDA] ptx-gen: kernel function not found in module\n");
+
+  // Add additional arguments for the global offsets.
+  llvm::FunctionType *FunctionType = Function->getFunctionType();
+  std::vector<llvm::Type *> ArgumentTypes(FunctionType->param_begin(),
+                                          FunctionType->param_end());
+  llvm::Type *I32 = llvm::Type::getInt32Ty(Module->getContext());
+  ArgumentTypes.push_back(I32);
+  ArgumentTypes.push_back(I32);
+  ArgumentTypes.push_back(I32);
+
+  // Create new function.
+  llvm::FunctionType *NewFunctionType =
+      llvm::FunctionType::get(Function->getReturnType(), ArgumentTypes, false);
+  llvm::Function *NewFunction = llvm::Function::Create(
+      NewFunctionType, Function->getLinkage(), Function->getName(), Module);
+  NewFunction->takeName(Function);
+
+  // Map function arguments.
+  llvm::ValueToValueMapTy VV;
+  llvm::Function::arg_iterator OldArg;
+  llvm::Function::arg_iterator NewArg;
+  for (OldArg = Function->arg_begin(), NewArg = NewFunction->arg_begin();
+       OldArg != Function->arg_end(); NewArg++, OldArg++) {
+    NewArg->takeName(&*OldArg);
+    VV[&*OldArg] = &*NewArg;
+  }
+
+  // Clone function.
+  llvm::SmallVector<llvm::ReturnInst *, 1> RI;
+  llvm::CloneFunctionInto(NewFunction, Function, VV, true, RI);
+
+  // Replace uses of the global offset variables with the new arguments.
+  NewArg->setName("global_offset_x");
+  replaceScalarGlobalVar(Module, "_global_offset_x", (&*NewArg++));
+  NewArg->setName("global_offset_y");
+  replaceScalarGlobalVar(Module, "_global_offset_y", (&*NewArg++));
+  NewArg->setName("global_offset_z");
+  replaceScalarGlobalVar(Module, "_global_offset_z", (&*NewArg++));
+
+  // TODO: What if the offsets are in a function that isn't the kernel?
+
+  Function->eraseFromParent();
+}
+
+int findLibDevice(char LibDevicePath[PATH_MAX], const char *Arch) {
+  // Extract numeric portion of SM version.
+  char *End;
+  unsigned long SM = strtoul(Arch + 3, &End, 10);
+  if (!SM || strlen(End)) {
+    POCL_MSG_ERR("[CUDA] invalid GPU architecture %s\n", Arch);
+    return 1;
+  }
+
+  // This mapping from SM version to libdevice library version is given here:
+  // http://docs.nvidia.com/cuda/libdevice-users-guide/basic-usage.html#version-selection
+  // This is no longer needed as of CUDA 9.
+  int LibDeviceSM = 0;
+  if (SM < 30)
+    LibDeviceSM = 20;
+  else if (SM == 30)
+    LibDeviceSM = 30;
+  else if (SM < 35)
+    LibDeviceSM = 20;
+  else if (SM <= 37)
+    LibDeviceSM = 35;
+  else if (SM < 50)
+    LibDeviceSM = 30;
+  else if (SM <= 53)
+    LibDeviceSM = 50;
+  else
+    LibDeviceSM = 30;
+
+  const char *BasePath[] = {
+    pocl_get_string_option("POCL_CUDA_TOOLKIT_PATH", CUDA_TOOLKIT_ROOT_DIR),
+    "/usr/local/lib/cuda",
+    "/usr/local/lib",
+    "/usr/lib",
+  };
+
+  static const char *NVVMPath[] = {
+    "/nvvm",
+    "/nvidia-cuda-toolkit",
+    "",
+  };
+
+  static const char *PathFormat = "%s%s/libdevice/libdevice.10.bc";
+  static const char *OldPathFormat =
+      "%s%s/libdevice/libdevice.compute_%d.10.bc";
+
+  // Search combinations of paths for the libdevice library.
+  for (auto bp : BasePath) {
+    for (auto np : NVVMPath) {
+      // Check for CUDA 9+ libdevice library.
+      size_t ps = snprintf(LibDevicePath, PATH_MAX - 1, PathFormat, bp, np);
+      LibDevicePath[ps] = '\0';
+      POCL_MSG_PRINT2(CUDA, __FUNCTION__, __LINE__,
+                      "looking for libdevice at '%s'\n", LibDevicePath);
+      if (pocl_exists(LibDevicePath)) {
+        POCL_MSG_PRINT2(CUDA, __FUNCTION__, __LINE__,
+                        "found libdevice at '%s'\n", LibDevicePath);
+        return 0;
+      }
+
+      // Check for pre CUDA 9 libdevice library.
+      ps = snprintf(LibDevicePath, PATH_MAX - 1, OldPathFormat, bp, np,
+                    LibDeviceSM);
+      LibDevicePath[ps] = '\0';
+      POCL_MSG_PRINT2(CUDA, __FUNCTION__, __LINE__,
+                      "looking for libdevice at '%s'\n", LibDevicePath);
+      if (pocl_exists(LibDevicePath)) {
+        POCL_MSG_PRINT2(CUDA, __FUNCTION__, __LINE__,
+                        "found libdevice at '%s'\n", LibDevicePath);
+        return 0;
+      }
+    }
+  }
+
+  return 1;
+}
+
+// Link CUDA's libdevice bitcode library to provide implementations for most of
+// the OpenCL math functions.
+// TODO: Can we link libdevice into the kernel library at pocl build time?
+// This would remove this runtime dependency on the CUDA toolkit.
+// Had some issues with the earlier pocl LLVM passes crashing on the libdevice
+// code - needs more investigation.
+void linkLibDevice(llvm::Module *Module, const char *KernelName,
+                   const char *LibDevicePath) {
+  auto Buffer = llvm::MemoryBuffer::getFile(LibDevicePath);
+  if (!Buffer)
+    POCL_ABORT("[CUDA] failed to open libdevice library file\n");
+
+  POCL_MSG_PRINT_INFO("loading libdevice from '%s'\n", LibDevicePath);
+
+  // Load libdevice bitcode library.
+  llvm::Expected<std::unique_ptr<llvm::Module>> LibDeviceModule =
+      parseBitcodeFile(Buffer->get()->getMemBufferRef(), Module->getContext());
+  if (!LibDeviceModule)
+    POCL_ABORT("[CUDA] failed to load libdevice bitcode\n");
+
+  // Fix triple and data-layout of libdevice module.
+  (*LibDeviceModule)->setTargetTriple(Module->getTargetTriple());
+  (*LibDeviceModule)->setDataLayout(Module->getDataLayout());
+
+  // Link libdevice into module.
+  llvm::Linker Linker(*Module);
+  if (Linker.linkInModule(std::move(LibDeviceModule.get()))) {
+    POCL_ABORT("[CUDA] failed to link to libdevice");
+  }
+
+  llvm::legacy::PassManager Passes;
+
+  // Run internalize to mark all non-kernel functions as internal.
+  auto PreserveKernel = [=](const llvm::GlobalValue &GV) {
+    return GV.getName() == KernelName;
+  };
+  Passes.add(llvm::createInternalizePass(PreserveKernel));
+
+  // Add NVVM reflect module flags to set math options.
+  // TODO: Determine correct FTZ value from frontend compiler options.
+  llvm::LLVMContext &Context = Module->getContext();
+  llvm::Type *I32 = llvm::Type::getInt32Ty(Context);
+  llvm::Metadata *FourMD =
+      llvm::ValueAsMetadata::get(llvm::ConstantInt::getSigned(I32, 4));
+  llvm::Metadata *NameMD = llvm::MDString::get(Context, "nvvm-reflect-ftz");
+  llvm::Metadata *OneMD =
+      llvm::ConstantAsMetadata::get(llvm::ConstantInt::getSigned(I32, 1));
+  llvm::MDNode *ReflectFlag =
+      llvm::MDNode::get(Context, {FourMD, NameMD, OneMD});
+  Module->addModuleFlag(ReflectFlag);
+
+  // Run optimization passes to clean up unused functions etc.
+  llvm::PassManagerBuilder Builder;
+  Builder.OptLevel = 3;
+  Builder.SizeLevel = 0;
+  Builder.populateModulePassManager(Passes);
+
+  Passes.run(*Module);
+}
+
+// This transformation replaces each pointer argument in the specific address
+// space with an integer offset, and then inserts the necessary GEP+BitCast
+// instructions to calculate the new pointers from the provided base global
+// variable.
+void convertPtrArgsToOffsets(llvm::Module *Module, const char *KernelName,
+                             unsigned AddrSpace, llvm::GlobalVariable *Base) {
+
+  llvm::LLVMContext &Context = Module->getContext();
+
+  llvm::Function *Function = Module->getFunction(KernelName);
+  if (!Function)
+    POCL_ABORT("[CUDA] ptx-gen: kernel function not found in module\n");
+
+  // Argument info for creating new function.
+  std::vector<llvm::Argument *> Arguments;
+  std::vector<llvm::Type *> ArgumentTypes;
+
+  llvm::ValueToValueMapTy VV;
+  std::vector<std::pair<llvm::Instruction *, llvm::Instruction *>> ToInsert;
+
+  // Loop over arguments.
+  bool NeedsArgOffsets = false;
+  for (auto &Arg : Function->args()) {
+    // Check for local memory pointer.
+    llvm::Type *ArgType = Arg.getType();
+    if (ArgType->isPointerTy() &&
+        ArgType->getPointerAddressSpace() == AddrSpace) {
+      NeedsArgOffsets = true;
+
+      // Create new argument for offset into shared memory allocation.
+      llvm::Type *I32ty = llvm::Type::getInt32Ty(Context);
+      llvm::Argument *Offset =
+          new llvm::Argument(I32ty, Arg.getName() + "_offset");
+      Arguments.push_back(Offset);
+      ArgumentTypes.push_back(I32ty);
+
+      // Insert GEP to add offset.
+      llvm::Value *Zero = llvm::ConstantInt::getSigned(I32ty, 0);
+      llvm::GetElementPtrInst *GEP =
+          llvm::GetElementPtrInst::Create(nullptr, Base, {Zero, Offset});
+
+      // Cast pointer to correct type.
+      llvm::BitCastInst *Cast = new llvm::BitCastInst(GEP, ArgType);
+
+      // Save these instructions to insert into new function later.
+      ToInsert.push_back({GEP, Cast});
+
+      // Map the old local memory argument to the result of this cast.
+      VV[&Arg] = Cast;
+    } else {
+      // No change to other arguments.
+      Arguments.push_back(&Arg);
+      ArgumentTypes.push_back(ArgType);
+    }
+  }
+
+  if (!NeedsArgOffsets)
+    return;
+
+  // Create new function with offsets instead of local memory pointers.
+  llvm::FunctionType *NewFunctionType =
+      llvm::FunctionType::get(Function->getReturnType(), ArgumentTypes, false);
+  llvm::Function *NewFunction = llvm::Function::Create(
+      NewFunctionType, Function->getLinkage(), Function->getName(), Module);
+  NewFunction->takeName(Function);
+
+  // Map function arguments.
+  std::vector<llvm::Argument *>::iterator OldArg;
+  llvm::Function::arg_iterator NewArg;
+  for (OldArg = Arguments.begin(), NewArg = NewFunction->arg_begin();
+       NewArg != NewFunction->arg_end(); NewArg++, OldArg++) {
+    NewArg->takeName(*OldArg);
+    if ((*OldArg)->getParent())
+      VV[*OldArg] = &*NewArg;
+    else {
+      // Manually replace new offset arguments.
+      (*OldArg)->replaceAllUsesWith(&*NewArg);
+      delete *OldArg;
+    }
+  }
+
+  // Clone function.
+  llvm::SmallVector<llvm::ReturnInst *, 1> RI;
+  llvm::CloneFunctionInto(NewFunction, Function, VV, true, RI);
+
+  // Insert offset instructions into new function.
+  for (auto Pair : ToInsert) {
+    Pair.first->insertBefore(&*NewFunction->begin()->begin());
+    Pair.second->insertAfter(Pair.first);
+  }
+
+  Function->eraseFromParent();
+}
+
+// CUDA doesn't allow constant pointer arguments, so we have to convert them to
+// offsets and manually add them to a global variable base pointer.
+void fixConstantMemArgs(llvm::Module *Module, const char *KernelName) {
+
+  // Calculate total size of automatic constant allocations.
+  size_t TotalAutoConstantSize = 0;
+  for (auto &GlobalVar : Module->globals()) {
+    if (GlobalVar.getType()->getPointerAddressSpace() == 4)
+      TotalAutoConstantSize += Module->getDataLayout().getTypeAllocSize(
+          GlobalVar.getInitializer()->getType());
+  }
+
+  // Create global variable for constant memory allocations.
+  // TODO: Does allocating the maximum amount have a penalty?
+  llvm::Type *ByteArrayType =
+      llvm::ArrayType::get(llvm::Type::getInt8Ty(Module->getContext()),
+                           65536 - TotalAutoConstantSize);
+  llvm::GlobalVariable *ConstantMemBase = new llvm::GlobalVariable(
+      *Module, ByteArrayType, false, llvm::GlobalValue::InternalLinkage,
+      llvm::Constant::getNullValue(ByteArrayType), "_constant_memory_region_",
+      NULL, llvm::GlobalValue::NotThreadLocal, 4, false);
+
+  convertPtrArgsToOffsets(Module, KernelName, 4, ConstantMemBase);
+}
+
+// CUDA doesn't allow multiple local memory arguments or automatic variables, so
+// we have to create a single global variable for local memory allocations, and
+// then manually add offsets to it to get each individual local memory
+// allocation.
+void fixLocalMemArgs(llvm::Module *Module, const char *KernelName) {
+
+  // Create global variable for local memory allocations.
+  llvm::Type *ByteArrayType =
+      llvm::ArrayType::get(llvm::Type::getInt8Ty(Module->getContext()), 0);
+  llvm::GlobalVariable *SharedMemBase = new llvm::GlobalVariable(
+      *Module, ByteArrayType, false, llvm::GlobalValue::ExternalLinkage, NULL,
+      "_shared_memory_region_", NULL, llvm::GlobalValue::NotThreadLocal, 3,
+      false);
+
+  convertPtrArgsToOffsets(Module, KernelName, 3, SharedMemBase);
+}
+
+// Map kernel math functions onto the corresponding CUDA libdevice functions.
+void mapLibDeviceCalls(llvm::Module *Module) {
+  struct FunctionMapEntry {
+    const char *OCLFunctionName;
+    const char *LibDeviceFunctionName;
+  };
+  struct FunctionMapEntry FunctionMap[] = {
+
+// clang-format off
+#define LDMAP(name) \
+  {name "f", "__nv_" name "f"}, \
+  {name,     "__nv_" name},
+
+    LDMAP("acos")
+    LDMAP("acosh")
+    LDMAP("asin")
+    LDMAP("asinh")
+    LDMAP("atan")
+    LDMAP("atanh")
+    LDMAP("atan2")
+    LDMAP("cbrt")
+    LDMAP("ceil")
+    LDMAP("copysign")
+    LDMAP("cos")
+    LDMAP("cosh")
+    LDMAP("exp")
+    LDMAP("exp2")
+    LDMAP("expm1")
+    LDMAP("fdim")
+    LDMAP("floor")
+    LDMAP("fmax")
+    LDMAP("fmin")
+    LDMAP("hypot")
+    LDMAP("ilogb")
+    LDMAP("lgamma")
+    LDMAP("log")
+    LDMAP("log2")
+    LDMAP("log10")
+    LDMAP("log1p")
+    LDMAP("logb")
+    LDMAP("nextafter")
+    LDMAP("remainder")
+    LDMAP("rint")
+    LDMAP("round")
+    LDMAP("sin")
+    LDMAP("sinh")
+    LDMAP("sqrt")
+    LDMAP("tan")
+    LDMAP("tanh")
+    LDMAP("trunc")
+#undef LDMAP
+
+    {"llvm.copysign.f32", "__nv_copysignf"},
+    {"llvm.copysign.f64", "__nv_copysign"},
+
+    {"llvm.pow.f32", "__nv_powf"},
+    {"llvm.pow.f64", "__nv_pow"},
+
+    // TODO: frexp
+    // TODO: ldexp
+    // TODO: lgamma_r
+    // TODO: modf
+    // TODO: pown
+    // TODO: remquo
+    // TODO: rootn
+  };
+  // clang-format on
+
+  for (auto &Entry : FunctionMap) {
+    llvm::Function *Function = Module->getFunction(Entry.OCLFunctionName);
+    if (!Function)
+      continue;
+
+    std::vector<llvm::Value *> Users(Function->user_begin(),
+                                     Function->user_end());
+    for (auto &U : Users) {
+      // Look for calls to function.
+      llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(U);
+      if (Call) {
+        // Create function declaration for libdevice version.
+        llvm::FunctionType *FunctionType = Function->getFunctionType();
+        llvm::Constant *LibDeviceFunction = Module->getOrInsertFunction(
+            Entry.LibDeviceFunctionName, FunctionType);
+
+        // Replace function with libdevice version.
+        std::vector<llvm::Value *> Args(Call->arg_begin(), Call->arg_end());
+        llvm::CallInst *NewCall =
+            llvm::CallInst::Create(LibDeviceFunction, Args, "", Call);
+        NewCall->takeName(Call);
+        Call->replaceAllUsesWith(NewCall);
+        Call->eraseFromParent();
+      }
+    }
+
+    Function->eraseFromParent();
+  }
+}
+
+int pocl_cuda_get_ptr_arg_alignment(const char *BitcodeFilename,
+                                    const char *KernelName,
+                                    size_t *Alignments) {
+  // Create buffer for bitcode file.
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Buffer =
+      llvm::MemoryBuffer::getFile(BitcodeFilename);
+  if (!Buffer) {
+    POCL_MSG_ERR("[CUDA] ptx-gen: failed to open bitcode file\n");
+    return 1;
+  }
+
+  // Load the LLVM bitcode module.
+  llvm::LLVMContext Context;
+  llvm::Expected<std::unique_ptr<llvm::Module>> Module =
+      parseBitcodeFile(Buffer->get()->getMemBufferRef(), Context);
+  if (!Module) {
+    POCL_MSG_ERR("[CUDA] ptx-gen: failed to load bitcode\n");
+    return 1;
+  }
+
+  // Get kernel function.
+  llvm::Function *Kernel = (*Module)->getFunction(KernelName);
+  if (!Kernel)
+    POCL_ABORT("[CUDA] kernel function not found in module\n");
+
+  // Calculate alignment for each argument.
+  const llvm::DataLayout &DL = (*Module)->getDataLayout();
+  for (auto &Arg : Kernel->args()) {
+    unsigned i = Arg.getArgNo();
+    llvm::Type *Type = Arg.getType();
+    if (!Type->isPointerTy())
+      Alignments[i] = 0;
+    else {
+      llvm::Type *ElemType = Type->getPointerElementType();
+      Alignments[i] = DL.getTypeAllocSize(ElemType);
+    }
+  }
+
+  return 0;
+}
diff --git a/lib/CL/devices/cuda/pocl-ptx-gen.h b/lib/CL/devices/cuda/pocl-ptx-gen.h
new file mode 100644
index 0000000..30a4d3f
--- /dev/null
+++ b/lib/CL/devices/cuda/pocl-ptx-gen.h
@@ -0,0 +1,62 @@
+/* pocl-ptx-gen.h - declarations for PTX code generator
+
+   Copyright (c) 2016-2017 James Price / University of Bristol
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#ifndef POCL_PTX_GEN_H
+#define POCL_PTX_GEN_H
+
+#include "config.h"
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+#ifndef PATH_MAX
+#define PATH_MAX 4096
+#endif
+
+/* Search for the libdevice bitcode library for the given GPU architecture. */
+/* Returns zero on success, non-zero on failure. */
+int findLibDevice(char LibDevicePath[PATH_MAX], const char *Arch);
+
+/* Generate a PTX file from an LLVM bitcode file. */
+/* Returns zero on success, non-zero on failure. */
+int pocl_ptx_gen(const char *BitcodeFilename,
+                 const char *PTXFilename,
+                 const char *KernelName,
+                 const char *Arch,
+                 const char *LibDevicePath,
+                 int HasOffsets);
+
+/* Populate the Alignments array with the required pointer alignments for */
+/* each kernel argument. */
+/* Returns zero on success, non-zero on failure. */
+int pocl_cuda_get_ptr_arg_alignment(const char *BitcodeFilename,
+                                    const char *KernelName,
+                                    size_t *Alignments);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* POCL_PTX_GEN_H */
diff --git a/lib/CL/devices/devices.c b/lib/CL/devices/devices.c
index f9449f9..6eca594 100644
--- a/lib/CL/devices/devices.c
+++ b/lib/CL/devices/devices.c
@@ -22,9 +22,20 @@
    THE SOFTWARE.
 */
 
+#define _GNU_SOURCE
+
 #include <string.h>
 #include <ctype.h>
 
+#ifdef __linux__
+#include <limits.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <ucontext.h>
+#endif
+
 #ifndef _MSC_VER
 #  include <unistd.h>
 #else
@@ -39,7 +50,10 @@
 #include "pocl_debug.h"
 #include "pocl_tracing.h"
 #include "pocl_cache.h"
-#include "pocl_queue_util.h"
+
+#ifdef OCS_AVAILABLE
+#include "pocl_llvm.h"
+#endif
 
 #if defined(TCE_AVAILABLE)
 #include "tce/ttasim/ttasim.h"
@@ -47,6 +61,10 @@
 
 #include "hsa/pocl-hsa.h"
 
+#if defined(BUILD_CUDA)
+#include "cuda/pocl-cuda.h"
+#endif
+
 #define MAX_DEV_NAME_LEN 64
 
 /* the enabled devices */
@@ -67,6 +85,9 @@ static init_device_ops pocl_devices_init_ops[] = {
 #if defined(BUILD_HSA)
   pocl_hsa_init_device_ops,
 #endif
+#if defined(BUILD_CUDA)
+  pocl_cuda_init_device_ops,
+#endif
 };
 
 #define POCL_NUM_DEVICE_TYPES (sizeof(pocl_devices_init_ops) / sizeof((pocl_devices_init_ops)[0]))
@@ -106,8 +127,17 @@ pocl_get_devices(cl_device_type device_type, struct _cl_device_id **devices, uns
 
   for (i = 0; i < pocl_num_devices; ++i)
     {
-      if ((pocl_devices[i].type & device_type) &&
-          (offline_compile || (pocl_devices[i].available == CL_TRUE)))
+      if (!offline_compile && (pocl_devices[i].available != CL_TRUE))
+        continue;
+
+      if (device_type == CL_DEVICE_TYPE_DEFAULT)
+        {
+          devices[dev_added] = &pocl_devices[i];
+          ++dev_added;
+          break;
+        }
+
+      if (pocl_devices[i].type & device_type)
         {
             if (dev_added < num_devices)
               {
@@ -133,12 +163,18 @@ pocl_get_device_type_count(cl_device_type device_type)
 
   for (i = 0; i < pocl_num_devices; ++i)
     {
-      if ((pocl_devices[i].type & device_type) &&
-          (offline_compile || (pocl_devices[i].available == CL_TRUE)))
+      if (!offline_compile && (pocl_devices[i].available != CL_TRUE))
+        continue;
+
+      if (device_type == CL_DEVICE_TYPE_DEFAULT)
+        return 1;
+
+      if (pocl_devices[i].type & device_type)
         {
            ++count;
         }
     }
+
   return count;
 }
 
@@ -180,7 +216,118 @@ pocl_string_to_dirname(char *str)
     }
 }
 
-void 
+/* This ugly hack is required because:
+ *
+ * OpenCL 1.2 specification, 6.3 Operators :
+ *
+ * A divide by zero with integer types does not cause an exception
+ * but will result in an unspecified value. Division by zero for
+ * floating-point types will result in ±infinity or NaN as
+ * prescribed by the IEEE-754 standard.
+ *
+ * FPU exceptions are masked by default on x86 linux, but integer divide
+ * is not and there doesn't seem any sane way to mask it.
+ *
+ * This *might* be possible to fix with a LLVM pass (either check divisor
+ * for 0, or perhaps some vector extension has a suitable instruction), but
+ * it's likely to ruin the performance.
+ */
+
+#ifdef __linux__
+#ifdef __x86_64__
+
+#define DIV_OPCODE_SIZE 1
+#define DIV_OPCODE_MASK 0xf6
+
+/* F6 /6, F6 /7, F7 /6, F7 /7 */
+#define DIV_OPCODE_1 0xf6
+#define DIV_OPCODE_2 0xf7
+#define DIV_MODRM_OPCODE_EXT_1 0x38 //  /7
+#define DIV_MODRM_OPCODE_EXT_2 0x30 //  /6
+
+#define MODRM_SIZE 1
+#define MODRM_MASK 0xC0
+#define REG2_MASK 0x38
+#define REG1_MASK 0x07
+#define ADDR_MODE_INDIRECT_ONE_BYTE_OFFSET 0x40
+#define ADDR_MODE_INDIRECT_FOUR_BYTE_OFFSET 0x80
+#define ADDR_MODE_INDIRECT 0x0
+#define ADDR_MODE_REGISTER_ONLY 0xC0
+#define REG_SP 0x4
+#define REG_BP 0x5
+#define SIB_BYTE 1
+#define IP_RELATIVE_INDEXING 4
+
+static struct sigaction sigfpe_action, old_sigfpe_action;
+
+static void
+sigfpe_signal_handler (int signo, siginfo_t *si, void *data)
+{
+  ucontext_t *uc;
+  uc = (ucontext_t *)data;
+  unsigned char *eip = (unsigned char *)(uc->uc_mcontext.gregs[REG_RIP]);
+
+  if ((signo == SIGFPE)
+      && ((si->si_code == FPE_INTDIV) || (si->si_code == FPE_INTOVF)))
+    {
+      /* Luckily for us, div-by-0 exceptions do NOT advance the IP register,
+       * so we have to disassemble the instruction (to know its length)
+       * and move IP past it. */
+      unsigned n = 0;
+
+      /* skip all prefixes */
+      while ((n < 4) && ((eip[n] & DIV_OPCODE_MASK) != DIV_OPCODE_MASK))
+        ++n;
+
+      /* too much prefixes = decoding failed */
+      if (n >= 4)
+        goto ORIGINAL_HANDLER;
+
+      /* check opcode */
+      unsigned opcode = eip[n];
+      if ((opcode != DIV_OPCODE_1) && (opcode != DIV_OPCODE_2))
+        goto ORIGINAL_HANDLER;
+      n += DIV_OPCODE_SIZE;
+
+      unsigned modrm = eip[n];
+      unsigned modmask = modrm & MODRM_MASK;
+      unsigned reg1mask = modrm & REG1_MASK;
+      unsigned reg2mask = modrm & REG2_MASK;
+      /* check opcode extension in ModR/M reg2 */
+      if ((reg2mask != DIV_MODRM_OPCODE_EXT_1)
+          && (reg2mask != DIV_MODRM_OPCODE_EXT_2))
+        goto ORIGINAL_HANDLER;
+      n += MODRM_SIZE;
+
+      /* handle immediates/registers */
+      if (modmask == ADDR_MODE_INDIRECT_ONE_BYTE_OFFSET)
+        n += 1;
+      if (modmask == ADDR_MODE_INDIRECT_FOUR_BYTE_OFFSET)
+        n += 4;
+      if (modmask == ADDR_MODE_INDIRECT)
+        n += 0;
+      if (modmask != ADDR_MODE_REGISTER_ONLY)
+        {
+          if (reg1mask == REG_SP)
+            n += SIB_BYTE;
+          if (reg1mask == REG_BP)
+            n += IP_RELATIVE_INDEXING;
+        }
+
+      uc->uc_mcontext.gregs[REG_RIP] += n;
+      return;
+    }
+  else
+    {
+    ORIGINAL_HANDLER:
+      (*old_sigfpe_action.sa_sigaction) (signo, si, data);
+    }
+}
+
+#endif
+#endif
+
+cl_int
 pocl_init_devices()
 {
   static unsigned int init_done = 0;
@@ -199,30 +346,64 @@ pocl_init_devices()
      infinite loop. */
 
   if (init_in_progress)
-      return;
+      return CL_SUCCESS; /* debatable, but what else can we do ? */
   init_in_progress = 1;
 
   if (init_done == 0)
     POCL_INIT_LOCK(pocl_init_lock);
   POCL_LOCK(pocl_init_lock);
-  if (init_done) 
+  if (init_done)
     {
       POCL_UNLOCK(pocl_init_lock);
-      return;
+      return pocl_num_devices ? CL_SUCCESS : CL_DEVICE_NOT_FOUND;
     }
 
   /* Set a global debug flag, so we don't have to call pocl_get_bool_option
    * everytime we use the debug macros */
 #ifdef POCL_DEBUG_MESSAGES
-  pocl_debug_messages = pocl_get_bool_option("POCL_DEBUG", 0);
+  const char* debug = pocl_get_string_option ("POCL_DEBUG", "0");
+  pocl_debug_messages_setup (debug);
   stderr_is_a_tty = isatty(fileno(stderr));
 #endif
 
-  pocl_aborting = 0;
+  if (pocl_cache_init_topdir ())
+    {
+      init_done = 1;
+      pocl_num_devices = 0;
+      POCL_UNLOCK (pocl_init_lock);
+      return CL_DEVICE_NOT_FOUND;
+    }
+  pocl_event_tracing_init ();
+
+#ifdef OCS_AVAILABLE
+  /* This is required to force LLVM to register its signal
+   * handlers, before pocl registers its own SIGFPE handler.
+   * LLVM otherwise calls this via
+   *    pocl_llvm_build_program ->
+   *    clang::PrintPreprocessedAction ->
+   *    CreateOutputFile -> RemoveFileOnSignal
+   * Registering our handlers before LLVM creates its sigaltstack
+   * leads to interesting crashes & bugs later.
+   */
+  char random_empty_file[POCL_FILENAME_LENGTH];
+  pocl_cache_tempname (random_empty_file, NULL, NULL);
+  pocl_llvm_remove_file_on_signal (random_empty_file);
+#endif
+
+#ifdef __linux__
+#ifdef __x86_64__
+
+  if (pocl_get_bool_option ("POCL_SIGFPE_HANDLER", 1))
+    {
+      POCL_MSG_PRINT_GENERAL ("Installing SIGFPE handler...\n");
+      sigfpe_action.sa_flags = SA_RESTART | SA_SIGINFO;
+      sigfpe_action.sa_sigaction = sigfpe_signal_handler;
+      int res = sigaction (SIGFPE, &sigfpe_action, &old_sigfpe_action);
+      assert (res == 0);
+    }
 
-  pocl_cache_init_topdir();
-  pocl_event_tracing_init();
-  pocl_init_queue_list();
+#endif
+#endif
 
   /* Init operations */
   for (i = 0; i < POCL_NUM_DEVICE_TYPES; ++i)
@@ -230,16 +411,26 @@ pocl_init_devices()
       pocl_devices_init_ops[i](&pocl_device_ops[i]);
       assert(pocl_device_ops[i].device_name != NULL);
 
-      /* Probe and add the result to the number of probbed devices */
+      /* Probe and add the result to the number of probed devices */
       assert(pocl_device_ops[i].probe);
       device_count[i] = pocl_device_ops[i].probe(&pocl_device_ops[i]);
       pocl_num_devices += device_count[i];
     }
 
-  assert(pocl_num_devices > 0);
+  if (pocl_num_devices == 0)
+    {
+      const char *dev_env = getenv (POCL_DEVICES_ENV);
+      if (dev_env)
+        POCL_MSG_WARN ("no devices found. %s=%s\n", POCL_DEVICES_ENV, dev_env);
+      return CL_DEVICE_NOT_FOUND;
+    }
+
   pocl_devices = (struct _cl_device_id*) calloc(pocl_num_devices, sizeof(struct _cl_device_id));
   if (pocl_devices == NULL)
-    POCL_ABORT("Can not allocate memory for devices\n");
+    {
+      POCL_MSG_ERR ("Can not allocate memory for devices\n");
+      return CL_OUT_OF_HOST_MEMORY;
+    }
 
   dev_index = 0;
   /* Init infos for each probed devices */
@@ -248,6 +439,7 @@ pocl_init_devices()
       assert(pocl_device_ops[i].init);
       for (j = 0; j < device_count[i]; ++j)
         {
+          cl_int ret = CL_SUCCESS;
           pocl_devices[dev_index].ops = &pocl_device_ops[i];
           pocl_devices[dev_index].dev_id = dev_index;
           /* The default value for the global memory space identifier is
@@ -255,8 +447,8 @@ pocl_init_devices()
              it to point to some other device's global memory id in case of
              a shared global memory. */
           pocl_devices[dev_index].global_mem_id = dev_index;
-          
-          pocl_device_ops[i].init_device_infos(&pocl_devices[dev_index]);
+
+          pocl_device_ops[i].init_device_infos(j, &pocl_devices[dev_index]);
 
           pocl_device_common_init(&pocl_devices[dev_index]);
 
@@ -264,22 +456,31 @@ pocl_init_devices()
           /* Check if there are device-specific parameters set in the
              POCL_DEVICEn_PARAMETERS env. */
           if (snprintf (env_name, 1024, "POCL_%s%d_PARAMETERS", dev_name, j) < 0)
-            POCL_ABORT("Unable to generate the env string.");
-          pocl_devices[dev_index].dev_id = dev_index;
-          pocl_devices[dev_index].ops->init(&pocl_devices[dev_index], getenv(env_name));
-
-          if (dev_index == 0)
-            pocl_devices[dev_index].type |= CL_DEVICE_TYPE_DEFAULT;
+            {
+              POCL_MSG_ERR("Unable to generate the env string.");
+              return CL_OUT_OF_HOST_MEMORY;
+            }
+          ret = pocl_devices[dev_index].ops->init (j, &pocl_devices[dev_index], getenv(env_name));
+          switch (ret)
+          {
+          case CL_OUT_OF_HOST_MEMORY:
+            return ret;
+          case CL_SUCCESS:
+            break;
+          default:
+            pocl_devices[dev_index].available = 0;
+          }
 
           pocl_devices[dev_index].cache_dir_name = strdup(pocl_devices[dev_index].long_name);
           pocl_string_to_dirname(pocl_devices[dev_index].cache_dir_name);
-          
+
           ++dev_index;
         }
     }
 
   init_done = 1;
   POCL_UNLOCK(pocl_init_lock);
+  return CL_SUCCESS;
 }
 
 int pocl_get_unique_global_mem_id ()
diff --git a/lib/CL/devices/devices.h b/lib/CL/devices/devices.h
index 5baa7f0..7574fd2 100644
--- a/lib/CL/devices/devices.h
+++ b/lib/CL/devices/devices.h
@@ -45,7 +45,7 @@ extern unsigned int pocl_num_devices;
  * The devices are shared across contexts, thus must implement resource
  * management internally also across multiple contexts.
  */
-void pocl_init_devices();
+cl_int pocl_init_devices();
 
 /**
  * \brief Get the count of devices for a specific type
diff --git a/lib/CL/devices/hsa/pocl-hsa.c b/lib/CL/devices/hsa/pocl-hsa.c
index 32a9000..6eeb8e5 100644
--- a/lib/CL/devices/hsa/pocl-hsa.c
+++ b/lib/CL/devices/hsa/pocl-hsa.c
@@ -184,8 +184,13 @@ typedef struct pocl_hsa_device_data_s {
 
   /* if agent supports async handlers*/
   int have_wait_any;
+
+  /* compilation lock */
+  pocl_lock_t pocl_hsa_compilation_lock;
 } pocl_hsa_device_data_t;
 
+
+
 void
 pocl_hsa_init_device_ops(struct pocl_device_ops *ops)
 {
@@ -239,7 +244,7 @@ pocl_hsa_abort_on_hsa_error(hsa_status_t status,
   if (status != HSA_STATUS_SUCCESS)
     {
       hsa_status_string(status, &str);
-      POCL_MSG_PRINT2(func, line, "Error from HSA Runtime call:\n");
+      POCL_MSG_PRINT2(HSA, func, line, "Error from HSA Runtime call:\n");
       POCL_ABORT("%s", str);
     }
 }
@@ -258,7 +263,7 @@ pocl_hsa_abort_on_pthread_error(int status,
 {
   if (status != 0)
     {
-      POCL_MSG_PRINT2(func, line, "Error from pthread call:\n");
+      POCL_MSG_PRINT2(HSA, func, line, "Error from pthread call:\n");
       POCL_ABORT("%s", strerror(status));
     }
 }
@@ -269,8 +274,7 @@ pocl_hsa_abort_on_pthread_error(int status,
                                                             #code);
 
 static hsa_agent_t hsa_agents[MAX_HSA_AGENTS];
-static intptr_t last_assigned_agent = 0;
-static int found_hsa_agents = 0;
+static unsigned found_hsa_agents = 0;
 
 static hsa_status_t
 pocl_hsa_get_agents_callback(hsa_agent_t agent, void *data)
@@ -440,9 +444,9 @@ get_hsa_device_features(char* dev_name, struct _cl_device_id* dev)
 }
 
 void
-pocl_hsa_init_device_infos(struct _cl_device_id* dev)
+pocl_hsa_init_device_infos(unsigned j, struct _cl_device_id* dev)
 {
-  pocl_basic_init_device_infos (dev);
+  pocl_basic_init_device_infos (j, dev);
 
   SETUP_DEVICE_CL_VERSION(HSA_DEVICE_CL_VERSION_MAJOR,
                           HSA_DEVICE_CL_VERSION_MINOR)
@@ -454,10 +458,10 @@ pocl_hsa_init_device_infos(struct _cl_device_id* dev)
   dev->local_as_id = 3;
   dev->constant_as_id = 2;
 
-  assert(found_hsa_agents > 0);
-  assert(last_assigned_agent < found_hsa_agents);
-  dev->data = (void*)last_assigned_agent;
-  hsa_agent_t agent = hsa_agents[last_assigned_agent++];
+  assert (found_hsa_agents > 0);
+  assert (j < found_hsa_agents);
+  dev->data = (void*)(uintptr_t)j;
+  hsa_agent_t agent = hsa_agents[j];
 
   uint32_t cache_sizes[4];
   HSA_CHECK(hsa_agent_get_info (agent, HSA_AGENT_INFO_CACHE_SIZE,
@@ -583,9 +587,8 @@ pocl_hsa_probe(struct pocl_device_ops *ops)
   HSA_CHECK(hsa_iterate_agents(pocl_hsa_get_agents_callback, NULL));
 
   POCL_MSG_PRINT_INFO("pocl-hsa: found %d agents.\n", found_hsa_agents);
-  last_assigned_agent = 0;
 
-  return found_hsa_agents;
+  return (int)found_hsa_agents;
 }
 
 static void
@@ -596,8 +599,8 @@ hsa_queue_callback(hsa_status_t status, hsa_queue_t *q, void* data) {
 /* driver pthread prototype */
 void * pocl_hsa_driver_pthread (void *cldev);
 
-void
-pocl_hsa_init (cl_device_id device, const char* parameters)
+cl_int
+pocl_hsa_init (unsigned j, cl_device_id device, const char* parameters)
 {
   pocl_hsa_device_data_t *d;
 
@@ -605,6 +608,8 @@ pocl_hsa_init (cl_device_id device, const char* parameters)
 
   d = (pocl_hsa_device_data_t *) calloc (1, sizeof(pocl_hsa_device_data_t));
 
+  POCL_INIT_LOCK (d->pocl_hsa_compilation_lock);
+
   intptr_t agent_index = (intptr_t)device->data;
   d->agent.handle = hsa_agents[agent_index].handle;
   device->data = d;
@@ -691,6 +696,7 @@ pocl_hsa_init (cl_device_id device, const char* parameters)
   d->exit_driver_thread = 0;
   PTHREAD_CHECK(pthread_create(&d->driver_pthread_id, NULL,
                  &pocl_hsa_driver_pthread, device));
+  return CL_SUCCESS;
 }
 
 static void*
@@ -934,41 +940,6 @@ setup_kernel_args (pocl_hsa_device_data_t *d,
 #endif
 }
 
-/*
- * This replaces a simple system(), because system() was causing issues
- * (gpu lockups) when compiling code (via compile_parallel_bc_to_brig)
- * with OpenCL 2.0 atomics (like CalcPie from AMD SDK).
- * The reason of lockups is unknown (yet).
- */
-static int
-run_command(char* args[])
-{
-  POCL_MSG_PRINT_INFO("Launching: %s\n", args[0]);
-#ifdef HAVE_VFORK
-  pid_t p = vfork();
-#elif defined(HAVE_FORK)
-  pid_t p = fork();
-#else
-#error Must have fork() or vfork() system calls for HSA
-#endif
-  if (p == 0)
-    {
-      return execv(args[0], args);
-    }
-  else
-    {
-      if (p < 0)
-        return -1;
-      int status;
-      if (waitpid(p, &status, 0) < 0)
-        POCL_ABORT("pocl-hsa: waitpid() itself failed.\n");
-      if (WIFEXITED(status))
-        return WEXITSTATUS(status);
-      else
-        return -2;
-    }
-}
-
 static int
 compile_parallel_bc_to_brig(char* brigfile, cl_kernel kernel,
                             cl_device_id device) {
@@ -998,14 +969,14 @@ compile_parallel_bc_to_brig(char* brigfile, cl_kernel kernel,
 
       char* args1[] = { LLVM_LLC, "-O2", "-march=hsail64", "-filetype=asm",
                         "-o", hsailfile, parallel_bc_path, NULL };
-      if ((error = run_command(args1)))
+      if ((error = pocl_run_command (args1)))
         {
           POCL_MSG_PRINT_INFO("pocl-hsa: llc exit status %i\n", error);
           return error;
         }
 
       char* args2[] = { HSAIL_ASM, "-o", brigfile, hsailfile, NULL };
-      if ((error = run_command(args2)))
+      if ((error = pocl_run_command (args2)))
         {
           POCL_MSG_PRINT_INFO("pocl-hsa: HSAILasm exit status %i\n", error);
           return error;
@@ -1027,12 +998,26 @@ pocl_hsa_compile_kernel (_cl_command_node *cmd, cl_kernel kernel,
 
   hsa_executable_t final_obj;
 
+  POCL_LOCK (d->pocl_hsa_compilation_lock);
+
+  int error = pocl_llvm_generate_workgroup_function (device, kernel,
+                                        cmd->command.run.local_x,
+                                        cmd->command.run.local_y,
+                                        cmd->command.run.local_z);
+  if (error)
+    {
+      POCL_MSG_PRINT_GENERAL ("HSA: pocl_llvm_generate_workgroup_function()"
+                              " failed for kernel %s\n", kernel->name);
+      assert (error == 0);
+    }
+
   unsigned i;
   for (i = 0; i<HSA_KERNEL_CACHE_SIZE; i++)
     if (d->kernel_cache[i].kernel == kernel)
       {
         POCL_MSG_PRINT_INFO("kernel.hsa_exe found in"
                             " kernel cache, returning\n");
+        POCL_UNLOCK (d->pocl_hsa_compilation_lock);
         return;
       }
 
@@ -1136,6 +1121,7 @@ pocl_hsa_compile_kernel (_cl_command_node *cmd, cl_kernel kernel,
        kernel_symbol, HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE,
        &d->kernel_cache[i].args_segment_size));
 
+  POCL_UNLOCK (d->pocl_hsa_compilation_lock);
 }
 
 void
@@ -1165,6 +1151,8 @@ pocl_hsa_uninit (cl_device_id device)
 
   PTHREAD_CHECK(pthread_mutex_destroy(&d->list_mutex));
 
+  POCL_DESTROY_LOCK (d->pocl_hsa_compilation_lock);
+
   POCL_MEM_FREE(d);
   device->data = NULL;
 }
@@ -1205,12 +1193,11 @@ pocl_hsa_submit (_cl_command_node *node, cl_command_queue cq)
 
   POCL_LOCK_OBJ (node->event);
   PTHREAD_CHECK(pthread_mutex_lock(&d->list_mutex));
-  POCL_UPDATE_EVENT_SUBMITTED(&node->event);
 
-  /* this "ready" consept to ensure that command is pushed only once */
-  if (!(node->ready) && pocl_command_is_ready(node->event))
+  node->ready = 1;
+  if (pocl_command_is_ready (node->event))
     {
-      node->ready = 1;
+      POCL_UPDATE_EVENT_SUBMITTED (node->event);
       PN_ADD(d->ready_list, node->event);
       added_to_readylist = 1;
     }
@@ -1240,29 +1227,32 @@ pocl_hsa_join (cl_device_id device, cl_command_queue cq)
     }
   cl_event event = cq->last_event.event;
   assert(event);
-  POCL_LOCK_OBJ(event);
+  POCL_LOCK_OBJ (event);
+  ++event->pocl_refcount;
   POCL_UNLOCK_OBJ (cq);
 
   POCL_MSG_PRINT_INFO("pocl-hsa: device->join on event %u\n", event->id);
 
-  if (event->status == CL_COMPLETE)
+  if (event->status <= CL_COMPLETE)
     {
       POCL_MSG_PRINT_INFO("pocl-hsa: device->join: last event (%u) in queue"
                           " exists, but is complete\n", event->id);
-      POCL_UNLOCK_OBJ(event);
+      --event->pocl_refcount;
+      POCL_UNLOCK_OBJ (event);
       return;
     }
 
-  while (event->status != CL_COMPLETE)
+  while (event->status > CL_COMPLETE)
     {
       pocl_hsa_event_data_t *e_d = (pocl_hsa_event_data_t *)event->data;
       PTHREAD_CHECK (pthread_cond_wait (&e_d->event_cond, &event->pocl_lock));
     }
-  POCL_UNLOCK_OBJ(event);
-
   POCL_MSG_PRINT_INFO("pocl-hsa: device->join on event %u finished"
                       " with status: %i\n", event->id, event->status);
-  assert(event->status == CL_COMPLETE);
+
+  assert (event->status <= CL_COMPLETE);
+  --event->pocl_refcount;
+  POCL_UNLOCK_OBJ (event);
 }
 
 void
@@ -1273,20 +1263,27 @@ pocl_hsa_flush (cl_device_id device, cl_command_queue cq)
 }
 
 void
-pocl_hsa_notify (cl_device_id device, cl_event event)
+pocl_hsa_notify (cl_device_id device, cl_event event, cl_event finished)
 {
   pocl_hsa_device_data_t *d = device->data;
-  POCL_LOCK_OBJ (event);
   _cl_command_node *node = event->command;
   int added_to_readylist = 0;
   POCL_MSG_PRINT_INFO("pocl-hsa: notify on event %u \n", event->id);
 
-  /* this "ready" consept to ensure that command is pushed only once */
-  if (!(node->ready) && pocl_command_is_ready(node->event))
+  if (finished->status < CL_COMPLETE)
+    {
+      POCL_UPDATE_EVENT_FAILED (event);
+      return;
+    }
+
+  if (!node->ready)
+    return;
+
+  if (pocl_command_is_ready (event))
     {
-      node->ready = 1;
-      if (event->status == CL_SUBMITTED)
+      if (event->status == CL_QUEUED)
         {
+          POCL_UPDATE_EVENT_SUBMITTED (event);
           PTHREAD_CHECK(pthread_mutex_lock(&d->list_mutex));
 
           size_t i = 0;
@@ -1307,10 +1304,10 @@ pocl_hsa_notify (cl_device_id device, cl_event event)
           PTHREAD_CHECK(pthread_mutex_unlock(&d->list_mutex));
         }
       else
-        POCL_MSG_WARN("node->ready was 0 but event %u is"
-                      " not submitted!\n", event->id);
+        POCL_MSG_WARN ("node->ready was 1 but event %u is"
+                       " not queued: status %i!\n",
+                       event->id, event->status);
     }
-  POCL_UNLOCK_OBJ (event);
 
   if (added_to_readylist)
     hsa_signal_subtract_relaxed(d->nudge_driver_thread, 1);
@@ -1328,7 +1325,7 @@ pocl_hsa_wait_event(cl_device_id device, cl_event event)
 {
   POCL_MSG_PRINT_INFO("pocl-hsa: device->wait_event on event %u\n", event->id);
   POCL_LOCK_OBJ (event);
-  if (event->status == CL_COMPLETE)
+  if (event->status <= CL_COMPLETE)
     {
       POCL_MSG_PRINT_INFO("pocl-hsa: device->wain_event: last event"
                           " (%u) in queue exists, but is complete\n", 
@@ -1336,7 +1333,7 @@ pocl_hsa_wait_event(cl_device_id device, cl_event event)
       POCL_UNLOCK_OBJ(event);
       return;
     }
-  while (event->status != CL_COMPLETE)
+  while (event->status > CL_COMPLETE)
     {
       pocl_hsa_event_data_t *e_d = (pocl_hsa_event_data_t *)event->data;
       PTHREAD_CHECK(pthread_cond_wait(&(e_d->event_cond), &event->pocl_lock));
@@ -1344,7 +1341,7 @@ pocl_hsa_wait_event(cl_device_id device, cl_event event)
   POCL_UNLOCK_OBJ(event);
 
   POCL_MSG_PRINT_INFO("event wait finished with status: %i\n", event->status);
-  assert(event->status == CL_COMPLETE);
+  assert (event->status <= CL_COMPLETE);
 }
 
 /* DRIVER PTHREAD part */
@@ -1457,8 +1454,8 @@ pocl_hsa_launch(pocl_hsa_device_data_t *d, cl_event event)
         = kernel_packet->completion_signal.handle;
     }
 
+  POCL_UPDATE_EVENT_RUNNING (event);
   POCL_UNLOCK_OBJ (event);
-  POCL_UPDATE_EVENT_RUNNING(&event);
 }
 
 static void
@@ -1492,11 +1489,7 @@ pocl_hsa_ndrange_event_finished (pocl_hsa_device_data_t *d, size_t i)
   hsa_memory_free(event_data->actual_kernargs);
 
   POCL_UNLOCK_OBJ (event);
-  POCL_UPDATE_EVENT_COMPLETE(&event);
-
-  uint64_t ns = event->time_end - event->time_start;
-  pocl_debug_print_duration(__func__,__LINE__,
-                            "HSA NDrange Kernel (host clock)", ns);
+  POCL_UPDATE_EVENT_COMPLETE (event);
 
 }
 
@@ -1567,8 +1560,9 @@ pocl_hsa_driver_pthread (void * cldev)
   pocl_hsa_device_pthread_data_t* dd = &d->driver_data;
 
   /* timeout counter, resets with each new queued kernel to 1/8, then
-   * exponentially increases by 40% up to about 3/4 of d->timeout */
-  uint64_t kernel_timeout_ns = d->timeout >> 3;
+   * exponentially increases by 40% up to about 3/4 of d->timeout.
+   * disabled for now */
+  /* uint64_t kernel_timeout_ns = d->timeout >> 3; */
 
   dd->running_list_size = 0;
   dd->last_queue = 0;
@@ -1600,11 +1594,13 @@ pocl_hsa_driver_pthread (void * cldev)
 
   while (1)
     {
-      // reset the signal
-
+      /* reset the signal. Disabled for now; see below */
+#if 0
       if (pocl_hsa_run_ready_commands(d))
         kernel_timeout_ns = d->timeout >> 3;
-
+#else
+      pocl_hsa_run_ready_commands(d);
+#endif
       if (d->exit_driver_thread)
         goto EXIT_PTHREAD;
 
@@ -1705,16 +1701,32 @@ pocl_hsa_update_event (cl_device_id device, cl_event event, cl_int status)
       if (event->queue->properties & CL_QUEUE_PROFILING_ENABLE)
         event->time_end = device->ops->get_timer_value(device->data);
 
+      uint64_t ns = event->time_end - event->time_start;
+      pocl_debug_print_duration (__func__,__LINE__,
+                                 "HSA NDrange Kernel (host clock)", ns);
+
+
       POCL_LOCK_OBJ (event);
       event->status = CL_COMPLETE;
-
       pthread_cond_signal(&e_d->event_cond);
+      POCL_UNLOCK_OBJ (event);
 
       device->ops->broadcast (event);
-      POCL_UNLOCK_OBJ (event);
       break;
     default:
-      assert("Invalid event status\n");
+      POCL_MSG_PRINT_INFO ("HSA: EVENT FAILED, event %d\n", event->id);
+      pocl_mem_objs_cleanup (event);
+      pocl_update_command_queue (event);
+
+      if (event->queue->properties & CL_QUEUE_PROFILING_ENABLE)
+        event->time_end = device->ops->get_timer_value (device->data);
+
+      POCL_LOCK_OBJ (event);
+      event->status = CL_FAILED;
+      pthread_cond_signal (&e_d->event_cond);
+      POCL_UNLOCK_OBJ (event);
+
+      device->ops->broadcast (event);
       break;
     }
 }
diff --git a/lib/CL/devices/prototypes.inc b/lib/CL/devices/prototypes.inc
index fb19888..b04fd9f 100644
--- a/lib/CL/devices/prototypes.inc
+++ b/lib/CL/devices/prototypes.inc
@@ -34,16 +34,18 @@
   void pocl_##__DRV__##_submit (_cl_command_node *node, cl_command_queue cq); \
   void pocl_##__DRV__##_join (cl_device_id device, cl_command_queue cq); \
   void pocl_##__DRV__##_flush (cl_device_id device, cl_command_queue cq); \
-  void pocl_##__DRV__##_notify (cl_device_id device, cl_event event);  \
+  void pocl_##__DRV__##_notify (cl_device_id device, cl_event event, cl_event finished);  \
   void pocl_##__DRV__##_broadcast (cl_event event);                    \
   void pocl_##__DRV__##_wait_event (cl_device_id device, cl_event event); \
   void pocl_##__DRV__##_update_event (cl_device_id device, cl_event event, cl_int status); \
   void pocl_##__DRV__##_free_event_data (cl_event event); \
-  void pocl_##__DRV__##_init_device_infos(struct _cl_device_id* dev);  \
+  void pocl_##__DRV__##_init_device_infos(unsigned j, struct _cl_device_id* dev);  \
   void pocl_##__DRV__##_init_device_ops(struct pocl_device_ops* ops);  \
   void pocl_##__DRV__##_uninit (cl_device_id device);                   \
-  void pocl_##__DRV__##_init (cl_device_id device, const char* parameters); \
+  cl_int pocl_##__DRV__##_init (unsigned j, cl_device_id device, const char* parameters); \
   unsigned int pocl_##__DRV__##_probe (struct pocl_device_ops *ops); \
+  cl_int pocl_##__DRV__##_init_queue (cl_command_queue queue); \
+  void pocl_##__DRV__##_free_queue (cl_command_queue queue); \
   cl_int pocl_##__DRV__##_alloc_mem_obj (cl_device_id device, cl_mem mem_obj, \
                                          void* host_ptr); \
   void *pocl_##__DRV__##_create_sub_buffer (void *device_data, void* buffer, \
diff --git a/lib/CL/devices/pthread/pocl-pthread_scheduler.h b/lib/CL/devices/pthread/pocl-pthread_scheduler.h
index 9927ba9..2521c87 100644
--- a/lib/CL/devices/pthread/pocl-pthread_scheduler.h
+++ b/lib/CL/devices/pthread/pocl-pthread_scheduler.h
@@ -36,14 +36,13 @@
 typedef struct pool_thread_data thread_data;
 
 /* Initializes scheduler. Must be called before any kernel enqueue */
-void pthread_scheduler_init (size_t num_worker_threads);
+void pthread_scheduler_init (size_t num_worker_threads, cl_device_id device);
 
 void pthread_scheduler_uinit ();
 
 /* Gives ready-to-execute command for scheduler */
 void pthread_scheduler_push_command (_cl_command_node *cmd);
 
-
 void pthread_scheduler_push_kernel (kernel_run_command *run_cmd);
 
 /* blocks until given command queue is empty == finished */
@@ -51,7 +50,7 @@ void pthread_scheduler_wait_cq (cl_command_queue cq);
 
 void pthread_scheduler_release_host ();
 
-int pthread_scheduler_get_work (thread_data *td, _cl_command_node **cmd_ptr);
+void pthread_scheduler_get_work (thread_data *td, _cl_command_node **cmd_ptr);
 
 #ifdef __GNUC__
 #pragma GCC visibility pop
diff --git a/lib/CL/devices/pthread/pocl-pthread_utils.h b/lib/CL/devices/pthread/pocl-pthread_utils.h
index 72b8a11..310bab6 100644
--- a/lib/CL/devices/pthread/pocl-pthread_utils.h
+++ b/lib/CL/devices/pthread/pocl-pthread_utils.h
@@ -3,49 +3,85 @@
 
 #include "pocl_cl.h"
 
-/* locking macros */
-#define PTHREAD_LOCK(__lock , __counter)        \
-  do {                                          \
-    if ((__counter))                            \
-      ++(*((unsigned*)(__counter)));            \
-  }while (pthread_mutex_trylock((__lock)))
-
-//#define PTHREAD_LOCK(__lock, __counter) do {pthread_mutex_lock((__lock__));} while (0)
-
-#define PTHREAD_UNLOCK(__lock) do { pthread_mutex_unlock((__lock)); }while(0)
-
 #ifdef __GNUC__
 #pragma GCC visibility push(hidden)
 #endif
 
+/* locking macros */
+#define PTHREAD_LOCK(__lock)  pthread_mutex_lock(__lock)
+#define PTHREAD_UNLOCK(__lock) pthread_mutex_unlock(__lock)
+#define PTHREAD_INIT_LOCK(__lock) pthread_mutex_init(__lock, NULL)
+#define PTHREAD_DESTROY_LOCK(__lock) pthread_mutex_destroy(__lock)
+
+/* Apparently Mac OS X does not have spinlock, despite having pthreads.
+ * for now only enable spinlocks on linux.*/
+#ifdef __linux__
+  #define PTHREAD_FAST_LOCK_T pthread_spinlock_t
+  #define PTHREAD_FAST_LOCK(l) pthread_spin_lock(l)
+  #define PTHREAD_FAST_UNLOCK(l) pthread_spin_unlock(l)
+  #define PTHREAD_FAST_INIT(l) pthread_spin_init(l, PTHREAD_PROCESS_PRIVATE)
+  #define PTHREAD_FAST_DESTROY(l) pthread_spin_destroy(l)
+#else
+  #define PTHREAD_FAST_LOCK_T pthread_mutex_t
+  #define PTHREAD_FAST_LOCK(l) pthread_mutex_lock(l)
+  #define PTHREAD_FAST_UNLOCK(l) pthread_mutex_unlock(l)
+  #define PTHREAD_FAST_INIT(l) pthread_mutex_init(l, NULL)
+  #define PTHREAD_FAST_DESTROY(l) pthread_mutex_destroy(l)
+#endif
+
 typedef struct kernel_run_command kernel_run_command;
 struct kernel_run_command
 {
   void *data;
   cl_kernel kernel;
   cl_device_id device;
-  struct pocl_context pc;
   _cl_command_node *cmd;
-  pthread_mutex_t lock;
-  unsigned lock_counter;
-  volatile unsigned group_idx[3];
-  volatile unsigned remaining_wgs;
-  volatile unsigned wgs_dealt;
   pocl_workgroup workgroup;
   struct pocl_argument *kernel_args;
-  volatile int ref_count;
-  kernel_run_command *volatile next;
+  kernel_run_command *next;
+  unsigned long ref_count;
+
+  /* actual kernel arguments. these are setup once at the kernel setup
+   * phase, then each thread sets up the local arguments for itself. */
+  void **arguments;
+  /* this is required b/c there's an additional level of indirection */
+  void **arguments2;
+
 #ifdef POCL_PTHREAD_CACHE_MONITORING
   pocl_cache_data cache_data;
 #endif
-};
 
+  PTHREAD_FAST_LOCK_T lock __attribute__ ((aligned (HOST_CPU_CACHELINE_SIZE)));
+
+  unsigned remaining_wgs __attribute__ ((aligned (HOST_CPU_CACHELINE_SIZE)));
+  unsigned wgs_dealt;
+
+  struct pocl_context pc __attribute__ ((aligned (HOST_CPU_CACHELINE_SIZE)));
+
+} __attribute__ ((aligned (HOST_CPU_CACHELINE_SIZE)));
+
+#ifdef USE_POCL_MEMMANAGER
 void pocl_init_kernel_run_command_manager (void);
 void pocl_init_thread_argument_manager ();
 kernel_run_command* new_kernel_run_command ();
 void free_kernel_run_command (kernel_run_command *k);
-void setup_kernel_arg_array(void **arguments, kernel_run_command *k);
-void free_kernel_arg_array (void **arguments, kernel_run_command *k);
+#else
+#define pocl_init_kernel_run_command_manager() NULL
+#define pocl_init_thread_argument_manager() NULL
+#define new_kernel_run_command()                                              \
+  (kernel_run_command *)pocl_aligned_malloc (HOST_CPU_CACHELINE_SIZE,         \
+                                             sizeof (kernel_run_command))
+#define free_kernel_run_command(k) free (k)
+#endif
+
+void setup_kernel_arg_array (kernel_run_command *k);
+void setup_kernel_arg_array_with_locals (void **arguments, void **arguments2,
+                                         kernel_run_command *k,
+                                         char *local_mem,
+                                         size_t local_mem_size);
+void free_kernel_arg_array (kernel_run_command *k);
+void free_kernel_arg_array_with_locals (void **arguments, void **arguments2,
+                                        kernel_run_command *k);
 
 #ifdef __GNUC__
 #pragma GCC visibility pop
diff --git a/lib/CL/devices/pthread/pthread.c b/lib/CL/devices/pthread/pthread.c
index 8ed94b6..b3d91fd 100644
--- a/lib/CL/devices/pthread/pthread.c
+++ b/lib/CL/devices/pthread/pthread.c
@@ -86,9 +86,6 @@
 /* CUSTOM_BUFFER_ALLOCATOR */
 #endif
 
-#define COMMAND_LENGTH 2048
-#define WORKGROUP_STRING_LENGTH 1024
-
 /* The name of the environment variable used to force a certain max thread count
    for the thread execution. */
 #define THREAD_COUNT_ENV "POCL_MAX_PTHREAD_COUNT"
@@ -105,10 +102,6 @@ struct data {
   cl_kernel current_kernel;
   /* Loaded kernel dynamic library handle. */
   lt_dlhandle current_dlhandle;
-
-  /* List of commands waiting to be enqueued */
-  _cl_command_node * volatile command_list;
-  pthread_mutex_t cq_lock;      /* Lock for command list related operations */
   volatile uint64_t total_cmd_exec_time;
 
 #ifdef CUSTOM_BUFFER_ALLOCATOR
@@ -172,17 +165,20 @@ pocl_pthread_probe(struct pocl_device_ops *ops)
 }
 
 void
-pocl_pthread_init_device_infos(struct _cl_device_id* dev)
+pocl_pthread_init_device_infos(unsigned j, struct _cl_device_id* dev)
 {
-  pocl_basic_init_device_infos(dev);
+  pocl_basic_init_device_infos(j, dev);
 }
 
+static cl_device_partition_property pthread_partition_properties[2]
+    = { CL_DEVICE_PARTITION_EQUALLY, CL_DEVICE_PARTITION_BY_COUNTS };
 
-void
-pocl_pthread_init (cl_device_id device, const char* parameters)
+cl_int
+pocl_pthread_init (unsigned j, cl_device_id device, const char* parameters)
 {
-  static int device_number = 0;
   struct data *d;
+  cl_int ret = CL_SUCCESS;
+  int err;
   static char scheduler_initialized = 0;
 #ifdef CUSTOM_BUFFER_ALLOCATOR
   static mem_regions_management* mrm = NULL;
@@ -193,24 +189,31 @@ pocl_pthread_init (cl_device_id device, const char* parameters)
   // Should we instead have a separate bool field in device, or do the
   // initialization at library startup time with __attribute__((constructor))?
   if (device->data!=NULL)
-    return;
+    return CL_SUCCESS;
 
   d = (struct data *) calloc (1, sizeof (struct data));
-
-  d->current_kernel = NULL;
-  d->current_dlhandle = 0;
-  device->data = d;
+  if (d == NULL)
+    return CL_OUT_OF_HOST_MEMORY;
 
 #ifdef CUSTOM_BUFFER_ALLOCATOR
   if (mrm == NULL)
     {
       mrm = malloc (sizeof (mem_regions_management));
+      if (mrm == NULL)
+        {
+          free (d);
+          return CL_OUT_OF_HOST_MEMORY;
+        }
       BA_INIT_LOCK (mrm->mem_regions_lock);
       mrm->mem_regions = NULL;
     }
   d->mem_regions = mrm;
 #endif
 
+  d->current_kernel = NULL;
+  d->current_dlhandle = 0;
+  device->data = d;
+
   device->address_bits = sizeof(void*) * 8;
 
   device->min_data_type_align_size = MAX_EXTENDED_ALIGNMENT; // this is in bytes
@@ -222,10 +225,12 @@ pocl_pthread_init (cl_device_id device, const char* parameters)
      initialize global_mem_size which it is not yet. Just put 
      a nonzero there for now. */
   device->global_mem_size = 1;
-  pocl_topology_detect_device_info(device);
+  err = pocl_topology_detect_device_info (device);
+  if (err)
+    ret = CL_INVALID_DEVICE;
   num_worker_threads = max (get_max_thread_count (device), 
                             (unsigned)pocl_get_int_option("POCL_PTHREAD_MIN_THREADS", 1));
-  
+
   pocl_cpuinfo_detect_device_info(device);
   pocl_set_buffer_image_limits(device);
 
@@ -236,16 +241,12 @@ pocl_pthread_init (cl_device_id device, const char* parameters)
     device->vendor_id =
       magic[0] | magic[1] << 8 | magic[2] << 16 | magic[3] << 24;
 
-  device->vendor_id += device_number;
-  device_number++;
+  device->vendor_id += j;
 
   // pthread has elementary partitioning support
   device->max_sub_devices = device->max_compute_units;
   device->num_partition_properties = 2;
-  device->partition_properties = calloc(device->num_partition_properties,
-    sizeof(cl_device_partition_property));
-  device->partition_properties[0] = CL_DEVICE_PARTITION_EQUALLY;
-  device->partition_properties[1] = CL_DEVICE_PARTITION_BY_COUNTS;
+  device->partition_properties = pthread_partition_properties;
   device->num_partition_types = 0;
   device->partition_type = NULL;
 
@@ -257,17 +258,16 @@ pocl_pthread_init (cl_device_id device, const char* parameters)
   device->has_64bit_long=0;
   #endif
 
-  pthread_mutex_init (&d->cq_lock, NULL);
   if (!scheduler_initialized)
     {
       scheduler_initialized = 1;
       pocl_init_dlhandle_cache();
       pocl_init_kernel_run_command_manager();
-
-      pthread_scheduler_init (num_worker_threads);
+      pthread_scheduler_init (num_worker_threads, device);
     }
   /* system mem as global memory */
   device->global_mem_id = 0;
+  return ret;
 }
 
 void
@@ -340,7 +340,8 @@ get_max_thread_count(cl_device_id device)
   if (device->max_compute_units == 0)
     return pocl_get_int_option (THREAD_COUNT_ENV, FALLBACK_MAX_THREAD_COUNT);
   else
-    return pocl_get_int_option(THREAD_COUNT_ENV, POCL_REAL_DEV(device)->max_compute_units);
+    return pocl_get_int_option (THREAD_COUNT_ENV,
+                                pocl_real_dev (device)->max_compute_units);
 }
 
 void
@@ -351,35 +352,20 @@ pocl_pthread_run
   /* not used: this device will not be told when or what to run */
 }
 
-void *
-pocl_pthread_map_mem (void *data, void *buf_ptr, 
-                      size_t offset, size_t size, void* host_ptr) 
-{
-  /* All global pointers of the pthread/CPU device are in 
-     the host address space already, and up to date. */     
-  return (char*)buf_ptr + offset;
-}
-
 void
 pocl_pthread_submit (_cl_command_node *node, cl_command_queue cq)
 {
   cl_device_id device = node->device;
   struct data *d = device->data;
-
   POCL_LOCK_OBJ (node->event);
-  POCL_UPDATE_EVENT_SUBMITTED(&node->event);
-  /* this "ready" consept to ensure that command is pushed only once */
-  if (!(node->ready) && pocl_command_is_ready(node->event))
+
+  node->ready = 1;
+  if (pocl_command_is_ready (node->event))
     {
-      node->ready = 1;
+      POCL_UPDATE_EVENT_SUBMITTED (node->event);
       pthread_scheduler_push_command (node);
     }
-  else
-    {
-      PTHREAD_LOCK (&d->cq_lock, NULL);
-      DL_PREPEND (d->command_list, node);
-      PTHREAD_UNLOCK (&d->cq_lock);
-    }
+
   POCL_UNLOCK_OBJ (node->event);
   return;
 }
@@ -398,28 +384,29 @@ pocl_pthread_join(cl_device_id device, cl_command_queue cq)
 }
 
 void
-pocl_pthread_notify (cl_device_id device, cl_event event)
+pocl_pthread_notify (cl_device_id device, cl_event event, cl_event finished)
 {
   struct data *d = (struct data*)device->data;
    int wake_thread = 0;
   _cl_command_node * volatile node = event->command;
 
-  POCL_LOCK_OBJ (event);
-  /* this "ready" consept to ensure that command is pushed only once */
-  if (!(node->ready) && pocl_command_is_ready(node->event))
+  if (finished->status < CL_COMPLETE)
+    {
+      POCL_UPDATE_EVENT_FAILED (event);
+      return;
+    }
+
+  if (!node->ready)
+    return;
+
+  if (pocl_command_is_ready (node->event))
     {
-      node->ready = 1;
-      if (event->status == CL_SUBMITTED)
+      if (event->status == CL_QUEUED)
         {
-          PTHREAD_LOCK (&d->cq_lock, NULL);
-          assert (d->command_list != NULL);
-          DL_DELETE (d->command_list, node);
-          PTHREAD_UNLOCK (&d->cq_lock);
+          POCL_UPDATE_EVENT_SUBMITTED (event);
           wake_thread = 1;
         }
     }
-  POCL_UNLOCK_OBJ (event);
-
   if (wake_thread)
     {
       pthread_scheduler_push_command (node);
@@ -457,12 +444,15 @@ void pocl_pthread_update_event (cl_device_id device, cl_event event, cl_int stat
         event->time_submit = device->ops->get_timer_value(device->data);
       break;
     case CL_RUNNING:
+      POCL_LOCK_OBJ (event);
       event->status = status;
       if (event->queue->properties & CL_QUEUE_PROFILING_ENABLE)
         event->time_start = device->ops->get_timer_value(device->data);
+      POCL_UNLOCK_OBJ (event);
       break;
     case CL_COMPLETE:
-      POCL_MSG_PRINT_INFO("PTHREAD: Command complete, event %d\n", event->id);
+      POCL_MSG_PRINT_EVENTS ("PTHREAD: Command complete, event %d\n",
+                             event->id);
       pocl_mem_objs_cleanup (event);
       cq_ready = pocl_update_command_queue (event);
 
@@ -471,26 +461,44 @@ void pocl_pthread_update_event (cl_device_id device, cl_event event, cl_int stat
 
       POCL_LOCK_OBJ (event);
       event->status = CL_COMPLETE;
-
       pthread_cond_signal(&e_d->event_cond);
+      POCL_UNLOCK_OBJ (event);
+
       if (cq_ready)
         pthread_scheduler_release_host ();
 
       device->ops->broadcast (event);
-      POCL_UNLOCK_OBJ (event);
       break;
+
     default:
-      assert("Invalid event status\n");
+      POCL_MSG_PRINT_EVENTS ("setting FAIL status on event %u\n", event->id);
+
+      POCL_LOCK_OBJ (event);
+      event->status = CL_FAILED;
+      pthread_cond_signal (&e_d->event_cond);
+      POCL_UNLOCK_OBJ (event);
+
+      pocl_mem_objs_cleanup (event);
+      cq_ready = pocl_update_command_queue (event);
+
+      if (event->queue->properties & CL_QUEUE_PROFILING_ENABLE)
+        event->time_end = device->ops->get_timer_value (device->data);
+
+      if (cq_ready)
+        pthread_scheduler_release_host ();
+
+      device->ops->broadcast (event);
       break;
     }
 }
 
 void pocl_pthread_wait_event (cl_device_id device, cl_event event)
 {
+
   struct event_data *e_d = event->data;
 
   POCL_LOCK_OBJ (event);
-  while (event->status != CL_COMPLETE)
+  while (event->status > CL_COMPLETE)
     {
       pthread_cond_wait(&e_d->event_cond, &event->pocl_lock);
     }
@@ -503,6 +511,5 @@ void pocl_pthread_free_event_data (cl_event event)
   assert(event->data != NULL);
   free(event->data);
   event->data = NULL;
-
 }
 
diff --git a/lib/CL/devices/pthread/pthread_scheduler.c b/lib/CL/devices/pthread/pthread_scheduler.c
index 363d8a8..364e4f4 100644
--- a/lib/CL/devices/pthread/pthread_scheduler.c
+++ b/lib/CL/devices/pthread/pthread_scheduler.c
@@ -1,5 +1,13 @@
+#define _GNU_SOURCE
+
+#ifdef __linux__
+#include <sched.h>
+#endif
+
 #include <string.h>
 #include <pthread.h>
+#include <time.h>
+
 #include "pocl-pthread_scheduler.h"
 #include "pocl_cl.h"
 #include "pocl-pthread.h"
@@ -13,57 +21,76 @@ static void* pocl_pthread_driver_thread (void *p);
 
 struct pool_thread_data
 {
-  pthread_t thread;
-  size_t my_id;
-  struct shared_data * sd;
-  _cl_command_node *volatile work_queue;
-  kernel_run_command *volatile kernel_queue;
-  pthread_cond_t wakeup_cond;
-  pthread_mutex_t lock;
-  volatile int executed_commands;
-  volatile int stolen_commands;
-  volatile int stolen_wgs;
-  volatile unsigned lock_counter;
-  volatile uint64_t prev_wg_finish_time;
-  pthread_mutex_t kernel_q_lock;
-  volatile int kernel_counter;
-};
+  pthread_cond_t wakeup_cond __attribute__ ((aligned (HOST_CPU_CACHELINE_SIZE)));
+  pthread_mutex_t lock __attribute__ ((aligned (HOST_CPU_CACHELINE_SIZE)));
+
+  pthread_t thread __attribute__ ((aligned (HOST_CPU_CACHELINE_SIZE)));
+
+  unsigned long executed_commands;
+  /* per-CU (= per-thread) local memory */
+  void *local_mem;
+  unsigned current_ftz;
+  unsigned num_threads;
+  /* index of this particular thread
+   * [0, num_threads-1]
+   * used for deciding whether a particular thread should run
+   * commands scheduled on a subdevice. */
+  unsigned index;
+  void *last_cmd_ignored;
+
+} __attribute__ ((aligned (HOST_CPU_CACHELINE_SIZE)));
 
 typedef struct scheduler_data_
 {
-  struct pool_thread_data *volatile thread_pool;
-  _cl_command_node *volatile work_queue;
-  kernel_run_command *volatile kernel_queue;
-  volatile int num_threads;
-  volatile int round_robin_index;
-  pthread_cond_t cq_finished_cond;
-  pthread_cond_t wake_pool;
-  pthread_mutex_t wq_lock;
-  pthread_mutex_t cq_finished_lock;
-  volatile int thread_pool_shutdown_requested;
-  cl_device_id *volatile pool_devices;
-} scheduler_data;
+  unsigned num_threads;
+
+  struct pool_thread_data *thread_pool;
+  size_t local_mem_size;
+
+  _cl_command_node *work_queue
+      __attribute__ ((aligned (HOST_CPU_CACHELINE_SIZE)));
+  kernel_run_command *kernel_queue;
+
+  pthread_cond_t wake_pool __attribute__ ((aligned (HOST_CPU_CACHELINE_SIZE)));
+  pthread_mutex_t wake_lock __attribute__ ((aligned (HOST_CPU_CACHELINE_SIZE)));
+  PTHREAD_FAST_LOCK_T wq_lock_fast __attribute__ ((aligned (HOST_CPU_CACHELINE_SIZE)));
+
+  pthread_cond_t cq_finished_cond __attribute__ ((aligned (HOST_CPU_CACHELINE_SIZE)));
+  pthread_mutex_t cq_finished_lock __attribute__ ((aligned (HOST_CPU_CACHELINE_SIZE)));
+
+  int thread_pool_shutdown_requested;
+} scheduler_data __attribute__ ((aligned (HOST_CPU_CACHELINE_SIZE)));
 
 static scheduler_data scheduler;
 
-void pthread_scheduler_init (size_t num_worker_threads)
+void
+pthread_scheduler_init (size_t num_worker_threads, cl_device_id device)
 {
-  size_t i;
-  pthread_mutex_init (&(scheduler.wq_lock), NULL);
-  pthread_mutex_init (&(scheduler.cq_finished_lock), NULL);
+  unsigned i;
+  PTHREAD_INIT_LOCK (&(scheduler.wake_lock));
+  PTHREAD_FAST_INIT (&(scheduler.wq_lock_fast));
+
+  PTHREAD_INIT_LOCK (&(scheduler.cq_finished_lock));
   pthread_cond_init (&(scheduler.cq_finished_cond), NULL);
   pthread_cond_init (&(scheduler.wake_pool), NULL);
 
-  scheduler.thread_pool = calloc
-    (num_worker_threads, sizeof (struct pool_thread_data));
+  scheduler.thread_pool = pocl_aligned_malloc (
+      HOST_CPU_CACHELINE_SIZE,
+      num_worker_threads * sizeof (struct pool_thread_data));
+  memset (scheduler.thread_pool, 0,
+          num_worker_threads * sizeof (struct pool_thread_data));
+
   scheduler.num_threads = num_worker_threads;
+  /* safety margin - aligning pointers later (in kernel arg setup)
+   * may require more local memory than actual local mem size.
+   * TODO fix this */
+  scheduler.local_mem_size = device->local_mem_size << 4;
 
   for (i = 0; i < num_worker_threads; ++i)
     {
-      scheduler.thread_pool[i].my_id = i;
       pthread_cond_init (&scheduler.thread_pool[i].wakeup_cond, NULL);
-      pthread_mutex_init (&scheduler.thread_pool[i].lock, NULL);
-      pthread_mutex_init (&scheduler.thread_pool[i].kernel_q_lock, NULL);
+      PTHREAD_INIT_LOCK (&scheduler.thread_pool[i].lock);
+      scheduler.thread_pool[i].index = i;
       pthread_create (&scheduler.thread_pool[i].thread, NULL,
                       pocl_pthread_driver_thread,
                       (void*)&scheduler.thread_pool[i]);
@@ -73,57 +100,95 @@ void pthread_scheduler_init (size_t num_worker_threads)
 
 void pthread_scheduler_uinit ()
 {
-  int i;
+  unsigned i;
   scheduler.thread_pool_shutdown_requested = 1;
 
-  pthread_mutex_lock (&scheduler.wq_lock);
+  PTHREAD_LOCK (&scheduler.wake_lock);
   pthread_cond_broadcast (&scheduler.wake_pool);
-  pthread_mutex_unlock (&scheduler.wq_lock);
+  PTHREAD_UNLOCK (&scheduler.wake_lock);
 
   for (i = 0; i < scheduler.num_threads; ++i)
     {
       pthread_join (scheduler.thread_pool[i].thread, NULL);
     }
+
+  PTHREAD_FAST_DESTROY (&scheduler.wq_lock_fast);
+  pthread_cond_destroy (&scheduler.wake_pool);
+  PTHREAD_DESTROY_LOCK (&scheduler.wake_lock);
+
+  pthread_cond_destroy (&scheduler.cq_finished_cond);
+  PTHREAD_DESTROY_LOCK (&scheduler.cq_finished_lock);
 }
 
 void pthread_scheduler_push_command (_cl_command_node *cmd)
 {
-  PTHREAD_LOCK (&scheduler.wq_lock, NULL);
+  PTHREAD_FAST_LOCK (&scheduler.wq_lock_fast);
   DL_APPEND (scheduler.work_queue, cmd);
+  PTHREAD_FAST_UNLOCK (&scheduler.wq_lock_fast);
+
+  PTHREAD_LOCK (&scheduler.wake_lock);
   pthread_cond_broadcast (&scheduler.wake_pool);
-  PTHREAD_UNLOCK (&scheduler.wq_lock);
+  PTHREAD_UNLOCK (&scheduler.wake_lock);
 }
 
 void pthread_scheduler_push_kernel (kernel_run_command *run_cmd)
 {
-  PTHREAD_LOCK (&scheduler.wq_lock, NULL);
+  PTHREAD_FAST_LOCK (&scheduler.wq_lock_fast);
   LL_APPEND (scheduler.kernel_queue, run_cmd);
+  PTHREAD_FAST_UNLOCK (&scheduler.wq_lock_fast);
+
+  PTHREAD_LOCK (&scheduler.wake_lock);
   pthread_cond_broadcast (&scheduler.wake_pool);
-  PTHREAD_UNLOCK (&scheduler.wq_lock);
+  PTHREAD_UNLOCK (&scheduler.wake_lock);
 }
 
 void pthread_scheduler_wait_cq (cl_command_queue cq)
 {
+  PTHREAD_LOCK (&scheduler.cq_finished_lock);
+
+#ifdef HAVE_CLOCK_GETTIME
+  struct timespec timeout = {0, 0};
+#endif
+
   while (1)
     {
-      pthread_mutex_lock (&scheduler.cq_finished_lock);
       POCL_LOCK_OBJ (cq);
       if (cq->command_count == 0)
         {
           POCL_UNLOCK_OBJ (cq);
-          pthread_mutex_unlock (&scheduler.cq_finished_lock);
+          PTHREAD_UNLOCK (&scheduler.cq_finished_lock);
           return;
         }
       POCL_UNLOCK_OBJ (cq);
-      pthread_cond_wait (&scheduler.cq_finished_cond,
-                         &scheduler.cq_finished_lock);
-      pthread_mutex_unlock (&scheduler.cq_finished_lock);
+
+      /* pocl_cond_timedwait() is a workaround, the pthread driver sometimes
+       * gets stuck in the loop waiting for finished_cond while the CQ is
+       * actually empty. With timedwait() it eventually recovers.
+       */
+#ifdef HAVE_CLOCK_GETTIME
+      clock_gettime(CLOCK_REALTIME, &timeout);
+      timeout.tv_nsec += 100000000;
+      if (timeout.tv_nsec >= 1000000000)
+        {
+          timeout.tv_nsec -= 1000000000;
+          ++timeout.tv_sec;
+        }
+      pthread_cond_timedwait (&scheduler.cq_finished_cond,
+                              &scheduler.cq_finished_lock,
+                              &timeout);
+#else
+       pthread_cond_wait (&scheduler.cq_finished_cond,
+                          &scheduler.cq_finished_lock);
+#endif
+
     }
+
+  PTHREAD_UNLOCK (&scheduler.cq_finished_lock);
 }
 
 void pthread_scheduler_release_host ()
 {
-  PTHREAD_LOCK (&scheduler.cq_finished_lock, NULL);
+  PTHREAD_LOCK (&scheduler.cq_finished_lock);
   pthread_cond_signal (&scheduler.cq_finished_cond);
   PTHREAD_UNLOCK (&scheduler.cq_finished_lock);
 }
@@ -135,72 +200,126 @@ work_group_scheduler (kernel_run_command *k,
 static void finalize_kernel_command (thread_data *thread_data,
                               kernel_run_command *k);
 
-int pthread_scheduler_get_work (thread_data *td, _cl_command_node **cmd_ptr)
+/* if subd is not a subdevice, returns 1
+ * if subd is subdevice, takes a look at the subdevice CUs
+ * and if they match the current driver thread, returns 1
+ * otherwise set last ignored command to cmd and return 0 */
+static int
+shall_we_run_this (thread_data *td, cl_device_id subd, void *cmd)
+{
+
+  if (subd && subd->parent_device)
+    {
+      if (!((td->index >= subd->core_start)
+            && (td->index < (subd->core_start + subd->core_count))))
+        {
+          td->last_cmd_ignored = cmd;
+          return 0;
+        }
+    }
+  td->last_cmd_ignored = NULL;
+  return 1;
+}
+
+void
+pthread_scheduler_get_work (thread_data *td, _cl_command_node **cmd_ptr)
 {
   _cl_command_node *cmd;
   kernel_run_command *run_cmd;
-  // execute kernel if available
-  PTHREAD_LOCK (&scheduler.wq_lock, NULL);
-  if ((run_cmd = scheduler.kernel_queue))
+
+  /* execute kernel if available */
+  PTHREAD_FAST_LOCK (&scheduler.wq_lock_fast);
+  run_cmd = scheduler.kernel_queue;
+
+  /* execute kernel if available */
+  if (run_cmd && shall_we_run_this (td, run_cmd->device, run_cmd))
     {
       ++run_cmd->ref_count;
-      PTHREAD_UNLOCK (&scheduler.wq_lock);
+      PTHREAD_FAST_UNLOCK (&scheduler.wq_lock_fast);
 
       work_group_scheduler (run_cmd, td);
 
-      PTHREAD_LOCK (&scheduler.wq_lock, NULL);
-      if (!(--run_cmd->ref_count))
+      PTHREAD_FAST_LOCK (&scheduler.wq_lock_fast);
+      if ((--run_cmd->ref_count) == 0)
         {
-          PTHREAD_UNLOCK (&scheduler.wq_lock);
+          PTHREAD_FAST_UNLOCK (&scheduler.wq_lock_fast);
           finalize_kernel_command (td, run_cmd);
+          PTHREAD_FAST_LOCK (&scheduler.wq_lock_fast);
         }
-      else
-        PTHREAD_UNLOCK (&scheduler.wq_lock);
     }
-  else
-    PTHREAD_UNLOCK (&scheduler.wq_lock);
 
-  // execute a command if available
-  PTHREAD_LOCK (&scheduler.wq_lock, NULL);
-  if ((cmd = scheduler.work_queue))
+  /* execute a command if available */
+  *cmd_ptr = NULL;
+  cmd = scheduler.work_queue;
+  if (cmd && shall_we_run_this (td, cmd->device, cmd))
     {
       DL_DELETE (scheduler.work_queue, cmd);
-      PTHREAD_UNLOCK (&scheduler.wq_lock);
       *cmd_ptr = cmd;
-      return 0;
     }
-  PTHREAD_UNLOCK (&scheduler.wq_lock);
-  *cmd_ptr = NULL;
-  return 1;
+  PTHREAD_FAST_UNLOCK (&scheduler.wq_lock_fast);
+  return;
 }
 
 static void
-pthread_scheduler_sleep()
+pthread_scheduler_sleep (thread_data *td)
 {
-  static struct timespec time_to_wait = {0, 0};
+  struct timespec time_to_wait = {0, 0};
   time_to_wait.tv_sec = time(NULL) + 5;
 
-  PTHREAD_LOCK (&scheduler.wq_lock, NULL);
-  if (scheduler.work_queue == NULL && scheduler.kernel_queue == 0)
-    pthread_cond_timedwait (&scheduler.wake_pool, &scheduler.wq_lock, &time_to_wait);
-  PTHREAD_UNLOCK (&scheduler.wq_lock);
+  PTHREAD_FAST_LOCK (&scheduler.wq_lock_fast);
+  /* if the queues are empty, go to sleep.
+   * if the queues are not empty, but this thread ignored the
+   * last command (because it's for different subdevice CUs),
+   * also go to sleep. */
+  if ((scheduler.work_queue == NULL && scheduler.kernel_queue == NULL)
+      || (td->last_cmd_ignored
+          && (((void *)scheduler.kernel_queue == td->last_cmd_ignored)
+              || ((void *)scheduler.work_queue == td->last_cmd_ignored))))
+    {
+      PTHREAD_FAST_UNLOCK (&scheduler.wq_lock_fast);
+      PTHREAD_LOCK (&scheduler.wake_lock);
+      pthread_cond_timedwait (&scheduler.wake_pool, &scheduler.wake_lock, &time_to_wait);
+      PTHREAD_UNLOCK (&scheduler.wake_lock);
+    }
+  else
+    PTHREAD_FAST_UNLOCK (&scheduler.wq_lock_fast);
 }
 
+/* Maximum and minimum chunk sizes for get_wg_index_range().
+ * Each pthread driver's thread fetches work from a kernel's WG pool in
+ * chunks, this determines the limits (scaled up by # of threads). */
 #define POCL_PTHREAD_MAX_WGS 256
-static int get_wg_index_range (kernel_run_command *k, unsigned *start_index,
-                               unsigned *end_index, char *last_wgs)
+#define POCL_PTHREAD_MIN_WGS 32
+
+static int
+get_wg_index_range (kernel_run_command *k, unsigned *start_index,
+                    unsigned *end_index, int *last_wgs, unsigned num_threads)
 {
+  const unsigned scaled_max_wgs = POCL_PTHREAD_MAX_WGS * num_threads;
+  const unsigned scaled_min_wgs = POCL_PTHREAD_MIN_WGS * num_threads;
+
   unsigned max_wgs;
-  *last_wgs = 0;
-  PTHREAD_LOCK (&k->lock, NULL);
+  PTHREAD_FAST_LOCK (&k->lock);
   if (k->remaining_wgs == 0)
     {
-      PTHREAD_UNLOCK (&k->lock);
+      PTHREAD_FAST_UNLOCK (&k->lock);
       return 0;
     }
-  max_wgs = min (POCL_PTHREAD_MAX_WGS,
-                 (1 + k->remaining_wgs / scheduler.num_threads));
+
+  /* If the work is comprised of huge number of WGs of small WIs,
+   * then get_wg_index_range() becomes a problem on manycore CPUs
+   * because lock contention on k->lock.
+   *
+   * If we have enough workgroups, scale up the requests linearly by
+   * num_threads, otherwise fallback to smaller workgroups.
+   */
+  if (k->remaining_wgs <= (scaled_max_wgs * num_threads))
+    max_wgs = min (scaled_min_wgs, (1 + k->remaining_wgs / num_threads));
+  else
+    max_wgs = min (scaled_max_wgs, (1 + k->remaining_wgs / num_threads));
+
   max_wgs = min (max_wgs, k->remaining_wgs);
+  assert (max_wgs > 0);
 
   *start_index = k->wgs_dealt;
   *end_index = k->wgs_dealt + max_wgs-1;
@@ -208,60 +327,86 @@ static int get_wg_index_range (kernel_run_command *k, unsigned *start_index,
   k->wgs_dealt += max_wgs;
   if (k->remaining_wgs == 0)
     *last_wgs = 1;
-  PTHREAD_UNLOCK (&k->lock);
+  PTHREAD_FAST_UNLOCK (&k->lock);
 
   return 1;
 }
 
 inline static void translate_wg_index_to_3d_index (kernel_run_command *k,
                                                    unsigned index,
-                                                   size_t *index_3d)
+                                                   size_t *index_3d,
+                                                   unsigned xy_slice,
+                                                   unsigned row_size)
 {
-  unsigned xy_slice = k->pc.num_groups[0] * k->pc.num_groups[1];
   index_3d[2] = index / xy_slice;
-  index_3d[1] = (index % xy_slice) / k->pc.num_groups[0];
-  index_3d[0] = (index % xy_slice) % k->pc.num_groups[0];
+  index_3d[1] = (index % xy_slice) / row_size;
+  index_3d[0] = (index % xy_slice) % row_size;
 }
 
 static int
 work_group_scheduler (kernel_run_command *k,
                       struct pool_thread_data *thread_data)
 {
-  void *arguments[k->kernel->num_args + k->kernel->num_locals];
+  void *arguments[k->kernel->num_args + k->kernel->num_locals + 1];
+  void *arguments2[k->kernel->num_args + k->kernel->num_locals + 1];
   struct pocl_context pc;
   unsigned i;
   unsigned start_index;
   unsigned end_index;
-  char last_wgs = 0;
+  int last_wgs = 0;
 
-  if (!get_wg_index_range (k, &start_index, &end_index,  &last_wgs))
+  if (!get_wg_index_range (k, &start_index, &end_index, &last_wgs,
+                           thread_data->num_threads))
     return 0;
 
-  setup_kernel_arg_array ((void**)&arguments, k);
+  assert (end_index >= start_index);
+
+  setup_kernel_arg_array_with_locals (
+      (void **)&arguments, (void **)&arguments2, k, thread_data->local_mem,
+      scheduler.local_mem_size);
   memcpy (&pc, &k->pc, sizeof (struct pocl_context));
+
+  /* Flush to zero is only set once at start of kernel (because FTZ is
+   * a compilation option), but we need to reset rounding mode after every
+   * iteration (since it can be changed during kernel execution). */
+  unsigned flush = k->kernel->program->flush_denorms;
+  if (thread_data->current_ftz != flush)
+    {
+      pocl_set_ftz (flush);
+      thread_data->current_ftz = flush;
+    }
+
+  unsigned slice_size = k->pc.num_groups[0] * k->pc.num_groups[1];
+  unsigned row_size = k->pc.num_groups[0];
+
   do
     {
       if (last_wgs)
         {
-          PTHREAD_LOCK (&scheduler.wq_lock, NULL);
+          PTHREAD_FAST_LOCK (&scheduler.wq_lock_fast);
           LL_DELETE (scheduler.kernel_queue, k);
-          PTHREAD_UNLOCK (&scheduler.wq_lock);
+          PTHREAD_FAST_UNLOCK (&scheduler.wq_lock_fast);
         }
+
       for (i = start_index; i <= end_index; ++i)
         {
-          translate_wg_index_to_3d_index (k, i, (size_t*)&pc.group_id);
+          translate_wg_index_to_3d_index (k, i, pc.group_id,
+                                          slice_size, row_size);
+
 #ifdef DEBUG_MT
-          printf("### exec_wg: gid_x %d, gid_y %d, gid_z %d\n",
+          printf("### exec_wg: gid_x %zu, gid_y %zu, gid_z %zu\n",
                  pc.group_id[0],
                  pc.group_id[1], pc.group_id[2]);
 #endif
+          pocl_set_default_rm ();
           k->workgroup (arguments, &pc);
         }
+    }
+  while (get_wg_index_range (k, &start_index, &end_index, &last_wgs,
+                             thread_data->num_threads));
 
-    }while (get_wg_index_range (k, &start_index, &end_index,  &last_wgs));
-
-
-  free_kernel_arg_array (arguments, k);
+  free_kernel_arg_array_with_locals ((void **)&arguments, (void **)&arguments2,
+                                     k);
 
   return 1;
 }
@@ -273,8 +418,10 @@ void finalize_kernel_command (struct pool_thread_data *thread_data,
   printf("### kernel %s finished\n", k->cmd->command.run.kernel->name);
 #endif
 
+  free_kernel_arg_array (k);
+
   pocl_ndrange_node_cleanup (k->cmd);
-  POCL_UPDATE_EVENT_COMPLETE (&k->cmd->event);
+  POCL_UPDATE_EVENT_COMPLETE (k->cmd->event);
 
   pocl_mem_manager_free_command (k->cmd);
 
@@ -290,51 +437,40 @@ pocl_pthread_prepare_kernel
   unsigned i;
   cl_kernel kernel = cmd->command.run.kernel;
   struct pocl_context *pc = &cmd->command.run.pc;
-  cl_device_id device = NULL;
 
   cmd->device->ops->compile_kernel (cmd, NULL, NULL);
 
-  /* Find which device number within the context correspond
-     to current device.  */
-  for (i = 0; i < kernel->context->num_devices; ++i)
-    {
-      if (kernel->context->devices[i]->data == data)
-        {
-          device = kernel->context->devices[i];
-          break;
-        }
-    }
-
   int num_groups = pc->num_groups[0] * pc->num_groups[1] * pc->num_groups[2];
 
   run_cmd = new_kernel_run_command ();
   run_cmd->data = data;
   run_cmd->kernel = kernel;
-  run_cmd->device = device;
+  run_cmd->device = cmd->device;
   run_cmd->pc = *pc;
   run_cmd->cmd = cmd;
-  run_cmd->group_idx[0] = 0;
-  run_cmd->group_idx[1] = 0;
-  run_cmd->group_idx[2] = 0;
   run_cmd->pc.local_size[0] = cmd->command.run.local_x;
   run_cmd->pc.local_size[1] = cmd->command.run.local_y;
   run_cmd->pc.local_size[2] = cmd->command.run.local_z;
   run_cmd->remaining_wgs = num_groups;
+  run_cmd->wgs_dealt = 0;
   run_cmd->workgroup = cmd->command.run.wg;
   run_cmd->kernel_args = cmd->command.run.arguments;
   run_cmd->next = NULL;
+  run_cmd->ref_count = 0;
+  PTHREAD_FAST_INIT (&run_cmd->lock);
 
-  pthread_scheduler_push_kernel (run_cmd);  
+  setup_kernel_arg_array (run_cmd);
 
+  pthread_scheduler_push_kernel (run_cmd);
 }
 
 static void
-pocl_pthread_exec_command (_cl_command_node * volatile cmd,
+pocl_pthread_exec_command (_cl_command_node *cmd,
                            struct pool_thread_data *td)
 {
   if(cmd->type == CL_COMMAND_NDRANGE_KERNEL)
     {
-      POCL_UPDATE_EVENT_RUNNING(&(cmd->event));
+      POCL_UPDATE_EVENT_RUNNING (cmd->event);
       pocl_pthread_prepare_kernel (cmd->command.run.data, cmd);
     }
   else
@@ -350,11 +486,32 @@ pocl_pthread_driver_thread (void *p)
 {
   struct pool_thread_data *td = (struct pool_thread_data*)p;
   _cl_command_node *cmd = NULL;
+  /* some random value, doesn't matter as long as it's not a valid bool - to
+   * force a first FTZ setup */
+  td->current_ftz = 213;
+  td->num_threads = scheduler.num_threads;
+  td->last_cmd_ignored = NULL;
+
+  assert (scheduler.local_mem_size > 0);
+  td->local_mem = pocl_aligned_malloc (MAX_EXTENDED_ALIGNMENT,
+                                       scheduler.local_mem_size);
+
+#ifdef __linux__
+  if (pocl_get_bool_option ("POCL_AFFINITY", 0))
+    {
+      cpu_set_t set;
+      CPU_ZERO (&set);
+      CPU_SET (td->index, &set);
+      pthread_setaffinity_np (td->thread, sizeof (cpu_set_t), &set);
+    }
+#endif
 
   while (1)
     {
       if (scheduler.thread_pool_shutdown_requested)
         {
+          pthread_cond_destroy (&td->wakeup_cond);
+          PTHREAD_DESTROY_LOCK (&td->lock);
           pthread_exit (NULL);
         }
 
@@ -368,6 +525,6 @@ pocl_pthread_driver_thread (void *p)
           ++td->executed_commands;
         }
       // check if its time to sleep
-      pthread_scheduler_sleep();
+      pthread_scheduler_sleep (td);
     }
 }
diff --git a/lib/CL/devices/pthread/pthread_utils.c b/lib/CL/devices/pthread/pthread_utils.c
index 66474b6..11911cc 100644
--- a/lib/CL/devices/pthread/pthread_utils.c
+++ b/lib/CL/devices/pthread/pthread_utils.c
@@ -6,6 +6,8 @@
 #include "pocl-pthread.h"
 #include "pocl_mem_management.h"
 
+#ifdef USE_POCL_MEMMANAGER
+
 static kernel_run_command *volatile kernel_pool = 0;
 static int kernel_pool_initialized = 0;
 static pocl_lock_t kernel_pool_lock;
@@ -56,30 +58,55 @@ void free_kernel_run_command (kernel_run_command *k)
   POCL_UNLOCK (kernel_pool_lock);
 }
 
-void setup_kernel_arg_array(void **arguments, kernel_run_command *k)
+#endif
+
+#define ARGS_SIZE                                                             \
+  (sizeof (void *) * (kernel->num_args + kernel->num_locals + 1))
+
+static char *
+align_ptr (char *p)
+{
+  uintptr_t r = (uintptr_t)p;
+  if (r & (MAX_EXTENDED_ALIGNMENT - 1))
+    {
+      r = r & (~(MAX_EXTENDED_ALIGNMENT - 1));
+      r += MAX_EXTENDED_ALIGNMENT;
+    }
+  return (char *)r;
+}
+
+/* called from kernel setup code.
+ * Sets up the actual arguments, except the local ones. */
+void
+setup_kernel_arg_array (kernel_run_command *k)
 {
-  struct pocl_argument *al;  
+  struct pocl_argument *al;
   cl_kernel kernel = k->kernel;
   cl_uint i;
-
+  void **arguments;
+  void **arguments2;
+  k->arguments = arguments
+      = pocl_memalign_alloc (MAX_EXTENDED_ALIGNMENT, ARGS_SIZE);
+  k->arguments2 = arguments2
+      = pocl_memalign_alloc (MAX_EXTENDED_ALIGNMENT, ARGS_SIZE);
   for (i = 0; i < kernel->num_args; ++i)
     {
       al = &(k->kernel_args[i]);
       if (kernel->arg_info[i].is_local)
         {
-          arguments[i] = malloc (sizeof (void *));
-          *(void **)(arguments[i]) = pocl_memalign_alloc(MAX_EXTENDED_ALIGNMENT, al->size);
+          arguments[i] = NULL;
+          arguments2[i] = NULL;
         }
       else if (kernel->arg_info[i].type == POCL_ARG_TYPE_POINTER)
       {
-        /* It's legal to pass a NULL pointer to clSetKernelArguments. In 
+        /* It's legal to pass a NULL pointer to clSetKernelArguments. In
            that case we must pass the same NULL forward to the kernel.
            Otherwise, the user must have created a buffer with per device
            pointers stored in the cl_mem. */
-        if (al->value == NULL) 
+        if (al->value == NULL)
           {
-            arguments[i] = malloc (sizeof (void *));
-            *(void **)arguments[i] = NULL;
+            arguments[i] = &arguments2[i];
+            arguments2[i] = NULL;
           }
         else
           {
@@ -94,27 +121,59 @@ void setup_kernel_arg_array(void **arguments, kernel_run_command *k)
         {
           dev_image_t di;
           fill_dev_image_t(&di, al, k->device);
-          void* devptr = pocl_memalign_alloc(MAX_EXTENDED_ALIGNMENT, 
-                                             sizeof(dev_image_t));
-          arguments[i] = malloc (sizeof (void *));
-          *(void **)(arguments[i]) = devptr;
-          pocl_pthread_write (k->data, &di, devptr, 0, sizeof(dev_image_t));
+          void *devptr = pocl_memalign_alloc (MAX_EXTENDED_ALIGNMENT,
+                                              sizeof (dev_image_t));
+          arguments[i] = &arguments2[i];
+          arguments2[i] = devptr;
+          memcpy (devptr, &di, sizeof (dev_image_t));
         }
       else if (kernel->arg_info[i].type == POCL_ARG_TYPE_SAMPLER)
         {
           dev_sampler_t ds;
           fill_dev_sampler_t(&ds, al);
 
-          void* devptr = pocl_memalign_alloc(MAX_EXTENDED_ALIGNMENT, 
-                                             sizeof(dev_sampler_t));
-          arguments[i] = malloc (sizeof (void *));
-          *(void **)(arguments[i]) = devptr;
-          pocl_pthread_write (k->data, &ds, *(void**)arguments[i], 0,
-                              sizeof(dev_sampler_t));
+          void *devptr = pocl_memalign_alloc (MAX_EXTENDED_ALIGNMENT,
+                                              sizeof (dev_sampler_t));
+          arguments[i] = &arguments2[i];
+          arguments2[i] = devptr;
+          memcpy (devptr, &ds, sizeof (dev_sampler_t));
         }
       else
         arguments[i] = al->value;
     }
+}
+
+/* called from each driver thread.
+ * "arguments" and "arguments2" are the output:
+ * driver-thread-local copies of kern args.
+ *
+ * they're set up by 1) memcpy from kernel_run_command, 2) all
+ * local args are set to thread-local "local memory" storage. */
+void
+setup_kernel_arg_array_with_locals (void **arguments, void **arguments2,
+                                    kernel_run_command *k, char *local_mem,
+                                    size_t local_mem_size)
+{
+  cl_kernel kernel = k->kernel;
+  cl_uint i;
+
+  memcpy (arguments2, k->arguments2, ARGS_SIZE);
+  memcpy (arguments, k->arguments, ARGS_SIZE);
+
+  char *start = local_mem;
+
+  for (i = 0; i < kernel->num_args; ++i)
+    {
+      if (kernel->arg_info[i].is_local)
+        {
+          size_t size = k->kernel_args[i].size;
+          arguments[i] = &arguments2[i];
+          arguments2[i] = start;
+          start += size;
+          start = align_ptr (start);
+          assert ((size_t) (start - local_mem) <= local_mem_size);
+        }
+    }
 
   /* Allocate the automatic local buffers which are implemented as implicit
      extra arguments at the end of the kernel argument list. */
@@ -122,41 +181,66 @@ void setup_kernel_arg_array(void **arguments, kernel_run_command *k)
        i < kernel->num_args + kernel->num_locals;
        ++i)
     {
-      al = &(k->kernel_args[i]);
-      arguments[i] = malloc (sizeof (void *));
-      *(void **)(arguments[i]) = pocl_memalign_alloc (MAX_EXTENDED_ALIGNMENT, al->size);
+      size_t size = k->kernel_args[i].size;
+      arguments[i] = &arguments2[i];
+      arguments2[i] = start;
+      start += size;
+      start = align_ptr (start);
+      assert ((size_t) (start - local_mem) <= local_mem_size);
     }
-
 }
 
-void free_kernel_arg_array (void **arguments, kernel_run_command *k)
+/* called from kernel teardown code.
+ * frees the actual arguments, except the local ones. */
+void
+free_kernel_arg_array (kernel_run_command *k)
 {
   cl_uint i;
   cl_kernel kernel = k->kernel;
+  void **arguments = k->arguments;
+  void **arguments2 = k->arguments2;
+
   for (i = 0; i < kernel->num_args; ++i)
     {
-      if (kernel->arg_info[i].is_local )
+      if (kernel->arg_info[i].is_local)
         {
-          POCL_MEM_FREE(*(void **)(arguments[i]));
-          POCL_MEM_FREE(arguments[i]);
+          assert (arguments[i] == NULL);
+          assert (arguments2[i] == NULL);
         }
       else if (kernel->arg_info[i].type == POCL_ARG_TYPE_IMAGE ||
                 kernel->arg_info[i].type == POCL_ARG_TYPE_SAMPLER)
         {
-          POCL_MEM_FREE(*(void **)(arguments[i]));
-          POCL_MEM_FREE(arguments[i]);
+          POCL_MEM_FREE (arguments2[i]);
         }
-      else if (kernel->arg_info[i].type == POCL_ARG_TYPE_POINTER && *(void**)arguments[i] == NULL)
+    }
+
+  POCL_MEM_FREE (k->arguments);
+  POCL_MEM_FREE (k->arguments2);
+}
+
+/* called from each driver thread.
+ * frees the local arguments. */
+void
+free_kernel_arg_array_with_locals (void **arguments, void **arguments2,
+                                   kernel_run_command *k)
+{
+  cl_kernel kernel = k->kernel;
+  cl_uint i;
+
+  for (i = 0; i < kernel->num_args; ++i)
+    {
+      if (kernel->arg_info[i].is_local)
         {
-          POCL_MEM_FREE(arguments[i]);
+          arguments[i] = NULL;
+          arguments2[i] = NULL;
         }
     }
+
   for (i = kernel->num_args;
        i < kernel->num_args + kernel->num_locals;
        ++i)
     {
-      POCL_MEM_FREE(*(void **)(arguments[i]));
-      POCL_MEM_FREE(arguments[i]);
+      arguments[i] = NULL;
+      arguments2[i] = NULL;
     }
 }
-
diff --git a/lib/CL/devices/tce/tce_common.cc b/lib/CL/devices/tce/tce_common.cc
index 677acbc..4a99e7f 100644
--- a/lib/CL/devices/tce/tce_common.cc
+++ b/lib/CL/devices/tce/tce_common.cc
@@ -22,11 +22,12 @@
 */
 #include "tce_common.h"
 #include "pocl_util.h"
+#include "pocl_cache.h"
+#include "pocl_llvm.h"
 #include "utlist.h"
 #include "common.h"
 
 #include "config.h"
-#include "install-paths.h"
 #include "pocl_runtime_config.h"
 #include "pocl_hash.h"
 
@@ -53,13 +54,8 @@
 #include <GlobalScope.hh>
 #include <Environment.hh>
 
-#include "pocl_cache.h"
-
 using namespace TTAMachine;
 
-#define COMMAND_LENGTH 256
-#define WORKGROUP_STRING_LENGTH 1024
-
 #include <algorithm>
 
 #define ALIGNMENT (std::max(ALIGNOF_FLOAT16, ALIGNOF_DOUBLE16))
@@ -72,6 +68,7 @@ TCEDevice::TCEDevice(cl_device_id dev, const char* adfName) :
   ready_list(NULL), command_list(NULL) {
   parent->data = this;
   pthread_mutex_init (&cq_lock, NULL);
+  POCL_INIT_LOCK(tce_compile_lock);
   dev->address_bits = 32;
   dev->autolocals_to_args = 1;
 #if defined(WORDS_BIGENDIAN) && WORDS_BIGENDIAN == 1
@@ -224,9 +221,9 @@ TCEDevice::tceccCommandLine
     }
   else 
     {
-      deviceMainSrc = TCEString(PKGDATADIR) + "/" + mainC;
+      deviceMainSrc = TCEString(POCL_INSTALL_PRIVATE_DATADIR) + "/" + mainC;
       assert(access(deviceMainSrc.c_str(), R_OK) == 0);
-      poclIncludePathSwitch = " -I " PKGDATADIR "/include";
+      poclIncludePathSwitch = " -I " POCL_INSTALL_PRIVATE_DATADIR "/include";
     }
 
   TCEString extraFlags = extraParams;
@@ -404,14 +401,29 @@ void
 pocl_tce_compile_kernel(_cl_command_node *cmd,
                         cl_kernel kernel, cl_device_id device)
 {
-
   if (cmd->type != CL_COMMAND_NDRANGE_KERNEL)
     return;
 
   void* data = cmd->device->data;
   TCEDevice *d = (TCEDevice*)data;
 
-  int error;
+  if (!kernel)
+    kernel = cmd->command.run.kernel;
+  if (!device)
+    device = cmd->device;
+
+  POCL_LOCK(d->tce_compile_lock);
+  int error = pocl_llvm_generate_workgroup_function(device, kernel,
+      cmd->command.run.local_x, cmd->command.run.local_y,
+      cmd->command.run.local_z);
+
+  if (error) {
+    POCL_UNLOCK(d->tce_compile_lock);
+    POCL_MSG_PRINT_GENERAL("TCE: pocl_llvm_generate_workgroup_function()"
+                           " failed for kernel %s\n", kernel->name);
+    assert(error == 0);
+  }
+
   char bytecode[POCL_FILENAME_LENGTH];
 
   assert(d != NULL);
@@ -437,6 +449,8 @@ pocl_tce_compile_kernel(_cl_command_node *cmd,
         POCL_ABORT("Error while running tcecc.");
       }
   }
+
+  POCL_UNLOCK(d->tce_compile_lock);
 }
 
 void
@@ -852,13 +866,13 @@ static void tce_command_scheduler (TCEDevice *d)
   while ((node = d->ready_list))
     {
       assert (pocl_command_is_ready(node->event));
-      CDL_DELETE (d->ready_list, node); 
-      pthread_mutex_unlock (&d->cq_lock);
+      CDL_DELETE (d->ready_list, node);
+      POCL_UNLOCK(d->cq_lock);
       assert (node->event->status == CL_SUBMITTED);
       if (node->type == CL_COMMAND_NDRANGE_KERNEL)
         pocl_tce_compile_kernel(node, NULL, NULL);
       pocl_exec_command(node);
-      pthread_mutex_lock (&d->cq_lock);
+      POCL_LOCK(d->cq_lock);
     }
     
   return;
@@ -868,15 +882,15 @@ void
 pocl_tce_submit (_cl_command_node *node, cl_command_queue /*cq*/)
 {
   TCEDevice *d = (TCEDevice*)node->device->data;
-  cl_event *event = &(node->event);
 
-  POCL_LOCK (d->cq_lock);
-  POCL_UPDATE_EVENT_SUBMITTED(event);
+  POCL_LOCK_OBJ(node->event);
+  node->ready = 1;
+  POCL_LOCK(d->cq_lock);
   pocl_command_push(node, &d->ready_list, &d->command_list);
+  POCL_UNLOCK_OBJ(node->event);
 
   tce_command_scheduler (d);
-
-  POCL_UNLOCK (d->cq_lock);
+  POCL_UNLOCK(d->cq_lock);
 
   return;
 }
@@ -890,14 +904,6 @@ void pocl_tce_flush (cl_device_id device, cl_command_queue /*cq*/)
   POCL_UNLOCK (d->cq_lock);
 }
 
-void
-pocl_tce_push_command (_cl_command_node *node)
-{
-  TCEDevice *d = (TCEDevice*)node->device->data;
-
-  pocl_command_push(node, &d->ready_list, &d->command_list);
-
-}
 
 void
 pocl_tce_join(cl_device_id device, cl_command_queue /*cq*/)
@@ -912,27 +918,30 @@ pocl_tce_join(cl_device_id device, cl_command_queue /*cq*/)
 }
 
 void
-pocl_tce_notify (cl_device_id device, cl_event event)
+pocl_tce_notify (cl_device_id device, cl_event event, cl_event finished)
 {
   TCEDevice *d = (TCEDevice*)device->data;
   _cl_command_node * volatile node = event->command;
-  
-  POCL_LOCK_OBJ (event);
-  if (!(node->ready) && pocl_command_is_ready(node->event))
-    {
-      node->ready = 1;
-      POCL_UNLOCK_OBJ (event);
-      if (node->event->status == CL_SUBMITTED)
-        {
-          POCL_LOCK (d->cq_lock);
-          CDL_DELETE (d->command_list, node);
-          CDL_PREPEND (d->ready_list, node);
-          tce_command_scheduler (d);
-          POCL_UNLOCK (d->cq_lock);
-        }
-      return;
+
+  if (finished->status < CL_COMPLETE) {
+    POCL_UPDATE_EVENT_FAILED(event);
+    return;
+  }
+
+  if (!node->ready)
+    return;
+
+  if (pocl_command_is_ready(event)) {
+    if (event->status == CL_QUEUED) {
+      POCL_UPDATE_EVENT_SUBMITTED(event);
+      POCL_LOCK(d->cq_lock);
+      CDL_DELETE(d->command_list, node);
+      CDL_PREPEND(d->ready_list, node);
+      tce_command_scheduler(d);
+      POCL_UNLOCK(d->cq_lock);
+    }
+    return;
     }
-  POCL_UNLOCK_OBJ (event);
 }
 
 void
diff --git a/lib/CL/devices/tce/tce_common.h b/lib/CL/devices/tce/tce_common.h
index 502508f..c22eb92 100644
--- a/lib/CL/devices/tce/tce_common.h
+++ b/lib/CL/devices/tce/tce_common.h
@@ -122,6 +122,7 @@ class TCEDevice {
   uint64_t globalCycleCount;
 
   pthread_mutex_t cq_lock;
+  pocl_lock_t tce_compile_lock;
   _cl_command_node *volatile ready_list;
   _cl_command_node *volatile command_list;
 };
diff --git a/lib/CL/devices/tce/ttasim/ttasim.cc b/lib/CL/devices/tce/ttasim/ttasim.cc
index 176916e..9a1aca0 100644
--- a/lib/CL/devices/tce/ttasim/ttasim.cc
+++ b/lib/CL/devices/tce/ttasim/ttasim.cc
@@ -103,7 +103,7 @@ pocl_ttasim_init_device_ops(struct pocl_device_ops *ops)
 
 
 void
-pocl_ttasim_init_device_infos(struct _cl_device_id* dev)
+pocl_ttasim_init_device_infos(unsigned j, struct _cl_device_id* dev)
 {
   dev->type = CL_DEVICE_TYPE_GPU;
   dev->max_compute_units = 1;
@@ -138,6 +138,7 @@ pocl_ttasim_init_device_infos(struct _cl_device_id* dev)
   dev->available = CL_TRUE;
   dev->compiler_available = CL_TRUE;
   dev->spmd = CL_FALSE;
+  dev->workgroup_pass = CL_TRUE;
   dev->execution_capabilities = CL_EXEC_KERNEL;
   dev->queue_properties = CL_QUEUE_PROFILING_ENABLE;
   dev->vendor = "TTA-Based Co-design Environment";
@@ -529,14 +530,15 @@ private:
 };
 
 
-void
-pocl_ttasim_init (cl_device_id device, const char* parameters)
+cl_int
+pocl_ttasim_init (unsigned j, cl_device_id device, const char* parameters)
 {
   if (parameters == NULL)
     POCL_ABORT("The tta device requires the adf file as a device parameter.\n"
                "Set it with POCL_TTASIMn_PARAMETERS=\"path/to/themachine.adf\".\n");
-  
-  new TTASimDevice(device, parameters); 
+
+  new TTASimDevice(device, parameters);
+  return CL_SUCCESS;
 }
 
 void
diff --git a/lib/CL/devices/topology/pocl_topology.c b/lib/CL/devices/topology/pocl_topology.c
index 716b1fb..2a7344e 100644
--- a/lib/CL/devices/topology/pocl_topology.c
+++ b/lib/CL/devices/topology/pocl_topology.c
@@ -27,10 +27,60 @@
 
 #include "pocl_topology.h"
 
-void
+#if defined(__x86_64__) || defined(__i386__)
+
+enum VendorSignatures
+{
+  SIG_INTEL = 0x756e6547 /* Genu */,
+  SIG_AMD = 0x68747541 /* Auth */
+};
+
+/// getX86CpuIDAndInfo - Execute the specified cpuid and return the 4 values in
+/// the specified arguments.  If we can't run cpuid on the host, return true.
+static int
+getX86CpuIDAndInfo (unsigned value, unsigned *rEAX, unsigned *rEBX,
+                    unsigned *rECX, unsigned *rEDX)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#if defined(__x86_64__)
+  // gcc doesn't know cpuid would clobber ebx/rbx. Preserve it manually.
+  __asm__("movq\t%%rbx, %%rsi\n\t"
+          "cpuid\n\t"
+          "xchgq\t%%rbx, %%rsi\n\t"
+          : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX)
+          : "a"(value));
+  return 0;
+#elif defined(__i386__)
+  __asm__("movl\t%%ebx, %%esi\n\t"
+          "cpuid\n\t"
+          "xchgl\t%%ebx, %%esi\n\t"
+          : "=a"(*rEAX), "=S"(*rEBX), "=c"(*rECX), "=d"(*rEDX)
+          : "a"(value));
+  return 0;
+#else
+  return 1;
+#endif
+#elif defined(_MSC_VER)
+  // The MSVC intrinsic is portable across x86 and x64.
+  int registers[4];
+  __cpuid (registers, value);
+  *rEAX = registers[0];
+  *rEBX = registers[1];
+  *rECX = registers[2];
+  *rEDX = registers[3];
+  return 0;
+#else
+  return 1;
+#endif
+}
+
+#endif
+
+int
 pocl_topology_detect_device_info(cl_device_id device)
 {
   hwloc_topology_t pocl_topology;
+  int ret = 0;
 
   /*
    * hwloc's OpenCL backend causes problems at the initialization stage
@@ -44,15 +94,27 @@ pocl_topology_detect_device_info(cl_device_id device)
    */
   setenv ("HWLOC_PLUGINS_PATH", "/dev/null", 1);
 
-  int ret = hwloc_topology_init(&pocl_topology);
+  ret = hwloc_topology_init (&pocl_topology);
   if (ret == -1)
-    POCL_ABORT("Cannot initialize the topology.\n");
-
-  hwloc_topology_set_flags(pocl_topology, HWLOC_TOPOLOGY_FLAG_WHOLE_IO);
-
-  ret = hwloc_topology_load(pocl_topology);
+  {
+    POCL_MSG_ERR ("Cannot initialize the topology.\n");
+    return ret;
+  }
+
+  hwloc_topology_ignore_type (pocl_topology, HWLOC_TOPOLOGY_FLAG_WHOLE_IO);
+  hwloc_topology_ignore_type (pocl_topology, HWLOC_OBJ_SYSTEM);
+  hwloc_topology_ignore_type (pocl_topology, HWLOC_OBJ_GROUP);
+  hwloc_topology_ignore_type (pocl_topology, HWLOC_OBJ_BRIDGE);
+  hwloc_topology_ignore_type (pocl_topology, HWLOC_OBJ_MISC);
+  hwloc_topology_ignore_type (pocl_topology, HWLOC_OBJ_PCI_DEVICE);
+  hwloc_topology_ignore_type (pocl_topology, HWLOC_OBJ_OS_DEVICE);
+
+  ret = hwloc_topology_load (pocl_topology);
   if (ret == -1)
-    POCL_ABORT("Cannot load the topology.\n");
+  {
+    POCL_MSG_ERR ("Cannot load the topology.\n");
+    goto exit_destroy;
+  }
 
   device->global_mem_size =
       hwloc_get_root_obj(pocl_topology)->memory.total_memory;
@@ -62,25 +124,27 @@ pocl_topology_detect_device_info(cl_device_id device)
   if(depth != HWLOC_TYPE_DEPTH_UNKNOWN)
     device->max_compute_units = hwloc_get_nbobjs_by_depth(pocl_topology, depth);
 
-  // A vendor ID for a CPU is not well-defined, so we just use the
-  // PCI vendor ID of a bridge, on the (debatable) assumption that it matches
-  // the CPU vendor (e.g. AMD bridges for AMD CPUs vs Intel bridges for Intel
-  // CPUs). TODO FIXME This is not always true, but we don't have a better
-  // logic for the time
-  do {
-    hwloc_obj_t bridge = NULL;
-    while ((bridge = hwloc_get_next_bridge(pocl_topology, bridge))) {
-      union hwloc_obj_attr_u *attr = bridge->attr;
-      unsigned int vid;
-      if (!attr)
-	continue;
-      vid = attr->bridge.upstream.pci.vendor_id;
-      if (vid) {
-	device->vendor_id = vid;
-	break;
-      }
+#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
+#if defined(__x86_64__) || defined(__i386__)
+  unsigned Vendor, EAX, ECX, EDX;
+  if (getX86CpuIDAndInfo (0, &EAX, &Vendor, &ECX, &EDX))
+    device->vendor_id = 0x0086;
+  else
+    {
+      if (Vendor == SIG_INTEL)
+        device->vendor_id = 0x8086;
+      else if (Vendor == SIG_AMD)
+        device->vendor_id = 0x1022;
+      else
+        /* unknown x86 */
+        device->vendor_id = 0x0086;
     }
-  } while (0);
+#else
+  device->vendor_id = 0x0000;
+#endif
+#else
+  device->vendor_id = 0x0000;
+#endif
 
   /* Find information about global memory cache by looking at the first
    * cache covering the first PU */
@@ -100,7 +164,9 @@ pocl_topology_detect_device_info(cl_device_id device)
   } while (0);
 
   // Destroy topology object and return
-  hwloc_topology_destroy(pocl_topology);
+exit_destroy:
+  hwloc_topology_destroy (pocl_topology);
+  return ret;
 
 }
 
diff --git a/lib/CL/devices/topology/pocl_topology.h b/lib/CL/devices/topology/pocl_topology.h
index 2d04866..49f949f 100644
--- a/lib/CL/devices/topology/pocl_topology.h
+++ b/lib/CL/devices/topology/pocl_topology.h
@@ -37,7 +37,7 @@
 #pragma GCC visibility push(hidden)
 #endif
 
-void pocl_topology_detect_device_info(cl_device_id device);
+int pocl_topology_detect_device_info(cl_device_id device);
 
 #ifdef __GNUC__
 #pragma GCC visibility pop
diff --git a/lib/CL/pocl_binary.c b/lib/CL/pocl_binary.c
index 3a4a4ff..63d24dc 100644
--- a/lib/CL/pocl_binary.c
+++ b/lib/CL/pocl_binary.c
@@ -47,16 +47,18 @@
   #define le64toh(x) OSSwapLittleToHostInt64(x)
 #endif
 
-#if defined(WORDS_BIGENDIAN) && WORDS_BIGENDIAN == 1
-  static const char host_endian = 1;
-#else
-  static const char host_endian = 0;
-#endif
-
 /* pocl binary identifier */
 #define POCLCC_STRING_ID "poclbin"
 #define POCLCC_STRING_ID_LENGTH 8
-#define POCLCC_VERSION 1
+/* changes for version 2: added program.bc right after header */
+/* changes for version 3: added flush_denorms flag into header */
+/* changes for version 4: kernel library is now linked into
+                          program.bc, so older binaries may fail
+                          to run with "undefined symbol" errors. */
+/* changes for version 5: added program binary_type into header */
+/* changes for version 6: added reqd_wg_size informations into
+                          pocl_binary_kernel structure */
+#define POCLCC_VERSION 6
 
 /* pocl binary structures */
 
@@ -67,6 +69,8 @@
  * 4) files are written as two strings: | uint32_t | relative filename | uint32_t | content |
  */
 
+#define OPENCL_MAX_DIMENSION 3
+
 typedef struct pocl_binary_kernel_s
 {
   /* the first 3 fields are sizes in bytes of the data pieces that follow
@@ -87,11 +91,14 @@ typedef struct pocl_binary_kernel_s
   /* kernel_name string */
   char *kernel_name;
 
-  // number of kernel arguments
+  /* number of kernel arguments */
   uint32_t num_args;
-  // number of kernel local variables
+  /* number of kernel local variables */
   uint32_t num_locals;
 
+  /* required work-group size */
+  uint64_t reqd_wg_size[OPENCL_MAX_DIMENSION];
+
   /* arguments and argument metadata. Note that not everything is stored
    * in the serialized binary */
   struct pocl_argument *dyn_arguments;
@@ -108,10 +115,13 @@ typedef struct pocl_binary_s
   uint32_t version;
   /* number of kernels in the serialized pocl binary */
   uint32_t num_kernels;
+  /* various flags */
+  uint64_t flags;
   /* program->build_hash[device_i], required to restore files into pocl cache */
   SHA1_digest_t program_build_hash;
 } pocl_binary;
 
+#define POCL_BINARY_FLAG_FLUSH_DENORMS (1 << 0)
 
 #define TO_LE(x)                                \
   ((sizeof(x) == 8) ? htole64((uint64_t)x) :    \
@@ -196,6 +206,7 @@ read_header(pocl_binary *b, const unsigned char *buffer)
   BUFFER_READ(b->device_id, uint64_t);
   BUFFER_READ(b->version, uint32_t);
   BUFFER_READ(b->num_kernels, uint32_t);
+  BUFFER_READ (b->flags, uint64_t);
   memcpy(b->program_build_hash, buffer, sizeof(SHA1_digest_t));
   buffer += sizeof(SHA1_digest_t);
   return (unsigned char*)buffer;
@@ -228,11 +239,22 @@ check_binary(cl_device_id device, const unsigned char *binary)
   pocl_binary b;
   unsigned char *p = read_header(&b, binary);
   if (b.version != POCLCC_VERSION)
-    return NULL;
+    {
+      POCL_MSG_WARN ("PoclBinary version %i different from the one "
+                     "recognized by this pocl version (%i)\n",
+                     b.version, POCLCC_VERSION);
+      return NULL;
+    }
   if (strncmp(b.pocl_id, POCLCC_STRING_ID, POCLCC_STRING_ID_LENGTH))
-    return NULL;
+    {
+      POCL_MSG_WARN ("File is not a pocl binary\n");
+      return NULL;
+    }
   if (pocl_binary_get_device_id(device) != b.device_id)
-    return NULL;
+    {
+      POCL_MSG_WARN ("PoclBinary device id mismatch\n");
+      return NULL;
+    }
   return p;
 }
 
@@ -276,11 +298,22 @@ pocl_binary_get_kernel_names(unsigned char *binary,
 
   unsigned char *orig_buffer;
   unsigned i, len;
+
+  /* skip real path of program.bc */
+  BUFFER_READ(len, uint32_t);
+  assert (len > 0);
+  buffer += len;
+
+  /* skip content of program.bc */
+  BUFFER_READ(len, uint32_t);
+  assert (len > 0);
+  buffer += len;
+
   for (i=0; i < num_kernels; i++)
   {
     orig_buffer = buffer;
     BUFFER_READ(struct_size, uint64_t);
-    // skip binaries_size & arginfo_size
+    /* skip binaries_size & arginfo_size */
     buffer += sizeof(uint64_t) + sizeof(uint32_t);
     BUFFER_READ_STR2(kernel_names[i], len);
     kernel_names[i][len] = 0;
@@ -381,6 +414,21 @@ pocl_binary_serialize_kernel_to_buffer(cl_kernel kernel,
   BUFFER_STORE(kernel->num_args, uint32_t);
   BUFFER_STORE(kernel->num_locals, uint32_t);
 
+  if (kernel->reqd_wg_size != NULL)
+    {
+      for (i = 0; i < OPENCL_MAX_DIMENSION; i++)
+        {
+          BUFFER_STORE(kernel->reqd_wg_size[i], uint64_t);
+        }
+    }
+  else
+    {
+      for (i = 0; i < OPENCL_MAX_DIMENSION; i++)
+        {
+          BUFFER_STORE((uint64_t)0, uint64_t);
+        }
+    }
+
   for (i=0; i < (kernel->num_args + kernel->num_locals); i++)
     {
       BUFFER_STORE(kernel->dyn_arguments[i].size, uint64_t);
@@ -501,13 +549,21 @@ pocl_binary_deserialize_kernel_from_buffer (unsigned char **buf,
   BUFFER_READ(kernel->num_args, uint32_t);
   BUFFER_READ(kernel->num_locals, uint32_t);
 
+  for (i = 0; i < OPENCL_MAX_DIMENSION; i++)
+    {
+      BUFFER_READ(kernel->reqd_wg_size[i], uint64_t);
+    }
+
   if (name_len > 0 && name_match)
     {
       *buf = *buf + kernel->struct_size;
-      if (kernel->sizeof_kernel_name != name_len)
-          return CL_INVALID_KERNEL_NAME;
-      if (strncmp (kernel->kernel_name, name_match, kernel->sizeof_kernel_name))
+      if ((kernel->sizeof_kernel_name != name_len)
+          || (strncmp (kernel->kernel_name, name_match,
+                       kernel->sizeof_kernel_name)))
+        {
+          POCL_MEM_FREE (kernel->kernel_name);
           return CL_INVALID_KERNEL_NAME;
+        }
 
       kernel->dyn_arguments = calloc ((kernel->num_args + kernel->num_locals),
                                       sizeof(struct pocl_argument));
@@ -544,6 +600,7 @@ pocl_binary_deserialize_kernel_from_buffer (unsigned char **buf,
     }
 
   *buf = buffer;
+  POCL_MEM_FREE (kernel->kernel_name);
   return CL_SUCCESS;
 
 }
@@ -565,11 +622,24 @@ pocl_binary_serialize(cl_program program, unsigned device_i, size_t *size)
   BUFFER_STORE(pocl_binary_get_device_id(program->devices[device_i]), uint64_t);
   BUFFER_STORE(POCLCC_VERSION, uint32_t);
   BUFFER_STORE(num_kernels, uint32_t);
+  uint64_t flags = 0;
+  if (program->flush_denorms)
+    flags |= POCL_BINARY_FLAG_FLUSH_DENORMS;
+  flags |= (program->binary_type << 1);
+  BUFFER_STORE (flags, uint64_t);
   memcpy(buffer, program->build_hash[device_i], sizeof(SHA1_digest_t));
   buffer += sizeof(SHA1_digest_t);
 
   assert(buffer < end_of_buffer);
 
+  char basedir[POCL_FILENAME_LENGTH];
+  pocl_cache_program_path (basedir, program, device_i);
+  size_t basedir_len = strlen (basedir);
+  char program_bc_path[POCL_FILENAME_LENGTH];
+  pocl_cache_program_bc_path (program_bc_path, program, device_i);
+  POCL_MSG_PRINT_INFO ("serializing program.bc: %s\n", program_bc_path);
+  buffer = serialize_file (program_bc_path, basedir_len, buffer);
+
   unsigned i;
   for (i=0; i < num_kernels; i++)
     {
@@ -592,13 +662,18 @@ pocl_binary_deserialize(cl_program program, unsigned device_i)
 
   pocl_binary b;
   buffer = read_header(&b, buffer);
+  program->flush_denorms = (b.flags & POCL_BINARY_FLAG_FLUSH_DENORMS);
+  program->binary_type = (b.flags >> 1);
 
   //assert(pocl_binary_check_binary_header(&b));
   assert (buffer < end_of_buffer);
 
-  pocl_binary_kernel k;
   char basedir[POCL_FILENAME_LENGTH];
+  pocl_cache_program_path (basedir, program, device_i);
+  size_t basedir_len = strlen (basedir);
+  buffer += deserialize_file (buffer, basedir, basedir_len);
 
+  pocl_binary_kernel k;
   unsigned i;
   for (i = 0; i < b.num_kernels; i++)
     {
@@ -669,6 +744,17 @@ pocl_binary_get_kernel_metadata (unsigned char *binary, const char *kernel_name,
                         "Deserialized a binary, but it doesn't seem to be "
                         "for this device.\n");
 
+  size_t len;
+  /* skip real path of program.bc */
+  BUFFER_READ(len, uint32_t);
+  assert (len > 0);
+  buffer += len;
+
+  /* skip content of program.bc */
+  BUFFER_READ(len, uint32_t);
+  assert (len > 0);
+  buffer += len;
+
   unsigned j;
   assert (b.num_kernels > 0);
   for (j = 0; j < b.num_kernels; j++)
@@ -687,10 +773,16 @@ pocl_binary_get_kernel_metadata (unsigned char *binary, const char *kernel_name,
   kernel->num_locals = k.num_locals;
   kernel->dyn_arguments = k.dyn_arguments;
   kernel->arg_info = k.arg_info;
-  free (k.kernel_name);
+  POCL_MEM_FREE (k.kernel_name);
 
-  POCL_RETURN_ERROR_COND ((kernel->reqd_wg_size = calloc(3, sizeof(int))) == NULL,
+  POCL_RETURN_ERROR_COND ((kernel->reqd_wg_size = calloc (OPENCL_MAX_DIMENSION, sizeof (size_t)))
+                              == NULL,
                           CL_OUT_OF_HOST_MEMORY);
 
+  for (j = 0; j < OPENCL_MAX_DIMENSION; j++)
+    {
+      kernel->reqd_wg_size[j] = k.reqd_wg_size[j];
+    }
+
   return CL_SUCCESS;
 }
diff --git a/lib/CL/pocl_build.c b/lib/CL/pocl_build.c
new file mode 100644
index 0000000..9d8346a
--- /dev/null
+++ b/lib/CL/pocl_build.c
@@ -0,0 +1,819 @@
+/* OpenCL runtime library: compile_and_link_program()
+
+   Copyright (c) 2011-2013 Universidad Rey Juan Carlos,
+                 2011-2014 Pekka Jääskeläinen / Tampere Univ. of Technology
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to
+   deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+   FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include "pocl_cl.h"
+#include <assert.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#ifndef _MSC_VER
+#  include <unistd.h>
+#else
+#  include "vccompat.hpp"
+#endif
+#ifdef OCS_AVAILABLE
+#include "pocl_llvm.h"
+#endif
+#include "pocl_util.h"
+#include "pocl_file_util.h"
+#include "pocl_cache.h"
+#include "config.h"
+#include "pocl_runtime_config.h"
+#include "pocl_binary.h"
+#include "pocl_shared.h"
+
+#define REQUIRES_CR_SQRT_DIV_ERR                                              \
+  "-cl-fp32-correctly-rounded-divide-sqrt build option "                      \
+  "was specified, but CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT "                   \
+  "is not set for device"
+
+#define REQUIRES_SPIR_SUPPORT "SPIR support is not available for device "
+
+/* supported compiler parameters which should pass to the frontend directly
+   by using -Xclang */
+static const char cl_parameters[] =
+  "-cl-single-precision-constant "
+  "-cl-fp32-correctly-rounded-divide-sqrt "
+  "-cl-opt-disable "
+  "-cl-mad-enable "
+  "-cl-unsafe-math-optimizations "
+  "-cl-finite-math-only "
+  "-cl-fast-relaxed-math "
+  "-cl-std=CL1.2 "
+  "-cl-std=CL1.1 "
+  "-cl-std=CL2.0 "
+  "-cl-kernel-arg-info "
+  "-w "
+  "-g "
+  "-Werror ";
+
+/*
+static const char cl_library_link_options[] =
+  "-create-library "
+  "-enable-link-options ";
+*/
+
+static const char cl_program_link_options[] =
+  "-cl-denorms-are-zero "
+  "-cl-no-signed-zeros "
+  "-cl-unsafe-math-optimizations "
+  "-cl-finite-math-only "
+  "-cl-fast-relaxed-math ";
+
+static const char cl_parameters_supported_after_clang_3_9[] =
+  "-cl-strict-aliasing " /* deprecated after OCL1.0 */
+  "-cl-denorms-are-zero "
+  "-cl-no-signed-zeros ";
+
+static const char cl_parameters_not_yet_supported_by_clang[] =
+  "-cl-uniform-work-group-size ";
+
+#define MEM_ASSERT(x, err_jmp) do{ if (x){errcode = CL_OUT_OF_HOST_MEMORY;goto err_jmp;}} while(0)
+
+// append token, growing modded_options, if necessary, by max(strlen(token)+1, 256)
+#define APPEND_TOKEN()                                                        \
+  do                                                                          \
+    {                                                                         \
+      needed = strlen (token) + 1;                                            \
+      assert (size > (i + needed));                                           \
+      i += needed;                                                            \
+      strcat (modded_options, token);                                         \
+      strcat (modded_options, " ");                                           \
+    }                                                                         \
+  while (0)
+
+#define APPEND_TO_MAIN_BUILD_LOG(...)  \
+  POCL_MSG_ERR(__VA_ARGS__);   \
+  {                            \
+    size_t l = strlen(program->main_build_log); \
+    snprintf(program->main_build_log + l, (640 - l), __VA_ARGS__); \
+  }
+
+#ifdef OCS_AVAILABLE
+cl_int
+program_compile_dynamic_wg_binaries(cl_program program)
+{
+  unsigned i, device_i;
+  cl_int errcode = CL_SUCCESS;
+  _cl_command_node cmd;
+
+  assert(program->num_kernels);
+  assert(program->build_status == CL_BUILD_SUCCESS);
+
+  memset(&cmd, 0, sizeof(_cl_command_node));
+  cmd.type = CL_COMMAND_NDRANGE_KERNEL;
+  char cachedir[POCL_FILENAME_LENGTH];
+  cmd.command.run.tmp_dir = cachedir;
+  POCL_LOCK_OBJ(program);
+
+  /* Build the dynamic WG sized parallel.bc and device specific code,
+     for each kernel & device combo.  */
+  for (device_i = 0; device_i < program->num_devices; ++device_i)
+    {
+      cl_device_id device = program->devices[device_i];
+
+      /* program may not be built for some of its devices */
+      if (program->pocl_binaries[device_i] || (!program->binaries[device_i]))
+        continue;
+
+      cmd.device = device;
+
+      for (i=0; i < program->num_kernels; i++)
+        {
+          cl_kernel kernel = program->default_kernels[i];
+          size_t local_x = 0, local_y = 0, local_z = 0;
+          if (kernel->reqd_wg_size != NULL &&
+              kernel->reqd_wg_size[0] > 0 &&
+              kernel->reqd_wg_size[1] > 0 &&
+              kernel->reqd_wg_size[2] > 0)
+            {
+              local_x = kernel->reqd_wg_size[0];
+              local_y = kernel->reqd_wg_size[1];
+              local_z = kernel->reqd_wg_size[2];
+            }
+          cmd.command.run.local_x = local_x;
+          cmd.command.run.local_y = local_y;
+          cmd.command.run.local_z = local_z;
+          cmd.command.run.kernel = kernel;
+          pocl_cache_kernel_cachedir_path (cachedir, program, device_i, kernel,
+                                           "", local_x, local_y, local_z);
+          device->ops->compile_kernel (&cmd, kernel, device);
+        }
+    }
+
+  POCL_UNLOCK_OBJ(program);
+  return errcode;
+}
+
+#endif
+
+/* options must be non-NULL.
+ * modded_options[size] + link_options are preallocated outputs
+ */
+static cl_int
+process_options (const char *options, char *modded_options, char *link_options,
+                 cl_program program, int compiling, int linking,
+                 int *create_library, unsigned *flush_denorms,
+                 int *requires_correctly_rounded_sqrt_div,
+                 int *spir_build, size_t size)
+{
+  cl_int error;
+  char *token = NULL;
+  char *saveptr = NULL;
+
+  *create_library = 0;
+  *flush_denorms = 0;
+  *requires_correctly_rounded_sqrt_div = 0;
+  *spir_build = 0;
+  int enable_link_options = 0;
+  link_options[0] = 0;
+  modded_options[0] = 0;
+  int ret_error = (linking ? (compiling ? CL_INVALID_BUILD_OPTIONS
+                                        : CL_INVALID_LINKER_OPTIONS)
+                           : CL_INVALID_COMPILER_OPTIONS);
+
+  assert (options);
+  assert (modded_options);
+  assert (compiling || linking);
+
+  size_t i = 1; /* terminating char */
+  size_t needed = 0;
+  char *temp_options = strdup (options);
+
+  token = strtok_r (temp_options, " ", &saveptr);
+  while (token != NULL)
+    {
+      /* check if parameter is supported compiler parameter */
+      if (memcmp (token, "-cl", 3) == 0 || memcmp (token, "-w", 2) == 0
+          || memcmp (token, "-Werror", 7) == 0)
+        {
+          if (strstr (cl_program_link_options, token))
+            {
+              /* when linking, only a subset of -cl* options are valid,
+               * and only with -enable-link-options */
+              if (linking && (!compiling))
+                {
+                  if (!enable_link_options)
+                    {
+                      APPEND_TO_MAIN_BUILD_LOG (
+                          "Not compiling but link options were not enabled, "
+                          "therefore %s is an invalid option\n",
+                          token);
+                      error = ret_error;
+                      goto ERROR;
+                    }
+                  strcat (link_options, token);
+                }
+              if (strstr (token, "-cl-denorms-are-zero"))
+                {
+                  *flush_denorms = 1;
+                }
+              if (strstr (token, "-cl-fp32-correctly-rounded-divide-sqrt"))
+                {
+                  *requires_correctly_rounded_sqrt_div = 1;
+                }
+            }
+          if (strstr (cl_parameters, token))
+            {
+              /* the LLVM API call pushes the parameters directly to the
+                 frontend without using -Xclang */
+            }
+          else if (strstr (cl_parameters_supported_after_clang_3_9, token))
+            {
+#ifndef LLVM_OLDER_THAN_3_9
+/* the LLVM API call pushes the parameters directly to the
+ * frontend without using -Xclang*/
+#else
+              APPEND_TO_MAIN_BUILD_LOG (
+                  "This build option is supported after clang3.9: %s\n",
+                  token);
+              token = strtok_r (NULL, " ", &saveptr);
+              continue;
+#endif
+            }
+          else if (strstr (cl_parameters_not_yet_supported_by_clang, token))
+            {
+              APPEND_TO_MAIN_BUILD_LOG (
+                  "This build option is not yet supported by clang: %s\n",
+                  token);
+              token = strtok_r (NULL, " ", &saveptr);
+              continue;
+            }
+          else
+            {
+              APPEND_TO_MAIN_BUILD_LOG("Invalid build option: %s\n", token);
+              error = ret_error;
+              goto ERROR;
+            }
+        }
+      else if (memcmp (token, "-g", 2) == 0)
+        {
+#ifndef LLVM_OLDER_THAN_3_8
+          token = "-debug-info-kind=line-tables-only";
+#endif
+        }
+      else if (memcmp (token, "-D", 2) == 0 || memcmp (token, "-I", 2) == 0)
+        {
+          APPEND_TOKEN();
+          /* if there is a space in between, then next token is part
+             of the option */
+          if (strlen (token) == 2)
+            token = strtok_r (NULL, " ", &saveptr);
+          else
+            {
+              token = strtok_r (NULL, " ", &saveptr);
+              continue;
+            }
+        }
+      else if (memcmp (token, "-x", 2) == 0 && strlen (token) == 2)
+        {
+          /* only "-x spir" is valid for the "-x" option */
+          token = strtok_r (NULL, " ", &saveptr);
+          if (!token || memcmp (token, "spir", 4) != 0)
+            {
+              APPEND_TO_MAIN_BUILD_LOG (
+                  "Invalid parameter to -x build option\n");
+              error = ret_error;
+              goto ERROR;
+            }
+          /* "-x spir" is not valid if we are building from source */
+          else if (program->source)
+            {
+              APPEND_TO_MAIN_BUILD_LOG (
+                  "\"-x spir\" is not valid when building from source\n");
+              error = ret_error;
+              goto ERROR;
+            }
+          else
+            *spir_build = 1;
+          token = strtok_r (NULL, " ", &saveptr);
+          continue;
+        }
+      else if (memcmp (token, "-spir-std=1.2", 13) == 0)
+        {
+          /* "-spir-std=" flags are not valid when building from source */
+          if (program->source)
+            {
+              APPEND_TO_MAIN_BUILD_LOG ("\"-spir-std=\" flag is not valid "
+                                        "when building from source\n");
+              error = ret_error;
+              goto ERROR;
+            }
+          else
+            *spir_build = 1;
+          token = strtok_r (NULL, " ", &saveptr);
+          continue;
+        }
+      else if (memcmp (token, "-create-library", 15) == 0)
+        {
+          if (!linking)
+            {
+              APPEND_TO_MAIN_BUILD_LOG (
+                  "\"-create-library\" flag is only valid when linking\n");
+              error = ret_error;
+              goto ERROR;
+            }
+          *create_library = 1;
+          token = strtok_r (NULL, " ", &saveptr);
+          continue;
+        }
+      else if (memcmp (token, "-enable-link-options", 20) == 0)
+        {
+          if (!linking)
+            {
+              APPEND_TO_MAIN_BUILD_LOG ("\"-enable-link-options\" flag is "
+                                        "only valid when linking\n");
+              error = ret_error;
+              goto ERROR;
+            }
+          if (!(*create_library))
+            {
+              APPEND_TO_MAIN_BUILD_LOG ("\"-enable-link-options\" flag is "
+                                        "only valid when -create-library "
+                                        "option was given\n");
+              error = ret_error;
+              goto ERROR;
+            }
+          enable_link_options = 1;
+          token = strtok_r (NULL, " ", &saveptr);
+          continue;
+        }
+      else
+        {
+          APPEND_TO_MAIN_BUILD_LOG ("Invalid build option: %s\n", token);
+          error = ret_error;
+          goto ERROR;
+        }
+      APPEND_TOKEN ();
+      token = strtok_r (NULL, " ", &saveptr);
+    }
+
+  error = CL_SUCCESS;
+
+  /* remove trailing whitespace */
+  i = strlen (modded_options);
+  if ((i > 0) && (modded_options[i - 1] == ' '))
+    modded_options[i - 1] = 0;
+ERROR:
+  POCL_MEM_FREE (temp_options);
+  return error;
+}
+
+static void
+clean_program_on_rebuild (cl_program program)
+{
+  /* if we're rebuilding the program, release the kernels and reset log/status
+   */
+  size_t i;
+  if ((program->build_status != CL_BUILD_NONE) || program->num_kernels > 0)
+    {
+      cl_kernel k;
+      for (k = program->kernels; k != NULL; k = k->next)
+        {
+          k->program = NULL;
+          --program->pocl_refcount;
+        }
+      program->kernels = NULL;
+      if (program->num_kernels)
+        {
+          program->operating_on_default_kernels = 1;
+          for (i = 0; i < program->num_kernels; i++)
+            {
+              if (program->kernel_names)
+                POCL_MEM_FREE (program->kernel_names[i]);
+              if (program->default_kernels && program->default_kernels[i])
+                POname (clReleaseKernel) (program->default_kernels[i]);
+            }
+          POCL_MEM_FREE (program->kernel_names);
+          POCL_MEM_FREE (program->default_kernels);
+          program->operating_on_default_kernels = 0;
+        }
+      program->num_kernels = 0;
+      program->build_status = CL_BUILD_NONE;
+      if (program->build_log)
+        for (i = 0; i < program->num_devices; ++i)
+          {
+            POCL_MEM_FREE (program->build_log[i]);
+            memset (program->build_hash[i], 0, sizeof (SHA1_digest_t));
+          }
+    }
+}
+
+cl_int
+compile_and_link_program(int compile_program,
+                         int link_program,
+                         cl_program program,
+
+                         cl_uint num_devices,
+                         const cl_device_id *device_list,
+                         const char *options,
+
+                         cl_uint num_input_headers,
+                         const cl_program *input_headers,
+                         const char **header_include_names,
+
+                         cl_uint num_input_programs,
+                         const cl_program *input_programs,
+
+                         void (CL_CALLBACK *pfn_notify) (cl_program program,
+                                                         void *user_data),
+                         void *user_data)
+{
+  char program_bc_path[POCL_FILENAME_LENGTH];
+  char link_options[512];
+  int errcode, error;
+  int create_library = 0;
+  int requires_cr_sqrt_div = 0;
+  int spir_build = 0;
+  unsigned flush_denorms = 0;
+  uint64_t fsize;
+  cl_device_id *unique_devlist = NULL;
+  char *binary = NULL;
+  unsigned device_i = 0, actually_built = 0;
+  size_t i, j;
+  void *write_cache_lock = NULL;
+  int build_error_code
+      = (link_program ? CL_BUILD_PROGRAM_FAILURE : CL_COMPILE_PROGRAM_FAILURE);
+  POCL_GOTO_LABEL_COND (PFN_NOTIFY, (program == NULL), CL_INVALID_PROGRAM);
+
+  POCL_GOTO_LABEL_COND (PFN_NOTIFY, (num_devices > 0 && device_list == NULL),
+                        CL_INVALID_VALUE);
+  POCL_GOTO_LABEL_COND (PFN_NOTIFY, (num_devices == 0 && device_list != NULL),
+                        CL_INVALID_VALUE);
+
+  POCL_GOTO_LABEL_COND (PFN_NOTIFY, (pfn_notify == NULL && user_data != NULL),
+                        CL_INVALID_VALUE);
+
+  POCL_GOTO_LABEL_ON (PFN_NOTIFY, program->kernels, CL_INVALID_OPERATION,
+                      "Program already has kernels\n");
+
+  POCL_GOTO_LABEL_ON (PFN_NOTIFY,
+                      (program->source == NULL && program->binaries == NULL),
+                      CL_INVALID_PROGRAM,
+                      "Program doesn't have sources or binaries! You need "
+                      "to call clCreateProgramWith{Binary|Source} first\n");
+
+  POCL_GOTO_LABEL_ON (PFN_NOTIFY,
+                      ((program->source == NULL) && (link_program == 0)),
+                      CL_INVALID_OPERATION,
+                      "Cannot clCompileProgram when program has no source\n");
+
+  POCL_LOCK_OBJ (program);
+
+  program->main_build_log[0] = 0;
+
+  /* TODO this should be somehow utilized at linking */
+  POCL_MEM_FREE (program->compiler_options);
+
+  if (options)
+    {
+      i = strlen (options);
+      size_t size = i + 512; /* add some space for pocl-added options */
+      program->compiler_options = (char *)malloc (size);
+      errcode = process_options (options, program->compiler_options,
+                                 link_options, program, compile_program,
+                                 link_program, &create_library, &flush_denorms,
+                                 &requires_cr_sqrt_div, &spir_build, size);
+      if (errcode != CL_SUCCESS)
+        goto ERROR_CLEAN_OPTIONS;
+    }
+
+  POCL_MSG_PRINT_INFO ("building program with options %s\n",
+                       program->compiler_options);
+
+
+  program->flush_denorms = flush_denorms;
+#if !(defined(__x86_64__) && defined(__GNUC__))
+  if (flush_denorms)
+    {
+      POCL_MSG_WARN ("flush to zero is currently only implemented for "
+                     "x86-64 & gcc/clang, ignoring flag\n");
+    }
+#endif
+
+  /* DEVICE LIST */
+  if (num_devices == 0)
+    {
+      num_devices = program->num_devices;
+      device_list = program->devices;
+    }
+  else
+    {
+      // convert subdevices to devices and remove duplicates
+      cl_uint real_num_devices = 0;
+      unique_devlist = pocl_unique_device_list (device_list, num_devices,
+                                                &real_num_devices);
+      num_devices = real_num_devices;
+      device_list = unique_devlist;
+    }
+
+  clean_program_on_rebuild (program);
+
+  /* Build the fully linked non-parallel bitcode for all
+         devices. */
+  for (device_i = 0; device_i < program->num_devices; ++device_i)
+    {
+      cl_device_id device = program->devices[device_i];
+
+      /* find the device in the supplied devices-to-build-for list */
+      int found = 0;
+      for (i = 0; i < num_devices; ++i)
+          if (device_list[i] == device) found = 1;
+      if (!found) continue;
+
+      if (requires_cr_sqrt_div
+          && !(device->single_fp_config & CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT))
+        {
+          APPEND_TO_MAIN_BUILD_LOG (REQUIRES_CR_SQRT_DIV_ERR);
+          POCL_GOTO_ERROR_ON (1, build_error_code,
+                              REQUIRES_CR_SQRT_DIV_ERR " %s\n",
+                              device->short_name);
+        }
+      actually_built++;
+
+      /* clCreateProgramWithSource */
+      if (program->source)
+        {
+          POCL_MSG_PRINT_INFO("building from sources for device %d\n", device_i);
+#ifdef OCS_AVAILABLE
+          error = pocl_llvm_build_program(
+              program, device_i, program->compiler_options, program_bc_path,
+              num_input_headers, input_headers, header_include_names,
+              (create_library ? 0 : link_program));
+          POCL_GOTO_ERROR_ON ((error != 0), build_error_code,
+                              "pocl_llvm_build_program() failed\n");
+#else
+          strcpy(program->main_build_log,
+                 "Cannot build a program from sources with pocl "
+                 "that does not have online compiler support\n");
+          POCL_GOTO_ERROR_ON(1, CL_COMPILER_NOT_AVAILABLE,
+                             "%s", program->main_build_log);
+#endif
+        }
+      /* clCreateProgramWithBinaries */
+      else if (program->binaries[device_i]
+               && (program->pocl_binaries[device_i] == NULL))
+        {
+#ifdef OCS_AVAILABLE
+          int spir_binary = bitcode_is_spir ((char*)program->binaries[device_i],
+                                             program->binary_sizes[device_i]);
+          if ((spir_binary || spir_build)
+              && (!strstr (device->extensions, "cl_khr_spir")))
+            {
+              APPEND_TO_MAIN_BUILD_LOG (REQUIRES_SPIR_SUPPORT);
+              POCL_GOTO_ERROR_ON (1, build_error_code,
+                                  REQUIRES_SPIR_SUPPORT " %s\n",
+                                  device->short_name);
+            }
+
+          POCL_MSG_PRINT_INFO ("building from a BC binary for device %d\n",
+                               device_i);
+          error = pocl_cache_create_program_cachedir(program, device_i,
+                                                     NULL, 0, program_bc_path);
+          POCL_GOTO_ERROR_ON((error != 0), CL_BUILD_PROGRAM_FAILURE,
+                             "Could not create program cachedir");
+          write_cache_lock = pocl_cache_acquire_writer_lock_i(program, device_i);
+          assert(write_cache_lock);
+          errcode = pocl_write_file(program_bc_path, (char*)program->binaries[device_i],
+                          (uint64_t)program->binary_sizes[device_i], 0, 0);
+          POCL_GOTO_ERROR_ON(errcode, CL_BUILD_PROGRAM_FAILURE,
+                             "Failed to write binaries to program.bc\n");
+#else
+          strcpy (program->main_build_log,
+                  "Cannot build program from LLVM IR binaries with "
+                  "pocl that does not have online compiler support\n");
+          POCL_GOTO_ERROR_ON (1, CL_COMPILER_NOT_AVAILABLE, "%s",
+                              program->main_build_log);
+#endif
+        }
+      else if (program->pocl_binaries[device_i])
+        {
+          POCL_MSG_PRINT_INFO("having a poclbinary for device %d\n", device_i);
+#ifdef OCS_AVAILABLE
+          if (program->binaries[device_i] == NULL)
+            {
+              POCL_MSG_WARN (
+                  "pocl-binary for this device doesn't contain "
+                  "program.bc - you won't be able to rebuild/link it\n");
+              /* do not try to read program.bc or LLVM IRs
+               * TODO maybe read LLVM IRs ?*/
+              continue;
+            }
+#else
+          continue;
+#endif
+        }
+      else if (link_program && (num_input_programs > 0))
+        {
+#ifdef OCS_AVAILABLE
+          /* just link binaries. */
+          unsigned char *cur_device_binaries[num_input_programs];
+          size_t cur_device_binary_sizes[num_input_programs];
+          void *cur_llvm_irs[num_input_programs];
+          for (j = 0; j < num_input_programs; j++)
+            {
+              assert (device == input_programs[j]->devices[device_i]);
+              cur_device_binaries[j] = input_programs[j]->binaries[device_i];
+
+              assert (cur_device_binaries[j]);
+              cur_device_binary_sizes[j]
+                  = input_programs[j]->binary_sizes[device_i];
+
+              if (input_programs[j]->llvm_irs[device_i] == NULL)
+                pocl_update_program_llvm_irs (input_programs[j], device_i,
+                                              device);
+
+              cur_llvm_irs[j] = input_programs[j]->llvm_irs[device_i];
+              assert (cur_llvm_irs[j]);
+            }
+          error = pocl_llvm_link_program (program, device_i,
+              program_bc_path, num_input_programs,
+              cur_device_binaries, cur_device_binary_sizes,
+              cur_llvm_irs, create_library);
+          POCL_GOTO_ERROR_ON ((error != CL_SUCCESS), CL_LINK_PROGRAM_FAILURE,
+                              "pocl_llvm_link_program() failed\n");
+#else
+          POCL_GOTO_ERROR_ON ((1), CL_LINK_PROGRAM_FAILURE,
+                              "clCompileProgram/clLinkProgram/clBuildProgram"
+                              " require a pocl built with LLVM\n");
+
+#endif
+        }
+      else
+        {
+          POCL_GOTO_ERROR_ON (1, CL_INVALID_BINARY,
+                              "No sources nor binaries for device %s - can't "
+                              "build the program\n", device->short_name);
+        }
+
+#ifdef OCS_AVAILABLE
+      /* Read binaries from program.bc to memory */
+      if (program->binaries[device_i] == NULL)
+        {
+          errcode = pocl_read_file(program_bc_path, &binary, &fsize);
+          POCL_GOTO_ERROR_ON(errcode, CL_BUILD_ERROR,
+                             "Failed to read binaries from program.bc to "
+                             "memory: %s\n", program_bc_path);
+
+          program->binary_sizes[device_i] = (size_t)fsize;
+          program->binaries[device_i] = (unsigned char *)binary;
+        }
+
+      if (program->llvm_irs[device_i] == NULL)
+        {
+          pocl_update_program_llvm_irs(program, device_i, device);
+        }
+      /* Maintain a 'last_accessed' file in every program's
+       * cache directory. Will be useful for cache pruning script
+       * that flushes old directories based on LRU */
+      pocl_cache_update_program_last_access(program, device_i);
+
+      if (write_cache_lock)
+        {
+          pocl_cache_release_lock(write_cache_lock);
+          write_cache_lock = NULL;
+        }
+
+#endif
+
+    }
+
+  POCL_GOTO_ERROR_ON ((actually_built < num_devices), build_error_code,
+                      "Some of the devices on the argument-supplied list are"
+                      "not available for the program, or do not exist\n");
+
+  program->build_status = CL_BUILD_SUCCESS;
+  program->binary_type = CL_PROGRAM_BINARY_TYPE_EXECUTABLE;
+  /* if program will be compiled using clCompileProgram its binary_type
+   * will be set to CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT.
+   *
+   * if program was created by clLinkProgram which is called
+   * with the –createlibrary link option its binary_type will be set to
+   * CL_PROGRAM_BINARY_TYPE_LIBRARY.
+   */
+  if (create_library)
+    program->binary_type = CL_PROGRAM_BINARY_TYPE_LIBRARY;
+  if (compile_program && !link_program)
+    program->binary_type = CL_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;
+
+  assert(program->num_kernels == 0);
+  for (i=0; i < program->num_devices; i++)
+    {
+#ifdef OCS_AVAILABLE
+      if (program->binaries[i])
+        {
+          program->num_kernels = pocl_llvm_get_kernel_count(program);
+          if (program->num_kernels)
+            {
+              program->kernel_names = calloc(program->num_kernels, sizeof(char*));
+              pocl_llvm_get_kernel_names(program,
+                                         program->kernel_names,
+                                         program->num_kernels);
+            }
+          break;
+        }
+#endif
+      if (program->pocl_binaries[i])
+        {
+          program->num_kernels =
+              pocl_binary_get_kernel_count(program->pocl_binaries[i]);
+          if (program->num_kernels)
+            {
+              program->kernel_names = calloc(program->num_kernels, sizeof(char*));
+              pocl_binary_get_kernel_names(program->pocl_binaries[i],
+                                           program->kernel_names,
+                                           program->num_kernels);
+            }
+          break;
+        }
+    }
+  POCL_GOTO_ERROR_ON((i >= program->num_devices),
+                     CL_INVALID_BINARY,
+                     "Could not set kernel number / names from the binary\n");
+
+  /* Set up all program kernels.  */
+  assert (program->default_kernels == NULL);
+  program->operating_on_default_kernels = 1;
+  if (program->num_kernels > 0)
+    program->default_kernels
+        = calloc (program->num_kernels, sizeof (cl_kernel));
+
+  for (i=0; i < program->num_kernels; i++)
+    {
+      program->default_kernels[i] =
+          POname (clCreateKernel) (program, program->kernel_names[i], &errcode);
+      POCL_GOTO_ERROR_ON ((errcode != CL_SUCCESS), build_error_code,
+                          "Failed to create default kernels\n");
+    }
+  program->operating_on_default_kernels = 0;
+
+  errcode = CL_SUCCESS;
+  goto FINISH;
+
+ERROR:
+  program->kernels = 0;
+  for(i = 0; i < program->num_devices; i++)
+  {
+    if (program->source)
+      {
+        POCL_MEM_FREE (program->binaries[i]);
+        program->binary_sizes[i] = 0;
+      }
+    pocl_cache_release_lock(program->read_locks[i]);
+    program->read_locks[i] = NULL;
+  }
+  if (program->num_kernels && program->kernel_names)
+    {
+      for (i=0; i < program->num_kernels; i++)
+        POCL_MEM_FREE(program->kernel_names[i]);
+      POCL_MEM_FREE(program->kernel_names);
+    }
+  if (program->default_kernels)
+    {
+      program->operating_on_default_kernels = 1;
+      for (i=0; i < program->num_kernels; i++)
+        if (program->default_kernels[i])
+          POname(clReleaseKernel)(program->default_kernels[i]);
+      program->operating_on_default_kernels = 0;
+      POCL_MEM_FREE(program->default_kernels);
+    }
+
+  pocl_cache_release_lock(write_cache_lock);
+
+ERROR_CLEAN_OPTIONS:
+  program->build_status = CL_BUILD_ERROR;
+
+FINISH:
+  POCL_UNLOCK_OBJ(program);
+  POCL_MEM_FREE (unique_devlist);
+
+PFN_NOTIFY:
+  if (pfn_notify)
+    pfn_notify (program, user_data);
+
+  return errcode;
+}
diff --git a/lib/CL/pocl_cache.c b/lib/CL/pocl_cache.c
index 2bf5eeb..61af313 100644
--- a/lib/CL/pocl_cache.c
+++ b/lib/CL/pocl_cache.c
@@ -21,16 +21,14 @@
    THE SOFTWARE.
 */
 
+#include <errno.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <string.h>
 #include <unistd.h>
 
 #include "config.h"
-
-#ifdef POCL_BUILT_WITH_CMAKE
 #include "pocl_build_timestamp.h"
-#endif
 
 #ifdef OCS_AVAILABLE
 #include "kernellib_hash.h"
@@ -233,32 +231,86 @@ void* pocl_cache_acquire_reader_lock(cl_program program,
 
 /******************************************************************************/
 
-void pocl_cache_mk_temp_name(char* path) {
-    assert(cache_topdir_initialized);
+static void
+pocl_cache_mk_temp_name (char *path_template, unsigned suffix_len, int *ret_fd)
+{
+  assert (cache_topdir_initialized);
 #if defined(_MSC_VER) || defined(__MINGW32__)
     char* tmp = _tempnam(cache_topdir, "pocl_");
     assert(tmp);
-    int bytes_written = snprintf(path, POCL_FILENAME_LENGTH, "%s", tmp);
+    int bytes_written
+        = snprintf (path_template, POCL_FILENAME_LENGTH, "%s", tmp);
     free(tmp);
     assert(bytes_written > 0 && bytes_written < POCL_FILENAME_LENGTH);
 #else
-    int bytes_written = snprintf(path, POCL_FILENAME_LENGTH,
-             "%s/temp_XXXXXX.cl", cache_topdir);
-    assert(bytes_written > 0 && bytes_written < POCL_FILENAME_LENGTH);
     /* using mkstemp() instead of tmpnam() has no real benefit
      * here, as we have to pass the filename to llvm,
      * but tmpnam() generates an annoying warning... */
-    int fd = mkstemps(path, 3);
-    assert(fd >= 0);
-    close(fd);
+    int fd;
+
+    if (suffix_len)
+      fd = mkstemps (path_template, suffix_len);
+    else
+      fd = mkstemp (path_template);
+
+    if (fd < 0)
+      {
+        char buf[512];
+        strerror_r (errno, buf, 512);
+        POCL_ABORT ("mkstemp failed: %s\n", buf);
+      }
+
+    if (ret_fd)
+      *ret_fd = fd;
+    else
+      close (fd);
+
+    return;
+#endif
+}
+
+int
+pocl_cache_create_tempdir (char *path)
+{
+  assert (cache_topdir_initialized);
+#if defined(_MSC_VER) || defined(__MINGW32__)
+  char *tmp = _tempnam (cache_topdir, "pocl_");
+  assert (tmp);
+  int bytes_written = snprintf (path, POCL_FILENAME_LENGTH, "%s", tmp);
+  free (tmp);
+  assert (bytes_written > 0 && bytes_written < POCL_FILENAME_LENGTH);
+  return 0;
+#else
+  int bytes_written = snprintf (path, POCL_FILENAME_LENGTH,
+                                "%s/tempdir_XXXXXX", cache_topdir);
+  assert (bytes_written > 0 && bytes_written < POCL_FILENAME_LENGTH);
+  /* TODO mkdtemp() might not be portable */
+  return (mkdtemp (path) == NULL);
 #endif
 }
 
-int pocl_cache_write_program_source(char *program_cl_path,
-                                    cl_program program) {
-    pocl_cache_mk_temp_name(program_cl_path);
-    return pocl_write_file(program_cl_path, program->source,
-                           strlen(program->source), 0, 0);
+void
+pocl_cache_tempname (char *path_template, const char *suffix, int *fd)
+{
+  assert (cache_topdir_initialized);
+  assert (path_template);
+  strcpy (path_template, cache_topdir);
+  size_t suffixlen = (suffix ? strlen (suffix) : 0);
+  size_t max = POCL_FILENAME_LENGTH - 16 - suffixlen;
+  assert (strlen (path_template) < max);
+  strcat (path_template, "/tempfile_XXXXXX");
+  if (suffix)
+    strcat (path_template, suffix);
+
+  pocl_cache_mk_temp_name (path_template, suffixlen, fd);
+}
+
+int
+pocl_cache_write_program_source (char *program_cl_path, cl_program program)
+{
+  pocl_cache_tempname (program_cl_path, ".cl", NULL);
+  return pocl_write_file (program_cl_path, program->source,
+                          strlen (program->source), 0, 0);
 }
 
 /******************************************************************************/
@@ -333,6 +385,9 @@ int pocl_cache_append_to_buildlog(cl_program  program,
                                   unsigned    device_i,
                                   const char *content,
                                   size_t      size) {
+    if (!buildhash_is_valid (program, device_i))
+      return -1;
+
     char buildlog_path[POCL_FILENAME_LENGTH];
     program_device_dir(buildlog_path, program,
                        device_i, POCL_BUILDLOG_FILENAME);
@@ -392,14 +447,24 @@ build_program_compute_hash(cl_program program,
       pocl_SHA1_Update(&hash_ctx,
 		       (uint8_t*) program->pocl_binaries[device_i],
 		       program->pocl_binary_sizes[device_i]);
-
-    } else {
-      /* Program was created with clCreateProgramWithBinary() with an LLVM IR binary */
-      assert(program->binaries[device_i]);
-      pocl_SHA1_Update(&hash_ctx,
-		       (uint8_t*) program->binaries[device_i],
-		       program->binary_sizes[device_i]);
-    }
+      }
+    else if (program->binary_sizes[device_i] > 0)
+      {
+        /* Program was created with clCreateProgramWithBinary() with an LLVM IR
+         * binary */
+        assert (program->binaries[device_i]);
+        pocl_SHA1_Update (&hash_ctx, (uint8_t *)program->binaries[device_i],
+                          program->binary_sizes[device_i]);
+      }
+    else
+      {
+        /* Program is linked from binaries, has no source or binary */
+        // assert(program->binary_type == CL_PROGRAM_BIN)
+        assert (preprocessed_source);
+        assert (source_len > 0);
+        pocl_SHA1_Update (&hash_ctx, (uint8_t *)preprocessed_source,
+                          source_len);
+      }
 
     if (program->compiler_options)
         pocl_SHA1_Update(&hash_ctx, (uint8_t*) program->compiler_options,
@@ -481,16 +546,19 @@ char* pocl_get_process_name()
 
 /******************************************************************************/
 
-void pocl_cache_init_topdir() {
+int
+pocl_cache_init_topdir ()
+{
 
-    if (cache_topdir_initialized)
-        return;
+  if (cache_topdir_initialized)
+    return 0;
 
-    const char *tmp_path = pocl_get_string_option("POCL_CACHE_DIR", NULL);
-    int needed;
+  const char *tmp_path = pocl_get_string_option ("POCL_CACHE_DIR", NULL);
+  int needed;
 
-    if (tmp_path && (pocl_exists(tmp_path))) {
-        needed = snprintf(cache_topdir, POCL_FILENAME_LENGTH, "%s", tmp_path);
+  if (tmp_path)
+    {
+      needed = snprintf (cache_topdir, POCL_FILENAME_LENGTH, "%s", tmp_path);
     } else     {
 #ifdef POCL_ANDROID
         char* process_name = pocl_get_process_name();
@@ -530,15 +598,33 @@ void pocl_cache_init_topdir() {
 #endif
     }
 
-    if (needed >= POCL_FILENAME_LENGTH) {
-        POCL_ABORT("pocl: cache path longer than maximum filename length\n");
+  if (needed >= POCL_FILENAME_LENGTH)
+    {
+      POCL_MSG_ERR ("pocl: cache path longer than maximum filename length\n");
+      return 1;
     }
 
     assert(strlen(cache_topdir) > 0);
+
     if (pocl_mkdir_p(cache_topdir))
-        POCL_ABORT("Could not create topdir %s for cache\n", cache_topdir);
-    cache_topdir_initialized = 1;
+      {
+        POCL_MSG_ERR (
+            "Could not create top directory (%s) for cache. \n\nNote: "
+            "if you have proper rights to create that directory, and still "
+            "get the error, then most likely pocl and the program you're "
+            "trying to run are linked to different versions of libstdc++ "
+            "library. \nThis is not a bug in pocl and there's nothing we "
+            "can do to fix it - you need both pocl and your program to be"
+            " compiled for your system. This is known to happen with "
+            "Luxmark benchmark binaries dowloaded from website; Luxmark "
+            "installed from your linux distribution's packages should "
+            "work.\n",
+            cache_topdir);
+        return 1;
+      }
 
+    cache_topdir_initialized = 1;
+    return 0;
 }
 
 /* Create the new program cachedir, invalidating the old program
@@ -624,8 +710,8 @@ void pocl_cache_cleanup_cachedir(cl_program program) {
             void* lock = acquire_program_lock(program, i, "_read", 0);
             if (!lock)
               {
-                POCL_MSG_PRINT(" *** WARNING *** ", "",
-                "Could not get an exclusive lock to remove program cachedir");
+                POCL_MSG_WARN ("Could not get an exclusive lock "
+                               "to remove program cachedir\n");
                 continue;
               }
             char cachedir[POCL_FILENAME_LENGTH];
diff --git a/lib/CL/pocl_cl.h b/lib/CL/pocl_cl.h
index 6bc0c2f..03a0e91 100644
--- a/lib/CL/pocl_cl.h
+++ b/lib/CL/pocl_cl.h
@@ -81,10 +81,13 @@ typedef pthread_mutex_t pocl_lock_t;
     POCL_UNLOCK_OBJ (__OBJ__);                                          \
   } while (0)
 
+#define POCL_RETAIN_OBJECT_UNLOCKED(__OBJ__)    \
+    ++((__OBJ__)->pocl_refcount);
+
 #define POCL_RETAIN_OBJECT(__OBJ__)             \
   do {                                          \
     POCL_LOCK_OBJ (__OBJ__);                    \
-    ++((__OBJ__)->pocl_refcount);               \
+    POCL_RETAIN_OBJECT_UNLOCKED (__OBJ__);      \
     POCL_UNLOCK_OBJ (__OBJ__);                  \
   } while (0)
 
@@ -114,6 +117,13 @@ typedef pthread_mutex_t pocl_lock_t;
       POCL_INIT_OBJECT_NO_ICD(__OBJ__)
 #endif
 
+#define POCL_DESTROY_OBJECT(__OBJ__)                                          \
+  do                                                                          \
+    {                                                                         \
+      POCL_DESTROY_LOCK ((__OBJ__)->pocl_lock);                               \
+    }                                                                         \
+  while (0);
+
 /* Declares the generic pocl object attributes inside a struct. */
 #define POCL_OBJECT \
   pocl_lock_t pocl_lock; \
@@ -145,12 +155,12 @@ typedef pthread_mutex_t pocl_lock_t;
 /* Symbol aliases are supported */
 
 #  define POname(name) PO##name
-#  define POdeclsym(name)			\
+#  define POdeclsym(name)                      \
   __typeof__(name) PO##name __attribute__((visibility("hidden")));
 #  define POCL_ALIAS_OPENCL_SYMBOL(name)                                \
   __typeof__(name) name __attribute__((alias ("PO" #name), visibility("default")));
 #  define POsymAlways(name) POCL_ALIAS_OPENCL_SYMBOL(name)
-#  if defined(DIRECT_LINKAGE) || !defined(BUILD_ICD)
+#  if !defined(BUILD_ICD)
 #    define POsym(name) POCL_ALIAS_OPENCL_SYMBOL(name)
 #  else
 #    define POsym(name)
@@ -162,10 +172,12 @@ typedef pthread_mutex_t pocl_lock_t;
  * it is used (as the ICD loader assumes that)*/
 #ifdef BUILD_ICD
 #  define POCL_ICD_OBJECT struct _cl_icd_dispatch *dispatch;
+#  define POCL_ICD_OBJECT_PLATFORM_ID POCL_ICD_OBJECT
 #  define POsymICD(name) POsym(name)
 #  define POdeclsymICD(name) POdeclsym(name)
 #else
 #  define POCL_ICD_OBJECT
+#  define POCL_ICD_OBJECT_PLATFORM_ID unsigned long id;
 #  define POsymICD(name)
 #  define POdeclsymICD(name)
 #endif
@@ -206,12 +218,13 @@ typedef struct pocl_argument_info {
   pocl_argument_type type;
   char is_local;
   char is_set;
+  unsigned type_size;
 } pocl_argument_info;
 
 struct pocl_device_ops {
   const char *device_name;
   void *shared_data; /* data to be shared by a devices of same type */
-  void (*init_device_infos) (struct _cl_device_id*);
+  void (*init_device_infos) (unsigned j, struct _cl_device_id*);
   /* implementation */
 
   /* New driver api extension for out-of-order execution and
@@ -234,7 +247,7 @@ struct pocl_device_ops {
 
   /* notify is used to communicate to a device driver that an event, it has
      been waiting, has been completed. */
-  void (*notify) (cl_device_id device, cl_event event);
+  void (*notify) (cl_device_id device, cl_event event, cl_event finished);
 
   /* broadcast is(has to be) called by the device driver when a command is
      completed.
@@ -263,7 +276,14 @@ struct pocl_device_ops {
 
   void (*uninit) (cl_device_id device);
   unsigned int (*probe) (struct pocl_device_ops *ops);
-  void (*init) (cl_device_id device, const char *parameters);
+  /* Device initialization. Parameters:
+   *  j : progressive index for the devices of the same type
+   *  device : struct to initialize
+   *  parameters : optional environment with device-specific parameters
+   */
+  cl_int (*init) (unsigned j, cl_device_id device, const char *parameters);
+  cl_int (*init_queue) (cl_command_queue queue);
+  void (*free_queue) (cl_command_queue queue);
   cl_int (*alloc_mem_obj) (cl_device_id device, cl_mem mem_obj, void* host_ptr);
   void *(*create_sub_buffer) (void *data, void* buffer, size_t origin, size_t size);
   void (*free) (cl_device_id device, cl_mem mem_obj);
@@ -342,6 +362,7 @@ struct pocl_device_ops {
 };
 
 typedef struct pocl_global_mem_t {
+  pocl_lock_t pocl_lock;
   size_t max_ever_allocated;
   size_t currently_allocated;
   size_t total_alloc_limit;
@@ -354,6 +375,10 @@ struct _cl_device_id {
   cl_device_type type;
   cl_uint vendor_id;
   cl_uint max_compute_units;
+  // for subdevices
+  cl_device_id parent_device;
+  unsigned core_start;
+  unsigned core_count;
   cl_uint max_work_item_dimensions;
   size_t max_work_item_sizes[3];
   size_t max_work_group_size;
@@ -413,6 +438,11 @@ struct _cl_device_id {
      we need to generate work-item loops to execute all the work-items
      in the WG, otherwise the hardware spawns the WIs. */
   cl_bool spmd;
+  /* The Workgroup pass creates launcher functions and replaces work-item
+     placeholder global variables (e.g. _local_size_, _global_offset_ etc) with
+     loads from the context struct passed as a kernel argument. This flag
+     enables or disables this pass. */
+  cl_bool workgroup_pass;
   cl_device_exec_capabilities execution_capabilities;
   cl_command_queue_properties queue_properties;
   cl_platform_id platform;
@@ -425,7 +455,6 @@ struct _cl_device_id {
   char *short_name;
   char *long_name;
   char *cache_dir_name;
-  cl_device_id parent_device;
 
   const char *vendor;
   const char *driver_version;
@@ -488,7 +517,7 @@ struct _cl_device_id {
 
 
 struct _cl_platform_id {
-  POCL_ICD_OBJECT
+  POCL_ICD_OBJECT_PLATFORM_ID
 }; 
 
 struct _cl_context {
@@ -531,10 +560,12 @@ struct _cl_command_queue {
   cl_command_queue_properties properties;
   /* implementation */
   cl_event events; /* events of the enqueued commands in enqueue order */
-  _cl_command_node * volatile root;
   struct _cl_event * volatile barrier;
   volatile int command_count; /* counter for unfinished command enqueued */
   volatile pocl_data_sync_item last_event;
+
+  /* backend specific data */
+  void *data;
 };
 
 
@@ -554,6 +585,7 @@ struct _cl_mem {
   cl_mem_object_type type;
   cl_mem_flags flags;
   size_t size;
+  size_t origin; /* for sub-buffers */
   void *mem_host_ptr;
   cl_uint map_count;
   cl_context context;
@@ -607,15 +639,18 @@ struct _cl_mem {
 
 typedef uint8_t SHA1_digest_t[SHA1_DIGEST_SIZE * 2 + 1];
 
-/* Any value except zero, just have to be an invalid pointer. */
-#define ADDING_DEFAULT_KERNELS_TO_CL_PROGRAM (void*)11
-
 struct _cl_program {
   POCL_ICD_OBJECT
   POCL_OBJECT;
   /* queries */
   cl_context context;
   cl_uint num_devices;
+  /* bool flag, set to 1 when removing/adding default kernels to a program
+   * This code needs to be eventually fixed by introducing kernel_metadata
+   * struct, see Issue #390 */
+  int operating_on_default_kernels;
+  /* -cl-denorms-are-zero build option */
+  unsigned flush_denorms;
   cl_device_id *devices;
   /* all the program sources appended together, terminated with a zero */
   char *source;
@@ -651,8 +686,6 @@ struct _cl_program {
   cl_build_status build_status;
   /* Use to store binary type */
   cl_program_binary_type binary_type;
-  /* Use to store build porgram callback (pfn_notify) */
-  build_program_callback_t *buildprogram_callback;
 };
 
 struct _cl_kernel {
@@ -665,12 +698,16 @@ struct _cl_kernel {
   cl_program program;
   struct pocl_argument_info *arg_info;
   cl_bitfield has_arg_metadata;
+  char *attributes;
   cl_uint num_locals;
-  int *reqd_wg_size;
+  size_t *reqd_wg_size;
   /* The kernel arguments that are set with clSetKernelArg().
      These are copied to the command queue command at enqueue. */
   struct pocl_argument *dyn_arguments;
   struct _cl_kernel *next;
+
+  /* backend specific data */
+  void *data;
 };
 
 typedef struct event_callback_item event_callback_item;
@@ -727,93 +764,159 @@ struct _cl_event {
   _cl_event * volatile prev;
 };
 
+typedef struct _pocl_user_event_data
+{
+  pthread_cond_t wakeup_cond;
+  pthread_mutex_t lock;
+} pocl_user_event_data;
 
 typedef struct _cl_sampler cl_sampler_t;
 struct _cl_sampler {
   POCL_ICD_OBJECT
+  POCL_OBJECT;
+  cl_context context;
   cl_bool             normalized_coords;
   cl_addressing_mode  addressing_mode;
   cl_filter_mode      filter_mode;
 };
 
-#define POCL_UPDATE_EVENT_QUEUED(__event)                               \
-  do {                                                                  \
-    if ((__event) != NULL && (*(__event)) != NULL)                      \
-      {                                                                 \
-        cl_command_queue __cq = (*(__event))->queue;                    \
-        if ((__cq)->device->ops->update_event)                   \
-          (__cq)->device->ops->update_event((__cq)->device, (*__event), CL_QUEUED);    \
-        else {                                                          \
-          (*(__event))->status = CL_QUEUED;                             \
-          if (__cq && __cq->properties & CL_QUEUE_PROFILING_ENABLE)     \
-            (*(__event))->time_queue =                                  \
-              __cq->device->ops->get_timer_value(__cq->device->data);   \
-        }                                                               \
-        pocl_event_updated(*(__event), CL_QUEUED);                      \
-      }                                                                 \
-  } while (0)                                                           \
-
-#define POCL_UPDATE_EVENT_SUBMITTED(__event)                            \
-  do {                                                                  \
-    if ((__event) != NULL && (*(__event)) != NULL)                      \
-      {                                                                 \
-        assert((*(__event))->status == CL_QUEUED);                      \
-         cl_command_queue __cq = (*(__event))->queue;                    \
-        if ((__cq)->device->ops->update_event)                   \
-          (__cq)->device->ops->update_event((__cq)->device, (*(__event)), CL_SUBMITTED);    \
-        else {                                                          \
-          (*(__event))->status = CL_SUBMITTED;                          \
-          if (__cq && __cq->properties & CL_QUEUE_PROFILING_ENABLE)     \
-            (*(__event))->time_submit =                                 \
-              __cq->device->ops->get_timer_value(__cq->device->data);   \
-        }                                                               \
-        pocl_event_updated(*(__event), CL_SUBMITTED);                   \
-      }                                                                 \
-  } while (0)                                                           \
-
-#define POCL_UPDATE_EVENT_RUNNING(__event)                              \
-  do {                                                                  \
-    if (__event != NULL && (*(__event)) != NULL)                        \
-      {                                                                 \
-        assert((*(__event))->status == CL_SUBMITTED);                   \
-        cl_command_queue __cq = (*(__event))->queue;                    \
-        if ((__cq)->device->ops->update_event)                   \
-          (__cq)->device->ops->update_event((__cq)->device, (*(__event)), CL_RUNNING);    \
-        else {                                                          \
-          (*(__event))->status = CL_RUNNING;                            \
-          if (__cq && __cq->properties & CL_QUEUE_PROFILING_ENABLE)     \
-            (*(__event))->time_start =                                  \
-              __cq->device->ops->get_timer_value(__cq->device->data);   \
-        }                                                               \
-        pocl_event_updated(*(__event), CL_RUNNING);                     \
-      }                                                                 \
-  } while (0)                                                           \
-
-#define POCL_UPDATE_EVENT_COMPLETE(__event)                             \
-  do {                                                                  \
-    if ((__event) != NULL && (*(__event)) != NULL)                      \
-      {                                                                 \
-        assert((*(__event))->status == CL_RUNNING);                     \
-        cl_command_queue __cq = (*(__event))->queue;                    \
-        assert((*(__event))->status == CL_RUNNING);                     \
-        if ((__cq)->device->ops->update_event)                          \
-          (__cq)->device->ops->update_event((__cq)->device, (*(__event)), CL_COMPLETE); \
-        else{                                                           \
-          pocl_mem_objs_cleanup ((*__event));                           \
-          POCL_LOCK_OBJ (*(__event));                                   \
-          (*(__event))->status = CL_COMPLETE;                           \
-          if ((__cq)->properties & CL_QUEUE_PROFILING_ENABLE){          \
-            (*(__event))->time_end =                                    \
-            (__cq)->device->ops->get_timer_value((__cq)->device->data); \
-          }                                                             \
-          (__cq)->device->ops->broadcast(*(__event));                   \
-          POCL_UNLOCK_OBJ (*(__event));                                 \
-          pocl_update_command_queue(*(__event));                        \
-        }                                                               \
-        pocl_event_updated(*(__event), CL_COMPLETE);                    \
-        POname(clReleaseEvent) (*(__event));                            \
-      }                                                                 \
-  } while (0)                                                           \
+#define POCL_UPDATE_EVENT_QUEUED(__event)                                     \
+  do                                                                          \
+    {                                                                         \
+      if ((__event) != NULL)                                                  \
+        {                                                                     \
+          cl_command_queue __cq = (__event)->queue;                           \
+          if ((__cq)->device->ops->update_event)                              \
+            (__cq)->device->ops->update_event ((__cq)->device, (__event),     \
+                                               CL_QUEUED);                    \
+          else                                                                \
+            {                                                                 \
+              (__event)->status = CL_QUEUED;                                  \
+              if (__cq && __cq->properties & CL_QUEUE_PROFILING_ENABLE)       \
+                (__event)->time_queue = __cq->device->ops->get_timer_value (  \
+                    __cq->device->data);                                      \
+            }                                                                 \
+          pocl_event_updated (__event, CL_QUEUED);                            \
+        }                                                                     \
+    }                                                                         \
+  while (0)
+
+#define POCL_UPDATE_EVENT_SUBMITTED(__event)                                  \
+  do                                                                          \
+    {                                                                         \
+      if ((__event) != NULL)                                                  \
+        {                                                                     \
+          assert ((__event)->status == CL_QUEUED);                            \
+          cl_command_queue __cq = (__event)->queue;                           \
+          if ((__cq)->device->ops->update_event)                              \
+            (__cq)->device->ops->update_event ((__cq)->device, (__event),     \
+                                               CL_SUBMITTED);                 \
+          else                                                                \
+            {                                                                 \
+              (__event)->status = CL_SUBMITTED;                               \
+              if (__cq && __cq->properties & CL_QUEUE_PROFILING_ENABLE)       \
+                (__event)->time_submit = __cq->device->ops->get_timer_value ( \
+                    __cq->device->data);                                      \
+            }                                                                 \
+          pocl_event_updated (__event, CL_SUBMITTED);                         \
+        }                                                                     \
+    }                                                                         \
+  while (0)
+
+#define POCL_UPDATE_EVENT_RUNNING(__event)                                    \
+  do                                                                          \
+    {                                                                         \
+      if (__event != NULL)                                                    \
+        {                                                                     \
+          assert ((__event)->status == CL_SUBMITTED);                         \
+          cl_command_queue __cq = (__event)->queue;                           \
+          if ((__cq)->device->ops->update_event)                              \
+            (__cq)->device->ops->update_event ((__cq)->device, (__event),     \
+                                               CL_RUNNING);                   \
+          else                                                                \
+            {                                                                 \
+              (__event)->status = CL_RUNNING;                                 \
+              if (__cq && __cq->properties & CL_QUEUE_PROFILING_ENABLE)       \
+                (__event)->time_start = __cq->device->ops->get_timer_value (  \
+                    __cq->device->data);                                      \
+            }                                                                 \
+          pocl_event_updated (__event, CL_RUNNING);                           \
+        }                                                                     \
+    }                                                                         \
+  while (0)
+
+#define POCL_UPDATE_EVENT_COMPLETE_INNER(__event, POST_EVENT)                 \
+  do                                                                          \
+    {                                                                         \
+      if ((__event) != NULL)                                                  \
+        {                                                                     \
+          assert ((__event)->status == CL_RUNNING);                           \
+          cl_command_queue __cq = (__event)->queue;                           \
+          assert ((__event)->status == CL_RUNNING);                           \
+          if ((__cq)->device->ops->update_event)                              \
+            (__cq)->device->ops->update_event ((__cq)->device, (__event),     \
+                                               CL_COMPLETE);                  \
+          else                                                                \
+            {                                                                 \
+              pocl_mem_objs_cleanup (__event);                                \
+              POCL_LOCK_OBJ (__event);                                        \
+              (__event)->status = CL_COMPLETE;                                \
+              if ((__cq)->properties & CL_QUEUE_PROFILING_ENABLE)             \
+                {                                                             \
+                  (__event)->time_end                                         \
+                      = (__cq)->device->ops->get_timer_value (                \
+                          (__cq)->device->data);                              \
+                }                                                             \
+              POCL_UNLOCK_OBJ (__event);                                      \
+              (__cq)->device->ops->broadcast (__event);                       \
+              pocl_update_command_queue (__event);                            \
+            }                                                                 \
+          pocl_event_updated (__event, CL_COMPLETE);                          \
+          POST_EVENT;                                                         \
+          POname (clReleaseEvent) (__event);                                  \
+        }                                                                     \
+    }                                                                         \
+  while (0)
+
+#define POCL_UPDATE_EVENT_COMPLETE(__event)                                   \
+  POCL_UPDATE_EVENT_COMPLETE_INNER (__event, NULL)
+
+#define POCL_UPDATE_EVENT_COMPLETE_MSG(__event, msg)                          \
+  POCL_UPDATE_EVENT_COMPLETE_INNER (__event,                                  \
+                                    POCL_DEBUG_EVENT_TIME ((__event), msg))
+
+#define CL_FAILED (-1)
+
+#define POCL_UPDATE_EVENT_FAILED(__event)                                     \
+  do                                                                          \
+    {                                                                         \
+      if ((__event) != NULL)                                                  \
+        {                                                                     \
+          cl_command_queue __cq = (__event)->queue;                           \
+          if ((__cq)->device->ops->update_event)                              \
+            (__cq)->device->ops->update_event ((__cq)->device, (__event),     \
+                                               CL_FAILED);                    \
+          else                                                                \
+            {                                                                 \
+              pocl_mem_objs_cleanup (__event);                                \
+              POCL_LOCK_OBJ (__event);                                        \
+              if ((__event)->status > CL_COMPLETE)                            \
+                (__event)->status = CL_FAILED;                                \
+              if ((__cq)->properties & CL_QUEUE_PROFILING_ENABLE)             \
+                {                                                             \
+                  (__event)->time_end                                         \
+                      = (__cq)->device->ops->get_timer_value (                \
+                          (__cq)->device->data);                              \
+                }                                                             \
+              POCL_UNLOCK_OBJ (__event);                                      \
+              (__cq)->device->ops->broadcast (__event);                       \
+              pocl_update_command_queue (__event);                            \
+            }                                                                 \
+          pocl_event_updated (__event, CL_FAILED);                            \
+          POname (clReleaseEvent) (__event);                                  \
+        }                                                                     \
+    }                                                                         \
+  while (0)
 
 #ifndef __cplusplus
 
diff --git a/lib/CL/pocl_debug.c b/lib/CL/pocl_debug.c
index c301f9b..c1de88d 100644
--- a/lib/CL/pocl_debug.c
+++ b/lib/CL/pocl_debug.c
@@ -1,46 +1,148 @@
 #include "pocl_debug.h"
 #include "pocl_timing.h"
 
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <pthread.h>
+#include <unistd.h>
+#include <stdarg.h>
+
 #ifdef POCL_DEBUG_MESSAGES
-int pocl_debug_messages;
+
+uint64_t pocl_debug_messages_filter; /* Bitfield */
 int stderr_is_a_tty;
 
+static pthread_mutex_t console_mutex = PTHREAD_MUTEX_INITIALIZER;
 
   #if !defined(_MSC_VER) && !defined(__MINGW32__)
 
-    #include <time.h>
-    #include <stdio.h>
+    /* The same as fprintf(stderr,..), except it's protected by mutex
+     * helps to keep the debug output correct when >1 thread
+     * is doing fprintf() */
+    int
+    pocl_fprintf_err (const char* format, ...)
+    {
+      pthread_mutex_lock (&console_mutex);
 
-    void pocl_debug_print_header(const char* func, unsigned line) {
+      va_list args;
+      va_start (args, format);
+      int res = vfprintf (stderr, format, args);
+      va_end (args);
+
+      pthread_mutex_unlock (&console_mutex);
+      return res;
+    }
+
+    void
+    pocl_debug_messages_setup (const char* debug)
+    {
+      pocl_debug_messages_filter = 0;
+      if (strlen (debug) == 1)
+        {
+          if (debug[0] == '1')
+            pocl_debug_messages_filter = POCL_DEBUG_FLAG_GENERAL
+                                         | POCL_DEBUG_FLAG_WARNING
+                                         | POCL_DEBUG_FLAG_ERROR;
+          return;
+        }
+      /* else parse */
+      char* tokenize = strdup (debug);
+      char* ptr = NULL;
+      ptr = strtok (tokenize, ",");
+
+      while (ptr != NULL)
+      {
+        if (strncmp (ptr, "general", 7) == 0)
+          pocl_debug_messages_filter |= POCL_DEBUG_FLAG_GENERAL;
+        else if (strncmp (ptr, "event", 5) == 0)
+          pocl_debug_messages_filter |= POCL_DEBUG_FLAG_EVENTS;
+        else if (strncmp (ptr, "cache", 5) == 0)
+          pocl_debug_messages_filter |= POCL_DEBUG_FLAG_CACHE;
+        else if (strncmp (ptr, "llvm", 4) == 0)
+          pocl_debug_messages_filter |= POCL_DEBUG_FLAG_LLVM;
+        else if (strncmp (ptr, "refc", 4) == 0)
+          pocl_debug_messages_filter |= POCL_DEBUG_FLAG_REFCOUNTS;
+        else if (strncmp (ptr, "lock", 4) == 0)
+          pocl_debug_messages_filter |= POCL_DEBUG_FLAG_LOCKING;
+        else if (strncmp (ptr, "cuda", 4) == 0)
+          pocl_debug_messages_filter |= POCL_DEBUG_FLAG_CUDA;
+        else if (strncmp (ptr, "warn", 4) == 0)
+          pocl_debug_messages_filter |= (POCL_DEBUG_FLAG_WARNING | POCL_DEBUG_FLAG_ERROR);
+        else if (strncmp (ptr, "hsa", 3) == 0)
+          pocl_debug_messages_filter |= POCL_DEBUG_FLAG_HSA;
+        else if (strncmp (ptr, "tce", 3) == 0)
+          pocl_debug_messages_filter |= POCL_DEBUG_FLAG_TCE;
+        else if (strncmp (ptr, "mem", 3) == 0)
+          pocl_debug_messages_filter |= POCL_DEBUG_FLAG_MEMORY;
+        else if (strncmp (ptr, "tim", 3) == 0)
+          pocl_debug_messages_filter |= POCL_DEBUG_FLAG_TIMING;
+        else if (strncmp (ptr, "all", 3) == 0)
+          pocl_debug_messages_filter |= POCL_DEBUG_FLAG_ALL;
+        else if (strncmp (ptr, "err", 3) == 0)
+          pocl_debug_messages_filter |= POCL_DEBUG_FLAG_ERROR;
+        else
+          POCL_MSG_WARN ("Unknown token in POCL_DEBUG env var: %s", ptr);
+
+        ptr = strtok (NULL,",");
+      }
+
+      free (tokenize);
+      pocl_fprintf_err ("** Final POCL_DEBUG flags: %lX \n",
+                        pocl_debug_messages_filter);
+    }
+
+    void
+    pocl_debug_print_header (const char* func, unsigned line,
+                             const char *filter, int filter_type)
+    {
 
         int year, mon, day, hour, min, sec, nanosec;
         pocl_gettimereal(&year, &mon, &day, &hour, &min, &sec, &nanosec);
 
-        const char* formatstring;
+        const char *filter_type_str;
+        const char *formatstring;
+
+        if (filter_type == POCL_FILTER_TYPE_ERR)
+          filter_type_str = (stderr_is_a_tty ? POCL_COLOR_RED : " *** ERROR *** ");
+        else if (filter_type == POCL_FILTER_TYPE_WARN)
+          filter_type_str = (stderr_is_a_tty ? POCL_COLOR_YELLOW : " *** WARNING *** ");
+        else if (filter_type == POCL_FILTER_TYPE_INFO)
+          filter_type_str = (stderr_is_a_tty ? POCL_COLOR_GREEN : " *** INFO *** ");
+        else
+          filter_type_str = (stderr_is_a_tty ? POCL_COLOR_GREEN : " *** UNKNOWN *** ");
+
         if (stderr_is_a_tty)
           formatstring = POCL_COLOR_BLUE
-              "[%04i-%02i-%02i %02i:%02i:%02i.%09li] "
-              POCL_COLOR_RESET "POCL: in fn"
-              POCL_COLOR_CYAN " %s "
-              POCL_COLOR_RESET "at line %u:\n";
+              "[%04i-%02i-%02i %02i:%02i:%02i.%09li]"
+              POCL_COLOR_RESET "POCL: in fn %s "
+              POCL_COLOR_RESET "at line %u:\n %s | %9s | ";
         else
           formatstring = "[%04i-%02i-%02i %02i:%02i:%02i.%09i] "
-              "POCL: in fn %s at line %u:\n";
-        fprintf(stderr,
-            formatstring, year, mon, day, hour, min,
-            sec, nanosec, func, line);
+              "POCL: in fn %s at line %u:\n %s | %9s | ";
+
+        pocl_fprintf_err (formatstring, year, mon, day, hour, min, sec,
+                          nanosec, func, line, filter_type_str, filter);
     }
 
     void pocl_debug_measure_start(uint64_t *start) {
-      if (!pocl_debug_messages)
-        return;
       *start = pocl_gettimemono_ns();
     }
 
+    #define PRINT_DURATION(func, line, ...)                                   \
+      do                                                                      \
+        {                                                                     \
+          pocl_debug_print_header (func, line,                                \
+                                   "TIMING", POCL_FILTER_TYPE_INFO);          \
+          pocl_fprintf_err (__VA_ARGS__);                                     \
+        }                                                                     \
+      while (0)
+
+
     void pocl_debug_print_duration(const char* func, unsigned line,
                                    const char* msg, uint64_t nanosecs)
     {
-      if (!pocl_debug_messages)
+      if (!(pocl_debug_messages_filter & POCL_DEBUG_FLAG_TIMING))
         return;
       const char* formatstring;
       if (stderr_is_a_tty)
@@ -62,31 +164,31 @@ int stderr_is_a_tty;
                     "     %3" PRIu64 " " POCL_COLOR_RESET " ns    %s\n";
           else
             formatstring = "      >>>           %3" PRIu64 "  ns    %s\n";
-          POCL_MSG_PRINT2(func, line, formatstring, b, msg);
+          PRINT_DURATION (func, line, formatstring, b, msg);
         }
       else if ((sec == 0) && (nsec < 1000000))
         {
           a = nsec / 1000;
           b = nsec % 1000;
-          POCL_MSG_PRINT2(func, line, formatstring, a, b, "us", msg);
+          PRINT_DURATION (func, line, formatstring, a, b, "us", msg);
         }
       else if (sec == 0)
         {
           a = nsec / 1000000;
           b = (nsec % 1000000) / 1000;
-          POCL_MSG_PRINT2(func, line, formatstring, a, b, "ms", msg);
+          PRINT_DURATION (func, line, formatstring, a, b, "ms", msg);
         }
       else
-          POCL_MSG_PRINT2(func, line, formatstring, sec, nsec, "s", msg);
+          PRINT_DURATION (func, line, formatstring, sec, nsec, "s", msg);
 
     }
 
+
+
     void pocl_debug_measure_finish(uint64_t *start, uint64_t *finish,
                                    const char* msg,
                                    const char* func,
                                    unsigned line) {
-      if (!pocl_debug_messages)
-        return;
       *finish = pocl_gettimemono_ns();
       pocl_debug_print_duration(func, line, msg, (*finish - *start) );
     }
diff --git a/lib/CL/pocl_debug.h b/lib/CL/pocl_debug.h
index a4d86cd..ba3a766 100644
--- a/lib/CL/pocl_debug.h
+++ b/lib/CL/pocl_debug.h
@@ -43,6 +43,27 @@ extern "C" {
 #define POCL_COLOR_BOLDCYAN    "\033[1m\033[36m"      /* Bold Cyan */
 #define POCL_COLOR_BOLDWHITE   "\033[1m\033[37m"      /* Bold White */
 
+/* bitfield values for pocl_debug_messages_filter */
+#define POCL_DEBUG_FLAG_GENERAL 0x1
+#define POCL_DEBUG_FLAG_MEMORY 0x2
+#define POCL_DEBUG_FLAG_LLVM 0x4
+#define POCL_DEBUG_FLAG_EVENTS 0x8
+#define POCL_DEBUG_FLAG_CACHE 0x10
+#define POCL_DEBUG_FLAG_LOCKING 0x20
+#define POCL_DEBUG_FLAG_REFCOUNTS 0x40
+#define POCL_DEBUG_FLAG_TIMING 0x80
+#define POCL_DEBUG_FLAG_HSA 0x100
+#define POCL_DEBUG_FLAG_TCE 0x200
+#define POCL_DEBUG_FLAG_CUDA 0x400
+#define POCL_DEBUG_FLAG_WARNING 0x800
+#define POCL_DEBUG_FLAG_ERROR 0x1000
+#define POCL_DEBUG_FLAG_ALL (uint64_t)(-1)
+
+#define POCL_FILTER_TYPE_INFO 1
+#define POCL_FILTER_TYPE_WARN 2
+#define POCL_FILTER_TYPE_ERR 3
+
+
 #ifdef __GNUC__
 #pragma GCC visibility push(hidden)
 #endif
@@ -50,8 +71,6 @@ extern "C" {
 /* Debugging macros. Also macros for marking unimplemented parts of specs or
    untested parts of the implementation. */
 
-extern int pocl_aborting;
-
 #define POCL_ABORT_UNIMPLEMENTED(MSG)                                   \
     do {                                                                \
         fprintf(stderr,"%s is unimplemented (%s:%d)\n",                 \
@@ -75,7 +94,6 @@ extern int pocl_aborting;
 
 #define POCL_ABORT(...)                                                 \
     do {                                                                \
-        pocl_aborting = 1;                                              \
         fprintf(stderr, __VA_ARGS__);                                   \
         abort();                                                        \
     } while (0)
@@ -94,24 +112,30 @@ extern int pocl_aborting;
 
 #ifdef POCL_DEBUG_MESSAGES
 
-    extern int pocl_debug_messages;
+    extern uint64_t pocl_debug_messages_filter;
     extern int stderr_is_a_tty;
 
+    #define POCL_DEBUGGING_ON (pocl_debug_messages_filter)
+
     #if __GNUC__ >= 2
     #define __func__ __PRETTY_FUNCTION__
     #else
     #define __func__ __FUNCTION__
     #endif
 
-        #define POCL_DEBUG_HEADER pocl_debug_print_header(__func__, __LINE__);
-        extern void pocl_debug_print_header(const char * func, unsigned line);
-        extern void pocl_debug_measure_start(uint64_t* start);
-        extern void pocl_debug_measure_finish(uint64_t* start, uint64_t* finish,
-                                              const char* msg,
-                                              const char *func,
-                                              unsigned line);
-        extern void pocl_debug_print_duration(const char* func, unsigned line,
-                                              const char* msg, uint64_t nanosecs);
+        int pocl_fprintf_err (const char* format, ...);
+        #define POCL_DEBUG_HEADER(FILTER, FILTER_TYPE) \
+            pocl_debug_print_header (__func__, __LINE__, #FILTER, FILTER_TYPE);
+        extern void pocl_debug_messages_setup (const char *debug);
+        extern void pocl_debug_print_header (const char * func, unsigned line,
+                                             const char* filter, int filter_type);
+        extern void pocl_debug_measure_start (uint64_t* start);
+        extern void pocl_debug_measure_finish (uint64_t* start, uint64_t* finish,
+                                               const char* msg,
+                                               const char *func,
+                                               unsigned line);
+        extern void pocl_debug_print_duration (const char* func, unsigned line,
+                                               const char* msg, uint64_t nanosecs);
         #define POCL_MEASURE_START(SUFFIX) \
           uint64_t pocl_time_start_ ## SUFFIX, pocl_time_finish_ ## SUFFIX; \
           pocl_debug_measure_start(&pocl_time_start_ ## SUFFIX);
@@ -121,56 +145,107 @@ extern int pocl_aborting;
                          &pocl_time_finish_ ## SUFFIX, "API: " #SUFFIX, \
                          __func__, __LINE__);
 
-    #define POCL_MSG_PRINT(TYPE, ERRCODE, ...)                              \
+    #define POCL_MSG_PRINT_F(FILTER, TYPE, ERRCODE, ...)                    \
         do {                                                                \
-            if (pocl_debug_messages) {                                      \
-                POCL_DEBUG_HEADER                                           \
+            if (pocl_debug_messages_filter & POCL_DEBUG_FLAG_ ## FILTER) {  \
+                POCL_DEBUG_HEADER(FILTER, POCL_FILTER_TYPE_ ## TYPE)        \
                 if (stderr_is_a_tty)                                        \
-                  fprintf(stderr, TYPE POCL_COLOR_CYAN ERRCODE " "  POCL_COLOR_RESET);            \
+                  pocl_fprintf_err ("%s", POCL_COLOR_BOLDRED                \
+                                    ERRCODE " "  POCL_COLOR_RESET);         \
                 else                                                        \
-                  fprintf(stderr, TYPE ERRCODE " ");                        \
-                fprintf(stderr, __VA_ARGS__);                               \
+                  pocl_fprintf_err ("%s", ERRCODE " ");                     \
+                pocl_fprintf_err (__VA_ARGS__);                             \
             }                                                               \
         } while (0)
 
-    #define POCL_MSG_PRINT2(func, line, ...)                                \
+    #define POCL_MSG_PRINT2(FILTER, func, line, ...)                        \
         do {                                                                \
-            if (pocl_debug_messages) {                                      \
-                pocl_debug_print_header(func, line);                        \
-                fprintf(stderr, __VA_ARGS__);                               \
+            if (pocl_debug_messages_filter & POCL_DEBUG_FLAG_ ## FILTER) {  \
+                pocl_debug_print_header (func, line,                        \
+                                 #FILTER, POCL_FILTER_TYPE_INFO);           \
+                pocl_fprintf_err (__VA_ARGS__);                             \
             }                                                               \
         } while (0)
 
-    #define POCL_MSG_WARN2(errcode, ...)   do { if (stderr_is_a_tty) \
-          POCL_MSG_PRINT(POCL_COLOR_YELLOW " *** WARNING *** ", errcode, __VA_ARGS__); \
-          else POCL_MSG_PRINT(" *** WARNING *** ", errcode, __VA_ARGS__); } while(0)
+    #define POCL_MSG_WARN2(errcode, ...) \
+              POCL_MSG_PRINT_F(WARNING, WARN, errcode, __VA_ARGS__)
     #define POCL_MSG_WARN(...)  POCL_MSG_WARN2("", __VA_ARGS__)
 
-    #define POCL_MSG_ERR2(errcode, ...)    do { if (stderr_is_a_tty) \
-          POCL_MSG_PRINT(POCL_COLOR_RED " *** ERROR *** ", errcode, __VA_ARGS__); \
-          else POCL_MSG_PRINT(" *** ERROR *** ", errcode, __VA_ARGS__); } while (0)
+    #define POCL_MSG_ERR2(errcode, ...) \
+          POCL_MSG_PRINT_F(ERROR, ERR, errcode, __VA_ARGS__)
     #define POCL_MSG_ERR(...)  POCL_MSG_ERR2("", __VA_ARGS__)
 
-    #define POCL_MSG_PRINT_INFO2(errcode, ...) do { if (stderr_is_a_tty) \
-          POCL_MSG_PRINT(POCL_COLOR_GREEN " *** INFO *** ", errcode, __VA_ARGS__); \
-          else POCL_MSG_PRINT(" *** INFO *** ", errcode, __VA_ARGS__); } while (0)
+    #define POCL_MSG_PRINT_INFO2(errcode, ...) \
+          POCL_MSG_PRINT_F(GENERAL, INFO, errcode, __VA_ARGS__)
     #define POCL_MSG_PRINT_INFO(...) POCL_MSG_PRINT_INFO2("", __VA_ARGS__)
 
-    #define POCL_DEBUG_EVENT_TIME(eventp, msg) \
-        pocl_debug_print_duration(__func__, __LINE__, "Event " msg, (uint64_t)((*eventp)->time_end - (*eventp)->time_start))
+    #define POCL_MSG_PRINT_INFO_F(filter, errcode, ...) \
+          POCL_MSG_PRINT_F(filter, INFO, errcode, __VA_ARGS__)
+
+    #define POCL_MSG_PRINT_CUDA2(errcode, ...) POCL_MSG_PRINT_INFO_F(CUDA, errcode, __VA_ARGS__)
+    #define POCL_MSG_PRINT_CUDA(...) POCL_MSG_PRINT_INFO_F(CUDA, "", __VA_ARGS__)
+    #define POCL_MSG_PRINT_HSA2(errcode, ...) POCL_MSG_PRINT_INFO_F(HSA, errcode, __VA_ARGS__)
+    #define POCL_MSG_PRINT_HSA(...) POCL_MSG_PRINT_INFO_F(HSA, "", __VA_ARGS__)
+    #define POCL_MSG_PRINT_TCE2(errcode, ...) POCL_MSG_PRINT_INFO_F(TCE, errcode, __VA_ARGS__)
+    #define POCL_MSG_PRINT_TCE(...) POCL_MSG_PRINT_INFO_F(TCE, "", __VA_ARGS__)
+    #define POCL_MSG_PRINT_LOCKING2(errcode, ...) POCL_MSG_PRINT_INFO_F(LOCKING, errcode, __VA_ARGS__)
+    #define POCL_MSG_PRINT_LOCKING(...) POCL_MSG_PRINT_INFO_F(LOCKING, "", __VA_ARGS__)
+    #define POCL_MSG_PRINT_REFCOUNTS2(errcode, ...) POCL_MSG_PRINT_INFO_F(REFCOUNTS, errcode, __VA_ARGS__)
+    #define POCL_MSG_PRINT_REFCOUNTS(...) POCL_MSG_PRINT_INFO_F(REFCOUNTS, "", __VA_ARGS__)
+    #define POCL_MSG_PRINT_CACHE2(errcode, ...) POCL_MSG_PRINT_INFO_F(CACHE, errcode, __VA_ARGS__)
+    #define POCL_MSG_PRINT_CACHE(...) POCL_MSG_PRINT_INFO_F(CACHE, "", __VA_ARGS__)
+    #define POCL_MSG_PRINT_EVENTS2(errcode, ...) POCL_MSG_PRINT_INFO_F(EVENTS, errcode, __VA_ARGS__)
+    #define POCL_MSG_PRINT_EVENTS(...) POCL_MSG_PRINT_INFO_F(EVENTS, "", __VA_ARGS__)
+    #define POCL_MSG_PRINT_LLVM2(errcode, ...) POCL_MSG_PRINT_INFO_F(LLVM, errcode, __VA_ARGS__)
+    #define POCL_MSG_PRINT_LLVM(...) POCL_MSG_PRINT_INFO_F(LLVM, "", __VA_ARGS__)
+    #define POCL_MSG_PRINT_MEMORY2(errcode, ...) POCL_MSG_PRINT_INFO_F(MEMORY, errcode, __VA_ARGS__)
+    #define POCL_MSG_PRINT_MEMORY(...) POCL_MSG_PRINT_INFO_F(MEMORY, "", __VA_ARGS__)
+    #define POCL_MSG_PRINT_GENERAL2(errcode, ...) POCL_MSG_PRINT_INFO_F(GENERAL, errcode, __VA_ARGS__)
+    #define POCL_MSG_PRINT_GENERAL(...) POCL_MSG_PRINT_INFO_F(GENERAL, "", __VA_ARGS__)
+
+#define POCL_DEBUG_EVENT_TIME(eventp, msg)                                    \
+  pocl_debug_print_duration (                                                 \
+      __func__, __LINE__, "Event " msg,                                       \
+      (uint64_t) ((eventp)->time_end - (eventp)->time_start))
 
 #else
 
-    #define POCL_MSG_WARN(...)
-    #define POCL_MSG_ERR(...)
-    #define POCL_MSG_PRINT(...)
-    #define POCL_MSG_PRINT2(...)
-    #define POCL_MSG_PRINT_INFO(...)
-    #define POCL_MSG_PRINT_INFO2(...)
+    #define POCL_DEBUGGING_ON 0
+
+    #define POCL_MSG_PRINT_F(...)  do {} while (0)
+    #define POCL_MSG_PRINT(...)  do {} while (0)
+    #define POCL_MSG_PRINT2(...)  do {} while (0)
+    #define POCL_MSG_WARN(...)  do {} while (0)
+    #define POCL_MSG_WARN2(...)  do {} while (0)
+    #define POCL_MSG_ERR(...)  do {} while (0)
+    #define POCL_MSG_ERR2(...)  do {} while (0)
+    #define POCL_MSG_PRINT_INFO(...)  do {} while (0)
+    #define POCL_MSG_PRINT_INFO2(...)  do {} while (0)
+    #define POCL_MSG_PRINT_INFO_F(...)  do {} while (0)
+
     #define POCL_DEBUG_HEADER
-    #define POCL_MEASURE_START(...)
-    #define POCL_MEASURE_FINISH(...)
-    #define POCL_DEBUG_EVENT_TIME(...)
+    #define POCL_MEASURE_START(...)  do {} while (0)
+    #define POCL_MEASURE_FINISH(...)  do {} while (0)
+    #define POCL_DEBUG_EVENT_TIME(...)  do {} while (0)
+
+    #define POCL_MSG_PRINT_HSA2(...)  do {} while (0)
+    #define POCL_MSG_PRINT_HSA(...)  do {} while (0)
+    #define POCL_MSG_PRINT_TCE2(...)  do {} while (0)
+    #define POCL_MSG_PRINT_TCE(...)  do {} while (0)
+    #define POCL_MSG_PRINT_LOCKING2(...)  do {} while (0)
+    #define POCL_MSG_PRINT_LOCKING(...)  do {} while (0)
+    #define POCL_MSG_PRINT_REFCOUNTS2(...)  do {} while (0)
+    #define POCL_MSG_PRINT_REFCOUNTS(...)  do {} while (0)
+    #define POCL_MSG_PRINT_CACHE2(...)  do {} while (0)
+    #define POCL_MSG_PRINT_CACHE(...)  do {} while (0)
+    #define POCL_MSG_PRINT_EVENTS2(...)  do {} while (0)
+    #define POCL_MSG_PRINT_EVENTS(...)  do {} while (0)
+    #define POCL_MSG_PRINT_LLVM2(...)  do {} while (0)
+    #define POCL_MSG_PRINT_LLVM(...)  do {} while (0)
+    #define POCL_MSG_PRINT_MEMORY2(...)  do {} while (0)
+    #define POCL_MSG_PRINT_MEMORY(...)  do {} while (0)
+    #define POCL_MSG_PRINT_GENERAL2(...)  do {} while (0)
+    #define POCL_MSG_PRINT_GENERAL(...)  do {} while (0)
 
 #endif
 
@@ -221,7 +296,29 @@ extern int pocl_aborting;
     }                                                                       \
   while (0)
 
+#define POCL_GOTO_LABEL_COND(label, cond, err_code)                           \
+  do                                                                          \
+    {                                                                         \
+      if (cond)                                                               \
+        {                                                                     \
+          POCL_MSG_ERR2 (#err_code, "%s\n", #cond);                           \
+          errcode = err_code;                                                 \
+          goto label;                                                         \
+        }                                                                     \
+    }                                                                         \
+  while (0)
 
+#define POCL_GOTO_LABEL_ON(label, cond, err_code, ...)                        \
+  do                                                                          \
+    {                                                                         \
+      if (cond)                                                               \
+        {                                                                     \
+          POCL_MSG_ERR2 (#err_code, __VA_ARGS__);                             \
+          errcode = err_code;                                                 \
+          goto label;                                                         \
+        }                                                                     \
+    }                                                                         \
+  while (0)
 
 #ifdef __GNUC__
 #pragma GCC visibility pop
diff --git a/lib/CL/pocl_image_util.c b/lib/CL/pocl_image_util.c
index 083ecc7..12222e4 100644
--- a/lib/CL/pocl_image_util.c
+++ b/lib/CL/pocl_image_util.c
@@ -21,9 +21,26 @@
    THE SOFTWARE.
 */
 
-#include "pocl_cl.h"
 #include "pocl_image_util.h"
 #include "assert.h"
+#include "pocl_cl.h"
+#include "pocl_util.h"
+
+static unsigned
+pocl_get_image_dim (const cl_mem image)
+{
+  if ((image->type == CL_MEM_OBJECT_IMAGE1D)
+      || (image->type == CL_MEM_OBJECT_IMAGE1D_BUFFER))
+    return 1;
+  if ((image->type == CL_MEM_OBJECT_IMAGE2D)
+      || (image->type == CL_MEM_OBJECT_IMAGE1D_ARRAY))
+    return 2;
+  if ((image->type == CL_MEM_OBJECT_IMAGE3D)
+      || (image->type == CL_MEM_OBJECT_IMAGE2D_ARRAY))
+    return 3;
+
+  return (unsigned)-1;
+}
 
 extern cl_int 
 pocl_check_image_origin_region (const cl_mem image, 
@@ -34,95 +51,133 @@ pocl_check_image_origin_region (const cl_mem image,
 
   POCL_RETURN_ERROR_COND((origin == NULL), CL_INVALID_VALUE);
   POCL_RETURN_ERROR_COND((region == NULL), CL_INVALID_VALUE);
-  
-  /* check if origin + region in each dimension is with in image bounds */
-  if (((origin[0] + region[0]) > image->image_row_pitch) || 
-      (image->image_height > 0 && 
-       ((origin[1] + region[1]) > image->image_height)) ||
-      (image->image_depth > 0 && (origin[2] + region[2]) > image->image_depth))
-    return CL_INVALID_VALUE;
 
+  unsigned dim = pocl_get_image_dim (image);
+
+  if (dim < 3)
+    {
+      /* If image is a 2D image object, origin[2] must be 0.
+       * If image is a 1D image or 1D image buffer object,
+       * origin[1] and origin[2] must be 0.
+       * If image is a 2D image object, region[2] must be 1.
+       * If image is a 1D image or 1D image buffer object,
+       * region[1] and region[2] must be 1.
+       * If image is a 1D image array object, region[2] must be 1.
+       */
+      unsigned i;
+      for (i = dim; i < 3; i++)
+        {
+          POCL_RETURN_ERROR_ON (
+              (origin[i] != 0), CL_INVALID_VALUE,
+              "Image origin[x](=%zu) must be 0 for x(=%u) >= image_dim\n",
+              origin[i], i);
+          POCL_RETURN_ERROR_ON (
+              (region[i] != 1), CL_INVALID_VALUE,
+              "Image region[x](=%zu) must be 1 for x(=%u) >= image_dim\n",
+              region[i], i);
+        }
+    }
+
+  /* check if origin + region in each dimension is with in image bounds */
+  POCL_RETURN_ERROR_ON (
+      ((origin[0] + region[0]) > image->image_width), CL_INVALID_VALUE,
+      "(origin[0](=%zu) + region[0](=%zu)) > image->image_width(=%zu)",
+      origin[0], region[0], image->image_width);
+  POCL_RETURN_ERROR_ON (
+      (image->image_height > 0
+       && ((origin[1] + region[1]) > image->image_height)),
+      CL_INVALID_VALUE,
+      "(origin[1](=%zu) + region[1](=%zu)) > image->image_height(=%zu)",
+      origin[1], region[2], image->image_height);
+  POCL_RETURN_ERROR_ON (
+      (image->image_depth > 0 && (origin[2] + region[2]) > image->image_depth),
+      CL_INVALID_VALUE,
+      "(origin[2](=%zu) + region[2](=%zu)) > image->image_depth(=%zu)",
+      origin[1], region[2], image->image_depth);
   return CL_SUCCESS;
 }
 
 extern cl_int
-pocl_check_device_supports_image(const cl_mem image,
-                                 const cl_command_queue command_queue)
+pocl_check_device_supports_image (cl_device_id device,
+                                  const cl_image_format *image_format,
+                                  const cl_image_desc *image_desc,
+                                  cl_image_format *supported_image_formats,
+                                  cl_uint num_entries)
 {
-  cl_uint num_entries;
   cl_int errcode;
-  const cl_device_id device = command_queue->device;
-  cl_image_format* supported_image_formats = NULL;
-  unsigned i;
+  cl_uint i;
+  size_t m;
 
   POCL_RETURN_ERROR_ON((!device->image_support), CL_INVALID_OPERATION,
           "Device does not support images");
 
-  if (image->type == CL_MEM_OBJECT_IMAGE1D ||
-      image->type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+  if (image_desc->image_type == CL_MEM_OBJECT_IMAGE1D
+      || image_desc->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
     {
-      POCL_RETURN_ERROR_ON((image->image_width > device->image2d_max_width),
-        CL_INVALID_IMAGE_SIZE, "Image width > device.image2d_max_width");
+      POCL_RETURN_ERROR_ON (
+          (image_desc->image_width > device->image2d_max_width),
+          CL_INVALID_IMAGE_SIZE, "Image width > device.image2d_max_width\n");
     }
 
-  if (image->type == CL_MEM_OBJECT_IMAGE2D ||
-      image->type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
+  if (image_desc->image_type == CL_MEM_OBJECT_IMAGE2D
+      || image_desc->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY)
     {
-      POCL_RETURN_ERROR_ON((image->image_width > device->image2d_max_width),
-        CL_INVALID_IMAGE_SIZE, "Image width > device.image2d_max_width");
-      POCL_RETURN_ERROR_ON((image->image_height > device->image2d_max_height),
-        CL_INVALID_IMAGE_SIZE, "Image height > device.image2d_max_height");
+      POCL_RETURN_ERROR_ON (
+          (image_desc->image_width > device->image2d_max_width),
+          CL_INVALID_IMAGE_SIZE, "Image width > device.image2d_max_width\n");
+      POCL_RETURN_ERROR_ON (
+          (image_desc->image_height > device->image2d_max_height),
+          CL_INVALID_IMAGE_SIZE, "Image height > device.image2d_max_height\n");
     }
 
-  if (image->type == CL_MEM_OBJECT_IMAGE3D)
+  if (image_desc->image_type == CL_MEM_OBJECT_IMAGE3D)
     {
-      POCL_RETURN_ERROR_ON((image->image_width > device->image3d_max_width),
-        CL_INVALID_IMAGE_SIZE, "Image width > device.image3d_max_width");
-      POCL_RETURN_ERROR_ON((image->image_height > device->image3d_max_height),
-        CL_INVALID_IMAGE_SIZE, "Image height > device.image3d_max_height");
-      POCL_RETURN_ERROR_ON((image->image_depth > device->image3d_max_depth),
-        CL_INVALID_IMAGE_SIZE, "Image depth > device.image3d_max_depth");
+      POCL_RETURN_ERROR_ON (
+          (image_desc->image_width > device->image3d_max_width),
+          CL_INVALID_IMAGE_SIZE, "Image width > device.image3d_max_width\n");
+      POCL_RETURN_ERROR_ON (
+          (image_desc->image_height > device->image3d_max_height),
+          CL_INVALID_IMAGE_SIZE, "Image height > device.image3d_max_height\n");
+      POCL_RETURN_ERROR_ON (
+          (image_desc->image_depth > device->image3d_max_depth),
+          CL_INVALID_IMAGE_SIZE, "Image depth > device.image3d_max_depth\n");
     }
 
-  /* check if image format is supported */
-  errcode = POname(clGetSupportedImageFormats)
-    (command_queue->context, 0, image->type, 0, NULL, &num_entries);
-
-  POCL_RETURN_ERROR_ON((errcode != CL_SUCCESS), errcode,
-        "clGetSupportedImageFormats call failed");
-
-  POCL_RETURN_ERROR_ON((num_entries == 0), errcode,
-        "This device does not support these images "
-        "(clGetSupportedImageFormats returned 0 entries)");
-
-  supported_image_formats = (cl_image_format*) malloc (num_entries * sizeof(cl_image_format));
-  if (supported_image_formats == NULL)
-      return CL_OUT_OF_HOST_MEMORY;
-
-  errcode = POname(clGetSupportedImageFormats)
-    (command_queue->context, 0, image->type, num_entries,
-     supported_image_formats, NULL);
+  if ((image_desc->image_type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+      || (image_desc->image_type == CL_MEM_OBJECT_IMAGE2D_ARRAY))
+    {
+      POname (clGetDeviceInfo (device, CL_DEVICE_IMAGE_MAX_ARRAY_SIZE,
+                               sizeof (m), &m, NULL));
+      POCL_RETURN_ERROR_ON ((m < image_desc->image_array_size),
+                            CL_INVALID_IMAGE_SIZE,
+                            "Image array size > device.max_array_size\n");
+    }
 
-  POCL_GOTO_ERROR_ON((errcode != CL_SUCCESS), errcode,
-        "2nd call of clGetSupportedImageFormats failed");
+  if (image_desc->image_type == CL_MEM_OBJECT_IMAGE1D_BUFFER)
+    {
+      POname (clGetDeviceInfo (device, CL_DEVICE_IMAGE_MAX_BUFFER_SIZE,
+                               sizeof (m), &m, NULL));
+      POCL_RETURN_ERROR_ON (
+          (m < image_desc->image_width), CL_INVALID_IMAGE_SIZE,
+          "Image buffer size (width) > device.max_buffer_size\n");
+    }
 
   for (i = 0; i < num_entries; i++)
     {
-      if (supported_image_formats[i].image_channel_order ==
-          image->image_channel_order &&
-          supported_image_formats[i].image_channel_data_type ==
-          image->image_channel_data_type)
+      if (supported_image_formats[i].image_channel_order
+              == image_format->image_channel_order
+          && supported_image_formats[i].image_channel_data_type
+                 == image_format->image_channel_data_type)
         {
           errcode = CL_SUCCESS;
           goto ERROR;
         }
     }
 
-  POCL_GOTO_ERROR_ON(1, CL_INVALID_IMAGE_FORMAT_DESCRIPTOR,
-    "The image format is not supported by the device");
+  POCL_GOTO_ERROR_ON (1, CL_INVALID_IMAGE_FORMAT_DESCRIPTOR,
+                      "The image format is not supported by the device\n");
 
 ERROR:
-  free(supported_image_formats);
   return errcode;
 }
 
@@ -166,96 +221,410 @@ pocl_get_image_information (cl_channel_order ch_order,
     }
 }
 
-cl_int
-pocl_write_image(cl_mem               image,
-                 cl_device_id         device_id,
-                 const size_t *       origin, /*[3]*/
-                 const size_t *       region, /*[3]*/
-                 size_t               host_row_pitch,
-                 size_t               host_slice_pitch, 
-                 const void *         ptr)
+/****************************************************/
+
+#define FOR4 unsigned i; for (i = 0; i < 4; i++)
+
+cl_char4
+convert_char4_sat (cl_float4 x)
 {
-  
-  if (image == NULL)
-    return CL_INVALID_MEM_OBJECT;
-
-  if ((ptr == NULL) || (region == NULL) || origin == NULL)
-    return CL_INVALID_VALUE;
-    
-  size_t dev_elem_size = sizeof(cl_float);
-  int dev_channels = 4;
-
-  size_t tuned_origin[3] = {origin[0]*dev_elem_size*dev_channels, origin[1], 
-                            origin[2]};
-  size_t tuned_region[3] = {region[0]*dev_elem_size*dev_channels, region[1], 
-                            region[2]};
-    
-  size_t image_row_pitch = image->image_row_pitch;
-  size_t image_slice_pitch = 0;
-    
-  if ((tuned_region[0]*tuned_region[1]*tuned_region[2] > 0) &&
-      (tuned_region[0]-1 +
-       image_row_pitch * (tuned_region[1]-1) +
-       image_slice_pitch * (tuned_region[2]-1) >= image->size))
-    return CL_INVALID_VALUE;
-  
-  device_id->ops->write_rect (device_id->data, ptr, 
-                         image->device_ptrs[device_id->dev_id].mem_ptr,
-                         tuned_origin, tuned_origin, tuned_region,
-                         image_row_pitch, image_slice_pitch,
-                         image_row_pitch, image_slice_pitch);
-  
-  
-  return CL_SUCCESS;
+  cl_char4 r;
+  FOR4
+    r.s[i] = (cl_char)max (CL_CHAR_MIN, min ((cl_int) (x.s[i]), CL_CHAR_MAX));
+  return r;
+}
+
+cl_short4
+convert_short4_sat (cl_float4 x)
+{
+  cl_short4 r;
+  FOR4
+    r.s[i] = (cl_short)max (CL_SHRT_MIN, min ((cl_int) (x.s[i]), CL_SHRT_MAX));
+  return r;
 }
-           
-extern cl_int         
-pocl_read_image(cl_mem               image,
-                cl_device_id         device_id,
-                const size_t *       origin, /*[3]*/
-                const size_t *       region, /*[3]*/
-                size_t               host_row_pitch,
-                size_t               host_slice_pitch, 
-                void *               ptr) 
+
+cl_uchar4
+convert_uchar4_sat (cl_float4 x)
 {
-    
-  if (image == NULL)
-    return CL_INVALID_MEM_OBJECT;
-
-  if ((ptr == NULL) || (region == NULL) || origin == NULL)
-    return CL_INVALID_VALUE;
-    
-  size_t width = image->image_width;
-  size_t height = image->image_height;
-
-  /* dev imagetype = host imagetype, in current implementation */
-  size_t dev_elem_size = image->image_elem_size;
-  size_t dev_channels = image->image_channels;
-
-  size_t tuned_origin[3] = {origin[0]*dev_elem_size*dev_channels, origin[1], 
-                            origin[2]};
-  size_t tuned_region[3] = {region[0]*dev_elem_size*dev_channels, region[1], 
-                            region[2]};
-  
-  size_t image_row_pitch = width*dev_elem_size*dev_channels; 
-  size_t image_slice_pitch = height*image_row_pitch;
-    
-  if ((tuned_origin[0] + tuned_region[0] > image_row_pitch) || 
-      (tuned_origin[1] + tuned_region[1] > height))
-     return CL_INVALID_VALUE;
-  
-  if ((image->type == CL_MEM_OBJECT_IMAGE3D && 
-       (tuned_origin[2] + tuned_region[2] > image->image_depth)))
-    return CL_INVALID_VALUE;
-  
-  if (image->type != CL_MEM_OBJECT_IMAGE3D && region[2] != 1)
-    return CL_INVALID_VALUE;
-  
-  device_id->ops->read_rect(device_id->data, ptr, 
-                       image->device_ptrs[device_id->dev_id].mem_ptr,
-                       tuned_origin, tuned_origin, tuned_region,
-                       image_row_pitch, image_slice_pitch,
-                       image_row_pitch, image_slice_pitch);
-  
-  return CL_SUCCESS;
+  cl_uchar4 r;
+  FOR4
+    r.s[i] = (cl_uchar)max (0, min ((cl_long) (x.s[i]), CL_UCHAR_MAX));
+  return r;
+}
+
+cl_ushort4
+convert_ushort4_sat (cl_float4 x)
+{
+  cl_ushort4 r;
+  FOR4
+    r.s[i] = (cl_ushort)max (0, min ((cl_long) (x.s[i]), CL_USHRT_MAX));
+  return r;
+}
+
+/****************************************************/
+
+cl_char
+convert_char_sat (cl_float x)
+{
+  cl_int y = (cl_int)x;
+  return (cl_char)max (CL_CHAR_MIN, min (y, CL_CHAR_MAX));
+}
+
+cl_short
+convert_short_sat (cl_float x)
+{
+  cl_int y = (cl_int)x;
+  return (cl_short)max (CL_SHRT_MIN, min (y, CL_SHRT_MAX));
+}
+
+cl_uchar
+convert_uchar_sat (cl_float x)
+{
+  cl_long y = (cl_long)x;
+  return (cl_uchar)max (0, min (y, CL_UCHAR_MAX));
+}
+
+cl_ushort
+convert_ushort_sat (cl_float x)
+{
+  cl_long y = (cl_long)x;
+  return (cl_ushort)max (0, min (y, CL_USHRT_MAX));
+}
+
+/****************************************************/
+
+cl_char4
+convert_char4_sat_int (cl_int4 x)
+{
+  cl_char4 r;
+  FOR4
+    r.s[i] = (cl_char)max (CL_CHAR_MIN, min ((cl_int) (x.s[i]), CL_CHAR_MAX));
+  return r;
+}
+
+cl_short4
+convert_short4_sat_int (cl_int4 x)
+{
+  cl_short4 r;
+  FOR4
+    r.s[i] = (cl_short)max (CL_SHRT_MIN, min ((cl_int) (x.s[i]), CL_SHRT_MAX));
+  return r;
+}
+
+cl_uchar4
+convert_uchar4_sat_int (cl_uint4 x)
+{
+  cl_uchar4 r;
+  FOR4
+    r.s[i] = (cl_uchar)min (x.s[i], CL_UCHAR_MAX);
+  return r;
+}
+
+cl_ushort4
+convert_ushort4_sat_int (cl_uint4 x)
+{
+  cl_ushort4 r;
+  FOR4
+    r.s[i] = (cl_ushort)min (x.s[i], CL_USHRT_MAX);
+  return r;
+}
+
+/****************************************************/
+
+cl_char
+convert_char_sat_int (cl_int x)
+{
+  return (cl_char)max (CL_CHAR_MIN, min (x, CL_CHAR_MAX));
+}
+
+cl_short
+convert_short_sat_int (cl_int x)
+{
+  return (cl_short)max (CL_SHRT_MIN, min (x, CL_SHRT_MAX));
+}
+
+cl_uchar
+convert_uchar_sat_int (cl_uint x)
+{
+  return (cl_uchar)min (x, CL_UCHAR_MAX);
+}
+
+cl_ushort
+convert_ushort_sat_int (cl_uint x)
+{
+  return (cl_ushort)min (x, CL_USHRT_MAX);
+}
+
+/****************************************************/
+
+static cl_uint4
+map_channels (cl_uint4 color, int order)
+{
+  switch (order)
+    {
+    case CL_ARGB:
+      {
+        // return color.wxyz;
+        cl_uint4 ret;
+        ret.s[0] = color.s[3];
+        ret.s[1] = color.s[0];
+        ret.s[2] = color.s[1];
+        ret.s[3] = color.s[2];
+        return ret;
+      }
+    case CL_BGRA:
+      {
+        // return color.zyxw;
+        cl_uint4 ret;
+        ret.s[0] = color.s[2];
+        ret.s[1] = color.s[1];
+        ret.s[2] = color.s[0];
+        ret.s[3] = color.s[3];
+        return ret;
+      }
+    case CL_RGBA:
+    default:
+      return color;
+    }
+}
+
+/* only for CL_FLOAT, CL_SNORM_INT8, CL_UNORM_INT8,
+ * CL_SNORM_INT16, CL_UNORM_INT16 channel types */
+static void
+write_float4_pixel (cl_float4 color, void *data, int type)
+{
+  if (type == CL_FLOAT)
+    {
+      cl_float4 *p = (cl_float4 *)data;
+      FOR4
+        p->s[i] = color.s[i];
+      return;
+    }
+  if (type == CL_HALF_FLOAT)
+    {
+      /* TODO: convert to builtins */
+      ((uint16_t *)data)[0] = float_to_half (color.s0);
+      ((uint16_t *)data)[1] = float_to_half (color.s1);
+      ((uint16_t *)data)[2] = float_to_half (color.s2);
+      ((uint16_t *)data)[3] = float_to_half (color.s3);
+      return;
+    }
+  const cl_float f127 = ((cl_float) (CL_CHAR_MAX));
+  const cl_float f32767 = ((cl_float) (CL_SHRT_MAX));
+  const cl_float f255 = ((cl_float) (CL_UCHAR_MAX));
+  const cl_float f65535 = ((cl_float) (CL_USHRT_MAX));
+  if (type == CL_SNORM_INT8)
+    {
+      /*  <-1.0, 1.0> to <I*_MIN, I*_MAX> */
+      cl_float4 colorf;
+      FOR4
+        colorf.s[i] = color.s[i] * f127;
+      cl_char4 final_color = convert_char4_sat (colorf);
+      *((cl_char4 *)data) = final_color;
+      return;
+    }
+  if (type == CL_SNORM_INT16)
+    {
+      cl_float4 colorf;
+      FOR4
+        colorf.s[i] = color.s[i] * f32767;
+      cl_short4 final_color = convert_short4_sat (colorf);
+      *((cl_short4 *)data) = final_color;
+      return;
+    }
+  if (type == CL_UNORM_INT8)
+    {
+      /* <0, I*_MAX> to <0.0, 1.0> */
+      /*  <-1.0, 1.0> to <I*_MIN, I*_MAX> */
+      cl_float4 colorf;
+      FOR4
+        colorf.s[i] = color.s[i] * f255;
+      cl_uchar4 final_color = convert_uchar4_sat (colorf);
+      *((cl_uchar4 *)data) = final_color;
+      return;
+    }
+  if (type == CL_UNORM_INT16)
+    {
+      cl_float4 colorf;
+      FOR4
+        colorf.s[i] = color.s[i] * f65535;
+      cl_ushort4 final_color = convert_ushort4_sat (colorf);
+      *((cl_ushort4 *)data) = final_color;
+      return;
+    }
+
+  return;
+}
+
+/* only for CL_FLOAT, CL_SNORM_INT8, CL_UNORM_INT8,
+ * CL_SNORM_INT16, CL_UNORM_INT16 channel types */
+static void
+write_float_pixel (cl_float color, void *data, int type)
+{
+  if (type == CL_FLOAT)
+    {
+      *((float *)data) = color;
+      return;
+    }
+  if (type == CL_HALF_FLOAT)
+    {
+      /* TODO: convert to builtins */
+      *((uint16_t *)data) = float_to_half (color);
+      return;
+    }
+  const cl_float f127 = ((cl_float)CL_CHAR_MAX);
+  const cl_float f32767 = ((cl_float)CL_SHRT_MAX);
+  const cl_float f255 = ((cl_float)CL_UCHAR_MAX);
+  const cl_float f65535 = ((cl_float)CL_USHRT_MAX);
+  if (type == CL_SNORM_INT8)
+    {
+      /*  <-1.0, 1.0> to <I*_MIN, I*_MAX> */
+      cl_float colorf = color * f127;
+      cl_char final_color = convert_char_sat (colorf);
+      *((cl_char *)data) = final_color;
+      return;
+    }
+  if (type == CL_SNORM_INT16)
+    {
+      cl_float colorf = color * f32767;
+      cl_short final_color = convert_short_sat (colorf);
+      *((cl_short *)data) = final_color;
+      return;
+    }
+  if (type == CL_UNORM_INT8)
+    {
+      /* <0, I*_MAX> to <0.0, 1.0> */
+      /*  <-1.0, 1.0> to <I*_MIN, I*_MAX> */
+      cl_float colorf = color * f255;
+      cl_uchar final_color = convert_uchar_sat (colorf);
+      *((cl_uchar *)data) = final_color;
+      return;
+    }
+  if (type == CL_UNORM_INT16)
+    {
+      cl_float colorf = color * f65535;
+      cl_ushort final_color = convert_ushort_sat (colorf);
+      *((cl_ushort *)data) = final_color;
+      return;
+    }
+
+  return;
+}
+
+/* for use inside filter functions
+ * no channel mapping
+ * no pointers to img metadata */
+static void
+pocl_write_pixel_fast_ui (cl_uint4 color, int order, int elem_size, void *data)
+{
+  if (order == CL_A)
+    {
+      if (elem_size == 1)
+        *((cl_uchar *)data) = convert_uchar_sat_int (color.s[3]);
+      else if (elem_size == 2)
+        *((cl_ushort *)data) = convert_ushort_sat_int (color.s[3]);
+      else if (elem_size == 4)
+        *((cl_uint *)data) = color.s[3];
+      return;
+    }
+
+  if (elem_size == 1)
+    {
+      *((cl_uchar4 *)data) = convert_uchar4_sat_int (color);
+    }
+  else if (elem_size == 2)
+    {
+      *((cl_ushort4 *)data) = convert_ushort4_sat_int (color);
+    }
+  else if (elem_size == 4)
+    {
+      *((cl_uint4 *)data) = color;
+    }
+
+  return;
+}
+
+/* for use inside filter functions
+ * no channel mapping
+ * no pointers to img metadata */
+static void
+pocl_write_pixel_fast_f (cl_float4 color, int channel_type, int order,
+                         void *data)
+{
+  if (order == CL_A)
+    {
+      write_float_pixel (color.s[3], data, channel_type);
+    }
+  else
+    {
+      write_float4_pixel (color, data, channel_type);
+    }
+
+  return;
+}
+
+/* for use inside filter functions
+ * no channel mapping
+ * no pointers to img metadata */
+static void
+pocl_write_pixel_fast_i (cl_int4 color, int order, int elem_size, void *data)
+{
+  if (order == CL_A)
+    {
+      if (elem_size == 1)
+        *((cl_char *)data) = convert_char_sat_int (color.s[3]);
+      else if (elem_size == 2)
+        *((cl_short *)data) = convert_short_sat_int (color.s[3]);
+      else if (elem_size == 4)
+        *((cl_int *)data) = color.s[3];
+      return;
+    }
+
+  if (elem_size == 1)
+    {
+      *((cl_char4 *)data) = convert_char4_sat_int (color);
+    }
+  else if (elem_size == 2)
+    {
+      *((cl_short4 *)data) = convert_short4_sat_int (color);
+    }
+  else if (elem_size == 4)
+    {
+      *((cl_int4 *)data) = color;
+    }
+  return;
+}
+
+/* full write with channel map conversion etc
+ * Writes a four element pixel to an image pixel pointed by integer coords.
+ */
+void
+pocl_write_pixel_zero (void *data, const void *color_ptr, int order,
+                       int elem_size, int channel_type)
+{
+  cl_uint4 color;
+  FOR4
+    color.s[i] = ((cl_uint4 *)color_ptr)->s[i];
+
+  color = map_channels (color, order);
+
+  typedef union
+  {
+    cl_uint4 ui;
+    cl_int4 i;
+    cl_float4 f;
+  } u;
+
+  u ucolor;
+  ucolor.ui = color;
+
+  if ((channel_type == CL_SIGNED_INT8) || (channel_type == CL_SIGNED_INT16)
+      || (channel_type == CL_SIGNED_INT32))
+    pocl_write_pixel_fast_i (ucolor.i, order, elem_size, data);
+  else if ((channel_type == CL_UNSIGNED_INT8)
+           || (channel_type == CL_UNSIGNED_INT16)
+           || (channel_type == CL_UNSIGNED_INT32))
+    pocl_write_pixel_fast_ui (ucolor.ui, order, elem_size, data);
+  else // TODO unsupported channel types
+    pocl_write_pixel_fast_f (ucolor.f, channel_type, order, data);
 }
diff --git a/lib/CL/pocl_image_util.h b/lib/CL/pocl_image_util.h
index fc63a88..4eeaba1 100644
--- a/lib/CL/pocl_image_util.h
+++ b/lib/CL/pocl_image_util.h
@@ -41,27 +41,31 @@ pocl_get_image_information (cl_channel_order  ch_order,
                             cl_int*           host_channels,
                             cl_int*           host_elem_size);
 
-extern cl_int
-pocl_check_device_supports_image(const cl_mem image,
-                                 const cl_command_queue command_queue);
+extern cl_int pocl_check_device_supports_image (
+    cl_device_id device, const cl_image_format *image_format,
+    const cl_image_desc *image_desc, cl_image_format *supported_image_formats,
+    cl_uint num_entries);
 
-extern cl_int
-pocl_write_image(cl_mem               image,
-                 cl_device_id         device_id,
-                 const size_t *       origin_, /*[3]*/
-                 const size_t *       region_, /*[3]*/
-                 size_t               host_row_pitch,
-                 size_t               host_slice_pitch, 
-                 const void *         ptr);
+void pocl_write_pixel_zero (void *data, const void *color_ptr, int order,
+                            int elem_size, int channel_type);
 
-extern cl_int         
-pocl_read_image(cl_mem               image,
-                cl_device_id         device,
-                const size_t *       origin, /*[3]*/
-                const size_t *       region, /*[3]*/
-                size_t               host_row_pitch,
-                size_t               host_slice_pitch, 
-                void *               ptr);
+cl_char4 convert_char4_sat (cl_float4 x);
+cl_char convert_char_sat (cl_float x);
+cl_char4 convert_char4_sat_int (cl_int4 x);
+cl_char convert_char_sat_int (cl_int x);
+cl_uchar4 convert_uchar4_sat (cl_float4 x);
+cl_uchar convert_uchar_sat (cl_float x);
+cl_uchar4 convert_uchar4_sat_int (cl_uint4 x);
+cl_uchar convert_uchar_sat_int (cl_uint x);
+
+cl_short4 convert_short4_sat (cl_float4 x);
+cl_short convert_short_sat (cl_float x);
+cl_short4 convert_short4_sat_int (cl_int4 x);
+cl_short convert_short_sat_int (cl_int x);
+cl_ushort4 convert_ushort4_sat (cl_float4 x);
+cl_ushort convert_ushort_sat (cl_float x);
+cl_ushort4 convert_ushort4_sat_int (cl_uint4 x);
+cl_ushort convert_ushort_sat_int (cl_uint x);
 
 #ifdef __GNUC__
 #pragma GCC visibility pop
diff --git a/lib/CL/pocl_img_buf_cpy.c b/lib/CL/pocl_img_buf_cpy.c
index c15afea..281747d 100644
--- a/lib/CL/pocl_img_buf_cpy.c
+++ b/lib/CL/pocl_img_buf_cpy.c
@@ -76,9 +76,6 @@ cl_int pocl_rect_copy(cl_command_queue command_queue,
         CL_INVALID_MEM_OBJECT, "src_image is not an image\n");
       POCL_RETURN_ERROR_ON((src->type == CL_MEM_OBJECT_IMAGE2D && src_origin[2] != 0),
         CL_INVALID_VALUE, "src_origin[2] must be 0 for 2D src_image\n");
-      errcode = pocl_check_device_supports_image(src, command_queue);
-      if (errcode != CL_SUCCESS)
-        return errcode;
     }
   else
     {
@@ -92,9 +89,6 @@ cl_int pocl_rect_copy(cl_command_queue command_queue,
         CL_INVALID_MEM_OBJECT, "dst is not an image\n");
       POCL_RETURN_ERROR_ON((dst->type == CL_MEM_OBJECT_IMAGE2D && dst_origin[2] != 0),
         CL_INVALID_VALUE, "dst_origin[2] must be 0 for 2D dst_image\n");
-      errcode = pocl_check_device_supports_image(dst, command_queue);
-      if (errcode != CL_SUCCESS)
-        return errcode;
     }
   else
     {
@@ -133,16 +127,25 @@ cl_int pocl_rect_copy(cl_command_queue command_queue,
       mod_dst_origin[0] *= dst->image_elem_size * dst->image_channels;
     }
 
+  /* NOTE: 1D image array has row_pitch == slice_pitch;
+   * need to zero it for bufferbound checks.
+   */
   if (src_is_image)
     {
       src_row_pitch = src->image_row_pitch;
-      src_slice_pitch = src->image_slice_pitch;
+      if (src->type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+        src_slice_pitch = 0;
+      else
+        src_slice_pitch = src->image_slice_pitch;
     }
 
   if (dst_is_image)
     {
       dst_row_pitch = dst->image_row_pitch;
-      dst_slice_pitch = dst->image_slice_pitch;
+      if (dst->type == CL_MEM_OBJECT_IMAGE1D_ARRAY)
+        dst_slice_pitch = 0;
+      else
+        dst_slice_pitch = dst->image_slice_pitch;
     }
 
   POCL_RETURN_ERROR_ON(((command_queue->context != src->context)
@@ -194,6 +197,9 @@ cl_int pocl_rect_copy(cl_command_queue command_queue,
   if (errcode != CL_SUCCESS)
     return errcode;
 
+  HANDLE_IMAGE1D_BUFFER (src);
+  HANDLE_IMAGE1D_BUFFER (dst);
+
   cmd->command.copy_image.src_buffer = src;
   cmd->command.copy_image.src_device =
     (src->owning_device) ? src->owning_device : command_queue->device;
diff --git a/lib/CL/pocl_intfn.h b/lib/CL/pocl_intfn.h
index cfb6e71..8eedbb9 100644
--- a/lib/CL/pocl_intfn.h
+++ b/lib/CL/pocl_intfn.h
@@ -29,12 +29,12 @@
 #endif
 
 POdeclsym(clBuildProgram)
+POdeclsym(clLinkProgram)
+POdeclsym(clCompileProgram)
 POdeclsym(clCreateBuffer)
 POdeclsym(clCreateCommandQueue)
 POdeclsym(clCreateContext)
 POdeclsym(clCreateContextFromType)
-POdeclsym(clCreateFromGLTexture2D)
-POdeclsym(clCreateFromGLTexture3D)
 POdeclsym(clCreateImage2D) 
 POdeclsym(clCreateImage3D)
 POdeclsym(clCreateImage)
@@ -42,6 +42,7 @@ POdeclsym(clCreateKernel)
 POdeclsym(clCreateKernelsInProgram)
 POdeclsym(clCreatePipe)
 POdeclsym(clCreateProgramWithBinary)
+POdeclsym(clCreateProgramWithBuiltInKernels)
 POdeclsym(clCreateProgramWithSource)
 POdeclsym(clCreateSampler)
 POdeclsym(clCreateSubBuffer)
@@ -81,6 +82,7 @@ POdeclsym(clGetDeviceInfo)
 POdeclsym(clGetEventInfo)
 POdeclsym(clGetEventProfilingInfo)
 POdeclsym(clGetExtensionFunctionAddress)
+POdeclsym(clGetExtensionFunctionAddressForPlatform)
 POdeclsym(clGetImageInfo)
 POdeclsym(clGetKernelInfo)
 POdeclsym(clGetKernelArgInfo)
@@ -126,5 +128,15 @@ POdeclsym(clSVMAlloc)
 POdeclsym(clSetKernelArgSVMPointer)
 POdeclsym(clSetKernelExecInfo)
 POdeclsym(clCreateCommandQueueWithProperties)
+POdeclsym(clCreateFromGLBuffer)
+POdeclsym(clCreateFromGLTexture)
+POdeclsym(clCreateFromGLTexture2D)
+POdeclsym(clCreateFromGLTexture3D)
+POdeclsym(clCreateFromGLRenderbuffer)
+POdeclsym(clGetGLObjectInfo)
+POdeclsym(clGetGLTextureInfo)
+POdeclsym(clEnqueueAcquireGLObjects)
+POdeclsym(clEnqueueReleaseGLObjects)
+POdeclsym(clGetGLContextInfoKHR)
 
 #endif
diff --git a/lib/CL/pocl_llvm.h b/lib/CL/pocl_llvm.h
index 7f4b423..180c4bb 100644
--- a/lib/CL/pocl_llvm.h
+++ b/lib/CL/pocl_llvm.h
@@ -2,17 +2,17 @@
 
    Copyright (c) 2013 Kalle Raiskila and
                       Pekka Jääskeläinen
-   
+
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -35,47 +35,62 @@ extern "C" {
 
 /* Returns the cpu name as reported by LLVM. */
 char* get_cpu_name();
+/* Returns if the cpu supports FMA instruction (uses LLVM). */
+int cpu_has_fma();
+
+int bitcode_is_spir(const char *bitcode, size_t size);
+
+/* Sets up the native/preferred vector widths at runtime (using LLVM). */
+void cpu_setup_vector_widths(cl_device_id dev);
 
 /* Compiles an .cl file into LLVM IR.
  */
-int pocl_llvm_build_program
-(cl_program program,
- unsigned device_i,
- const char* user_options_cstr, char *program_bc_path);
-
+int pocl_llvm_build_program(cl_program program,
+                            unsigned device_i,
+                            const char *user_options_cstr,
+                            char *program_bc_path,
+                            cl_uint num_input_headers,
+                            const cl_program *input_headers,
+                            const char **header_include_names,
+                            int linking_program);
 
 /* Retrieve metadata of the given kernel in the program to populate the
  * cl_kernel object.
  */
-int pocl_llvm_get_kernel_metadata
-(cl_program program, 
- cl_kernel kernel,
- int device_i,     
- const char* kernel_name,
- int *errcode);
+int pocl_llvm_get_kernel_metadata(cl_program program, cl_kernel kernel,
+                                  int device_i, const char *kernel_name,
+                                  int *errcode);
 
 /* This function links the input kernel LLVM bitcode and the
  * OpenCL kernel runtime library into one LLVM module, then
- * runs pocl's kernel compiler passes on that module to produce 
+ * runs pocl's kernel compiler passes on that module to produce
  * a function that executes all work-items in a work-group.
  *
  * Output is a LLVM bitcode file that contains a work-group function
- * and its associated launchers. 
+ * and its associated launchers.
  *
  * TODO: this is not thread-safe, it changes the LLVM global options to
  * control the compilation. We should enforce only one compilations is done
  * at a time or control the options through thread safe methods.
  */
-int pocl_llvm_generate_workgroup_function
-(char* kernel_cachedir, cl_device_id device,
- cl_kernel kernel, size_t local_x, size_t local_y, size_t local_z
-);
+int pocl_llvm_generate_workgroup_function(cl_device_id device,
+                                          cl_kernel kernel, size_t local_x,
+                                          size_t local_y, size_t local_z);
 
+int pocl_llvm_generate_workgroup_function_nowrite(
+    cl_device_id device, cl_kernel kernel, size_t local_x, size_t local_y,
+    size_t local_z, void **output);
 /**
  * Free the LLVM IR of a program for a given device
  */
 void pocl_free_llvm_irs(cl_program program, int device_i);
 
+/* calls delete on the module. */
+void pocl_destroy_llvm_module(void *modp);
+
+int pocl_llvm_remove_file_on_signal (const char *file);
+
+void pocl_llvm_release();
 /**
  * Update the program->binaries[] representation of the kernels
  * from the program->llvm_irs[] representation.
@@ -105,10 +120,8 @@ unsigned pocl_llvm_get_kernel_names( cl_program program, char **knames, unsigned
 /** Compile the kernel in infile from LLVM bitcode to native object file for
  * device, into outfile.
  */
-int pocl_llvm_codegen ( cl_kernel kernel,
-                        cl_device_id device,
-                        const char *infile,
-                        const char *outfile);
+int pocl_llvm_codegen(cl_kernel kernel, cl_device_id device, void *modp,
+                      char **output, size_t *output_size);
 
 /* Parse program file and populate program's llvm_irs */
 int
@@ -116,6 +129,14 @@ pocl_update_program_llvm_irs(cl_program program, unsigned device_i,
                              cl_device_id device);
 
 
+int pocl_llvm_link_program(cl_program program,
+                           unsigned device_i,
+                           char *program_bc_path,
+                           cl_uint num_input_programs,
+                           unsigned char **cur_device_binaries,
+                           size_t *cur_device_binary_sizes,
+                           void **cur_llvm_irs, int create_library);
+
 #ifdef __GNUC__
 #pragma GCC visibility pop
 #endif
diff --git a/lib/CL/pocl_llvm_api.cc b/lib/CL/pocl_llvm_api.cc
deleted file mode 100644
index e92d5a5..0000000
--- a/lib/CL/pocl_llvm_api.cc
+++ /dev/null
@@ -1,1957 +0,0 @@
-/* pocl_llvm_api.cc: C wrappers for calling the LLVM/Clang C++ APIs to invoke
-   the different kernel compilation phases.
-
-   Copyright (c) 2013 Kalle Raiskila
-                 2013-2017 Pekka Jääskeläinen
-
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-
-   The above copyright notice and this permission notice shall be included in
-   all copies or substantial portions of the Software.
-
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-   THE SOFTWARE.
-*/
-
-#include "CompilerWarnings.h"
-IGNORE_COMPILER_WARNING("-Wunused-parameter")
-IGNORE_COMPILER_WARNING("-Wstrict-aliasing")
-
-#include "config.h"
-
-#include "clang/CodeGen/CodeGenAction.h"
-#include "clang/Frontend/CompilerInstance.h"
-#include "clang/Frontend/CompilerInvocation.h"
-#include "clang/Frontend/FrontendActions.h"
-#include "clang/Frontend/TextDiagnosticBuffer.h"
-
-#ifndef LLVM_OLDER_THAN_4_0
-#include "clang/Lex/PreprocessorOptions.h"
-#endif
-
-// For some reason including pocl.h before including CodeGenAction.h
-// causes an error. Some kind of macro definition issue. To investigate.
-#include "pocl.h"
-
-
-#include "llvm/LinkAllPasses.h"
-#ifdef LLVM_OLDER_THAN_3_7
-#include "llvm/PassManager.h"
-#include "llvm/Target/TargetLibraryInfo.h"
-#else
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/LegacyPassManager.h"
-using llvm::legacy::PassManager;
-#endif
-
-#ifdef LLVM_OLDER_THAN_4_0
-#include "llvm/Bitcode/ReaderWriter.h"
-#else
-#include "llvm/Bitcode/BitcodeReader.h"
-#include "llvm/Bitcode/BitcodeWriter.h"
-#endif
-
-#include "llvm/Transforms/Utils/Cloning.h"
-
-#include "llvm/Linker/Linker.h"
-#include "llvm/PassAnalysisSupport.h"
-
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IRReader/IRReader.h"
-
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/Host.h"
-#include "llvm/Support/Path.h"
-#include "llvm/Support/MutexGuard.h"
-#include "llvm/Support/raw_os_ostream.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/ToolOutputFile.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/SubtargetFeature.h"
-#include "llvm/Transforms/IPO/PassManagerBuilder.h"
-#include "llvm/Support/Host.h"
-
-#include <iostream>
-#include <fstream>
-#include <vector>
-#include <sstream>
-#include <string>
-#include <cstdio>
-
-// Note - LLVM/Clang uses symbols defined in Khronos' headers in macros, 
-// causing compilation error if they are included before the LLVM headers.
-#include "pocl_llvm.h"
-#include "pocl_runtime_config.h"
-#include "install-paths.h"
-#include "LLVMUtils.h"
-#include "linker.h"
-#include "pocl_file_util.h"
-#include "pocl_cache.h"
-#include "TargetAddressSpaces.h"
-
-using namespace clang;
-using namespace llvm;
-
-
-POP_COMPILER_DIAGS
-
-/**
- * Use one global LLVMContext across all LLVM bitcodes. This is because
- * we want to cache the bitcode IR libraries and reuse them when linking
- * new kernels. The CloneModule etc. seem to assume we are linking
- * bitcodes with a same LLVMContext. Unfortunately, this requires serializing
- * all calls to the LLVM APIs with mutex.
- * Freeing/deleting the context crashes LLVM 3.2 (at program exit), as a
- * work-around, allocate this from heap.
- */
-static LLVMContext *globalContext = NULL;
-static LLVMContext *GlobalContext() {
-  if (globalContext == NULL) globalContext = new LLVMContext();
-  return globalContext;
-}
-
-/* The LLVM API interface functions are not at the moment not thread safe,
-   ensure only one thread is using this layer at the time with a mutex. */
-
-static llvm::sys::Mutex kernelCompilerLock;
-
-/* Global pocl device to be used by passes if needed */
-cl_device_id currentPoclDevice = NULL;
-
-static void InitializeLLVM();
-
-//#define DEBUG_POCL_LLVM_API
-
-#if defined(DEBUG_POCL_LLVM_API) && defined(NDEBUG)
-#undef NDEBUG
-#include <cassert>
-#endif
-
-
-// Read input source to clang::FrontendOptions.
-// The source is contained in the program->source array,
-// but if debugging option is enabled in the kernel compiler
-// we need to dump the file to disk first for the debugger
-// to find it.
-static inline int
-load_source(FrontendOptions &fe,
-            cl_program program)
-{
-  char source_file[POCL_FILENAME_LENGTH];
-  POCL_RETURN_ERROR_ON(pocl_cache_write_program_source(source_file, program),
-                       CL_OUT_OF_HOST_MEMORY, "Could not write program source");
-
-  fe.Inputs.push_back
-    (FrontendInputFile(source_file, clang::IK_OpenCL));
-
-  return 0;
-}
-
-// Unlink input sources
-static inline int
-unlink_source(FrontendOptions &fe)
-{
-  // don't unlink in debug mode
-  if (pocl_get_bool_option("POCL_DEBUG", 0))
-    return 0;
-
-  FrontendInputFile const& file = fe.Inputs.front();
-  if (file.isFile() && !file.isSystem()) {
-    return pocl_remove(file.getFile().str().c_str());
-  } else {
-    return 0; // nothing to do
-  }
-
-}
-
-#ifndef LLVM_OLDER_THAN_3_8
-#define PassManager legacy::PassManager
-#endif
-
-static llvm::Module*
-ParseIRFile(const char* fname, SMDiagnostic &Err, llvm::LLVMContext &ctx)
-{
-    return parseIRFile(fname, Err, ctx).release();
-}
-
-static void get_build_log(cl_program program,
-                         unsigned device_i,
-                         std::stringstream &ss_build_log,
-                         clang::TextDiagnosticBuffer *diagsBuffer,
-                         const SourceManager &sm)
-{
-    static const bool show_log = pocl_get_bool_option("POCL_VERBOSE", 0) ||
-      pocl_get_bool_option("POCL_DEBUG", 0);
-
-    for (TextDiagnosticBuffer::const_iterator i = diagsBuffer->err_begin(),
-         e = diagsBuffer->err_end(); i != e; ++i)
-      {
-        ss_build_log << "error: " << i->first.printToString(sm)
-                     << ": " << i->second << std::endl;
-      }
-    for (TextDiagnosticBuffer::const_iterator i = diagsBuffer->warn_begin(),
-         e = diagsBuffer->warn_end(); i != e; ++i)
-      {
-        ss_build_log << "warning: " << i->first.printToString(sm)
-                     << ": " << i->second << std::endl;
-      }
-
-    pocl_cache_append_to_buildlog(program, device_i,
-                                  ss_build_log.str().c_str(),
-                                  ss_build_log.str().size());
-
-    if (show_log)
-      std::cerr << ss_build_log.str();
-
-}
-
-
-int pocl_llvm_build_program(cl_program program, 
-                            unsigned device_i,
-                            const char* user_options_cstr,
-                            char* program_bc_path)
-
-{
-  void* write_lock = NULL;
-  char tempfile[POCL_FILENAME_LENGTH];
-  tempfile[0] = 0;
-  llvm::Module **mod = NULL;
-  std::string user_options(user_options_cstr ? user_options_cstr : "");
-  std::string content;
-  llvm::raw_string_ostream sos(content);
-  size_t n = 0;
-
-  llvm::MutexGuard lockHolder(kernelCompilerLock);
-  InitializeLLVM();
-
-  // Use CompilerInvocation::CreateFromArgs to initialize
-  // CompilerInvocation. This way we can reuse the Clang's
-  // command line parsing.
-  llvm::IntrusiveRefCntPtr<clang::DiagnosticIDs> diagID =
-    new clang::DiagnosticIDs();
-  llvm::IntrusiveRefCntPtr<clang::DiagnosticOptions> diagOpts =
-    new clang::DiagnosticOptions();
-  clang::TextDiagnosticBuffer *diagsBuffer =
-    new clang::TextDiagnosticBuffer();
-
-  clang::DiagnosticsEngine diags(diagID, &*diagOpts, diagsBuffer);
-
-  CompilerInstance CI;
-  CompilerInvocation &pocl_build = CI.getInvocation();
-
-  std::stringstream ss;
-  std::stringstream ss_build_log;
-
-  // add device specific switches, if any
-  // TODO this currently passes NULL as device tmpdir
-  cl_device_id device = program->devices[device_i];
-  if (device->ops->init_build != NULL)
-    {
-      char *device_switches =
-        device->ops->init_build (device->data);
-      if (device_switches != NULL)
-        {
-          ss << device_switches << " ";
-        }
-      POCL_MEM_FREE(device_switches);
-    }
-
-  llvm::StringRef extensions(device->extensions);
-
-  if (extensions.size() > 0) {
-    size_t e_start = 0, e_end = 0;
-    while (e_end < std::string::npos) {
-      e_end = extensions.find(' ', e_start);
-      llvm::StringRef tok = extensions.slice(e_start, e_end);
-      e_start = e_end + 1;
-      ss << "-D" << tok.str() << " ";
-#ifndef LLVM_OLDER_THAN_4_0
-      ss << "-cl-ext=" << tok.str() << " ";
-#endif
-    }
-  }
-
-  // This can cause illegal optimizations when unaware
-  // of the barrier semantics. -O2 is the default opt level in
-  // Clang for OpenCL C and seems to affect the performance
-  // of the end result, even if we optimize the final WG
-  // func. TODO: There should be 'noduplicate' etc. flags in 
-  // the 'barrier' function to prevent them.
-  // ss << "-O2 ";
-
-  ss << "-x cl ";
-  // Remove the inline keywords to force the user functions
-  // to be included in the program. Otherwise they will
-  // be removed and not inlined due to -O0.
-  ss << "-Dinline= ";
-  // The current directory is a standard search path.
-  ss << "-I. ";
-
-  // required for clGetKernelArgInfo()
-  ss << "-cl-kernel-arg-info ";
-
-  ss << user_options << " ";
-
-  if (device->endian_little)
-    ss << "-D__ENDIAN_LITTLE__=1 ";
-
-  if (device->image_support)
-    ss << "-D__IMAGE_SUPPORT__=1 ";
-
-  ss << "-DCL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE=" << device->global_var_max_size << " ";
-
-  if (user_options.find("cl-fast-relaxed-math") != std::string::npos)
-    ss << "-D__FAST_RELAXED_MATH__=1 ";
-
-  ss << "-D__OPENCL_VERSION__=" << device->cl_version_int << " ";
-
-  if (user_options.find("-cl-std=") == std::string::npos)
-    ss << "-cl-std=" << device->cl_version_std << " ";
-
-  std::string temp(ss.str());
-  size_t pos = temp.find("-cl-std=CL");
-  pos += 10;
-  int cl_std_major = temp.c_str()[pos] - '0';
-  int cl_std_minor = temp.c_str()[pos+2] - '0';
-  int cl_std_i = cl_std_major * 100 + cl_std_minor * 10;
-  ss << "-D__OPENCL_C_VERSION__=" << cl_std_i << " ";
-
-  /* With fp-contract we get calls to fma with processors which do not
-     have fma instructions. These ruin the performance. Better to have
-     the mul+add separated in the IR. */
-  ss << "-fno-builtin -ffp-contract=off ";
-  // This is required otherwise the initialization fails with
-  // unknown triple ''
-  ss << "-triple=" << device->llvm_target_triplet << " ";
-  if (device->llvm_cpu != NULL)
-    ss << "-target-cpu " << device->llvm_cpu << " ";
-
-  POCL_MSG_PRINT_INFO("all build options: %s\n", ss.str().c_str());
-
-  std::istream_iterator<std::string> begin(ss);
-  std::istream_iterator<std::string> end;
-  std::istream_iterator<std::string> i = begin;
-  std::vector<const char*> itemcstrs;
-  std::vector<std::string> itemstrs;
-  while (i != end) 
-    {
-      itemstrs.push_back(*i);
-      ++i;
-    }
-  for (unsigned idx=0; idx<itemstrs.size(); idx++)
-    {
-      // note: if itemstrs is modified after this, itemcstrs will be full
-      // of invalid pointers! Could make copies, but would have to clean up then...
-      itemcstrs.push_back(itemstrs[idx].c_str());
-    }
-#ifdef DEBUG_POCL_LLVM_API
-  // TODO: for some reason the user_options are replicated,
-  // they appear twice in a row in the output
-  std::cerr << "### options: " << ss.str() 
-            << "user_options: " << user_options << std::endl;
-#endif
-
-  if (program->build_log[device_i])
-    POCL_MEM_FREE(program->build_log[device_i]);
-
-  if (!CompilerInvocation::CreateFromArgs
-      (pocl_build, itemcstrs.data(), itemcstrs.data() + itemcstrs.size(),
-       diags)) {
-    pocl_cache_create_program_cachedir(program, device_i, NULL, 0,
-                                       program_bc_path);
-    get_build_log(program, device_i, ss_build_log, diagsBuffer, CI.getSourceManager());
-    return CL_INVALID_BUILD_OPTIONS;
-  }
-
-  LangOptions *la = pocl_build.getLangOpts();
-  PreprocessorOptions &po = pocl_build.getPreprocessorOpts();
-
-#ifdef LLVM_OLDER_THAN_3_9
-  pocl_build.setLangDefaults
-    (*la, clang::IK_OpenCL, clang::LangStandard::lang_opencl12);
-#else
-  llvm::Triple triple(device->llvm_target_triplet);
-  pocl_build.setLangDefaults
-    (*la, clang::IK_OpenCL, triple, po, clang::LangStandard::lang_opencl12);
-#endif
-
-  // LLVM 3.3 and older do not set that char is signed which is
-  // defined by the OpenCL C specs (but not by C specs).
-  la->CharIsSigned = true;
-
-  // the per-file types don't seem to override this
-  la->OpenCLVersion = cl_std_i;
-#ifdef POCL_USE_FAKE_ADDR_SPACE_IDS
-  la->FakeAddressSpaceMap = true;
-#else
-  la->FakeAddressSpaceMap = false;
-#endif
-  la->Blocks = true; //-fblocks
-  la->MathErrno = false; // -fno-math-errno
-  la->NoBuiltin = true;  // -fno-builtin
-  la->AsmBlocks = true;  // -fasm (?)
-
-  std::string kernelh;
-  if (pocl_get_bool_option("POCL_BUILDING", 0)) {
-    kernelh  = SRCDIR;
-    kernelh += "/include/_kernel.h";
-  } else {
-    kernelh = PKGDATADIR;
-    kernelh += "/include/_kernel.h";
-  }
-  po.Includes.push_back(kernelh);
-
-  clang::TargetOptions &ta = pocl_build.getTargetOpts();
-  ta.Triple = device->llvm_target_triplet;
-  if (device->llvm_cpu != NULL)
-    ta.CPU = device->llvm_cpu;
-
-#ifdef DEBUG_POCL_LLVM_API
-  std::cout << "### Triple: " << ta.Triple.c_str() <<  ", CPU: " << ta.CPU.c_str();
-#endif
-  CI.createDiagnostics(diagsBuffer, false);
-
-  FrontendOptions &fe = pocl_build.getFrontendOpts();
-  // The CreateFromArgs created an stdin input which we should remove first.
-  fe.Inputs.clear();
-  if (load_source(fe, program) != 0)
-    return CL_OUT_OF_HOST_MEMORY;
-
-  CodeGenOptions &cg = pocl_build.getCodeGenOpts();
-  cg.EmitOpenCLArgMetadata = true;
-  cg.StackRealignment = true;
-  // Let the vectorizer or another optimization pass unroll the loops,
-  // in case it sees beneficial.
-  cg.UnrollLoops = false;
-  // Lets leave vectorization to later compilation phase
-  cg.VectorizeLoop = false;
-  cg.VectorizeSLP = false;
-  cg.VectorizeBB = false;
-  // This workarounds a Frontend codegen issues with an illegal address
-  // space cast which is later flattened (and thus implicitly fixed) in
-  // the TargetAddressSpaces. See:  https://github.com/pocl/pocl/issues/195
-  cg.VerifyModule = false;
-
-  PreprocessorOutputOptions &poo = pocl_build.getPreprocessorOutputOpts();
-  poo.ShowCPP = 1;
-  poo.ShowComments = 0;
-  poo.ShowLineMarkers = 0;
-  poo.ShowMacroComments = 0;
-  poo.ShowMacros = 1;
-  poo.RewriteIncludes = 0;
-
-  std::string saved_output(fe.OutputFile);
-  pocl_cache_mk_temp_name(tempfile);
-  fe.OutputFile = tempfile;
-
-  bool success = true;
-  clang::PrintPreprocessedAction Preprocess;
-  success = CI.ExecuteAction(Preprocess);
-  char *PreprocessedOut = nullptr;
-  uint64_t PreprocessedSize = 0;
-
-  if (success) {
-    pocl_read_file(tempfile, &PreprocessedOut, &PreprocessedSize);
-    pocl_remove(tempfile);
-    fe.OutputFile = saved_output;
-  }
-
-  if (PreprocessedOut == nullptr) {
-    pocl_cache_create_program_cachedir(program, device_i, NULL, 0,
-                                       program_bc_path);
-    get_build_log(program, device_i, ss_build_log, diagsBuffer, CI.getSourceManager());
-    return CL_BUILD_PROGRAM_FAILURE;
-  }
-
-  pocl_cache_create_program_cachedir(program, device_i, PreprocessedOut,
-                                     static_cast<size_t>(PreprocessedSize), program_bc_path);
-
-  POCL_MEM_FREE(PreprocessedOut);
-
-  if (pocl_exists(program_bc_path)) {
-    unlink_source(fe);
-    return CL_SUCCESS;
-  }
-
-  // TODO: use pch: it is possible to disable the strict checking for
-  // the compilation flags used to compile it and the current translation
-  // unit via the preprocessor options directly.
-  clang::EmitLLVMOnlyAction EmitLLVM(GlobalContext());
-  success = CI.ExecuteAction(EmitLLVM);
-
-  unlink_source(fe);
-
-  get_build_log(program, device_i, ss_build_log, diagsBuffer, CI.getSourceManager());
-
-  if (!success)
-    return CL_BUILD_PROGRAM_FAILURE;
-
-  mod = (llvm::Module **)&program->llvm_irs[device_i];
-  if (*mod != NULL)
-    delete (llvm::Module*)*mod;
-
-  *mod = EmitLLVM.takeModule().release();
-
-  if (*mod == NULL)
-    return CL_BUILD_PROGRAM_FAILURE;
-
-  write_lock = pocl_cache_acquire_writer_lock_i(program, device_i);
-  assert(write_lock);
-
-  /* Always retain program.bc. Its required in clBuildProgram */
-  pocl_write_module(*mod, program_bc_path, 0);
-
-  POCL_MSG_PRINT_INFO("Wrote program.bc to %s.\n", program_bc_path);
-
-  /* To avoid writing & reading the same back,
-   * save program->binaries[i]
-   */
-  WriteBitcodeToFile(*mod, sos);
-  sos.str(); // flush
-
-  if (program->binaries[device_i])
-    POCL_MEM_FREE(program->binaries[device_i]);
-
-  n = content.size();
-  program->binary_sizes[device_i] = n;
-  program->binaries[device_i] = (unsigned char *) malloc(n);
-  std::memcpy(program->binaries[device_i], content.c_str(), n);
-
-  pocl_cache_release_lock(write_lock);
-
-  return CL_SUCCESS;
-}
-
-// The old way of getting kernel metadata from "opencl.kernels" module meta.
-// LLVM < 3.9 and SPIR
-static int pocl_get_kernel_arg_module_metadata(const char* kernel_name,
-                                               llvm::Module *input,
-                                               cl_kernel kernel)
-{
-  // find the right kernel in "opencl.kernels" metadata
-  llvm::NamedMDNode *opencl_kernels = input->getNamedMetadata("opencl.kernels");
-  llvm::MDNode *kernel_metadata = NULL;
-
-  assert(opencl_kernels && opencl_kernels->getNumOperands());
-
-  for (unsigned i = 0, e = opencl_kernels->getNumOperands(); i != e; ++i) {
-    llvm::MDNode *kernel_iter = opencl_kernels->getOperand(i);
-
-    llvm::Value *meta =
-      dyn_cast<llvm::ValueAsMetadata>(kernel_iter->getOperand(0))->getValue();
-    llvm::Function *kernel_prototype = llvm::cast<llvm::Function>(meta);
-    std::string name = kernel_prototype->getName().str();
-    if (name == kernel_name) {
-      kernel_metadata = kernel_iter;
-      break;
-    }
-  }
-
-  kernel->arg_info =
-    (struct pocl_argument_info*)calloc(
-      kernel->num_args, sizeof(struct pocl_argument_info));
-  memset(
-    kernel->arg_info, 0, sizeof(struct pocl_argument_info) * kernel->num_args);
-
-  kernel->has_arg_metadata = 0;
-
-  assert(kernel_metadata && "kernel NOT found in opencl.kernels metadata");
-
-#ifdef POCL_USE_FAKE_ADDR_SPACE_IDS
-  int BitcodeIsSPIR = input->getTargetTriple().find("spir") == 0;
-#endif
-
-  unsigned e = kernel_metadata->getNumOperands();
-  for (unsigned i = 1; i != e; ++i) {
-    llvm::MDNode *meta_node =
-      llvm::cast<MDNode>(kernel_metadata->getOperand(i));
-
-    // argument num
-    unsigned arg_num = meta_node->getNumOperands();
-#ifndef NDEBUG
-    int has_meta_for_every_arg = ((arg_num-1) == kernel->num_args);
-#endif
-
-    llvm::MDString *meta_name_node = llvm::cast<MDString>(meta_node->getOperand(0));
-    std::string meta_name = meta_name_node->getString().str();
-
-    for (unsigned j = 1; j != arg_num; ++j) {
-      llvm::Value *meta_arg_value = NULL;
-      if (isa<ValueAsMetadata>(meta_node->getOperand(j)))
-        meta_arg_value =
-          dyn_cast<ValueAsMetadata>(meta_node->getOperand(j))->getValue();
-      else if (isa<ConstantAsMetadata>(meta_node->getOperand(j)))
-        meta_arg_value =
-          dyn_cast<ConstantAsMetadata>(meta_node->getOperand(j))->getValue();
-      struct pocl_argument_info* current_arg = &kernel->arg_info[j-1];
-
-      if (meta_arg_value != NULL && isa<ConstantInt>(meta_arg_value) &&
-          meta_name == "kernel_arg_addr_space") {
-        assert(has_meta_for_every_arg && "kernel_arg_addr_space meta incomplete");
-        kernel->has_arg_metadata |= POCL_HAS_KERNEL_ARG_ADDRESS_QUALIFIER;
-        //std::cout << "is ConstantInt /  kernel_arg_addr_space" << std::endl;
-        llvm::ConstantInt *m = llvm::cast<ConstantInt>(meta_arg_value);
-        uint64_t val = m->getLimitedValue(UINT_MAX);
-        bool SPIRAddressSpaceIDs;
-#ifdef POCL_USE_FAKE_ADDR_SPACE_IDS
-        SPIRAddressSpaceIDs = BitcodeIsSPIR;
-#else
-        // We have an LLVM fixed to produce always SPIR AS ids for the argument
-        // info metadata.
-        SPIRAddressSpaceIDs = true;
-#endif
-
-        if (SPIRAddressSpaceIDs) {
-          switch(val) {
-            case 0:
-              current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_PRIVATE; break;
-            case 1:
-              current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_GLOBAL; break;
-            case 3:
-              current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_LOCAL; break;
-            case 2:
-              current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_CONSTANT; break;
-          }
-        } else {
-          switch(val) {
-#ifdef POCL_USE_FAKE_ADDR_SPACE_IDS
-            case POCL_FAKE_AS_PRIVATE:
-              current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_PRIVATE; break;
-            case POCL_FAKE_AS_GLOBAL:
-              current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_GLOBAL; break;
-            case POCL_FAKE_AS_LOCAL:
-              current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_LOCAL; break;
-            case POCL_FAKE_AS_CONSTANT:
-              current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_CONSTANT; break;
-            case POCL_FAKE_AS_GENERIC:
-              current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_PRIVATE; break;
-#endif
-          default:
-            POCL_MSG_ERR("Unknown address space ID %lu\n", val);
-            break;
-          }
-        }
-      }
-      else if (isa<MDString>(meta_node->getOperand(j))) {
-        //std::cout << "is MDString" << std::endl;
-        llvm::MDString *m = llvm::cast<MDString>(meta_node->getOperand(j));
-        std::string val = m->getString().str();
-
-        if (meta_name == "kernel_arg_access_qual") {
-          assert(has_meta_for_every_arg && "kernel_arg_access_qual meta incomplete");
-          kernel->has_arg_metadata |= POCL_HAS_KERNEL_ARG_ACCESS_QUALIFIER;
-          if (val == "read_write")
-            current_arg->access_qualifier = CL_KERNEL_ARG_ACCESS_READ_WRITE;
-          else if (val == "read_only")
-            current_arg->access_qualifier = CL_KERNEL_ARG_ACCESS_READ_ONLY;
-          else if (val == "write_only")
-            current_arg->access_qualifier = CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
-          else if (val == "none")
-            current_arg->access_qualifier = CL_KERNEL_ARG_ACCESS_NONE;
-          else
-            std::cout << "UNKNOWN kernel_arg_access_qual value: " << val << std::endl;
-        } else if (meta_name == "kernel_arg_type") {
-          assert(has_meta_for_every_arg && "kernel_arg_type meta incomplete");
-          kernel->has_arg_metadata |= POCL_HAS_KERNEL_ARG_TYPE_NAME;
-          current_arg->type_name = new char[val.size() + 1];
-          std::strcpy(current_arg->type_name, val.c_str());
-        } else if (meta_name == "kernel_arg_base_type") {
-          // may or may not be present even in SPIR
-        } else if (meta_name == "kernel_arg_type_qual") {
-          assert(has_meta_for_every_arg && "kernel_arg_type_qual meta incomplete");
-          kernel->has_arg_metadata |= POCL_HAS_KERNEL_ARG_TYPE_QUALIFIER;
-          current_arg->type_qualifier = 0;
-          if (val.find("const") != std::string::npos)
-            current_arg->type_qualifier |= CL_KERNEL_ARG_TYPE_CONST;
-          if (val.find("restrict") != std::string::npos)
-            current_arg->type_qualifier |= CL_KERNEL_ARG_TYPE_RESTRICT;
-          if (val.find("volatile") != std::string::npos)
-            current_arg->type_qualifier |= CL_KERNEL_ARG_TYPE_VOLATILE;
-        } else if (meta_name == "kernel_arg_name") {
-          assert(has_meta_for_every_arg && "kernel_arg_name meta incomplete");
-          kernel->has_arg_metadata |= POCL_HAS_KERNEL_ARG_NAME;
-          current_arg->name = new char[val.size() + 1];
-          std::strcpy(current_arg->name, val.c_str());
-        } else
-          std::cout << "UNKNOWN opencl metadata name: " << meta_name << std::endl;
-      }
-      else if (meta_name != "reqd_work_group_size")
-        std::cout << "UNKNOWN opencl metadata class for: " << meta_name << std::endl;
-
-    }
-  }
-  return 0;
-}
-
-#ifndef LLVM_OLDER_THAN_3_9
-// Clang 3.9 uses function metadata instead of module metadata for presenting
-// OpenCL kernel information.
-static int pocl_get_kernel_arg_function_metadata(const char* kernel_name,
-                                                 llvm::Module *input,
-                                                 cl_kernel kernel)
-{
-  llvm::Function *Kernel = NULL;
-  int bitcode_is_spir = input->getTargetTriple().find("spir") == 0;
-
-  // SPIR still uses the "opencl.kernels" MD.
-  if(bitcode_is_spir)
-    return pocl_get_kernel_arg_module_metadata(kernel_name, input, kernel);
-
-  for (llvm::Module::iterator i = input->begin(), e = input->end();
-       i != e; ++i) {
-    if (i->getMetadata("kernel_arg_access_qual")
-        && i->getName() == kernel_name)
-      {
-        Kernel = &*i;
-        break;
-      }
-  }
-  assert(Kernel);
-  kernel->has_arg_metadata = 0;
-
-  llvm::MDNode *meta_node;
-  llvm::Value *meta_arg_value = NULL;
-  struct pocl_argument_info* current_arg = NULL;
-
-  kernel->arg_info =
-    (struct pocl_argument_info*)calloc(
-      kernel->num_args, sizeof(struct pocl_argument_info));
-  memset(
-    kernel->arg_info, 0, sizeof(struct pocl_argument_info) * kernel->num_args);
-
-  // kernel_arg_addr_space
-  meta_node = Kernel->getMetadata("kernel_arg_addr_space");
-  assert(meta_node != nullptr);
-  unsigned arg_num = meta_node->getNumOperands();
-#ifndef NDEBUG
-  int has_meta_for_every_arg = (arg_num == kernel->num_args);
-#endif
-  for (unsigned j = 0; j < arg_num; ++j) {
-    assert(has_meta_for_every_arg && "kernel_arg_addr_space meta incomplete");
-
-    current_arg = &kernel->arg_info[j];
-    kernel->has_arg_metadata |= POCL_HAS_KERNEL_ARG_ADDRESS_QUALIFIER;
-    //std::cout << "is ConstantInt /  kernel_arg_addr_space" << std::endl;
-     meta_arg_value =
-          dyn_cast<ConstantAsMetadata>(meta_node->getOperand(j))->getValue();
-    llvm::ConstantInt *m = llvm::cast<ConstantInt>(meta_arg_value);
-    uint64_t val = m->getLimitedValue(UINT_MAX);
-
-    bool SPIRAddressSpaceIDs;
-#ifdef POCL_USE_FAKE_ADDR_SPACE_IDS
-    SPIRAddressSpaceIDs = bitcode_is_spir;
-#else
-    // We have an LLVM fixed to produce always SPIR AS ids for the argument
-    // info metadata.
-    SPIRAddressSpaceIDs = true;
-#endif
-    if (SPIRAddressSpaceIDs) {
-      switch(val) {
-      case 0:
-        current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_PRIVATE; break;
-      case 1:
-        current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_GLOBAL; break;
-      case 3:
-        current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_LOCAL; break;
-      case 2:
-        current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_CONSTANT; break;
-      default:
-        POCL_MSG_ERR("Unknown address space ID %lu\n", val);
-        break;
-      }
-    } else {
-      switch(val) {
-#ifdef POCL_USE_FAKE_ADDR_SPACE_IDS
-      case POCL_FAKE_AS_PRIVATE:
-        current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_PRIVATE; break;
-      case POCL_FAKE_AS_GLOBAL:
-        current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_GLOBAL; break;
-      case POCL_FAKE_AS_LOCAL:
-        current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_LOCAL; break;
-      case POCL_FAKE_AS_CONSTANT:
-        current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_CONSTANT; break;
-      case POCL_FAKE_AS_GENERIC:
-        current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_PRIVATE; break;
-#endif
-      default:
-        POCL_MSG_ERR("Unknown address space ID %lu\n", val);
-        break;
-      }
-    }
-  }
-
-  // kernel_arg_access_qual
-  meta_node = Kernel->getMetadata("kernel_arg_access_qual");
-  arg_num = meta_node->getNumOperands();
-#ifndef NDEBUG
-  has_meta_for_every_arg = (arg_num == kernel->num_args);
-#endif
-  assert(has_meta_for_every_arg && "kernel_arg_access_qual meta incomplete");
-
-  for (unsigned j= 0; j < meta_node->getNumOperands(); ++j) {
-    current_arg = &kernel->arg_info[j];
-    //std::cout << "is MDString" << std::endl;
-    llvm::MDString *m = llvm::cast<MDString>(meta_node->getOperand(j));
-    std::string val = m->getString().str();
-
-    assert(has_meta_for_every_arg && "kernel_arg_access_qual meta incomplete");
-    kernel->has_arg_metadata |= POCL_HAS_KERNEL_ARG_ACCESS_QUALIFIER;
-    if (val == "read_write")
-      current_arg->access_qualifier = CL_KERNEL_ARG_ACCESS_READ_WRITE;
-    else if (val == "read_only")
-      current_arg->access_qualifier = CL_KERNEL_ARG_ACCESS_READ_ONLY;
-    else if (val == "write_only")
-      current_arg->access_qualifier = CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
-    else if (val == "none")
-      current_arg->access_qualifier = CL_KERNEL_ARG_ACCESS_NONE;
-    else
-      std::cout << "UNKNOWN kernel_arg_access_qual value: " << val << std::endl;
-  }
-
-  // kernel_arg_type
-  meta_node = Kernel->getMetadata("kernel_arg_type");
-  assert(meta_node != nullptr);
-  arg_num = meta_node->getNumOperands();
-#ifndef NDEBUG
-  has_meta_for_every_arg = (arg_num == kernel->num_args);
-#endif
-  assert(has_meta_for_every_arg && "kernel_arg_type meta incomplete");
-
-  for (unsigned j= 0; j < meta_node->getNumOperands(); ++j) {
-    llvm::MDString *m = llvm::cast<MDString>(meta_node->getOperand(j));
-    std::string val = m->getString().str();
-
-    current_arg = &kernel->arg_info[j];
-    kernel->has_arg_metadata |= POCL_HAS_KERNEL_ARG_TYPE_NAME;
-    current_arg->type_name = new char[val.size() + 1];
-    std::strcpy(current_arg->type_name, val.c_str());
-  }
-
-  // kernel_arg_type_qual
-  meta_node = Kernel->getMetadata("kernel_arg_type_qual");
-  arg_num = meta_node->getNumOperands();
-#ifndef NDEBUG
-  has_meta_for_every_arg = (arg_num == kernel->num_args);
-#endif
-  assert(has_meta_for_every_arg && "kernel_arg_type_qual meta incomplete");
-  for (unsigned j= 0; j < meta_node->getNumOperands(); ++j) {
-    llvm::MDString *m = llvm::cast<MDString>(meta_node->getOperand(j));
-    std::string val = m->getString().str();
-
-    current_arg = &kernel->arg_info[j];
-    assert(has_meta_for_every_arg && "kernel_arg_type_qual meta incomplete");
-    kernel->has_arg_metadata |= POCL_HAS_KERNEL_ARG_TYPE_QUALIFIER;
-    current_arg->type_qualifier = 0;
-    if (val.find("const") != std::string::npos)
-      current_arg->type_qualifier |= CL_KERNEL_ARG_TYPE_CONST;
-    if (val.find("restrict") != std::string::npos)
-      current_arg->type_qualifier |= CL_KERNEL_ARG_TYPE_RESTRICT;
-    if (val.find("volatile") != std::string::npos)
-      current_arg->type_qualifier |= CL_KERNEL_ARG_TYPE_VOLATILE;
-  }
-
-  //kernel_arg_name
-  meta_node = Kernel->getMetadata("kernel_arg_name");
-  arg_num = meta_node->getNumOperands();
-#ifndef NDEBUG
-  has_meta_for_every_arg = (arg_num == kernel->num_args);
-#endif
-  assert(has_meta_for_every_arg && "kernel_arg_name meta incomplete");
-  for (unsigned j= 0; j < meta_node->getNumOperands(); ++j) {
-    llvm::MDString *m = llvm::cast<MDString>(meta_node->getOperand(j));
-    std::string val = m->getString().str();
-
-    current_arg = &kernel->arg_info[j];
-    kernel->has_arg_metadata |= POCL_HAS_KERNEL_ARG_NAME;
-    current_arg->name = new char[val.size() + 1];
-    std::strcpy(current_arg->name, val.c_str());
-  }
-
-  return 0;
-}
-#endif
-
-int pocl_llvm_get_kernel_metadata(cl_program program,
-                                  cl_kernel kernel,
-                                  int device_i,
-                                  const char* kernel_name,
-                                  int * errcode)
-{
-
-  int i;
-  llvm::Module *input = NULL;
-  cl_device_id Device = program->devices[device_i];
-
-  assert(Device->llvm_target_triplet &&
-         "Device has no target triple set");
-
-  if (program->llvm_irs != NULL &&
-      program->llvm_irs[device_i] != NULL)
-    input = (llvm::Module*)program->llvm_irs[device_i];
-  else {
-    *errcode = CL_INVALID_PROGRAM_EXECUTABLE;
-    return 1;
-  }
-
-  llvm::Function *KernelFunction = input->getFunction(kernel_name);
-  if (!KernelFunction) {
-    *errcode = CL_INVALID_KERNEL_NAME;
-    return 1;
-  }
-  kernel->num_args = KernelFunction->getArgumentList().size();
-
-#if defined(LLVM_OLDER_THAN_3_9)
-  if (pocl_get_kernel_arg_module_metadata(kernel_name, input, kernel)) {
-    *errcode = CL_INVALID_KERNEL;
-    return 1;
-  }
-#else
-  if (pocl_get_kernel_arg_function_metadata(kernel_name, input, kernel)) {
-    *errcode = CL_INVALID_KERNEL;
-    return 1;
-  }
-#endif
-
-#ifdef DEBUG_POCL_LLVM_API
-  printf("### fetching kernel metadata for kernel %s program %p input llvm::Module %p\n",
-         kernel_name, program, input);
-#endif
-
-  DataLayout *TD = 0;
-#ifdef LLVM_OLDER_THAN_3_7
-  const std::string &ModuleDataLayout =
-    input->getDataLayout()->getStringRepresentation();
-#else
-  const std::string &ModuleDataLayout =
-    input->getDataLayout().getStringRepresentation();
-#endif
-  if (!ModuleDataLayout.empty())
-    TD = new DataLayout(ModuleDataLayout);
-
-  SmallVector<GlobalVariable *, 8> locals;
-  for (llvm::Module::global_iterator i = input->global_begin(),
-         e = input->global_end();
-       i != e; ++i) {
-    std::string funcName = "";
-    funcName = KernelFunction->getName().str();
-    if (pocl::isAutomaticLocal(funcName, *i)) {
-      POCL_MSG_PRINT_INFO("Automatic local detected: %s\n",
-                          i->getName().str().c_str());
-      locals.push_back(&*i);
-    }
-  }
-
-  kernel->num_locals = locals.size();
-
-  /* Temporary store for the arguments that are set with clSetKernelArg. */
-  kernel->dyn_arguments =
-    (struct pocl_argument *) malloc ((kernel->num_args + kernel->num_locals) *
-                                     sizeof (struct pocl_argument));
-  /* Initialize kernel "dynamic" arguments (in case the user doesn't). */
-  for (unsigned i = 0; i < kernel->num_args; ++i)
-    {
-      kernel->dyn_arguments[i].value = NULL;
-      kernel->dyn_arguments[i].size = 0;
-    }
-
-  /* Fill up automatic local arguments. */
-  for (unsigned i = 0; i < kernel->num_locals; ++i)
-    {
-      unsigned auto_local_size =
-        TD->getTypeAllocSize(locals[i]->getInitializer()->getType());
-      kernel->dyn_arguments[kernel->num_args + i].value = NULL;
-      kernel->dyn_arguments[kernel->num_args + i].size = auto_local_size;
-#ifdef DEBUG_POCL_LLVM_API
-      printf("### automatic local %d size %u\n", i, auto_local_size);
-#endif
-    }
-
-  const llvm::Function::ArgumentListType &ArgList =
-    KernelFunction->getArgumentList();
-
-  i = 0;
-  for (llvm::Function::const_arg_iterator ii = ArgList.begin(),
-                                          ee = ArgList.end();
-       ii != ee ; ii++) {
-    llvm::Type *t = ii->getType();
-    struct pocl_argument_info &ArgInfo = kernel->arg_info[i];
-    ArgInfo.type = POCL_ARG_TYPE_NONE;
-    ArgInfo.is_local = false;
-    const llvm::PointerType *p = dyn_cast<llvm::PointerType>(t);
-    if (p && !ii->hasByValAttr()) {
-      ArgInfo.type = POCL_ARG_TYPE_POINTER;
-      // index 0 is for function attributes, parameters start at 1.
-      // TODO: detect the address space from MD.
-
-#ifndef POCL_USE_FAKE_ADDR_SPACE_IDS
-      if (ArgInfo.address_qualifier == CL_KERNEL_ARG_ADDRESS_LOCAL)
-        ArgInfo.is_local = true;
-#else
-      if (p->getAddressSpace() == POCL_FAKE_AS_GLOBAL ||
-          p->getAddressSpace() == POCL_FAKE_AS_CONSTANT ||
-          pocl::is_image_type(*t) || pocl::is_sampler_type(*t))
-        {
-          kernel->arg_info[i].is_local = false;
-        }
-      else
-        {
-          if (p->getAddressSpace() != POCL_FAKE_AS_LOCAL)
-            {
-              p->dump();
-              assert(p->getAddressSpace() == POCL_FAKE_AS_LOCAL);
-            }
-          kernel->arg_info[i].is_local = true;
-        }
-#endif
-    }
-
-    if (pocl::is_image_type(*t)) {
-      kernel->arg_info[i].type = POCL_ARG_TYPE_IMAGE;
-    } else if (pocl::is_sampler_type(*t)) {
-      kernel->arg_info[i].type = POCL_ARG_TYPE_SAMPLER;
-    }
-    i++;
-  }
-  // fill 'kernel->reqd_wg_size'
-  kernel->reqd_wg_size = (int*)malloc(3*sizeof(int));
-
-  unsigned reqdx = 0, reqdy = 0, reqdz = 0;
-
-#ifdef LLVM_OLDER_THAN_3_9
-  llvm::NamedMDNode *size_info =
-    KernelFunction->getParent()->getNamedMetadata("opencl.kernel_wg_size_info");
-  if (size_info) {
-    for (unsigned i = 0, e = size_info->getNumOperands(); i != e; ++i) {
-      llvm::MDNode *KernelSizeInfo = size_info->getOperand(i);
-      if (dyn_cast<ValueAsMetadata>(
-        KernelSizeInfo->getOperand(0).get())->getValue() != KernelFunction)
-        continue;
-      reqdx = (llvm::cast<ConstantInt>(
-                 llvm::dyn_cast<ConstantAsMetadata>(
-                   KernelSizeInfo->getOperand(1))->getValue()))->getLimitedValue();
-      reqdy = (llvm::cast<ConstantInt>(
-                 llvm::dyn_cast<ConstantAsMetadata>(
-                   KernelSizeInfo->getOperand(2))->getValue()))->getLimitedValue();
-      reqdz = (llvm::cast<ConstantInt>(
-                 llvm::dyn_cast<ConstantAsMetadata>(
-                   KernelSizeInfo->getOperand(3))->getValue()))->getLimitedValue();
-      break;
-    }
-  }
-#else
-  llvm::MDNode *ReqdWGSize =
-    KernelFunction->getMetadata("reqd_work_group_size");
-  if (ReqdWGSize != NULL) {
-    reqdx = (llvm::cast<ConstantInt>(
-               llvm::dyn_cast<ConstantAsMetadata>(
-                 ReqdWGSize->getOperand(0))->getValue()))->getLimitedValue();
-    reqdy = (llvm::cast<ConstantInt>(
-               llvm::dyn_cast<ConstantAsMetadata>(
-                 ReqdWGSize->getOperand(1))->getValue()))->getLimitedValue();
-    reqdz = (llvm::cast<ConstantInt>(
-               llvm::dyn_cast<ConstantAsMetadata>(
-                 ReqdWGSize->getOperand(2))->getValue()))->getLimitedValue();
-  }
-#endif
-
-  kernel->reqd_wg_size[0] = reqdx;
-  kernel->reqd_wg_size[1] = reqdy;
-  kernel->reqd_wg_size[2] = reqdz;
-
-#ifndef POCL_ANDROID
-  // Generate the kernel_obj.c file. This should be optional
-  // and generated only for the heterogeneous standalone devices which
-  // need the definitions to accompany the kernels, for the launcher
-  // code.
-  // TODO: the scripts use a generated kernel.h header file that
-  // gets added to this file. No checks seem to fail if that file
-  // is missing though, so it is left out from there for now
-
-  std::stringstream content;
-
-  content << std::endl << "#include <pocl_device.h>" << std::endl
-          << "void _pocl_launcher_" << kernel_name
-          << "_workgroup(void** args, struct pocl_context*);" << std::endl
-          << "void _pocl_launcher_" << kernel_name
-          << "_workgroup_fast(void** args, struct pocl_context*);" << std::endl;
-
-  if (Device->global_as_id != 0)
-    content << "__attribute__((address_space(" << Device->global_as_id << ")))"
-            << std::endl;
-
-  content << "__kernel_metadata _" << kernel_name << "_md = {" << std::endl
-          << "     \"" << kernel_name << "\"," << std::endl
-          << "     " << kernel->num_args << "," << std::endl
-          << "     " << kernel->num_locals << "," << std::endl
-          << "     _pocl_launcher_" << kernel_name << "_workgroup_fast" << std::endl
-          << " };" << std::endl;
-
-  pocl_cache_write_descriptor(program, device_i,
-                              kernel_name, content.str().c_str(),
-                              content.str().size());
-#endif
-
-  *errcode = CL_SUCCESS;
-  return 0;
-}
-
-char* get_cpu_name() {
-#ifdef __mips__
-  // The MIPS backend isn't able to automatically detect the host yet and the
-  // value returned by llvm::sys::getHostCPUName() isn't usable in the
-  // -target-cpu option so we must use the CPU detected by CMake.
-  StringRef r = OCL_KERNEL_TARGET_CPU;
-#else
-  StringRef r = llvm::sys::getHostCPUName();
-#endif
-
-#ifdef LLVM_3_8
-  // https://github.com/pocl/pocl/issues/413
-  if (r.str() == "skylake") {
-    r = llvm::StringRef("haswell");
-  }
-#endif
-
-  assert(r.size() > 0);
-  char* cpu_name = (char*) malloc (r.size()+1);
-  strncpy(cpu_name, r.data(), r.size());
-  cpu_name[r.size()] = 0;
-  return cpu_name;
-}
-
-/* helpers copied from LLVM opt START */
-
-/* FIXME: these options should come from the cl_device, and
- * cl_program's options. */
-static llvm::TargetOptions GetTargetOptions() {
-  llvm::TargetOptions Options;
-#ifdef LLVM_OLDER_THAN_3_9
-  Options.PositionIndependentExecutable = true;
-#endif
-  #ifdef HOST_FLOAT_SOFT_ABI
-  Options.FloatABIType = FloatABI::Soft;
-  #else
-  Options.FloatABIType = FloatABI::Hard;
-  #endif
-#if 0
-  Options.LessPreciseFPMADOption = EnableFPMAD;
-  Options.NoFramePointerElim = DisableFPElim;
-  Options.NoFramePointerElimNonLeaf = DisableFPElimNonLeaf;
-  Options.AllowFPOpFusion = FuseFPOps;
-  Options.UnsafeFPMath = EnableUnsafeFPMath;
-  Options.NoInfsFPMath = EnableNoInfsFPMath;
-  Options.NoNaNsFPMath = EnableNoNaNsFPMath;
-  Options.HonorSignDependentRoundingFPMathOption =
-  EnableHonorSignDependentRoundingFPMath;
-  Options.UseSoftFloat = GenerateSoftFloatCalls;
-  if (FloatABIForCalls != FloatABI::Default)
-    Options.FloatABIType = FloatABIForCalls;
-  Options.NoZerosInBSS = DontPlaceZerosInBSS;
-  Options.GuaranteedTailCallOpt = EnableGuaranteedTailCallOpt;
-  Options.DisableTailCalls = DisableTailCalls;
-  Options.StackAlignmentOverride = OverrideStackAlignment;
-  Options.RealignStack = EnableRealignStack;
-  Options.TrapFuncName = TrapFuncName;
-  Options.EnableSegmentedStacks = SegmentedStacks;
-  Options.UseInitArray = UseInitArray;
-  Options.SSPBufferSize = SSPBufferSize;
-#endif
-  return Options;
-}
-
-/* for "distro" style kernel libs, return which kernellib to use, at runtime */
-#ifdef KERNELLIB_HOST_DISTRO_VARIANTS
-static const char* getX86KernelLibName() {
-  StringMap<bool> Features;
-  llvm::sys::getHostCPUFeatures(Features);
-  const char *res = NULL;
-
-  if (Features["sse2"])
-    res = "sse2";
-  else
-    POCL_ABORT("Pocl on x86_64 requires at least SSE2");
-  if (Features["ssse3"] && Features["cx16"])
-    res = "ssse3";
-  if (Features["sse4.1"] && Features["cx16"])
-    res = "sse41";
-  if (Features["avx"] && Features["cx16"] && Features["popcnt"])
-    res = "avx";
-  if (Features["avx"] && Features["cx16"] && Features["popcnt"]
-      && Features["xop"] && Features["fma4"])
-    res = "avx_fma4";
-  if (Features["avx"] && Features["avx2"] && Features["cx16"]
-      && Features["popcnt"] && Features["lzcnt"] && Features["f16c"]
-      && Features["fma"] && Features["bmi"] && Features["bmi2"])
-    res = "avx2";
-  if (Features["avx512f"] )
-    res = "avx512";
-
-  return res;
-}
-#endif
-
-// Returns the TargetMachine instance or zero if no triple is provided.
-static TargetMachine* GetTargetMachine(cl_device_id device,
- const std::vector<std::string>& MAttrs=std::vector<std::string>()) {
-
-  std::string Error;
-  Triple TheTriple(device->llvm_target_triplet);
-
-  std::string MCPU =  device->llvm_cpu ? device->llvm_cpu : "";
-
-  const Target *TheTarget = 
-    TargetRegistry::lookupTarget("", TheTriple, Error);
-  
-  // In LLVM 3.4 and earlier, the target registry falls back to 
-  // the cpp backend in case a proper match was not found. In 
-  // that case simply do not use target info in the compilation 
-  // because it can be an off-tree target not registered at
-  // this point (read: TCE).
-  if (!TheTarget || TheTarget->getName() == std::string("cpp")) {
-    return 0;
-  }
-  
-  // Package up features to be passed to target/subtarget
-  std::string FeaturesStr;
-  if (MAttrs.size()) {
-    SubtargetFeatures Features;
-    for (unsigned i = 0; i != MAttrs.size(); ++i)
-      Features.AddFeature(MAttrs[i]);
-    FeaturesStr = Features.getString();
-  }
-
-  TargetMachine* TM = TheTarget->createTargetMachine(TheTriple.getTriple(),
-                                                     MCPU, FeaturesStr, 
-                                                     GetTargetOptions(),
-                                                     Reloc::PIC_, 
-                                                     CodeModel::Default,
-                                                     CodeGenOpt::Aggressive);
-  assert (TM != NULL && "llvm target has no targetMachine constructor"); 
-  if (device->ops->init_target_machine)
-    device->ops->init_target_machine(device->data, TM);
-
-  return TM;
-}
-/* helpers copied from LLVM opt END */
-
-static void InitializeLLVM() {
-  
-  static bool LLVMInitialized = false;
-  if (LLVMInitialized) return;
-  // We have not initialized any pass managers for any device yet.
-  // Run the global LLVM pass initialization functions.
-  InitializeAllTargets();
-  InitializeAllTargetMCs();
-  InitializeAllAsmPrinters();
-  InitializeAllAsmParsers();
-
-  LLVMInitialized = true;
-}
-
-/**
- * Prepare the kernel compiler passes.
- *
- * The passes are created only once per program run per device.
- * The returned pass manager should not be modified, only the Module
- * should be optimized using it.
- */
-static PassManager& kernel_compiler_passes
-(cl_device_id device, const std::string& module_data_layout)
-{
-  static std::map<cl_device_id, PassManager*> kernel_compiler_passes;
-
-  bool SPMDDevice = device->spmd;
-
-  if (kernel_compiler_passes.find(device) != 
-      kernel_compiler_passes.end())
-    {
-      return *kernel_compiler_passes[device];
-    }
-  
-  Triple triple(device->llvm_target_triplet);
-
-  PassRegistry &Registry = *PassRegistry::getPassRegistry();
-
-  const bool first_initialization_call = kernel_compiler_passes.size() == 0;
-
-  if (first_initialization_call) {
-    // TODO: do this globally, and just once per program
-    initializeCore(Registry);
-    initializeScalarOpts(Registry);
-    initializeVectorization(Registry);
-    initializeIPO(Registry);
-    initializeAnalysis(Registry);
-#ifdef LLVM_OLDER_THAN_3_8
-    initializeIPA(Registry);
-#endif
-    initializeTransformUtils(Registry);
-    initializeInstCombine(Registry);
-    initializeInstrumentation(Registry);
-    initializeTarget(Registry);
-  }
-
-# ifdef LLVM_OLDER_THAN_3_7
-  StringMap<llvm::cl::Option*> opts;
-  llvm::cl::getRegisteredOptions(opts);
-# else
-  StringMap<llvm::cl::Option *>& opts = llvm::cl::getRegisteredOptions();
-# endif
-
-  PassManager *Passes = new PassManager();
-
-#ifdef LLVM_OLDER_THAN_3_7
-  // Need to setup the target info for target specific passes. */
-  TargetMachine *Machine = GetTargetMachine(device);
-
-  // Add internal analysis passes from the target machine.
-  if (Machine != NULL)
-    Machine->addAnalysisPasses(*Passes);
-#else 
-  TargetMachine *Machine = GetTargetMachine(device);
-  if (Machine != NULL)
-    Passes->add(createTargetTransformInfoWrapperPass(Machine->getTargetIRAnalysis()));
-#endif
-
-
-  if (module_data_layout != "") {
-#if (defined LLVM_OLDER_THAN_3_7)
-    Passes->add(new DataLayoutPass());
-#endif
-  }
-
-  /* Disables automated generation of libcalls from code patterns. 
-     TCE doesn't have a runtime linker which could link the libs later on.
-     Also the libcalls might be harmful for WG autovectorization where we 
-     want to try to vectorize the code it converts to e.g. a memset or 
-     a memcpy */
-#ifdef LLVM_OLDER_THAN_3_7
-  TargetLibraryInfo *TLI = new TargetLibraryInfo(triple);
-  TLI->disableAllFunctions();
-  Passes->add(TLI);
-#else
-  TargetLibraryInfoImpl TLII(triple);
-  TLII.disableAllFunctions();
-  Passes->add(new TargetLibraryInfoWrapperPass(TLII));
-#endif
-
-  /* The kernel compiler passes to run, in order.
-
-     Notes about the kernel compiler phase ordering:
-     -mem2reg first because we get unoptimized output from Clang where all
-     variables are allocas. Avoid context saving the allocas and make the
-     more readable by calling -mem2reg at the beginning.
-
-     -implicit-cond-barriers after -implicit-loop-barriers because the latter can inject
-     barriers to loops inside conditional regions after which the peeling should be 
-     avoided by injecting the implicit conditional barriers
-
-     -loop-barriers, -barriertails, and -barriers should be ran after the implicit barrier 
-     injection passes so they "normalize" the implicit barriers also
-
-     -phistoallocas before -workitemloops as otherwise it cannot inject context
-     restore code (PHIs need to be at the beginning of the BB and so one cannot
-     context restore them with non-PHI code if the value is needed in another PHI). */
-
-  std::vector<std::string> passes;
-  passes.push_back("handle-samplers");
-  passes.push_back("workitem-handler-chooser");
-  passes.push_back("mem2reg");
-  passes.push_back("domtree");
-  passes.push_back("break-constgeps");
-  if (device->autolocals_to_args)
-	  passes.push_back("automatic-locals");
-  passes.push_back("flatten");
-  passes.push_back("always-inline");
-  passes.push_back("globaldce");
-  if (!SPMDDevice) {
-    passes.push_back("simplifycfg");
-    passes.push_back("loop-simplify");
-    passes.push_back("uniformity");
-    passes.push_back("phistoallocas");
-    passes.push_back("isolate-regions");
-    passes.push_back("implicit-loop-barriers");
-    passes.push_back("implicit-cond-barriers");
-    passes.push_back("loop-barriers");
-    passes.push_back("barriertails");
-    passes.push_back("barriers");
-    passes.push_back("isolate-regions");
-    passes.push_back("wi-aa");
-    passes.push_back("workitemrepl");
-    //passes.push_back("print-module");
-    passes.push_back("workitemloops");
-    // Remove the (pseudo) barriers.   They have no use anymore due to the
-    // work-item loop control taking care of them.
-    passes.push_back("remove-barriers");
-  }
-  // Add the work group launcher functions and privatize the pseudo variable
-  // (local id) accesses.
-  passes.push_back("workgroup");
-
-  // Attempt to move all allocas to the entry block to avoid the need for
-  // dynamic stack which is problematic for some architectures.
-  passes.push_back("allocastoentry");
-
-#ifdef POCL_USE_FAKE_ADDR_SPACE_IDS
-  // Convert the semantical OpenCL address space IDs to the ones of the target.
-  passes.push_back("target-address-spaces");
-#endif
-
-  // Later passes might get confused (and expose possible bugs in them) due to
-  // UNREACHABLE blocks left by repl. So let's clean up the CFG before running
-  // the standard LLVM optimizations.
-  passes.push_back("simplifycfg");
-
-#if 0
-  passes.push_back("print-module");
-  passes.push_back("dot-cfg");
-#endif
-
-  const std::string wg_method =
-    pocl_get_string_option("POCL_WORK_GROUP_METHOD", "loopvec");
-
-  if (kernel_compiler_passes.size() == 0) {
-    // Set the options only once. TODO: fix it so that each
-    // device can reset their own options. Now one cannot compile
-    // with different options to different devices at one run.
-
-    llvm::cl::Option *O = nullptr;
-    if (wg_method == "loopvec") {
-
-      passes.push_back("scalarizer");
-
-      O = opts["scalarize-load-store"];
-      assert(O && "could not find LLVM option 'scalarize-load-store'");
-      O->addOccurrence(1, StringRef("scalarize-load-store"),
-                       StringRef("1"), false);
-
-      // LLVM inner loop vectorizer does not check whether the loop inside
-      // another loop, in which case even a small trip count loops might be
-      // worthwhile to vectorize.
-      O = opts["vectorizer-min-trip-count"];
-      assert(O && "could not find LLVM option 'vectorizer-min-trip-count'");
-      O->addOccurrence(1, StringRef("vectorizer-min-trip-count"),
-                       StringRef("2"), false);
-
-      if (pocl_get_bool_option("POCL_VECTORIZER_REMARKS", 0) == 1) {
-        // Enable diagnostics from the loop vectorizer.
-        O = opts["pass-remarks-missed"];
-        assert(O && "could not find LLVM option 'pass-remarks-missed'");
-        O->addOccurrence(1, StringRef("pass-remarks-missed"),
-                         StringRef("loop-vectorize"), false);
-
-        O = opts["pass-remarks-analysis"];
-        assert(O && "could not find LLVM option 'pass-remarks-analysis'");
-        O->addOccurrence(1, StringRef("pass-remarks-analysis"),
-                         StringRef("loop-vectorize"), false);
-
-        O = opts["pass-remarks"];
-        assert(O && "could not find LLVM option 'pass-remarks'");
-        O->addOccurrence(1, StringRef("pass-remarks"),
-                         StringRef("loop-vectorize"), false);
-      }
-
-    }
-    if (pocl_get_bool_option("POCL_DEBUG_LLVM_PASSES", 0) == 1) {
-      O = opts["debug"];
-      assert(O && "could not find LLVM option 'debug'");
-      O->addOccurrence(1, StringRef("debug"), StringRef("true"), false);
-    }
-
-    O = opts["unroll-threshold"];
-    assert(O && "could not find LLVM option 'unroll-threshold'");
-    O->addOccurrence(1, StringRef("unroll-threshold"), StringRef("1"), false);
-  }
-
-  passes.push_back("instcombine");
-  passes.push_back("STANDARD_OPTS");
-  passes.push_back("instcombine");
-
-  // Now actually add the listed passes to the PassManager.
-  for(unsigned i = 0; i < passes.size(); ++i) {
-      // This is (more or less) -O3.
-      if (passes[i] == "STANDARD_OPTS")
-        {
-          PassManagerBuilder Builder;
-          Builder.OptLevel = 3;
-          Builder.SizeLevel = 0;
-
-          // These need to be setup in addition to invoking the passes
-          // to get the vectorizers initialized properly.
-          if (wg_method == "loopvec") {
-            Builder.LoopVectorize = true;
-            Builder.SLPVectorize = true;
-#ifdef LLVM_OLDER_THAN_3_7
-            Builder.BBVectorize = pocl_get_bool_option ("POCL_BBVECTORIZE", 1);
-#else
-            // In LLVM 3.7 the BB vectorizer crashes with some of the
-            // the shuffle tests, but gives performance improvements in
-            // some (see https://github.com/pocl/pocl/issues/251).
-            // Disable by default because of
-            // https://llvm.org/bugs/show_bug.cgi?id=25077
-            Builder.BBVectorize = pocl_get_bool_option ("POCL_BBVECTORIZE", 0);
-#endif
-          }
-          Builder.populateModulePassManager(*Passes);
-          continue;
-        }
-
-      const PassInfo *PIs = Registry.getPassInfo(StringRef(passes[i]));
-      if(PIs)
-        {
-          //std::cout << "-"<<passes[i] << " ";
-          Pass *thispass = PIs->createPass();
-          Passes->add(thispass);
-        }
-      else
-        {
-          std::cerr << "Failed to create kernel compiler pass " << passes[i] << std::endl;
-          POCL_ABORT("FAIL");
-        }
-    }
-
-
-  kernel_compiler_passes[device] = Passes;
-  return *Passes;
-}
-
-// Defined in llvmopencl/WorkitemHandler.cc
-namespace pocl {
-    extern size_t WGLocalSizeX;
-    extern size_t WGLocalSizeY;
-    extern size_t WGLocalSizeZ;
-    extern bool WGDynamicLocalSize;
-} 
-
-/**
- * Return the OpenCL C built-in function library bitcode
- * for the given device.
- */
-static llvm::Module*
-kernel_library
-(cl_device_id device)
-{
-  llvm::MutexGuard lockHolder(kernelCompilerLock);
-  InitializeLLVM();
-
-  static std::map<cl_device_id, llvm::Module*> libs;
-
-  Triple triple(device->llvm_target_triplet);
-
-  if (libs.find(device) != libs.end())
-    return libs[device];
-
-  const char *subdir = "host";
-  bool is_host = true;
-#ifdef TCE_AVAILABLE
-  if (triple.getArch() == Triple::tce) {
-    subdir = "tce";
-    is_host = false;
-  }
-#endif
-#ifdef BUILD_HSA
-  if (triple.getArch() == Triple::hsail64) {
-    subdir = "hsail64";
-    is_host = false;
-  }
-#endif
-#ifdef AMDGCN_ENABLED
-  if (triple.getArch == Triple::amdgcn) {
-    subdir = "amdgcn";
-    is_host = false;
-  }
-#endif
-
-  // TODO sync with Nat Ferrus' indexed linking
-  std::string kernellib;
-  std::string kernellib_fallback;
-  if (pocl_get_bool_option("POCL_BUILDING", 0)) {
-    kernellib = BUILDDIR;
-    kernellib += "/lib/kernel/";
-    kernellib += subdir;
-    // TODO: get this from the TCE target triplet
-    kernellib += "/kernel-";
-    kernellib += device->llvm_target_triplet;
-    if (is_host) {
-#ifdef POCL_BUILT_WITH_CMAKE
-    kernellib += '-';
-    kernellib_fallback = kernellib;
-    kernellib_fallback += OCL_KERNEL_TARGET_CPU;
-    kernellib_fallback += ".bc";
-#ifdef KERNELLIB_HOST_DISTRO_VARIANTS
-    if (triple.getArch() == Triple::x86_64 ||
-        triple.getArch() == Triple::x86)
-      kernellib += getX86KernelLibName();
-    else
-#endif
-      kernellib += device->llvm_cpu;
-#endif
-    }
-  } else { // POCL_BUILDING == 0, use install dir
-    kernellib = PKGDATADIR;
-    kernellib += "/kernel-";
-    kernellib += device->llvm_target_triplet;
-    if (is_host) {
-#ifdef POCL_BUILT_WITH_CMAKE
-    kernellib += '-';
-    kernellib_fallback = kernellib;
-    kernellib_fallback += OCL_KERNEL_TARGET_CPU;
-    kernellib_fallback += ".bc";
-#ifdef KERNELLIB_HOST_DISTRO_VARIANTS
-    if (triple.getArch() == Triple::x86_64 ||
-        triple.getArch() == Triple::x86)
-      kernellib += getX86KernelLibName();
-    else
-#endif
-      kernellib += device->llvm_cpu;
-#endif
-    }
-  }
-  kernellib += ".bc";
-
-  llvm::Module *lib;
-  SMDiagnostic Err;
-
-  if (pocl_exists(kernellib.c_str()))
-    {
-      POCL_MSG_PRINT_INFO("Using %s as the built-in lib.\n", kernellib.c_str());
-      lib = ParseIRFile(kernellib.c_str(), Err, *GlobalContext());
-    }
-  else
-    {
-      if (is_host && pocl_exists(kernellib_fallback.c_str()))
-        {
-          POCL_MSG_WARN("Using fallback %s as the built-in lib.\n",
-                        kernellib_fallback.c_str());
-          lib = ParseIRFile(kernellib_fallback.c_str(), Err, *GlobalContext());
-        }
-      else
-        POCL_ABORT("Kernel library file %s doesn't exist.", kernellib.c_str());
-    }
-  assert (lib != NULL);
-  libs[device] = lib;
-
-  return lib;
-}
-
-/* This is used to control the kernel we want to process in the kernel compilation. */
-extern cl::opt<std::string> KernelName;
-
-int pocl_llvm_generate_workgroup_function(char* kernel_cachedir, cl_device_id device,
-                                          cl_kernel kernel, size_t local_x,
-                                          size_t local_y, size_t local_z) {
-
-  pocl::WGDynamicLocalSize = (local_x == 0 && local_y == 0 && local_z == 0);
-
-  currentPoclDevice = device;
-
-  cl_program program = kernel->program;
-  int device_i = pocl_cl_device_to_index(program, device);
-  assert(device_i >= 0);
-
-  char parallel_bc_path[POCL_FILENAME_LENGTH];
-  pocl_cache_work_group_function_path(parallel_bc_path, program, device_i, kernel, local_x, local_y, local_z);
-
-  if (pocl_exists(parallel_bc_path))
-    return CL_SUCCESS;
-
-  char final_binary_path[POCL_FILENAME_LENGTH];
-  pocl_cache_final_binary_path(final_binary_path, program, device_i, kernel, local_x, local_y, local_z);
-
-  if (pocl_exists(final_binary_path))
-    return CL_SUCCESS;
-
-  pocl_mkdir_p(kernel_cachedir);
-
-  llvm::MutexGuard lockHolder(kernelCompilerLock);
-  InitializeLLVM();
-
-#ifdef DEBUG_POCL_LLVM_API
-  printf("### calling the kernel compiler for kernel %s local_x %zu "
-         "local_y %zu local_z %zu parallel_filename: %s\n",
-         kernel->name, local_x, local_y, local_z, parallel_bc_path);
-#endif
-
-  Triple triple(device->llvm_target_triplet);
-
-  SMDiagnostic Err;
-  std::string errmsg;
-
-  // Link the kernel and runtime library
-  llvm::Module *input = NULL;
-  if (kernel->program->llvm_irs != NULL &&
-      kernel->program->llvm_irs[device_i] != NULL)
-    {
-#ifdef DEBUG_POCL_LLVM_API
-      printf("### cloning the preloaded LLVM IR\n");
-#endif
-      llvm::Module* p = (llvm::Module*)kernel->program->llvm_irs[device_i];
-#ifdef LLVM_OLDER_THAN_3_8
-      input = llvm::CloneModule(p);
-#else
-      input = (llvm::CloneModule(p)).release();
-#endif
-    }
-  else
-    {
-#ifdef DEBUG_POCL_LLVM_API
-      printf("### loading the kernel bitcode from disk\n");
-#endif
-      char program_bc_path[POCL_FILENAME_LENGTH];
-      pocl_cache_program_bc_path(program_bc_path, program, device_i);
-      input = ParseIRFile(program_bc_path, Err, *GlobalContext());
-    }
-
-  /* Note this is a hack to get SPIR working. We'll be linking the
-   * host kernel library (plain LLVM IR) to the SPIR program.bc,
-   * so LLVM complains about incompatible DataLayouts. The proper solution
-   * would be to generate a SPIR kernel library
-   */
-  if (triple.getArch() == Triple::x86 || triple.getArch() == Triple::x86_64) {
-      if (input->getTargetTriple().substr(0, 6) == std::string("spir64")) {
-          input->setTargetTriple(triple.getTriple());
-          input->setDataLayout("e-m:e-i64:64-f80:128-n8:16:32:64-S128");
-      } else if (input->getTargetTriple().substr(0, 4) == std::string("spir")) {
-          input->setTargetTriple(triple.getTriple());
-          input->setDataLayout("e-m:e-p:32:32-i64:64-f80:32-n8:16:32-S32");
-      }
-  }
-
-  // Later this should be replaced with indexed linking of source code
-  // and/or bitcode for each kernel.
-  llvm::Module *libmodule = kernel_library(device);
-  assert (libmodule != NULL);
-  link(input, libmodule);
-
-  /* Now finally run the set of passes assembled above */
-  // TODO pass these as parameters instead, this is not thread safe!
-  pocl::WGLocalSizeX = local_x;
-  pocl::WGLocalSizeY = local_y;
-  pocl::WGLocalSizeZ = local_z;
-  KernelName = kernel->name;
-
-#ifdef LLVM_OLDER_THAN_3_7
-  kernel_compiler_passes(
-      device,
-      input->getDataLayout()->getStringRepresentation()).run(*input);
-#else
-  kernel_compiler_passes(
-      device,
-      input->getDataLayout().getStringRepresentation())
-      .run(*input);
-#endif
-  // TODO: don't write this once LLC is called via API, not system()
-  pocl_cache_write_kernel_parallel_bc(input, program, device_i, kernel,
-                                  local_x, local_y, local_z);
-
-  delete input;
-  return 0;
-}
-
-int
-pocl_update_program_llvm_irs(cl_program program,
-                             unsigned device_i,
-                             cl_device_id device)
-{
-  SMDiagnostic Err;
-  char program_bc_path[POCL_FILENAME_LENGTH];
-  llvm::MutexGuard lockHolder(kernelCompilerLock);
-  pocl_cache_program_bc_path(program_bc_path, program, device_i);
-
-  if (!pocl_exists(program_bc_path))
-    return -1;
-
-  program->llvm_irs[device_i] =
-              ParseIRFile(program_bc_path, Err, *GlobalContext());
-  return 0;
-}
-
-void pocl_free_llvm_irs(cl_program program, int device_i)
-{
-    if (program->llvm_irs[device_i]) {
-        llvm::Module *mod = (llvm::Module *)program->llvm_irs[device_i];
-        delete mod;
-        program->llvm_irs[device_i] = NULL;
-    }
-}
-
-void pocl_llvm_update_binaries (cl_program program) {
-
-  llvm::MutexGuard lockHolder(kernelCompilerLock);
-  InitializeLLVM();
-  char program_bc_path[POCL_FILENAME_LENGTH];
-  void* cache_lock = NULL;
-
-  // Dump the LLVM IR Modules to memory buffers. 
-  assert (program->llvm_irs != NULL);
-#ifdef DEBUG_POCL_LLVM_API        
-  printf("### refreshing the binaries of the program %p\n", program);
-#endif
-
-   for (size_t i = 0; i < program->num_devices; ++i)
-    {
-      assert (program->llvm_irs[i] != NULL);
-      if (program->binaries[i])
-          continue;
-
-      cache_lock = pocl_cache_acquire_writer_lock_i(program, i);
-
-      pocl_cache_program_bc_path(program_bc_path, program, i);
-      pocl_write_module((llvm::Module*)program->llvm_irs[i], program_bc_path, 1);
-
-      std::string content;
-      llvm::raw_string_ostream sos(content);
-      WriteBitcodeToFile((llvm::Module*)program->llvm_irs[i], sos);
-      sos.str(); // flush
-
-      size_t n = content.size();
-      if (n < program->binary_sizes[i])
-        POCL_ABORT("binary size doesn't match the expected value");
-      if (program->binaries[i])
-          POCL_MEM_FREE(program->binaries[i]);
-      program->binaries[i] = (unsigned char *) malloc(n);
-      std::memcpy(program->binaries[i], content.c_str(), n);
-
-      pocl_cache_release_lock(cache_lock);
-#ifdef DEBUG_POCL_LLVM_API        
-      printf("### binary for device %zi was of size %zu\n", i, program->binary_sizes[i]);
-#endif
-
-    }
-}
-
-/* This is the implementation of the public pocl_llvm_get_kernel_count(),
- * and is used internally also by pocl_llvm_get_kernel_names to
- */
-static unsigned
-pocl_llvm_get_kernel_count(cl_program program, char **knames,
-                           unsigned max_num_krn)
-{
-  llvm::MutexGuard lockHolder(kernelCompilerLock);
-  InitializeLLVM();
-
-  // TODO: is it safe to assume every device (i.e. the index 0 here)
-  // has the same set of programs & kernels?
-  llvm::Module *mod = (llvm::Module *) program->llvm_irs[0];
-
-  llvm::NamedMDNode *md = mod->getNamedMetadata("opencl.kernels");
-  if (md) {
-
-    if (knames) {
-      for (unsigned i=0; i<max_num_krn; i++) {
-        assert( md->getOperand(i)->getOperand(0) != NULL);
-        llvm::ValueAsMetadata *value =
-          dyn_cast<llvm::ValueAsMetadata>(md->getOperand(i)->getOperand(0));
-        llvm::Function *k = cast<Function>(value->getValue());
-        knames[i] = strdup(k->getName().data());
-      }
-    }
-    return md->getNumOperands();
-  }
-  // LLVM 3.9 does not use opencl.kernels meta, but kernel_arg_* function meta
-  else {
-    unsigned kernel_count = 0;
-    for (llvm::Module::iterator i = mod->begin(), e = mod->end();
-           i != e; ++i) {
-      if (i->getMetadata("kernel_arg_access_qual")) {
-        if (knames && kernel_count < max_num_krn) {
-          knames[kernel_count] = strdup(i->getName().str().c_str());
-        }
-        ++kernel_count;
-      }
-    }
-    return kernel_count;
-  }
-}
-
-unsigned
-pocl_llvm_get_kernel_count(cl_program program)
-{
-  return pocl_llvm_get_kernel_count(program, NULL, 0);
-}
-
-unsigned
-pocl_llvm_get_kernel_names(cl_program program, char **knames,
-                           unsigned max_num_krn)
-{
-  unsigned n = pocl_llvm_get_kernel_count(program, knames, max_num_krn);
-
-  return n;
-}
-
-/* Run LLVM codegen on input file (parallel-optimized).
- *
- * Output native object file. */
-int
-pocl_llvm_codegen(cl_kernel kernel,
-                  cl_device_id device,
-                  const char *infilename,
-                  const char *outfilename)
-{
-    llvm::MutexGuard lockHolder(kernelCompilerLock);
-
-    SMDiagnostic Err;
-
-    if (pocl_exists(outfilename))
-      return 0;
-
-    llvm::Triple triple(device->llvm_target_triplet);
-    llvm::TargetMachine *target = GetTargetMachine(device);
-
-    llvm::Module *input = ParseIRFile(infilename, Err, *GlobalContext());
-    assert(input);
-
-    PassManager PM;
-#ifdef LLVM_OLDER_THAN_3_7
-    llvm::TargetLibraryInfo *TLI = new TargetLibraryInfo(triple);
-    PM.add(TLI);
-#else
-    llvm::TargetLibraryInfoWrapperPass *TLIPass = new TargetLibraryInfoWrapperPass(triple);
-    PM.add(TLIPass);
-#endif
-#ifdef LLVM_OLDER_THAN_3_7
-    if (target != NULL) {
-      target->addAnalysisPasses(PM);
-    }
-#endif
-
-    // TODO: get DataLayout from the 'device'
-    // TODO: better error check
-#ifdef LLVM_OLDER_THAN_3_7
-    std::string data;
-    llvm::raw_string_ostream sos(data);
-    llvm::MCContext *mcc;
-    if (target && target->addPassesToEmitMC(PM, mcc, sos))
-      return 1;
-#else
-    SmallVector<char, 4096> data;
-    llvm::raw_svector_ostream sos(data);
-    if (target && target->addPassesToEmitFile(
-        PM, sos, TargetMachine::CGFT_ObjectFile))
-      return 1;
-#endif
-
-    PM.run(*input);
-    std::string o = sos.str(); // flush
-    POCL_MSG_PRINT_INFO("Writing code gen output to %s.\n", outfilename);
-
-    return pocl_write_file(outfilename, o.c_str(), o.size(), 0, 0);
-}
-/* vim: set ts=4 expandtab: */
diff --git a/lib/CL/pocl_llvm_api.h b/lib/CL/pocl_llvm_api.h
new file mode 100644
index 0000000..5ab697c
--- /dev/null
+++ b/lib/CL/pocl_llvm_api.h
@@ -0,0 +1,84 @@
+/* pocl_llvm_api.cc: internally used header for pocl's LLVM API sources.
+
+   Copyright (c) 2013 Kalle Raiskila
+                 2013-2017 Pekka Jääskeläinen
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include "pocl_llvm.h"
+
+#include <llvm/IR/Module.h>
+#include <map>
+#include <string>
+
+#ifdef __GNUC__
+#pragma GCC visibility push(hidden)
+#endif
+
+/* The LLVM API interface functions are not thread safe at the moment;
+ * Pocl needs to ensure only one thread is using this layer at the time.
+ *
+ * Pocl used a llvm::sys::Mutex class variable before, unfortunately,
+ * using llvm::sys::Mutex is not safe. Reason:
+ *
+ * if pocl is dlopened from a C++ program, pocl's C++ object destructors
+ * are called before the program's dtors. This causes the Mutex to be destroyed,
+ * and if the program's dtors call clReleaseProgram()
+ * -> pocl_free_llvm_irs() -> llvm::PoclMutexGuard guard_variable(Mutex)
+ * ... the program will freeze/segfault.
+ *
+ * This happens with many ViennaCL examples.
+ *
+ * This class is a replacement that uses a simple pthread lock
+ */
+
+class PoclCompilerMutexGuard {
+  PoclCompilerMutexGuard(const PoclCompilerMutexGuard &) = delete;
+  void operator=(const PoclCompilerMutexGuard &) = delete;
+
+public:
+  // an unused argument is required, otherwise compiler optimizes out the object
+  PoclCompilerMutexGuard(void *unused);
+  ~PoclCompilerMutexGuard();
+};
+
+
+typedef struct _cl_device_id *cl_device_id;
+extern cl_device_id currentPoclDevice;
+
+void InitializeLLVM();
+llvm::LLVMContext &GlobalContext();
+extern long numberOfIRs;
+
+llvm::Module *parseModuleIR(const char *path);
+void writeModuleIR(const llvm::Module *mod, std::string &str);
+llvm::Module *parseModuleIRMem(const char *input_stream, size_t size);
+int getModuleTriple(const char *input_stream, size_t size, std::string &triple);
+std::string getDiagString();
+
+void clearKernelPasses();
+void clearTargetMachines();
+void cleanKernelLibrary();
+
+extern std::string currentWgMethod;
+
+#ifdef __GNUC__
+#pragma GCC visibility pop
+#endif
diff --git a/lib/CL/pocl_llvm_build.cc b/lib/CL/pocl_llvm_build.cc
new file mode 100644
index 0000000..0d67d22
--- /dev/null
+++ b/lib/CL/pocl_llvm_build.cc
@@ -0,0 +1,880 @@
+/* pocl_llvm_build.cc: part of pocl's LLVM API which deals with
+   producing program.bc
+
+   Copyright (c) 2013 Kalle Raiskila
+                 2013-2017 Pekka Jääskeläinen
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include "CompilerWarnings.h"
+IGNORE_COMPILER_WARNING("-Wunused-parameter")
+IGNORE_COMPILER_WARNING("-Wstrict-aliasing")
+
+#include "config.h"
+
+#include "clang/CodeGen/CodeGenAction.h"
+#include "clang/Frontend/CompilerInstance.h"
+#include "clang/Frontend/CompilerInvocation.h"
+#include "clang/Frontend/FrontendActions.h"
+#include "clang/Frontend/TextDiagnosticBuffer.h"
+
+#ifndef LLVM_OLDER_THAN_4_0
+#include "clang/Lex/PreprocessorOptions.h"
+#endif
+
+#include "llvm/LinkAllPasses.h"
+#include "llvm/Linker/Linker.h"
+
+#include "llvm/Transforms/Utils/Cloning.h"
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+
+#include "llvm/Support/MutexGuard.h"
+
+#include <iostream>
+#include <sstream>
+
+// For some reason including pocl.h before including CodeGenAction.h
+// causes an error. Some kind of macro definition issue. To investigate.
+#include "pocl.h"
+// Note - LLVM/Clang uses symbols defined in Khronos' headers in macros,
+// causing compilation error if they are included before the LLVM headers.
+#include "pocl_llvm_api.h"
+#include "pocl_runtime_config.h"
+#include "linker.h"
+#include "pocl_file_util.h"
+#include "pocl_cache.h"
+#include "LLVMUtils.h"
+
+using namespace clang;
+using namespace llvm;
+
+POP_COMPILER_DIAGS
+
+
+/* Global pocl device to be used by passes if needed */
+cl_device_id currentPoclDevice = NULL;
+
+
+//#define DEBUG_POCL_LLVM_API
+
+#if defined(DEBUG_POCL_LLVM_API) && defined(NDEBUG)
+#undef NDEBUG
+#include <cassert>
+#endif
+
+
+// Read input source to clang::FrontendOptions.
+// The source is contained in the program->source array,
+// but if debugging option is enabled in the kernel compiler
+// we need to dump the file to disk first for the debugger
+// to find it.
+static inline int
+load_source(FrontendOptions &fe,
+            cl_program program)
+{
+  char source_file[POCL_FILENAME_LENGTH];
+  POCL_RETURN_ERROR_ON(pocl_cache_write_program_source(source_file, program),
+                       CL_OUT_OF_HOST_MEMORY, "Could not write program source");
+
+  fe.Inputs.push_back
+#if LLVM_OLDER_THAN_5_0
+      (FrontendInputFile(source_file, clang::IK_OpenCL));
+#else
+      (FrontendInputFile(source_file, clang::InputKind::OpenCL));
+#endif
+
+  return 0;
+}
+
+// Unlink input sources
+static inline int
+unlink_source(FrontendOptions &fe)
+{
+  // don't unlink in debug mode
+  if (pocl_get_bool_option("POCL_DEBUG", 0))
+    return 0;
+
+  FrontendInputFile const& file = fe.Inputs.front();
+  if (file.isFile() && !file.isSystem()) {
+    return pocl_remove(file.getFile().str().c_str());
+  } else {
+    return 0; // nothing to do
+  }
+
+}
+
+static void appendToProgramBuildLog(cl_program program, unsigned device_i,
+                                    std::string &s) {
+  if (!s.empty()) {
+    POCL_MSG_ERR(s.c_str());
+    /* this may not actually write anything if the buildhash is invalid,
+     * but program->build_log still gets written.  */
+    pocl_cache_append_to_buildlog(program, device_i, s.c_str(), s.size());
+    if (program->build_log[device_i]) {
+      size_t len = strlen(program->build_log[device_i]);
+      size_t len2 = strlen(s.c_str());
+      char *newlog = (char *)malloc(len + len2 + 1);
+      memcpy(newlog, program->build_log[device_i], len);
+      memcpy(newlog + len, s.c_str(), len2);
+      newlog[len + len2] = 0;
+      POCL_MEM_FREE(program->build_log[device_i]);
+      program->build_log[device_i] = newlog;
+    } else
+      program->build_log[device_i] = strdup(s.c_str());
+  }
+}
+
+static void get_build_log(cl_program program,
+                         unsigned device_i,
+                         std::stringstream &ss_build_log,
+                         clang::TextDiagnosticBuffer *diagsBuffer,
+                         const SourceManager &sm)
+{
+    for (TextDiagnosticBuffer::const_iterator i = diagsBuffer->err_begin(),
+         e = diagsBuffer->err_end(); i != e; ++i)
+      {
+        ss_build_log << "error: " << i->first.printToString(sm)
+                     << ": " << i->second << std::endl;
+      }
+    for (TextDiagnosticBuffer::const_iterator i = diagsBuffer->warn_begin(),
+         e = diagsBuffer->warn_end(); i != e; ++i)
+      {
+        ss_build_log << "warning: " << i->first.printToString(sm)
+                     << ": " << i->second << std::endl;
+      }
+
+    std::string log = ss_build_log.str();
+    appendToProgramBuildLog(program, device_i, log);
+}
+
+static llvm::Module *kernel_library(cl_device_id device);
+
+int pocl_llvm_build_program(cl_program program,
+                            unsigned device_i,
+                            const char *user_options_cstr,
+                            char *program_bc_path,
+                            cl_uint num_input_headers,
+                            const cl_program *input_headers,
+                            const char **header_include_names,
+                            int linking_program)
+
+{
+  void* write_lock = NULL;
+  char tempfile[POCL_FILENAME_LENGTH];
+  tempfile[0] = 0;
+  llvm::Module **mod = NULL;
+  char temp_include_dir[POCL_FILENAME_LENGTH];
+  std::string user_options(user_options_cstr ? user_options_cstr : "");
+  size_t n = 0;
+  int error;
+
+  PoclCompilerMutexGuard lockHolder(NULL);
+  InitializeLLVM();
+
+  if (num_input_headers > 0) {
+    error = pocl_cache_create_tempdir(temp_include_dir);
+    if(error)
+      {
+        POCL_MSG_ERR ("pocl_cache_create_tempdir (%s)"
+                      " failed with %i\n", temp_include_dir, error);
+        return error;
+      }
+    std::string tempdir(temp_include_dir);
+
+    for (n = 0; n < num_input_headers; n++) {
+      char *input_header = input_headers[n]->source;
+      size_t input_header_size = strlen(input_header);
+      const char *header_name = header_include_names[n];
+      std::string header(header_name);
+      /* TODO this path stuff should be in utils */
+      std::string path(tempdir);
+      path.append("/");
+      path.append(header_name);
+      size_t last_slash = header.rfind('/');
+      if (last_slash != std::string::npos) {
+        std::string dir(path, 0, (tempdir.size() + 1 + last_slash));
+        pocl_mkdir_p(dir.c_str());
+      }
+      pocl_write_file(path.c_str(), input_header, input_header_size, 0, 1);
+    }
+  }
+  // Use CompilerInvocation::CreateFromArgs to initialize
+  // CompilerInvocation. This way we can reuse the Clang's
+  // command line parsing.
+  llvm::IntrusiveRefCntPtr<clang::DiagnosticIDs> diagID =
+    new clang::DiagnosticIDs();
+  llvm::IntrusiveRefCntPtr<clang::DiagnosticOptions> diagOpts =
+    new clang::DiagnosticOptions();
+  clang::TextDiagnosticBuffer *diagsBuffer =
+    new clang::TextDiagnosticBuffer();
+
+  clang::DiagnosticsEngine diags(diagID, &*diagOpts, diagsBuffer);
+
+  CompilerInstance CI;
+  CompilerInvocation &pocl_build = CI.getInvocation();
+
+  std::stringstream ss;
+  std::stringstream ss_build_log;
+
+  // add device specific switches, if any
+  // TODO this currently passes NULL as device tmpdir
+  cl_device_id device = program->devices[device_i];
+  if (device->ops->init_build != NULL)
+    {
+      char *device_switches =
+        device->ops->init_build (device->data);
+      if (device_switches != NULL)
+        {
+          ss << device_switches << " ";
+        }
+      POCL_MEM_FREE(device_switches);
+    }
+
+  llvm::StringRef extensions(device->extensions);
+
+  std::string cl_ext;
+  if (extensions.size() > 0) {
+    size_t e_start = 0, e_end = 0;
+    while (e_end < std::string::npos) {
+      e_end = extensions.find(' ', e_start);
+      llvm::StringRef tok = extensions.slice(e_start, e_end);
+      e_start = e_end + 1;
+      ss << "-D" << tok.str() << " ";
+#ifndef LLVM_OLDER_THAN_4_0
+      cl_ext += "+";
+      cl_ext += tok.str();
+      cl_ext += ",";
+#endif
+    }
+  }
+#ifndef LLVM_OLDER_THAN_4_0
+  if (!cl_ext.empty()) {
+    cl_ext.back() = ' '; // replace last "," with space
+    ss << "-cl-ext=-all," << cl_ext;
+  }
+#endif
+  /* temp dir takes preference */
+  if (num_input_headers > 0)
+    ss << "-I" << temp_include_dir << " ";
+
+  if (device->has_64bit_long)
+    ss << "-Dcl_khr_int64 ";
+  // This can cause illegal optimizations when unaware
+  // of the barrier semantics. -O2 is the default opt level in
+  // Clang for OpenCL C and seems to affect the performance
+  // of the end result, even if we optimize the final WG
+  // func. TODO: There should be 'noduplicate' etc. flags in
+  // the 'barrier' function to prevent them.
+  // ss << "-O2 ";
+
+  ss << "-x cl ";
+  // Remove the inline keywords to force the user functions
+  // to be included in the program. Otherwise they will
+  // be removed and not inlined due to -O0.
+  ss << "-Dinline= ";
+  // The current directory is a standard search path.
+  ss << "-I. ";
+  // required for clGetKernelArgInfo()
+  ss << "-cl-kernel-arg-info ";
+
+  ss << user_options << " ";
+
+  if (device->endian_little)
+    ss << "-D__ENDIAN_LITTLE__=1 ";
+
+  if (device->image_support)
+    ss << "-D__IMAGE_SUPPORT__=1 ";
+
+  ss << "-DCL_DEVICE_MAX_GLOBAL_VARIABLE_SIZE=" << device->global_var_max_size << " ";
+
+  if (user_options.find("cl-fast-relaxed-math") != std::string::npos)
+    ss << "-D__FAST_RELAXED_MATH__=1 ";
+
+  ss << "-D__OPENCL_VERSION__=" << device->cl_version_int << " ";
+
+  if (user_options.find("-cl-std=") == std::string::npos)
+    ss << "-cl-std=" << device->cl_version_std << " ";
+
+  std::string temp(ss.str());
+  size_t pos = temp.find("-cl-std=CL");
+  pos += 10;
+  int cl_std_major = temp.c_str()[pos] - '0';
+  int cl_std_minor = temp.c_str()[pos+2] - '0';
+  int cl_std_i = cl_std_major * 100 + cl_std_minor * 10;
+  ss << "-D__OPENCL_C_VERSION__=" << cl_std_i << " ";
+
+  ss << "-fno-builtin ";
+  /* with fp-contract=on we get calls to fma with processors which do not
+   * have fma instructions. These ruin the performance.
+   *
+   * TODO find out which processors. Seems to be at least TCE
+   *
+   * default fp-contract is "on" which means "enable if enabled by a pragma".
+   */
+  llvm::Triple triple (device->llvm_target_triplet);
+  if (triple.getArch () == Triple::tce)
+    ss << "-ffp-contract=off ";
+
+  // This is required otherwise the initialization fails with
+  // unknown triple ''
+  ss << "-triple=" << device->llvm_target_triplet << " ";
+  if (device->llvm_cpu != NULL)
+    ss << "-target-cpu " << device->llvm_cpu << " ";
+
+  POCL_MSG_PRINT_LLVM("all build options: %s\n", ss.str().c_str());
+
+  std::istream_iterator<std::string> begin(ss);
+  std::istream_iterator<std::string> end;
+  std::istream_iterator<std::string> i = begin;
+  std::vector<const char*> itemcstrs;
+  std::vector<std::string> itemstrs;
+  while (i != end) {
+    itemstrs.push_back(*i);
+    ++i;
+  }
+
+  for (unsigned idx = 0; idx < itemstrs.size(); idx++) {
+      // note: if itemstrs is modified after this, itemcstrs will be full
+      // of invalid pointers! Could make copies, but would have to clean up then...
+      itemcstrs.push_back(itemstrs[idx].c_str());
+  }
+
+#ifdef DEBUG_POCL_LLVM_API
+  // TODO: for some reason the user_options are replicated,
+  // they appear twice in a row in the output
+  std::cerr << "### options: " << ss.str()
+            << "user_options: " << user_options << std::endl;
+#endif
+
+  if (program->build_log[device_i])
+    POCL_MEM_FREE(program->build_log[device_i]);
+
+  if (!CompilerInvocation::CreateFromArgs
+      (pocl_build, itemcstrs.data(), itemcstrs.data() + itemcstrs.size(),
+       diags)) {
+    pocl_cache_create_program_cachedir(program, device_i, NULL, 0,
+                                       program_bc_path);
+    get_build_log(program, device_i, ss_build_log, diagsBuffer, CI.getSourceManager());
+    return CL_INVALID_BUILD_OPTIONS;
+  }
+
+  LangOptions *la = pocl_build.getLangOpts();
+  PreprocessorOptions &po = pocl_build.getPreprocessorOpts();
+
+#ifdef LLVM_OLDER_THAN_3_9
+  pocl_build.setLangDefaults
+    (*la, clang::IK_OpenCL, clang::LangStandard::lang_opencl12);
+#else
+  pocl_build.setLangDefaults
+#if LLVM_OLDER_THAN_5_0
+      (*la, clang::IK_OpenCL, triple, po, clang::LangStandard::lang_opencl12);
+#else
+      (*la, clang::InputKind::OpenCL, triple, po,
+       clang::LangStandard::lang_opencl12);
+#endif
+#endif
+
+  // LLVM 3.3 and older do not set that char is signed which is
+  // defined by the OpenCL C specs (but not by C specs).
+  la->CharIsSigned = true;
+
+  // the per-file types don't seem to override this
+  la->OpenCLVersion = cl_std_i;
+#ifdef POCL_USE_FAKE_ADDR_SPACE_IDS
+  la->FakeAddressSpaceMap = true;
+#else
+  la->FakeAddressSpaceMap = false;
+#endif
+  la->Blocks = true; //-fblocks
+  la->MathErrno = false; // -fno-math-errno
+  la->NoBuiltin = true;  // -fno-builtin
+  la->AsmBlocks = true;  // -fasm (?)
+  la->setStackProtector(LangOptions::StackProtectorMode::SSPOff);
+  la->PICLevel = 0;
+#ifdef LLVM_OLDER_THAN_3_9
+  la->PIELevel = 0;
+#else
+  la->PIE = 0;
+#endif
+
+  std::string kernelh;
+  std::string BuiltinRenamesH;
+
+#ifdef ENABLE_POCL_BUILDING
+  if (pocl_get_bool_option("POCL_BUILDING", 0)) {
+    kernelh  = SRCDIR;
+#else
+  if (0) {
+#endif
+  } else {
+    kernelh = POCL_INSTALL_PRIVATE_DATADIR;
+  }
+  BuiltinRenamesH = kernelh;
+  kernelh += "/include/_kernel.h";
+  BuiltinRenamesH += "/include/_builtin_renames.h";
+
+  po.Includes.push_back(BuiltinRenamesH);
+#ifndef LLVM_OLDER_THAN_4_0
+  // Use Clang's opencl-c.h header.
+  po.Includes.push_back(CLANG_RESOURCE_DIR "/include/opencl-c.h");
+#endif
+  po.Includes.push_back(kernelh);
+  clang::TargetOptions &ta = pocl_build.getTargetOpts();
+  ta.Triple = device->llvm_target_triplet;
+  if (device->llvm_cpu != NULL)
+    ta.CPU = device->llvm_cpu;
+
+#ifdef DEBUG_POCL_LLVM_API
+  std::cout << "### Triple: " << ta.Triple.c_str() <<  ", CPU: " << ta.CPU.c_str();
+#endif
+  CI.createDiagnostics(diagsBuffer, false);
+
+  FrontendOptions &fe = pocl_build.getFrontendOpts();
+  // The CreateFromArgs created an stdin input which we should remove first.
+  fe.Inputs.clear();
+  if (load_source(fe, program) != 0)
+    return CL_OUT_OF_HOST_MEMORY;
+
+  CodeGenOptions &cg = pocl_build.getCodeGenOpts();
+  cg.EmitOpenCLArgMetadata = true;
+  cg.StackRealignment = true;
+  // Let the vectorizer or another optimization pass unroll the loops,
+  // in case it sees beneficial.
+  cg.UnrollLoops = false;
+  // Lets leave vectorization to later compilation phase
+  cg.VectorizeLoop = false;
+  cg.VectorizeSLP = false;
+  // This workarounds a Frontend codegen issues with an illegal address
+  // space cast which is later flattened (and thus implicitly fixed) in
+  // the TargetAddressSpaces. See:  https://github.com/pocl/pocl/issues/195
+  cg.VerifyModule = false;
+
+  PreprocessorOutputOptions &poo = pocl_build.getPreprocessorOutputOpts();
+  poo.ShowCPP = 1;
+  poo.ShowComments = 0;
+  poo.ShowLineMarkers = 0;
+  poo.ShowMacroComments = 0;
+  poo.ShowMacros = 1;
+  poo.RewriteIncludes = 0;
+
+  pocl_cache_tempname(tempfile, ".cl", NULL);
+  fe.OutputFile.assign(tempfile);
+
+  bool success = true;
+  clang::PrintPreprocessedAction Preprocess;
+  success = CI.ExecuteAction(Preprocess);
+  char *PreprocessedOut = nullptr;
+  uint64_t PreprocessedSize = 0;
+
+  if (success) {
+    pocl_read_file(tempfile, &PreprocessedOut, &PreprocessedSize);
+  }
+  if (pocl_get_bool_option("POCL_LEAVE_KERNEL_COMPILER_TEMP_FILES", 0) == 0) {
+    if (num_input_headers > 0)
+      pocl_rm_rf(temp_include_dir);
+    pocl_remove(tempfile);
+  }
+
+  if (PreprocessedOut == nullptr) {
+    pocl_cache_create_program_cachedir(program, device_i, NULL, 0,
+                                       program_bc_path);
+    get_build_log(program, device_i, ss_build_log, diagsBuffer, CI.getSourceManager());
+    return CL_BUILD_PROGRAM_FAILURE;
+  }
+
+  pocl_cache_create_program_cachedir(program, device_i, PreprocessedOut,
+                                     static_cast<size_t>(PreprocessedSize), program_bc_path);
+
+  POCL_MEM_FREE(PreprocessedOut);
+
+  if (pocl_exists(program_bc_path)) {
+    unlink_source(fe);
+    return CL_SUCCESS;
+  }
+
+  // TODO: use pch: it is possible to disable the strict checking for
+  // the compilation flags used to compile it and the current translation
+  // unit via the preprocessor options directly.
+  llvm::LLVMContext &c = GlobalContext();
+  clang::EmitLLVMOnlyAction EmitLLVM(&c);
+  success = CI.ExecuteAction(EmitLLVM);
+
+  unlink_source(fe);
+
+  get_build_log(program, device_i, ss_build_log, diagsBuffer, CI.getSourceManager());
+
+  if (!success)
+    return CL_BUILD_PROGRAM_FAILURE;
+
+  mod = (llvm::Module **)&program->llvm_irs[device_i];
+  if (*mod != NULL) {
+    delete *mod;
+    --numberOfIRs;
+  }
+
+  *mod = EmitLLVM.takeModule().release();
+
+  if (*mod == NULL)
+    return CL_BUILD_PROGRAM_FAILURE;
+
+  ++numberOfIRs;
+
+  // a workaround for errors with PIC + variables in constant addrspace
+  // fails on the test_convert_type_X tests with this error:
+  //   relocation R_X86_64_PC32 against symbol `char_values' can not be used
+  //   when making a shared object; recompile with -fPIC
+#ifdef LLVM_OLDER_THAN_3_9
+  (*mod)->setPICLevel(PICLevel::Default);
+#else
+  (*mod)->setPICLevel(PICLevel::NotPIC);
+  (*mod)->setPIELevel(PIELevel::Default);
+#endif
+
+
+  // link w kernel lib, but not if we're called from clCompileProgram()
+  // Later this should be replaced with indexed linking of source code
+  // and/or bitcode for each kernel.
+  if (linking_program) {
+    currentPoclDevice = device;
+    llvm::Module *libmodule = kernel_library(device);
+    assert(libmodule != NULL);
+    std::string log("Error(s) while linking: \n");
+    if (link(*mod, libmodule, log)) {
+      appendToProgramBuildLog(program, device_i, log);
+      std::string msg = getDiagString();
+      appendToProgramBuildLog(program, device_i, msg);
+      delete *mod;
+      *mod = nullptr;
+      --numberOfIRs;
+      return CL_BUILD_PROGRAM_FAILURE;
+    }
+  }
+
+  write_lock = pocl_cache_acquire_writer_lock_i(program, device_i);
+  assert(write_lock);
+
+  POCL_MSG_PRINT_LLVM("Writing program.bc to %s.\n", program_bc_path);
+
+  /* Always retain program.bc. Its required in clBuildProgram */
+  error = pocl_write_module(*mod, program_bc_path, 0);
+  if(error)
+    return error;
+
+  /* To avoid writing & reading the same back,
+   * save program->binaries[i]
+   */
+  std::string content;
+  writeModuleIR(*mod, content);
+
+  if (program->binaries[device_i])
+    POCL_MEM_FREE(program->binaries[device_i]);
+
+  n = content.size();
+  program->binary_sizes[device_i] = n;
+  program->binaries[device_i] = (unsigned char *) malloc(n);
+  std::memcpy(program->binaries[device_i], content.c_str(), n);
+
+  pocl_cache_release_lock(write_lock);
+
+  return CL_SUCCESS;
+}
+
+
+int pocl_llvm_link_program(cl_program program,
+                           unsigned device_i,
+                           char *program_bc_path,
+                           cl_uint num_input_programs,
+                           unsigned char **cur_device_binaries,
+                           size_t *cur_device_binary_sizes,
+                           void **cur_llvm_irs,
+                           int create_library) {
+
+  void *write_lock;
+  std::string concated_binaries;
+  size_t n = 0, i;
+  cl_device_id device = program->devices[device_i];
+  llvm::Module **modptr = (llvm::Module **)&program->llvm_irs[device_i];
+  int error;
+
+  PoclCompilerMutexGuard lockHolder(NULL);
+  InitializeLLVM();
+
+#ifdef LLVM_OLDER_THAN_3_8
+  llvm::Module *mod =
+      new llvm::Module(StringRef("linked_program"), GlobalContext());
+#else
+  std::unique_ptr<llvm::Module> mod(
+      new llvm::Module(StringRef("linked_program"), GlobalContext()));
+#endif
+
+  for (i = 0; i < num_input_programs; i++) {
+    assert(cur_device_binaries[i]);
+    assert(cur_device_binary_sizes[i]);
+    concated_binaries.append(std::string((char *)cur_device_binaries[i],
+                                         cur_device_binary_sizes[i]));
+
+    llvm::Module *p = (llvm::Module *)cur_llvm_irs[i];
+    assert(p);
+
+#ifdef LLVM_OLDER_THAN_3_8
+    if (Linker::LinkModules(mod, llvm::CloneModule(p))) {
+      delete mod;
+#else
+    if (Linker::linkModules(*mod, llvm::CloneModule(p))) {
+#endif
+      std::string msg = getDiagString();
+      appendToProgramBuildLog(program, device_i, msg);
+      return CL_LINK_PROGRAM_FAILURE;
+    }
+  }
+
+#ifdef LLVM_OLDER_THAN_3_8
+  llvm::Module *linked_module = mod;
+#else
+  llvm::Module *linked_module = mod.release();
+#endif
+
+  if (linked_module == nullptr)
+    return CL_LINK_PROGRAM_FAILURE;
+
+  if (*modptr != nullptr) {
+    delete *modptr;
+    --numberOfIRs;
+    *modptr = nullptr;
+  }
+
+  if (!create_library) {
+    // linked all the programs together, now link in the kernel library
+    currentPoclDevice = device;
+    llvm::Module *libmodule = kernel_library(device);
+    assert(libmodule != NULL);
+    std::string log("Error(s) while linking: \n");
+    if (link(linked_module, libmodule, log)) {
+      appendToProgramBuildLog(program, device_i, log);
+      std::string msg = getDiagString();
+      appendToProgramBuildLog(program, device_i, msg);
+      delete linked_module;
+      return CL_BUILD_PROGRAM_FAILURE;
+    }
+  }
+
+  *modptr = linked_module;
+  ++numberOfIRs;
+
+  /* TODO currently cached on concated binary contents (in undefined order),
+     this is not terribly useful (but we have to store it somewhere..) */
+  error = pocl_cache_create_program_cachedir(program, device_i,
+                                     concated_binaries.c_str(),
+                                     concated_binaries.size(),
+                                     program_bc_path);
+  if (error)
+    {
+      POCL_MSG_ERR ("pocl_cache_create_program_cachedir(%s)"
+                    " failed with %i\n", program_bc_path, error);
+      return error;
+    }
+
+  write_lock = pocl_cache_acquire_writer_lock_i(program, device_i);
+  assert(write_lock);
+
+  POCL_MSG_PRINT_LLVM("Writing program.bc to %s.\n", program_bc_path);
+
+  /* Always retain program.bc. Its required in clBuildProgram */
+  error = pocl_write_module(linked_module, program_bc_path, 0);
+  if (error)
+    return error;
+
+  /* To avoid writing & reading the same back,
+   * save program->binaries[i]
+   */
+  std::string content;
+  writeModuleIR(linked_module, content);
+
+  if (program->binaries[device_i])
+    POCL_MEM_FREE(program->binaries[device_i]);
+
+  n = content.size();
+  program->binary_sizes[device_i] = n;
+  program->binaries[device_i] = (unsigned char *)malloc(n);
+  std::memcpy(program->binaries[device_i], content.c_str(), n);
+
+  pocl_cache_release_lock(write_lock);
+
+  return CL_SUCCESS;
+}
+
+/* for "distro" style kernel libs, return which kernellib to use, at runtime */
+#ifdef KERNELLIB_HOST_DISTRO_VARIANTS
+const char *getX86KernelLibName() {
+  StringMap<bool> Features;
+  llvm::sys::getHostCPUFeatures(Features);
+  const char *res = NULL;
+
+  if (Features["sse2"])
+    res = "sse2";
+  else
+    POCL_ABORT("Pocl on x86_64 requires at least SSE2");
+  if (Features["ssse3"] && Features["cx16"])
+    res = "ssse3";
+  if (Features["sse4.1"] && Features["cx16"])
+    res = "sse41";
+  if (Features["avx"] && Features["cx16"] && Features["popcnt"])
+    res = "avx";
+  if (Features["avx"] && Features["cx16"] && Features["popcnt"] && Features["f16c"])
+    res = "avx_f16c";
+  if (Features["avx"] && Features["cx16"] && Features["popcnt"]
+      && Features["xop"] && Features["fma4"])
+    res = "avx_fma4";
+  if (Features["avx"] && Features["avx2"] && Features["cx16"]
+      && Features["popcnt"] && Features["lzcnt"] && Features["f16c"]
+      && Features["fma"] && Features["bmi"] && Features["bmi2"])
+    res = "avx2";
+  if (Features["avx512f"] )
+    res = "avx512";
+
+  return res;
+}
+#endif
+
+
+static std::map<cl_device_id, llvm::Module *> kernelLibraryMap;
+
+/**
+ * Return the OpenCL C built-in function library bitcode
+ * for the given device.
+ */
+static llvm::Module*
+kernel_library
+(cl_device_id device)
+{
+  Triple triple(device->llvm_target_triplet);
+
+  if (kernelLibraryMap.find(device) != kernelLibraryMap.end())
+    return kernelLibraryMap[device];
+
+  const char *subdir = "host";
+  bool is_host = true;
+#ifdef TCE_AVAILABLE
+  if (triple.getArch() == Triple::tce) {
+    subdir = "tce";
+    is_host = false;
+  }
+#endif
+#ifdef BUILD_HSA
+  if (triple.getArch() == Triple::hsail64) {
+    subdir = "hsail64";
+    is_host = false;
+  }
+#endif
+#ifdef AMDGCN_ENABLED
+  if (triple.getArch == Triple::amdgcn) {
+    subdir = "amdgcn";
+    is_host = false;
+  }
+#endif
+#ifdef BUILD_CUDA
+  if (triple.getArch() == Triple::nvptx ||
+      triple.getArch() == Triple::nvptx64) {
+    subdir = "cuda";
+    is_host = false;
+  }
+#endif
+
+  // TODO sync with Nat Ferrus' indexed linking
+  std::string kernellib;
+  std::string kernellib_fallback;
+#ifdef ENABLE_POCL_BUILDING
+  if (pocl_get_bool_option("POCL_BUILDING", 0)) {
+    kernellib = BUILDDIR;
+#else
+  if (0) {
+#endif
+    kernellib += "/lib/kernel/";
+    kernellib += subdir;
+    // TODO: get this from the TCE target triplet
+    kernellib += "/kernel-";
+    kernellib += device->llvm_target_triplet;
+    if (is_host) {
+      kernellib += '-';
+      kernellib_fallback = kernellib;
+      kernellib_fallback += OCL_KERNEL_TARGET_CPU;
+      kernellib_fallback += ".bc";
+#ifdef KERNELLIB_HOST_DISTRO_VARIANTS
+      if (triple.getArch() == Triple::x86_64 ||
+          triple.getArch() == Triple::x86)
+        kernellib += getX86KernelLibName();
+      else
+#endif
+        kernellib += device->llvm_cpu;
+    }
+  } else { // POCL_BUILDING == 0, use install dir
+    kernellib = POCL_INSTALL_PRIVATE_DATADIR;
+    kernellib += "/kernel-";
+    kernellib += device->llvm_target_triplet;
+    if (is_host) {
+      kernellib += '-';
+      kernellib_fallback = kernellib;
+      kernellib_fallback += OCL_KERNEL_TARGET_CPU;
+      kernellib_fallback += ".bc";
+#ifdef KERNELLIB_HOST_DISTRO_VARIANTS
+      if (triple.getArch() == Triple::x86_64 ||
+          triple.getArch() == Triple::x86)
+        kernellib += getX86KernelLibName();
+      else
+#endif
+        kernellib += device->llvm_cpu;
+    }
+  }
+  kernellib += ".bc";
+
+  llvm::Module *lib;
+  SMDiagnostic Err;
+
+  if (pocl_exists(kernellib.c_str()))
+    {
+      POCL_MSG_PRINT_LLVM("Using %s as the built-in lib.\n", kernellib.c_str());
+      lib = parseModuleIR(kernellib.c_str());
+    }
+  else
+    {
+      if (is_host && pocl_exists(kernellib_fallback.c_str()))
+        {
+          POCL_MSG_WARN("Using fallback %s as the built-in lib.\n",
+                        kernellib_fallback.c_str());
+          lib = parseModuleIR(kernellib_fallback.c_str());
+        }
+      else
+        POCL_ABORT("Kernel library file %s doesn't exist.", kernellib.c_str());
+    }
+  assert (lib != NULL);
+  kernelLibraryMap[device] = lib;
+
+  return lib;
+}
+
+void cleanKernelLibrary() {
+  for (auto i = kernelLibraryMap.begin(), e = kernelLibraryMap.end();
+       i != e; ++i) {
+    delete (llvm::Module *)i->second;
+  }
+  kernelLibraryMap.clear();
+}
diff --git a/lib/CL/pocl_llvm_metadata.cc b/lib/CL/pocl_llvm_metadata.cc
new file mode 100644
index 0000000..097353b
--- /dev/null
+++ b/lib/CL/pocl_llvm_metadata.cc
@@ -0,0 +1,818 @@
+/* pocl_llvm_metadata.cc: part of pocl LLVM API dealing with kernel metadata.
+
+   Copyright (c) 2013 Kalle Raiskila
+                 2013-2017 Pekka Jääskeläinen
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include <llvm/Support/Casting.h>
+#include <llvm/Support/MutexGuard.h>
+#include <llvm/IR/Value.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/Type.h>
+#include <llvm/IR/Metadata.h>
+#include <llvm/IR/DebugInfoMetadata.h>
+#include <llvm/IR/Function.h>
+#include <llvm/IR/Module.h>
+#include <llvm/IR/DataLayout.h>
+#include <llvm/ADT/SmallVector.h>
+
+#include <string>
+#include <map>
+#include <iostream>
+#include <sstream>
+
+#include "pocl_cl.h"
+#include "pocl_llvm_api.h"
+#include "pocl_cache.h"
+#include "LLVMUtils.h"
+
+using namespace llvm;
+
+// The old way of getting kernel metadata from "opencl.kernels" module meta.
+// LLVM < 3.9 and SPIR
+static int pocl_get_kernel_arg_module_metadata(const char *kernel_name,
+                                               llvm::Module *input,
+                                               cl_kernel kernel) {
+  // find the right kernel in "opencl.kernels" metadata
+  llvm::NamedMDNode *opencl_kernels = input->getNamedMetadata("opencl.kernels");
+  llvm::MDNode *kernel_metadata = NULL;
+
+#ifdef LLVM_OLDER_THAN_3_9
+  assert(opencl_kernels && opencl_kernels->getNumOperands());
+#else
+  if (!(opencl_kernels && opencl_kernels->getNumOperands()))
+    // Perhaps it is a SPIR kernel without the "opencl.kernels" metadata
+    return 1;
+#endif
+
+  for (unsigned i = 0, e = opencl_kernels->getNumOperands(); i != e; ++i) {
+    llvm::MDNode *kernel_iter = opencl_kernels->getOperand(i);
+
+    llvm::Value *meta =
+        dyn_cast<llvm::ValueAsMetadata>(kernel_iter->getOperand(0))->getValue();
+    llvm::Function *kernel_prototype = llvm::cast<llvm::Function>(meta);
+    std::string name = kernel_prototype->getName().str();
+    if (name == kernel_name) {
+      kernel_metadata = kernel_iter;
+      break;
+    }
+  }
+
+  kernel->arg_info = (struct pocl_argument_info *)calloc(
+      kernel->num_args, sizeof(struct pocl_argument_info));
+  memset(kernel->arg_info, 0,
+         sizeof(struct pocl_argument_info) * kernel->num_args);
+
+  kernel->has_arg_metadata = 0;
+
+  assert(kernel_metadata && "kernel NOT found in opencl.kernels metadata");
+
+#ifdef POCL_USE_FAKE_ADDR_SPACE_IDS
+  int BitcodeIsSPIR = input->getTargetTriple().find("spir") == 0;
+#endif
+
+  unsigned e = kernel_metadata->getNumOperands();
+  for (unsigned i = 1; i != e; ++i) {
+    llvm::MDNode *meta_node =
+        llvm::cast<MDNode>(kernel_metadata->getOperand(i));
+
+    // argument num
+    unsigned arg_num = meta_node->getNumOperands();
+#ifndef NDEBUG
+    int has_meta_for_every_arg = ((arg_num - 1) == kernel->num_args);
+#endif
+
+    llvm::MDString *meta_name_node =
+        llvm::cast<MDString>(meta_node->getOperand(0));
+    std::string meta_name = meta_name_node->getString().str();
+
+    for (unsigned j = 1; j != arg_num; ++j) {
+      llvm::Value *meta_arg_value = NULL;
+      if (isa<ValueAsMetadata>(meta_node->getOperand(j)))
+        meta_arg_value =
+            dyn_cast<ValueAsMetadata>(meta_node->getOperand(j))->getValue();
+      else if (isa<ConstantAsMetadata>(meta_node->getOperand(j)))
+        meta_arg_value =
+            dyn_cast<ConstantAsMetadata>(meta_node->getOperand(j))->getValue();
+      struct pocl_argument_info *current_arg = &kernel->arg_info[j - 1];
+
+      if (meta_arg_value != NULL && isa<ConstantInt>(meta_arg_value) &&
+          meta_name == "kernel_arg_addr_space") {
+        assert(has_meta_for_every_arg &&
+               "kernel_arg_addr_space meta incomplete");
+        kernel->has_arg_metadata |= POCL_HAS_KERNEL_ARG_ADDRESS_QUALIFIER;
+        // std::cout << "is ConstantInt /  kernel_arg_addr_space" << std::endl;
+        llvm::ConstantInt *m = llvm::cast<ConstantInt>(meta_arg_value);
+        uint64_t val = m->getLimitedValue(UINT_MAX);
+        bool SPIRAddressSpaceIDs;
+#ifdef POCL_USE_FAKE_ADDR_SPACE_IDS
+        SPIRAddressSpaceIDs = BitcodeIsSPIR;
+#else
+        // We have an LLVM fixed to produce always SPIR AS ids for the argument
+        // info metadata.
+        SPIRAddressSpaceIDs = true;
+#endif
+
+        if (SPIRAddressSpaceIDs) {
+          switch (val) {
+          case 0:
+            current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_PRIVATE;
+            break;
+          case 1:
+            current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_GLOBAL;
+            break;
+          case 3:
+            current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_LOCAL;
+            break;
+          case 2:
+            current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_CONSTANT;
+            break;
+          }
+        } else {
+          switch (val) {
+#ifdef POCL_USE_FAKE_ADDR_SPACE_IDS
+          case POCL_FAKE_AS_PRIVATE:
+            current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_PRIVATE;
+            break;
+          case POCL_FAKE_AS_GLOBAL:
+            current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_GLOBAL;
+            break;
+          case POCL_FAKE_AS_LOCAL:
+            current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_LOCAL;
+            break;
+          case POCL_FAKE_AS_CONSTANT:
+            current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_CONSTANT;
+            break;
+          case POCL_FAKE_AS_GENERIC:
+            current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_PRIVATE;
+            break;
+#endif
+          default:
+            POCL_MSG_ERR("Unknown address space ID %lu\n", val);
+            break;
+          }
+        }
+      } else if (isa<MDString>(meta_node->getOperand(j))) {
+        // std::cout << "is MDString" << std::endl;
+        llvm::MDString *m = llvm::cast<MDString>(meta_node->getOperand(j));
+        std::string val = m->getString().str();
+
+        if (meta_name == "kernel_arg_access_qual") {
+          assert(has_meta_for_every_arg &&
+                 "kernel_arg_access_qual meta incomplete");
+          kernel->has_arg_metadata |= POCL_HAS_KERNEL_ARG_ACCESS_QUALIFIER;
+          if (val == "read_write")
+            current_arg->access_qualifier = CL_KERNEL_ARG_ACCESS_READ_WRITE;
+          else if (val == "read_only")
+            current_arg->access_qualifier = CL_KERNEL_ARG_ACCESS_READ_ONLY;
+          else if (val == "write_only")
+            current_arg->access_qualifier = CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
+          else if (val == "none")
+            current_arg->access_qualifier = CL_KERNEL_ARG_ACCESS_NONE;
+          else
+            std::cout << "UNKNOWN kernel_arg_access_qual value: " << val
+                      << std::endl;
+        } else if (meta_name == "kernel_arg_type") {
+          assert(has_meta_for_every_arg && "kernel_arg_type meta incomplete");
+          kernel->has_arg_metadata |= POCL_HAS_KERNEL_ARG_TYPE_NAME;
+          current_arg->type_name = (char *)malloc(val.size() + 1);
+          std::strcpy(current_arg->type_name, val.c_str());
+        } else if (meta_name == "kernel_arg_base_type") {
+          // may or may not be present even in SPIR
+        } else if (meta_name == "kernel_arg_type_qual") {
+          assert(has_meta_for_every_arg &&
+                 "kernel_arg_type_qual meta incomplete");
+          kernel->has_arg_metadata |= POCL_HAS_KERNEL_ARG_TYPE_QUALIFIER;
+          current_arg->type_qualifier = 0;
+          if (val.find("const") != std::string::npos)
+            current_arg->type_qualifier |= CL_KERNEL_ARG_TYPE_CONST;
+          if (val.find("restrict") != std::string::npos)
+            current_arg->type_qualifier |= CL_KERNEL_ARG_TYPE_RESTRICT;
+          if (val.find("volatile") != std::string::npos)
+            current_arg->type_qualifier |= CL_KERNEL_ARG_TYPE_VOLATILE;
+        } else if (meta_name == "kernel_arg_name") {
+          assert(has_meta_for_every_arg && "kernel_arg_name meta incomplete");
+          kernel->has_arg_metadata |= POCL_HAS_KERNEL_ARG_NAME;
+          current_arg->name = (char *)malloc(val.size() + 1);
+          std::strcpy(current_arg->name, val.c_str());
+        } else
+          std::cout << "UNKNOWN opencl metadata name: " << meta_name
+                    << std::endl;
+      } else if (meta_name != "reqd_work_group_size")
+        std::cout << "UNKNOWN opencl metadata class for: " << meta_name
+                  << std::endl;
+    }
+  }
+  return 0;
+}
+
+static std::map<std::string, unsigned> type_size_map = {
+  {std::string("char"), (1)},
+  {std::string("uchar"), (1)},
+  {std::string("short"), (2)},
+  {std::string("ushort"), (2)},
+  {std::string("int"), (4)},
+  {std::string("uint"), (4)},
+  {std::string("long"), (8)},
+  {std::string("ulong"), (8)},
+
+
+  {std::string("char2"), (1*2)},
+  {std::string("uchar2"), (1*2)},
+  {std::string("short2"), (2*2)},
+  {std::string("ushort2"), (2*2)},
+  {std::string("int2"), (4*2)},
+  {std::string("uint2"), (4*2)},
+  {std::string("long2"), (8*2)},
+  {std::string("ulong2"), (8*2)},
+
+
+  {std::string("char3"), (1*4)},
+  {std::string("uchar3"), (1*4)},
+  {std::string("short3"), (2*4)},
+  {std::string("ushort3"), (2*4)},
+  {std::string("int3"), (4*4)},
+  {std::string("uint3"), (4*4)},
+  {std::string("long3"), (8*4)},
+  {std::string("ulong3"), (8*4)},
+
+
+  {std::string("char4"), (1*4)},
+  {std::string("uchar4"), (1*4)},
+  {std::string("short4"), (2*4)},
+  {std::string("ushort4"), (2*4)},
+  {std::string("int4"), (4*4)},
+  {std::string("uint4"), (4*4)},
+  {std::string("long4"), (8*4)},
+  {std::string("ulong4"), (8*4)},
+
+
+  {std::string("char8"), (1*8)},
+  {std::string("uchar8"), (1*8)},
+  {std::string("short8"), (2*8)},
+  {std::string("ushort8"), (2*8)},
+  {std::string("int8"), (4*8)},
+  {std::string("uint8"), (4*8)},
+  {std::string("long8"), (8*8)},
+  {std::string("ulong8"), (8*8)},
+
+  {std::string("char16"), (1*16)},
+  {std::string("uchar16"), (1*16)},
+  {std::string("short16"), (2*16)},
+  {std::string("ushort16"), (2*16)},
+  {std::string("int16"), (4*16)},
+  {std::string("uint16"), (4*16)},
+  {std::string("long16"), (8*16)},
+  {std::string("ulong16"), (8*16)},
+
+  {std::string("half"), (2)},
+  {std::string("float"), (4)},
+  {std::string("double"), (8)},
+
+  {std::string("half2"), (2*2)},
+  {std::string("float2"), (4*2)},
+  {std::string("double2"), (8*2)},
+
+  {std::string("half3"), (2*4)},
+  {std::string("float3"), (4*4)},
+  {std::string("double3"), (8*4)},
+
+  {std::string("half4"), (2*4)},
+  {std::string("float4"), (4*4)},
+  {std::string("double4"), (8*4)},
+
+  {std::string("half8"), (2*8)},
+  {std::string("float8"), (4*8)},
+  {std::string("double8"), (8*8)},
+
+  {std::string("half16"), (2*16)},
+  {std::string("float16"), (4*16)},
+  {std::string("double16"), (8*16)}
+};
+
+#ifndef LLVM_OLDER_THAN_3_9
+// Clang 3.9 uses function metadata instead of module metadata for presenting
+// OpenCL kernel information.
+static int pocl_get_kernel_arg_function_metadata(const char *kernel_name,
+                                                 llvm::Module *input,
+                                                 cl_kernel kernel) {
+  llvm::Function *Kernel = NULL;
+  int bitcode_is_spir = input->getTargetTriple().find("spir") == 0;
+
+  // SPIR still uses the "opencl.kernels" MD.
+  if (bitcode_is_spir) {
+    auto status =
+      pocl_get_kernel_arg_module_metadata(kernel_name, input, kernel);
+    if (status == 0)
+      return status;
+    // Else go on, because it might be SPIR encoded with modern LLVM
+  }
+
+  for (llvm::Module::iterator i = input->begin(), e = input->end();
+       i != e; ++i) {
+    if (i->getMetadata("kernel_arg_access_qual") &&
+        i->getName() == kernel_name) {
+      Kernel = &*i;
+      break;
+    }
+  }
+  assert(Kernel);
+  kernel->has_arg_metadata = 0;
+
+  llvm::MDNode *meta_node;
+  llvm::Value *meta_arg_value = NULL;
+  struct pocl_argument_info *current_arg = NULL;
+
+  kernel->arg_info = (struct pocl_argument_info *)calloc(
+      kernel->num_args, sizeof(struct pocl_argument_info));
+  memset(kernel->arg_info, 0,
+         sizeof(struct pocl_argument_info) * kernel->num_args);
+
+  // kernel_arg_addr_space
+  meta_node = Kernel->getMetadata("kernel_arg_addr_space");
+  assert(meta_node != nullptr);
+  unsigned arg_num = meta_node->getNumOperands();
+#ifndef NDEBUG
+  int has_meta_for_every_arg = (arg_num == kernel->num_args);
+#endif
+  for (unsigned j = 0; j < arg_num; ++j) {
+    assert(has_meta_for_every_arg && "kernel_arg_addr_space meta incomplete");
+
+    current_arg = &kernel->arg_info[j];
+    kernel->has_arg_metadata |= POCL_HAS_KERNEL_ARG_ADDRESS_QUALIFIER;
+    // std::cout << "is ConstantInt /  kernel_arg_addr_space" << std::endl;
+    meta_arg_value =
+        dyn_cast<ConstantAsMetadata>(meta_node->getOperand(j))->getValue();
+    llvm::ConstantInt *m = llvm::cast<ConstantInt>(meta_arg_value);
+    uint64_t val = m->getLimitedValue(UINT_MAX);
+
+    bool SPIRAddressSpaceIDs;
+#ifdef POCL_USE_FAKE_ADDR_SPACE_IDS
+    SPIRAddressSpaceIDs = bitcode_is_spir;
+#else
+    // We have an LLVM fixed to produce always SPIR AS ids for the argument
+    // info metadata.
+    SPIRAddressSpaceIDs = true;
+#endif
+    if (SPIRAddressSpaceIDs) {
+      switch (val) {
+      case 0:
+        current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_PRIVATE;
+        break;
+      case 1:
+        current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_GLOBAL;
+        break;
+      case 3:
+        current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_LOCAL;
+        break;
+      case 2:
+        current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_CONSTANT;
+        break;
+      default:
+        POCL_MSG_ERR("Unknown address space ID %lu\n", val);
+        break;
+      }
+    } else {
+      switch (val) {
+#ifdef POCL_USE_FAKE_ADDR_SPACE_IDS
+      case POCL_FAKE_AS_PRIVATE:
+        current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_PRIVATE;
+        break;
+      case POCL_FAKE_AS_GLOBAL:
+        current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_GLOBAL;
+        break;
+      case POCL_FAKE_AS_LOCAL:
+        current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_LOCAL;
+        break;
+      case POCL_FAKE_AS_CONSTANT:
+        current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_CONSTANT;
+        break;
+      case POCL_FAKE_AS_GENERIC:
+        current_arg->address_qualifier = CL_KERNEL_ARG_ADDRESS_PRIVATE;
+        break;
+#endif
+      default:
+        POCL_MSG_ERR("Unknown address space ID %lu\n", val);
+        break;
+      }
+    }
+  }
+
+  // kernel_arg_access_qual
+  meta_node = Kernel->getMetadata("kernel_arg_access_qual");
+  arg_num = meta_node->getNumOperands();
+#ifndef NDEBUG
+  has_meta_for_every_arg = (arg_num == kernel->num_args);
+#endif
+  assert(has_meta_for_every_arg && "kernel_arg_access_qual meta incomplete");
+
+  for (unsigned j = 0; j < meta_node->getNumOperands(); ++j) {
+    current_arg = &kernel->arg_info[j];
+    // std::cout << "is MDString" << std::endl;
+    llvm::MDString *m = llvm::cast<MDString>(meta_node->getOperand(j));
+    std::string val = m->getString().str();
+
+    assert(has_meta_for_every_arg && "kernel_arg_access_qual meta incomplete");
+    kernel->has_arg_metadata |= POCL_HAS_KERNEL_ARG_ACCESS_QUALIFIER;
+    if (val == "read_write")
+      current_arg->access_qualifier = CL_KERNEL_ARG_ACCESS_READ_WRITE;
+    else if (val == "read_only")
+      current_arg->access_qualifier = CL_KERNEL_ARG_ACCESS_READ_ONLY;
+    else if (val == "write_only")
+      current_arg->access_qualifier = CL_KERNEL_ARG_ACCESS_WRITE_ONLY;
+    else if (val == "none")
+      current_arg->access_qualifier = CL_KERNEL_ARG_ACCESS_NONE;
+    else
+      std::cout << "UNKNOWN kernel_arg_access_qual value: " << val << std::endl;
+  }
+
+  // kernel_arg_type
+  meta_node = Kernel->getMetadata("kernel_arg_type");
+  assert(meta_node != nullptr);
+  arg_num = meta_node->getNumOperands();
+#ifndef NDEBUG
+  has_meta_for_every_arg = (arg_num == kernel->num_args);
+#endif
+  assert(has_meta_for_every_arg && "kernel_arg_type meta incomplete");
+
+  for (unsigned j = 0; j < meta_node->getNumOperands(); ++j) {
+    llvm::MDString *m = llvm::cast<MDString>(meta_node->getOperand(j));
+    std::string val = m->getString().str();
+
+    current_arg = &kernel->arg_info[j];
+    kernel->has_arg_metadata |= POCL_HAS_KERNEL_ARG_TYPE_NAME;
+    current_arg->type_name = (char *)malloc(val.size() + 1);
+    if (type_size_map.find(val) != type_size_map.end())
+      current_arg->type_size = type_size_map[val];
+    else
+      current_arg->type_size = 0;
+    std::strcpy(current_arg->type_name, val.c_str());
+  }
+
+  // kernel_arg_type_qual
+  meta_node = Kernel->getMetadata("kernel_arg_type_qual");
+  arg_num = meta_node->getNumOperands();
+#ifndef NDEBUG
+  has_meta_for_every_arg = (arg_num == kernel->num_args);
+#endif
+  assert(has_meta_for_every_arg && "kernel_arg_type_qual meta incomplete");
+  for (unsigned j = 0; j < meta_node->getNumOperands(); ++j) {
+    llvm::MDString *m = llvm::cast<MDString>(meta_node->getOperand(j));
+    std::string val = m->getString().str();
+
+    current_arg = &kernel->arg_info[j];
+    assert(has_meta_for_every_arg && "kernel_arg_type_qual meta incomplete");
+    kernel->has_arg_metadata |= POCL_HAS_KERNEL_ARG_TYPE_QUALIFIER;
+    current_arg->type_qualifier = 0;
+    if (val.find("const") != std::string::npos)
+      current_arg->type_qualifier |= CL_KERNEL_ARG_TYPE_CONST;
+    if (val.find("restrict") != std::string::npos)
+      current_arg->type_qualifier |= CL_KERNEL_ARG_TYPE_RESTRICT;
+    if (val.find("volatile") != std::string::npos)
+      current_arg->type_qualifier |= CL_KERNEL_ARG_TYPE_VOLATILE;
+  }
+
+  // kernel_arg_name
+  meta_node = Kernel->getMetadata("kernel_arg_name");
+  if (meta_node) {
+    arg_num = meta_node->getNumOperands();
+#ifndef NDEBUG
+    has_meta_for_every_arg = (arg_num == kernel->num_args);
+#endif
+    assert(has_meta_for_every_arg && "kernel_arg_name meta incomplete");
+    for (unsigned j = 0; j < meta_node->getNumOperands(); ++j) {
+      llvm::MDString *m = llvm::cast<MDString>(meta_node->getOperand(j));
+      std::string val = m->getString().str();
+
+      current_arg = &kernel->arg_info[j];
+      kernel->has_arg_metadata |= POCL_HAS_KERNEL_ARG_NAME;
+      current_arg->name = (char *)malloc(val.size() + 1);
+      std::strcpy(current_arg->name, val.c_str());
+    }
+  }
+  else {
+    // With SPIR 2.0 generated by more modern (3.9) Clang there is no more
+    // "kernel_arg_name" metadata, so retrieve the name in another way
+    // \todo Implement walking on the arguments themselves to get the names
+  }
+
+  return 0;
+}
+#endif
+
+int pocl_llvm_get_kernel_metadata(cl_program program,
+                                  cl_kernel kernel,
+                                  int device_i,
+                                  const char* kernel_name,
+                                  int * errcode)
+{
+  PoclCompilerMutexGuard lockHolder(NULL);
+  InitializeLLVM();
+
+  int i;
+  llvm::Module *input = NULL;
+  cl_device_id Device = program->devices[device_i];
+
+  assert(Device->llvm_target_triplet && "Device has no target triple set");
+
+  if (program->llvm_irs != NULL && program->llvm_irs[device_i] != NULL)
+    input = (llvm::Module *)program->llvm_irs[device_i];
+  else {
+    *errcode = CL_INVALID_PROGRAM_EXECUTABLE;
+    return 1;
+  }
+
+  llvm::Function *KernelFunction = input->getFunction(kernel_name);
+  if (!KernelFunction) {
+    *errcode = CL_INVALID_KERNEL_NAME;
+    return 1;
+  }
+  kernel->num_args = KernelFunction->arg_size();
+
+#if defined(LLVM_OLDER_THAN_3_9)
+  if (pocl_get_kernel_arg_module_metadata(kernel_name, input, kernel)) {
+    *errcode = CL_INVALID_KERNEL;
+    return 1;
+  }
+#else
+  if (pocl_get_kernel_arg_function_metadata(kernel_name, input, kernel)) {
+    *errcode = CL_INVALID_KERNEL;
+    return 1;
+  }
+#endif
+
+#ifdef DEBUG_POCL_LLVM_API
+  printf("### fetching kernel metadata for kernel %s program %p "
+         "input llvm::Module %p\n",
+         kernel_name, program, input);
+#endif
+
+  DataLayout *TD = nullptr;
+#ifdef LLVM_OLDER_THAN_3_7
+  const std::string &ModuleDataLayout =
+      input->getDataLayout()->getStringRepresentation();
+#else
+  const std::string &ModuleDataLayout =
+      input->getDataLayout().getStringRepresentation();
+#endif
+  assert(!ModuleDataLayout.empty());
+  TD = new DataLayout(ModuleDataLayout);
+
+  SmallVector<GlobalVariable *, 8> locals;
+  for (llvm::Module::global_iterator i = input->global_begin(),
+                                     e = input->global_end();
+       i != e; ++i) {
+    std::string funcName = "";
+    funcName = KernelFunction->getName().str();
+    if (pocl::isAutomaticLocal(funcName, *i)) {
+      POCL_MSG_PRINT_LLVM("Automatic local detected: %s\n",
+                          i->getName().str().c_str());
+      locals.push_back(&*i);
+    }
+  }
+
+  kernel->num_locals = locals.size();
+
+  /* Temporary store for the arguments that are set with clSetKernelArg. */
+  kernel->dyn_arguments = (struct pocl_argument *)malloc(
+      (kernel->num_args + kernel->num_locals) * sizeof(struct pocl_argument));
+  /* Initialize kernel "dynamic" arguments (in case the user doesn't). */
+  for (unsigned i = 0; i < kernel->num_args; ++i) {
+    kernel->dyn_arguments[i].value = NULL;
+    kernel->dyn_arguments[i].size = 0;
+  }
+
+  /* Fill up automatic local arguments. */
+  for (unsigned i = 0; i < kernel->num_locals; ++i) {
+    unsigned auto_local_size =
+        TD->getTypeAllocSize(locals[i]->getInitializer()->getType());
+    kernel->dyn_arguments[kernel->num_args + i].value = NULL;
+    kernel->dyn_arguments[kernel->num_args + i].size = auto_local_size;
+#ifdef DEBUG_POCL_LLVM_API
+    printf("### automatic local %d size %u\n", i, auto_local_size);
+#endif
+  }
+
+  i = 0;
+  for (llvm::Function::const_arg_iterator ii = KernelFunction->arg_begin(),
+                                          ee = KernelFunction->arg_end();
+       ii != ee; ii++) {
+    llvm::Type *t = ii->getType();
+    struct pocl_argument_info &ArgInfo = kernel->arg_info[i];
+    ArgInfo.type = POCL_ARG_TYPE_NONE;
+    ArgInfo.is_local = false;
+    const llvm::PointerType *p = dyn_cast<llvm::PointerType>(t);
+    if (p && !ii->hasByValAttr()) {
+      ArgInfo.type = POCL_ARG_TYPE_POINTER;
+      // index 0 is for function attributes, parameters start at 1.
+      // TODO: detect the address space from MD.
+
+#ifndef POCL_USE_FAKE_ADDR_SPACE_IDS
+      if (ArgInfo.address_qualifier == CL_KERNEL_ARG_ADDRESS_LOCAL)
+        ArgInfo.is_local = true;
+#else
+      if (p->getAddressSpace() == POCL_FAKE_AS_GLOBAL ||
+          p->getAddressSpace() == POCL_FAKE_AS_CONSTANT ||
+          pocl::is_image_type(*t) || pocl::is_sampler_type(*t)) {
+        kernel->arg_info[i].is_local = false;
+      } else {
+        if (p->getAddressSpace() != POCL_FAKE_AS_LOCAL) {
+          p->dump();
+          assert(p->getAddressSpace() == POCL_FAKE_AS_LOCAL);
+        }
+        kernel->arg_info[i].is_local = true;
+      }
+#endif
+    }
+
+    if (pocl::is_image_type(*t)) {
+      ArgInfo.type = POCL_ARG_TYPE_IMAGE;
+    } else if (pocl::is_sampler_type(*t)) {
+      ArgInfo.type = POCL_ARG_TYPE_SAMPLER;
+    }
+    i++;
+  }
+
+  std::stringstream attrstr;
+  std::string vectypehint;
+  // fill 'kernel->reqd_wg_size'
+  kernel->reqd_wg_size = (size_t *)malloc(3 * sizeof(size_t));
+
+  size_t reqdx = 0, reqdy = 0, reqdz = 0;
+
+#ifdef LLVM_OLDER_THAN_3_9
+  llvm::NamedMDNode *size_info =
+    KernelFunction->getParent()->getNamedMetadata("opencl.kernel_wg_size_info");
+  if (size_info) {
+    for (unsigned i = 0, e = size_info->getNumOperands(); i != e; ++i) {
+      llvm::MDNode *KernelSizeInfo = size_info->getOperand(i);
+      if (dyn_cast<ValueAsMetadata>(
+        KernelSizeInfo->getOperand(0).get())->getValue() != KernelFunction)
+        continue;
+      reqdx = (llvm::cast<ConstantInt>(
+                 llvm::dyn_cast<ConstantAsMetadata>(
+                   KernelSizeInfo->getOperand(1))->getValue()))->getLimitedValue();
+      reqdy = (llvm::cast<ConstantInt>(
+                 llvm::dyn_cast<ConstantAsMetadata>(
+                   KernelSizeInfo->getOperand(2))->getValue()))->getLimitedValue();
+      reqdz = (llvm::cast<ConstantInt>(
+                 llvm::dyn_cast<ConstantAsMetadata>(
+                   KernelSizeInfo->getOperand(3))->getValue()))->getLimitedValue();
+      break;
+    }
+  }
+#else
+  llvm::MDNode *ReqdWGSize =
+      KernelFunction->getMetadata("reqd_work_group_size");
+  if (ReqdWGSize != NULL) {
+    reqdx = (llvm::cast<ConstantInt>(
+               llvm::dyn_cast<ConstantAsMetadata>(
+                 ReqdWGSize->getOperand(0))->getValue()))->getLimitedValue();
+    reqdy = (llvm::cast<ConstantInt>(
+               llvm::dyn_cast<ConstantAsMetadata>(
+                 ReqdWGSize->getOperand(1))->getValue()))->getLimitedValue();
+    reqdz = (llvm::cast<ConstantInt>(
+               llvm::dyn_cast<ConstantAsMetadata>(
+                 ReqdWGSize->getOperand(2))->getValue()))->getLimitedValue();
+  }
+#endif
+
+  // TODO: implement vec_type_hint / work_group_size_hint attributes
+  kernel->reqd_wg_size[0] = reqdx;
+  kernel->reqd_wg_size[1] = reqdy;
+  kernel->reqd_wg_size[2] = reqdz;
+  if (reqdx || reqdy || reqdz)
+    attrstr << "__attribute__((reqd_work_group_size("
+            << reqdx << ", " << reqdy
+            << ", " << reqdz << " )))";
+  if (vectypehint.size() > 0) {
+    if (reqdx || reqdy || reqdz)
+      attrstr << " ";
+    attrstr << "__attribute__ ((vec_type_hint (" << vectypehint << ")))";
+  }
+
+  std::string r = attrstr.str();
+  if (r.size() > 0) {
+    kernel->attributes = (char *)malloc(r.size() + 1);
+    std::memcpy(kernel->attributes, r.c_str(), r.size());
+    kernel->attributes[r.size()] = 0;
+  } else
+    kernel->attributes = NULL;
+
+#ifndef POCL_ANDROID
+  // Generate the kernel_obj.c file. This should be optional
+  // and generated only for the heterogeneous standalone devices which
+  // need the definitions to accompany the kernels, for the launcher
+  // code.
+  // TODO: the scripts use a generated kernel.h header file that
+  // gets added to this file. No checks seem to fail if that file
+  // is missing though, so it is left out from there for now
+
+  std::stringstream content;
+
+  content << std::endl
+          << "#include <pocl_device.h>" << std::endl
+          << "void _pocl_launcher_" << kernel_name
+          << "_workgroup(void** args, struct pocl_context*);" << std::endl
+          << "void _pocl_launcher_" << kernel_name
+          << "_workgroup_fast(void** args, struct pocl_context*);" << std::endl;
+
+  if (Device->global_as_id != 0)
+    content << "__attribute__((address_space(" << Device->global_as_id << ")))"
+            << std::endl;
+
+  content << "__kernel_metadata _" << kernel_name << "_md = {" << std::endl
+          << "     \"" << kernel_name << "\"," << std::endl
+          << "     " << kernel->num_args << "," << std::endl
+          << "     " << kernel->num_locals << "," << std::endl
+          << "     _pocl_launcher_" << kernel_name << "_workgroup_fast"
+          << std::endl
+          << " };" << std::endl;
+
+  pocl_cache_write_descriptor(program, device_i, kernel_name,
+                              content.str().c_str(), content.str().size());
+#endif
+
+  delete TD;
+  *errcode = CL_SUCCESS;
+  return 0;
+}
+
+
+
+/* This is the implementation of the public pocl_llvm_get_kernel_count(),
+ * and is used internally also by pocl_llvm_get_kernel_names to
+ */
+static unsigned pocl_llvm_get_kernel_count(cl_program program,
+                                           char **knames,
+                                           unsigned max_num_krn)
+{
+  PoclCompilerMutexGuard lockHolder(NULL);
+  InitializeLLVM();
+
+  /* any device's module will do for metadata, just use first non-NULL */
+  llvm::Module *mod = NULL;
+  unsigned i;
+  for (i = 0; i < program->num_devices; i++)
+    if (program->llvm_irs[i]) {
+      mod = (llvm::Module *)program->llvm_irs[i];
+      break;
+    }
+
+  llvm::NamedMDNode *md = mod->getNamedMetadata("opencl.kernels");
+  if (md) {
+
+    if (knames) {
+      for (unsigned i = 0; i < max_num_krn; i++) {
+        assert(md->getOperand(i)->getOperand(0) != NULL);
+        llvm::ValueAsMetadata *value =
+            dyn_cast<llvm::ValueAsMetadata>(md->getOperand(i)->getOperand(0));
+        llvm::Function *k = cast<Function>(value->getValue());
+        knames[i] = strdup(k->getName().data());
+      }
+    }
+    return md->getNumOperands();
+  }
+  // LLVM 3.9 does not use opencl.kernels meta, but kernel_arg_* function meta
+  else {
+    unsigned kernel_count = 0;
+    for (llvm::Module::iterator i = mod->begin(), e = mod->end(); i != e; ++i) {
+      if (i->getMetadata("kernel_arg_access_qual")) {
+        if (knames && kernel_count < max_num_krn) {
+          knames[kernel_count] = strdup(i->getName().str().c_str());
+        }
+        ++kernel_count;
+      }
+    }
+    return kernel_count;
+  }
+}
+
+unsigned pocl_llvm_get_kernel_count(cl_program program) {
+  return pocl_llvm_get_kernel_count(program, NULL, 0);
+}
+
+unsigned pocl_llvm_get_kernel_names(cl_program program, char **knames,
+                                    unsigned max_num_krn) {
+  unsigned n = pocl_llvm_get_kernel_count(program, knames, max_num_krn);
+
+  return n;
+}
diff --git a/lib/CL/pocl_llvm_utils.cc b/lib/CL/pocl_llvm_utils.cc
new file mode 100644
index 0000000..83353d5
--- /dev/null
+++ b/lib/CL/pocl_llvm_utils.cc
@@ -0,0 +1,370 @@
+/* pocl_llvm_utils.cc: various helpers for pocl LLVM API.
+
+   Copyright (c) 2013 Kalle Raiskila
+                 2013-2017 Pekka Jääskeläinen
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include "config.h"
+#include "pocl_runtime_config.h"
+#include "pocl_llvm_api.h"
+#include "pocl_debug.h"
+
+#include <llvm/ADT/StringRef.h>
+#include <llvm/ADT/StringMap.h>
+
+#include <llvm/Support/MutexGuard.h>
+#include <llvm/Support/Host.h>
+#include <llvm/Support/TargetSelect.h>
+#include <llvm/Support/SourceMgr.h>
+#include <llvm/Support/Signals.h>
+
+#include <llvm/IR/DiagnosticPrinter.h>
+#include <llvm/IR/LLVMContext.h>
+#include <llvm/IR/Module.h>
+
+#include <llvm/Target/TargetMachine.h>
+
+#include <llvm/IRReader/IRReader.h>
+
+#ifdef LLVM_OLDER_THAN_4_0
+#include <llvm/Bitcode/ReaderWriter.h>
+#else
+#include <llvm/Bitcode/BitcodeReader.h>
+#include <llvm/Bitcode/BitcodeWriter.h>
+#endif
+
+#include <llvm/Support/raw_os_ostream.h>
+#include <llvm/PassRegistry.h>
+
+#ifdef LLVM_OLDER_THAN_3_7
+#include <llvm/PassManager.h>
+#else
+#include <llvm/IR/LegacyPassManager.h>
+#define PassManager legacy::PassManager
+#endif
+
+
+
+
+using namespace llvm;
+
+#include <string>
+#include <map>
+
+llvm::Module *parseModuleIR(const char *path) {
+  SMDiagnostic Err;
+  return parseIRFile(path, Err, GlobalContext()).release();
+}
+
+
+void writeModuleIR(const Module *mod, std::string &str) {
+  llvm::raw_string_ostream sos(str);
+  WriteBitcodeToFile(mod, sos);
+  sos.str(); // flush
+}
+
+llvm::Module *parseModuleIRMem(const char *input_stream, size_t size) {
+  StringRef input_stream_ref(input_stream, size);
+  std::unique_ptr<MemoryBuffer> buffer =
+      MemoryBuffer::getMemBufferCopy(input_stream_ref);
+
+#ifdef LLVM_OLDER_THAN_3_8
+  llvm::ErrorOr<std::unique_ptr<llvm::Module>> parsed_module =
+      parseBitcodeFile(buffer->getMemBufferRef(), GlobalContext());
+  if (std::error_code ec = parsed_module.getError())
+    return nullptr;
+#else
+  auto parsed_module =
+      parseBitcodeFile(buffer->getMemBufferRef(), GlobalContext());
+  if (!parsed_module)
+    return nullptr;
+#endif
+  return parsed_module.get().release();
+}
+
+int getModuleTriple(const char *input_stream, size_t size,
+                    std::string &triple) {
+  StringRef input_stream_ref(input_stream, size);
+  std::unique_ptr<MemoryBuffer> buffer =
+      MemoryBuffer::getMemBufferCopy(input_stream_ref);
+
+#ifdef LLVM_OLDER_THAN_4_0
+  triple = getBitcodeTargetTriple(buffer->getMemBufferRef(), GlobalContext());
+#else
+  auto triple_e = getBitcodeTargetTriple(buffer->getMemBufferRef());
+  if (!triple_e)
+    return -1;
+  triple = triple_e.get();
+#endif
+  return 0;
+}
+
+char *get_cpu_name() {
+#ifdef __mips__
+  // TODO test: this may be handled by the if-"generic"-then-"HOST_CPU" below.
+  //
+  // The MIPS backend isn't able to automatically detect the host yet and the
+  // value returned by llvm::sys::getHostCPUName() isn't usable in the
+  // -target-cpu option so we must use the CPU detected by CMake.
+  StringRef r = OCL_KERNEL_TARGET_CPU;
+#else
+  StringRef r = llvm::sys::getHostCPUName();
+#endif
+
+#ifdef LLVM_3_8
+  // https://github.com/pocl/pocl/issues/413
+  if (r.str() == "skylake") {
+    r = llvm::StringRef("haswell");
+  }
+#endif
+
+  if (r.str() == "generic") {
+    POCL_MSG_WARN("LLVM does not recognize your cpu, trying to use "
+                   HOST_CPU " for -target-cpu\n");
+    r = llvm::StringRef(HOST_CPU);
+  }
+
+  assert(r.size() > 0);
+  char *cpu_name = (char *)malloc(r.size() + 1);
+  strncpy(cpu_name, r.data(), r.size());
+  cpu_name[r.size()] = 0;
+  return cpu_name;
+}
+
+int bitcode_is_spir(const char *bitcode, size_t size) {
+  std::string triple;
+  int err = getModuleTriple(bitcode, size, triple);
+  if (!err)
+    return triple.find("spir") == 0;
+  else
+    return 0;
+}
+
+// TODO this should be fixed to not require LLVM eventually,
+// so that LLVM-less builds also report FMA correctly.
+int cpu_has_fma() {
+  StringMap<bool> features;
+  bool res = llvm::sys::getHostCPUFeatures(features);
+  assert(res);
+  return ((features["fma"] || features["fma4"]) ? 1 : 0);
+}
+
+#define VECWIDTH(x)                                                            \
+  std::min(std::max((lane_width / (unsigned)(sizeof(x))), 1U), 16U)
+
+void cpu_setup_vector_widths(cl_device_id dev) {
+  StringMap<bool> features;
+  bool res = llvm::sys::getHostCPUFeatures(features);
+  assert(res);
+  unsigned lane_width = 1;
+  if ((features["sse"]) || (features["neon"]))
+    lane_width = 16;
+  if (features["avx"])
+    lane_width = 32;
+  if (features["avx512f"])
+    lane_width = 64;
+
+  dev->native_vector_width_char = dev->preferred_vector_width_char =
+      VECWIDTH(char);
+  dev->native_vector_width_short = dev->preferred_vector_width_short =
+      VECWIDTH(short);
+  dev->native_vector_width_int = dev->preferred_vector_width_int =
+      VECWIDTH(int);
+  dev->native_vector_width_long = dev->preferred_vector_width_long =
+      VECWIDTH(long);
+  dev->native_vector_width_float = dev->preferred_vector_width_float =
+      VECWIDTH(float);
+  dev->native_vector_width_double = dev->preferred_vector_width_double =
+      VECWIDTH(double);
+  dev->native_vector_width_half = dev->preferred_vector_width_half =
+      VECWIDTH(short);
+}
+
+int pocl_llvm_remove_file_on_signal(const char *file) {
+  return llvm::sys::RemoveFileOnSignal(
+            StringRef(file)) ? 0 : -1;
+}
+
+/*
+ * Use one global LLVMContext across all LLVM bitcodes. This is because
+ * we want to cache the bitcode IR libraries and reuse them when linking
+ * new kernels. The CloneModule etc. seem to assume we are linking
+ * bitcodes with a same LLVMContext. Unfortunately, this requires serializing
+ * all calls to the LLVM APIs with mutex.
+ * Freeing/deleting the context crashes LLVM 3.2 (at program exit), as a
+ * work-around, allocate this from heap.
+ */
+static LLVMContext *globalContext = NULL;
+static bool LLVMInitialized = false;
+
+static std::string poclDiagString;
+static llvm::raw_string_ostream poclDiagStream(poclDiagString);
+static DiagnosticPrinterRawOStream poclDiagPrinter(poclDiagStream);
+
+static void diagHandler(const DiagnosticInfo &DI, void *Context) {
+  DI.print(poclDiagPrinter);
+}
+
+std::string getDiagString() {
+  poclDiagStream.flush();
+  std::string ret(std::move(poclDiagString));
+  poclDiagString.clear();
+  return ret;
+}
+
+llvm::LLVMContext &GlobalContext() {
+  if (globalContext == NULL) {
+    globalContext = new LLVMContext();
+    globalContext->setDiagnosticHandler(diagHandler, globalContext);
+  }
+  return *globalContext;
+}
+
+/* The LLVM API interface functions are not at the moment not thread safe,
+ * Pocl needs to ensure only one thread is using this layer at the time.
+ */
+static pocl_lock_t kernelCompilerLock = POCL_LOCK_INITIALIZER;
+
+PoclCompilerMutexGuard::PoclCompilerMutexGuard(void *unused) {
+  POCL_LOCK(kernelCompilerLock);
+}
+
+PoclCompilerMutexGuard::~PoclCompilerMutexGuard() {
+  POCL_UNLOCK(kernelCompilerLock);
+}
+
+std::string currentWgMethod;
+
+/* must be called with kernelCompilerLock locked */
+void InitializeLLVM() {
+
+  if (LLVMInitialized)
+    return;
+  // We have not initialized any pass managers for any device yet.
+  // Run the global LLVM pass initialization functions.
+  InitializeAllTargets();
+  InitializeAllTargetMCs();
+  InitializeAllAsmPrinters();
+  InitializeAllAsmParsers();
+
+  PassRegistry &Registry = *PassRegistry::getPassRegistry();
+
+  initializeCore(Registry);
+  initializeScalarOpts(Registry);
+  initializeVectorization(Registry);
+  initializeIPO(Registry);
+  initializeAnalysis(Registry);
+#ifdef LLVM_OLDER_THAN_3_8
+  initializeIPA(Registry);
+#endif
+  initializeTransformUtils(Registry);
+  initializeInstCombine(Registry);
+  initializeInstrumentation(Registry);
+  initializeTarget(Registry);
+
+// Set the options only once. TODO: fix it so that each
+// device can reset their own options. Now one cannot compile
+// with different options to different devices at one run.
+
+#ifdef LLVM_OLDER_THAN_3_7
+  StringMap<llvm::cl::Option *> opts;
+  llvm::cl::getRegisteredOptions(opts);
+#else
+  StringMap<llvm::cl::Option *> &opts = llvm::cl::getRegisteredOptions();
+#endif
+
+  llvm::cl::Option *O = nullptr;
+
+  currentWgMethod = pocl_get_string_option("POCL_WORK_GROUP_METHOD", "loopvec");
+
+  if (currentWgMethod == "loopvec") {
+
+    O = opts["scalarize-load-store"];
+    assert(O && "could not find LLVM option 'scalarize-load-store'");
+    O->addOccurrence(1, StringRef("scalarize-load-store"), StringRef("1"),
+                     false);
+
+    // LLVM inner loop vectorizer does not check whether the loop inside
+    // another loop, in which case even a small trip count loops might be
+    // worthwhile to vectorize.
+    O = opts["vectorizer-min-trip-count"];
+    assert(O && "could not find LLVM option 'vectorizer-min-trip-count'");
+    O->addOccurrence(1, StringRef("vectorizer-min-trip-count"), StringRef("2"),
+                     false);
+
+    if (pocl_get_bool_option("POCL_VECTORIZER_REMARKS", 0) == 1) {
+      // Enable diagnostics from the loop vectorizer.
+      O = opts["pass-remarks-missed"];
+      assert(O && "could not find LLVM option 'pass-remarks-missed'");
+      O->addOccurrence(1, StringRef("pass-remarks-missed"),
+                       StringRef("loop-vectorize"), false);
+
+      O = opts["pass-remarks-analysis"];
+      assert(O && "could not find LLVM option 'pass-remarks-analysis'");
+      O->addOccurrence(1, StringRef("pass-remarks-analysis"),
+                       StringRef("loop-vectorize"), false);
+
+      O = opts["pass-remarks"];
+      assert(O && "could not find LLVM option 'pass-remarks'");
+      O->addOccurrence(1, StringRef("pass-remarks"),
+                       StringRef("loop-vectorize"), false);
+    }
+  }
+  if (pocl_get_bool_option("POCL_DEBUG_LLVM_PASSES", 0) == 1) {
+    O = opts["debug"];
+    assert(O && "could not find LLVM option 'debug'");
+    O->addOccurrence(1, StringRef("debug"), StringRef("true"), false);
+  }
+
+  O = opts["unroll-threshold"];
+  assert(O && "could not find LLVM option 'unroll-threshold'");
+  O->addOccurrence(1, StringRef("unroll-threshold"), StringRef("1"), false);
+
+  LLVMInitialized = true;
+}
+
+
+// TODO FIXME currently pocl_llvm_release() only works when
+// there are zero programs with IRs, because
+// programs hold references to LLVM IRs
+long numberOfIRs = 0;
+
+void pocl_llvm_release() {
+
+  PoclCompilerMutexGuard lockHolder(NULL);
+
+  assert(numberOfIRs >= 0);
+
+  if (numberOfIRs > 0) {
+    POCL_MSG_PRINT_LLVM("still have references to IRs - not releasing LLVM\n");
+    return;
+  } else {
+    POCL_MSG_PRINT_LLVM("releasing LLVM\n");
+  }
+
+  clearKernelPasses();
+  clearTargetMachines();
+  cleanKernelLibrary();
+
+  delete globalContext;
+  globalContext = nullptr;
+  LLVMInitialized = false;
+}
diff --git a/lib/CL/pocl_llvm_wg.cc b/lib/CL/pocl_llvm_wg.cc
new file mode 100644
index 0000000..7d9a435
--- /dev/null
+++ b/lib/CL/pocl_llvm_wg.cc
@@ -0,0 +1,658 @@
+/* pocl_llvm_wg.cc: part of pocl LLVM API dealing with parallel.bc,
+   optimization passes and codegen.
+
+   Copyright (c) 2013 Kalle Raiskila
+                 2013-2017 Pekka Jääskeläinen
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include "config.h"
+#include "pocl.h"
+#include "pocl_cache.h"
+#include "pocl_llvm_api.h"
+#include "pocl_file_util.h"
+
+#include <string>
+#include <map>
+#include <vector>
+#include <iostream>
+
+#include <llvm/Support/Casting.h>
+#include <llvm/Support/MutexGuard.h>
+#include <llvm/Support/TargetRegistry.h>
+#include <llvm/Support/SourceMgr.h>
+#include <llvm/Support/CommandLine.h>
+
+#include <llvm/ADT/Triple.h>
+#include <llvm/ADT/StringRef.h>
+
+#include <llvm/IR/Module.h>
+#include <llvm/IR/Function.h>
+
+#include <llvm/Target/TargetOptions.h>
+#include <llvm/Target/TargetMachine.h>
+
+#include <llvm/Transforms/IPO/PassManagerBuilder.h>
+#include <llvm/Transforms/Utils/Cloning.h>
+
+#include <llvm/PassRegistry.h>
+#include <llvm/PassInfo.h>
+
+#ifdef LLVM_OLDER_THAN_3_7
+#include <llvm/PassManager.h>
+#include <llvm/Target/TargetLibraryInfo.h>
+#else
+#include <llvm/Analysis/TargetLibraryInfo.h>
+#include <llvm/Analysis/TargetTransformInfo.h>
+#include <llvm/IR/LegacyPassManager.h>
+#define PassManager legacy::PassManager
+#endif
+
+using namespace llvm;
+
+/**
+ * Prepare the kernel compiler passes.
+ *
+ * The passes are created only once per program run per device.
+ * The returned pass manager should not be modified, only the Module
+ * should be optimized using it.
+ */
+
+static std::map<cl_device_id, llvm::TargetMachine *> targetMachines;
+static std::map<cl_device_id, PassManager *> kernelPasses;
+
+// This is used to control the kernel we process in the kernel compilation.
+extern cl::opt<std::string> KernelName;
+
+/* FIXME: these options should come from the cl_device, and
+ * cl_program's options. */
+static llvm::TargetOptions GetTargetOptions() {
+  llvm::TargetOptions Options;
+#ifdef LLVM_OLDER_THAN_3_9
+  Options.PositionIndependentExecutable = true;
+#endif
+#ifdef HOST_FLOAT_SOFT_ABI
+  Options.FloatABIType = FloatABI::Soft;
+#else
+  Options.FloatABIType = FloatABI::Hard;
+#endif
+#if 0
+  Options.LessPreciseFPMADOption = EnableFPMAD;
+  Options.NoFramePointerElim = DisableFPElim;
+  Options.NoFramePointerElimNonLeaf = DisableFPElimNonLeaf;
+  Options.AllowFPOpFusion = FuseFPOps;
+  Options.UnsafeFPMath = EnableUnsafeFPMath;
+  Options.NoInfsFPMath = EnableNoInfsFPMath;
+  Options.NoNaNsFPMath = EnableNoNaNsFPMath;
+  Options.HonorSignDependentRoundingFPMathOption =
+  EnableHonorSignDependentRoundingFPMath;
+  Options.UseSoftFloat = GenerateSoftFloatCalls;
+  if (FloatABIForCalls != FloatABI::Default)
+    Options.FloatABIType = FloatABIForCalls;
+  Options.NoZerosInBSS = DontPlaceZerosInBSS;
+  Options.GuaranteedTailCallOpt = EnableGuaranteedTailCallOpt;
+  Options.DisableTailCalls = DisableTailCalls;
+  Options.StackAlignmentOverride = OverrideStackAlignment;
+  Options.RealignStack = EnableRealignStack;
+  Options.TrapFuncName = TrapFuncName;
+  Options.EnableSegmentedStacks = SegmentedStacks;
+  Options.UseInitArray = UseInitArray;
+  Options.SSPBufferSize = SSPBufferSize;
+#endif
+  return Options;
+}
+
+void clearTargetMachines() {
+  for (auto i = targetMachines.begin(), e = targetMachines.end(); i != e; ++i) {
+    delete (llvm::TargetMachine *)i->second;
+  }
+  targetMachines.clear();
+}
+
+void clearKernelPasses() {
+  for (auto i = kernelPasses.begin(), e = kernelPasses.end();
+       i != e; ++i) {
+    PassManager *pm = (PassManager *)i->second;
+    delete pm;
+  }
+
+  kernelPasses.clear();
+}
+
+// Returns the TargetMachine instance or zero if no triple is provided.
+static TargetMachine *GetTargetMachine(cl_device_id device) {
+
+  if (targetMachines.find(device) != targetMachines.end())
+    return targetMachines[device];
+
+  std::string Error;
+  Triple TheTriple(device->llvm_target_triplet);
+
+  std::string MCPU = device->llvm_cpu ? device->llvm_cpu : "";
+
+  const Target *TheTarget = TargetRegistry::lookupTarget("", TheTriple, Error);
+
+  // In LLVM 3.4 and earlier, the target registry falls back to
+  // the cpp backend in case a proper match was not found. In
+  // that case simply do not use target info in the compilation
+  // because it can be an off-tree target not registered at
+  // this point (read: TCE).
+  if (!TheTarget || TheTarget->getName() == std::string("cpp")) {
+    return 0;
+  }
+
+  TargetMachine *TM = TheTarget->createTargetMachine(
+      TheTriple.getTriple(), MCPU, StringRef(""), GetTargetOptions(),
+      Reloc::PIC_, CodeModel::Default, CodeGenOpt::Aggressive);
+  assert(TM != NULL && "llvm target has no targetMachine constructor");
+  if (device->ops->init_target_machine)
+    device->ops->init_target_machine(device->data, TM);
+  targetMachines[device] = TM;
+
+  return TM;
+}
+/* helpers copied from LLVM opt END */
+
+static PassManager &
+kernel_compiler_passes(cl_device_id device, llvm::Module *input,
+                       const std::string &module_data_layout) {
+
+  PassManager *Passes = nullptr;
+  PassRegistry *Registry = nullptr;
+
+  if (kernelPasses.find(device) != kernelPasses.end()) {
+    return *kernelPasses[device];
+  }
+
+  bool SPMDDevice = device->spmd;
+
+  Registry = PassRegistry::getPassRegistry();
+
+  Triple triple(device->llvm_target_triplet);
+
+  Passes = new PassManager();
+
+  // Need to setup the target info for target specific passes. */
+  TargetMachine *Machine = GetTargetMachine(device);
+
+#ifdef LLVM_OLDER_THAN_3_7
+  // Add internal analysis passes from the target machine.
+  if (Machine)
+    Machine->addAnalysisPasses(*Passes);
+#else
+  if (Machine)
+    Passes->add(
+        createTargetTransformInfoWrapperPass(Machine->getTargetIRAnalysis()));
+#endif
+
+  if (module_data_layout != "") {
+#if (defined LLVM_OLDER_THAN_3_7)
+    Passes->add(new DataLayoutPass());
+#endif
+  }
+
+  /* Disables automated generation of libcalls from code patterns.
+     TCE doesn't have a runtime linker which could link the libs later on.
+     Also the libcalls might be harmful for WG autovectorization where we
+     want to try to vectorize the code it converts to e.g. a memset or
+     a memcpy */
+#ifdef LLVM_OLDER_THAN_3_7
+  TargetLibraryInfo *TLI = new TargetLibraryInfo(triple);
+  TLI->disableAllFunctions();
+  Passes->add(TLI);
+#else
+  TargetLibraryInfoImpl TLII(triple);
+  TLII.disableAllFunctions();
+  Passes->add(new TargetLibraryInfoWrapperPass(TLII));
+#endif
+
+  /* The kernel compiler passes to run, in order.
+
+     Notes about the kernel compiler phase ordering:
+
+     -mem2reg first because we get unoptimized output from Clang where all
+     variables are allocas. Avoid context saving the allocas and make the
+     more readable by calling -mem2reg at the beginning.
+
+     -implicit-cond-barriers after -implicit-loop-barriers because the latter
+     can inject barriers to loops inside conditional regions after which the
+     peeling should be avoided by injecting the implicit conditional barriers.
+
+     -loop-barriers, -barriertails, and -barriers should be ran after the
+     implicit barrier injection passes so they "normalize" the implicit
+     barriers also.
+
+     -phistoallocas before -workitemloops as otherwise it cannot inject context
+     restore code (PHIs need to be at the beginning of the BB and so one cannot
+     context restore them with non-PHI code if the value is needed in another
+     PHI). */
+
+  std::vector<std::string> passes;
+  passes.push_back("remove-optnone");
+  passes.push_back("optimize-wi-func-calls");
+  passes.push_back("handle-samplers");
+  passes.push_back("workitem-handler-chooser");
+  passes.push_back("mem2reg");
+  passes.push_back("domtree");
+  if (device->autolocals_to_args)
+    passes.push_back("automatic-locals");
+
+  if (SPMDDevice) {
+    passes.push_back("flatten-inline-all");
+    passes.push_back("always-inline");
+  }  else {
+    passes.push_back("flatten-globals");
+    passes.push_back("always-inline");
+#ifndef LLVM_3_9
+    passes.push_back("inline");
+#endif
+  }
+
+#ifndef LLVM_OLDER_THAN_4_0
+  // It should be now safe to run -O3 over the single work-item kernel
+  // as the barrier has the attributes preventing illegal motions and
+  // duplication. Let's do it to clean up the code for later passes.
+  // Especially the WI context structures get needlessly bloated in case there
+  // is dead code lying around.
+  passes.push_back("STANDARD_OPTS");
+#else
+  // Just clean up any unused globals.
+  passes.push_back("globaldce");
+#endif
+
+  if (!SPMDDevice) {
+    passes.push_back("simplifycfg");
+    passes.push_back("loop-simplify");
+    passes.push_back("uniformity");
+    passes.push_back("phistoallocas");
+    passes.push_back("isolate-regions");
+    passes.push_back("implicit-loop-barriers");
+    passes.push_back("implicit-cond-barriers");
+    passes.push_back("loop-barriers");
+    passes.push_back("barriertails");
+    passes.push_back("barriers");
+    passes.push_back("isolate-regions");
+    passes.push_back("wi-aa");
+    passes.push_back("workitemrepl");
+    //passes.push_back("print-module");
+    passes.push_back("workitemloops");
+    // Remove the (pseudo) barriers.   They have no use anymore due to the
+    // work-item loop control taking care of them.
+    passes.push_back("remove-barriers");
+  }
+  // Add the work group launcher functions and privatize the pseudo variable
+  // (local id) accesses.
+  if (device->workgroup_pass)
+    passes.push_back("workgroup");
+
+  // Attempt to move all allocas to the entry block to avoid the need for
+  // dynamic stack which is problematic for some architectures.
+  passes.push_back("allocastoentry");
+
+#ifdef POCL_USE_FAKE_ADDR_SPACE_IDS
+  // Convert the semantical OpenCL address space IDs to the ones of the target.
+  passes.push_back("target-address-spaces");
+#endif
+
+  // Later passes might get confused (and expose possible bugs in them) due to
+  // UNREACHABLE blocks left by repl. So let's clean up the CFG before running
+  // the standard LLVM optimizations.
+  passes.push_back("simplifycfg");
+
+#if 0
+  passes.push_back("print-module");
+  passes.push_back("dot-cfg");
+#endif
+
+  if (currentWgMethod == "loopvec")
+    passes.push_back("scalarizer");
+
+  passes.push_back("instcombine");
+  passes.push_back("STANDARD_OPTS");
+  passes.push_back("instcombine");
+
+  // Now actually add the listed passes to the PassManager.
+  for (unsigned i = 0; i < passes.size(); ++i) {
+    // This is (more or less) -O3.
+    if (passes[i] == "STANDARD_OPTS") {
+      PassManagerBuilder Builder;
+      Builder.OptLevel = 3;
+      Builder.SizeLevel = 0;
+
+      // These need to be setup in addition to invoking the passes
+      // to get the vectorizers initialized properly.
+      if (currentWgMethod == "loopvec") {
+        Builder.LoopVectorize = true;
+        Builder.SLPVectorize = true;
+      }
+      Builder.populateModulePassManager(*Passes);
+      continue;
+    }
+
+    const PassInfo *PIs = Registry->getPassInfo(StringRef(passes[i]));
+    if (PIs) {
+      // std::cout << "-"<<passes[i] << " ";
+      Pass *thispass = PIs->createPass();
+      Passes->add(thispass);
+    } else {
+      std::cerr << "Failed to create kernel compiler pass " << passes[i]
+                << std::endl;
+      POCL_ABORT("FAIL");
+    }
+  }
+
+  kernelPasses[device] = Passes;
+  return *Passes;
+}
+
+void pocl_destroy_llvm_module(void *modp) {
+
+  PoclCompilerMutexGuard lockHolder(NULL);
+  InitializeLLVM();
+
+  llvm::Module *mod = (llvm::Module *)modp;
+  if (mod) {
+    delete mod;
+    --numberOfIRs;
+  }
+}
+
+// Defined in llvmopencl/WorkitemHandler.cc
+namespace pocl {
+extern size_t WGLocalSizeX;
+extern size_t WGLocalSizeY;
+extern size_t WGLocalSizeZ;
+extern bool WGDynamicLocalSize;
+}
+
+int pocl_llvm_generate_workgroup_function_nowrite(cl_device_id device,
+  cl_kernel kernel, size_t local_x, size_t local_y, size_t local_z, void **output) {
+
+  int device_i = pocl_cl_device_to_index(kernel->program, device);
+  assert(device_i >= 0);
+
+  pocl::WGDynamicLocalSize = (local_x == 0 && local_y == 0 && local_z == 0);
+
+  currentPoclDevice = device;
+
+  PoclCompilerMutexGuard lockHolder(NULL);
+  InitializeLLVM();
+
+#ifdef DEBUG_POCL_LLVM_API
+  printf("### calling the kernel compiler for kernel %s local_x %zu "
+         "local_y %zu local_z %zu parallel_filename: %s\n",
+         kernel->name, local_x, local_y, local_z, parallel_bc_path);
+#endif
+
+  Triple triple(device->llvm_target_triplet);
+
+  SMDiagnostic Err;
+
+  // Link the kernel and runtime library
+  llvm::Module *input = NULL;
+  if (kernel->program->llvm_irs != NULL &&
+      kernel->program->llvm_irs[device_i] != NULL) {
+#ifdef DEBUG_POCL_LLVM_API
+    printf("### cloning the preloaded LLVM IR\n");
+#endif
+    llvm::Module *p = (llvm::Module *)kernel->program->llvm_irs[device_i];
+#ifdef LLVM_OLDER_THAN_3_8
+    input = llvm::CloneModule(p);
+#else
+    input = (llvm::CloneModule(p)).release();
+#endif
+  } else {
+#ifdef DEBUG_POCL_LLVM_API
+    printf("### loading the kernel bitcode from disk\n");
+#endif
+    char program_bc_path[POCL_FILENAME_LENGTH];
+    pocl_cache_program_bc_path(program_bc_path, kernel->program, device_i);
+    input = parseModuleIR(program_bc_path);
+  }
+
+  /* Note this is a hack to get SPIR working. We'll be linking the
+   * host kernel library (plain LLVM IR) to the SPIR program.bc,
+   * so LLVM complains about incompatible DataLayouts. The proper solution
+   * would be to generate a SPIR kernel library
+   */
+  if (triple.getArch() == Triple::x86 || triple.getArch() == Triple::x86_64) {
+    if (input->getTargetTriple().substr(0, 6) == std::string("spir64")) {
+      input->setTargetTriple(triple.getTriple());
+      input->setDataLayout("e-m:e-i64:64-f80:128-n8:16:32:64-S128");
+    } else if (input->getTargetTriple().substr(0, 4) == std::string("spir")) {
+      input->setTargetTriple(triple.getTriple());
+      input->setDataLayout("e-m:e-p:32:32-i64:64-f80:32-n8:16:32-S32");
+    }
+  }
+
+  /* Now finally run the set of passes assembled above */
+  // TODO pass these as parameters instead, this is not thread safe!
+  pocl::WGLocalSizeX = local_x;
+  pocl::WGLocalSizeY = local_y;
+  pocl::WGLocalSizeZ = local_z;
+  KernelName = kernel->name;
+
+#ifdef LLVM_OLDER_THAN_3_7
+  kernel_compiler_passes(device, input,
+                         input->getDataLayout()->getStringRepresentation())
+      .run(*input);
+#else
+  kernel_compiler_passes(device, input,
+                         input->getDataLayout().getStringRepresentation())
+      .run(*input);
+#endif
+
+  assert(output != NULL);
+  *output = (void *)input;
+  ++numberOfIRs;
+  return 0;
+}
+
+
+int pocl_llvm_generate_workgroup_function(cl_device_id device, cl_kernel kernel,
+                                          size_t local_x, size_t local_y,
+                                          size_t local_z) {
+
+  void *modp = NULL;
+
+  int device_i = pocl_cl_device_to_index(kernel->program, device);
+  assert(device_i >= 0);
+
+  char parallel_bc_path[POCL_FILENAME_LENGTH];
+  pocl_cache_work_group_function_path(parallel_bc_path, kernel->program,
+                                      device_i, kernel, local_x, local_y,
+                                      local_z);
+
+  if (pocl_exists(parallel_bc_path))
+    return CL_SUCCESS;
+
+  char final_binary_path[POCL_FILENAME_LENGTH];
+  pocl_cache_final_binary_path(final_binary_path, kernel->program, device_i,
+                               kernel, local_x, local_y, local_z);
+
+  if (pocl_exists(final_binary_path))
+    return CL_SUCCESS;
+
+  int error = pocl_llvm_generate_workgroup_function_nowrite(
+      device, kernel, local_x, local_y, local_z, &modp);
+  if (error)
+    return error;
+
+  error = pocl_cache_write_kernel_parallel_bc(
+      modp, kernel->program, device_i, kernel, local_x, local_y, local_z);
+
+  if (error)
+    {
+      POCL_MSG_ERR ("pocl_cache_write_kernel_parallel_bc()"
+                    " failed with %i\n", error);
+      return error;
+    }
+
+  pocl_destroy_llvm_module(modp);
+  return error;
+}
+
+int pocl_update_program_llvm_irs(cl_program program,
+                                 unsigned device_i,
+                                 cl_device_id device) {
+  PoclCompilerMutexGuard lockHolder(NULL);
+  InitializeLLVM();
+
+  char program_bc_path[POCL_FILENAME_LENGTH];
+  pocl_cache_program_bc_path(program_bc_path, program, device_i);
+
+  if (!pocl_exists(program_bc_path))
+    {
+      POCL_MSG_ERR ("%s does not exist!\n",
+                     program_bc_path);
+      return -1;
+    }
+
+  assert(program->llvm_irs[device_i] == nullptr);
+  program->llvm_irs[device_i] = parseModuleIR(program_bc_path);
+  ++numberOfIRs;
+  return 0;
+}
+
+void pocl_free_llvm_irs(cl_program program, int device_i) {
+  if (program->llvm_irs[device_i]) {
+    PoclCompilerMutexGuard lockHolder(NULL);
+    InitializeLLVM();
+    llvm::Module *mod = (llvm::Module *)program->llvm_irs[device_i];
+    delete mod;
+    --numberOfIRs;
+    program->llvm_irs[device_i] = NULL;
+  }
+}
+
+void pocl_llvm_update_binaries(cl_program program) {
+
+  PoclCompilerMutexGuard lockHolder(NULL);
+  InitializeLLVM();
+
+  char program_bc_path[POCL_FILENAME_LENGTH];
+  void *cache_lock = NULL;
+  int error;
+
+  // Dump the LLVM IR Modules to memory buffers.
+  assert(program->llvm_irs != NULL);
+#ifdef DEBUG_POCL_LLVM_API
+  printf("### refreshing the binaries of the program %p\n", program);
+#endif
+
+  for (size_t i = 0; i < program->num_devices; ++i) {
+    assert(program->llvm_irs[i] != NULL);
+    if (program->binaries[i])
+      continue;
+
+    cache_lock = pocl_cache_acquire_writer_lock_i(program, i);
+    assert(cache_lock);
+
+    pocl_cache_program_bc_path(program_bc_path, program, i);
+    error = pocl_write_module((llvm::Module *)program->llvm_irs[i],
+                              program_bc_path, 1);
+    assert(error == 0);
+    if (error)
+      {
+        POCL_MSG_ERR ("pocl_write_module(%s) failed!\n",
+                     program_bc_path);
+        continue;
+      }
+
+    std::string content;
+    writeModuleIR((llvm::Module *)program->llvm_irs[i], content);
+
+    size_t n = content.size();
+    if (n < program->binary_sizes[i])
+      POCL_ABORT("binary size doesn't match the expected value");
+    if (program->binaries[i])
+      POCL_MEM_FREE(program->binaries[i]);
+    program->binaries[i] = (unsigned char *)malloc(n);
+    std::memcpy(program->binaries[i], content.c_str(), n);
+
+    pocl_cache_release_lock(cache_lock);
+    cache_lock = NULL;
+
+#ifdef DEBUG_POCL_LLVM_API
+    printf("### binary for device %zi was of size %zu\n", i,
+           program->binary_sizes[i]);
+#endif
+  }
+}
+
+
+/* Run LLVM codegen on input file (parallel-optimized).
+ * modp = llvm::Module* of parallel.bc
+ * Output native object file (<kernel>.so.o). */
+int pocl_llvm_codegen(cl_kernel kernel, cl_device_id device, void *modp,
+                      char **output, size_t *output_size) {
+
+  PoclCompilerMutexGuard lockHolder(NULL);
+  InitializeLLVM();
+
+  llvm::Triple triple(device->llvm_target_triplet);
+  llvm::TargetMachine *target = GetTargetMachine(device);
+
+  llvm::Module *input = (llvm::Module *)modp;
+  assert(input);
+  *output = NULL;
+
+  PassManager PM;
+#ifdef LLVM_OLDER_THAN_3_7
+  llvm::TargetLibraryInfo *TLI = new TargetLibraryInfo(triple);
+  PM.add(TLI);
+#else
+  llvm::TargetLibraryInfoWrapperPass *TLIPass =
+      new TargetLibraryInfoWrapperPass(triple);
+  PM.add(TLIPass);
+#endif
+#ifdef LLVM_OLDER_THAN_3_7
+  if (target != NULL) {
+    target->addAnalysisPasses(PM);
+  }
+#endif
+
+  // TODO: get DataLayout from the 'device'
+  // TODO: better error check
+#ifdef LLVM_OLDER_THAN_3_7
+  std::string data;
+  llvm::raw_string_ostream sos(data);
+  llvm::MCContext *mcc;
+  if (target && target->addPassesToEmitMC(PM, mcc, sos))
+    return 1;
+#else
+  SmallVector<char, 4096> data;
+  llvm::raw_svector_ostream sos(data);
+  if (target &&
+      target->addPassesToEmitFile(PM, sos, TargetMachine::CGFT_ObjectFile))
+    return 1;
+#endif
+
+  PM.run(*input);
+  std::string o = sos.str(); // flush
+  const char *cstr = o.c_str();
+  size_t s = o.size();
+  *output = (char *)malloc(s);
+  *output_size = s;
+  memcpy(*output, cstr, s);
+
+  return 0;
+}
+/* vim: set ts=4 expandtab: */
diff --git a/lib/CL/pocl_mem_management.c b/lib/CL/pocl_mem_management.c
index 051c2c5..c7712f0 100644
--- a/lib/CL/pocl_mem_management.c
+++ b/lib/CL/pocl_mem_management.c
@@ -26,6 +26,18 @@
 #include "utlist.h"
 #include <string.h>
 
+#ifndef USE_POCL_MEMMANAGER
+
+cl_event pocl_mem_manager_new_event ()
+{
+  cl_event ev = (cl_event) calloc (1, sizeof (struct _cl_event));
+  if (ev != NULL)
+    POCL_INIT_OBJECT(ev);
+  return ev;
+}
+
+#else
+
 typedef struct _mem_manager
 {
   pocl_lock_t event_lock;
@@ -69,13 +81,13 @@ cl_event pocl_mem_manager_new_event ()
     {
       LL_DELETE (mm->event_list, ev);
       POCL_UNLOCK (mm->event_lock);
+      POCL_INIT_OBJECT (ev); /* reinit the pocl_lock mutex */
       return ev;
     }
   POCL_UNLOCK (mm->event_lock);
 
   ev = (struct _cl_event*) calloc (1, sizeof (struct _cl_event));
   POCL_INIT_OBJECT(ev);
-  ev->pocl_refcount = 1;
   return ev;
 }
 
@@ -133,3 +145,5 @@ void pocl_mem_manager_free_event_node (event_node *ed)
   LL_PREPEND (mm->event_node_list, ed);
   POCL_UNLOCK (mm->event_node_lock);
 }
+
+#endif
diff --git a/lib/CL/pocl_mem_management.h b/lib/CL/pocl_mem_management.h
index c69e65b..dd13bb0 100644
--- a/lib/CL/pocl_mem_management.h
+++ b/lib/CL/pocl_mem_management.h
@@ -27,6 +27,8 @@
 #pragma GCC visibility push(hidden)
 #endif
 
+#ifdef USE_POCL_MEMMANAGER
+
 void pocl_init_mem_manager (void);
 
 cl_event pocl_mem_manager_new_event (void);
@@ -41,6 +43,27 @@ event_node* pocl_mem_manager_new_event_node ();
 
 void pocl_mem_manager_free_event_node (event_node *ed);
 
+#else
+
+#define pocl_init_mem_manager() NULL
+
+cl_event pocl_mem_manager_new_event ();
+
+#define pocl_mem_manager_free_event(event) POCL_MEM_FREE(event)
+
+#define pocl_mem_manager_new_command() \
+  (_cl_command_node*) calloc (1, sizeof (_cl_command_node))
+
+#define pocl_mem_manager_free_command(cmd) POCL_MEM_FREE(cmd)
+
+#define pocl_mem_manager_new_event_node() \
+  (event_node*) calloc (1, sizeof (event_node))
+
+#define pocl_mem_manager_free_event_node(en) POCL_MEM_FREE(en)
+
+
+#endif
+
 #ifdef __GNUC__
 #pragma GCC visibility pop
 #endif
diff --git a/lib/CL/pocl_opengl.c b/lib/CL/pocl_opengl.c
new file mode 100644
index 0000000..ec4394d
--- /dev/null
+++ b/lib/CL/pocl_opengl.c
@@ -0,0 +1,121 @@
+
+#include "pocl_cl.h"
+
+CL_API_ENTRY cl_mem CL_API_CALL
+POname(clCreateFromGLTexture)(cl_context      context,
+                        cl_mem_flags    flags,
+                        cl_GLenum       texture_target,
+                        cl_GLint        miplevel,
+                        cl_GLuint       texture,
+                        cl_int *        errcode_ret)
+CL_API_SUFFIX__VERSION_1_2
+{
+  POCL_ABORT_UNIMPLEMENTED("The entire clCreateFromGLTexture call");
+  return NULL;
+}
+POsym(clCreateFromGLTexture)
+
+
+
+CL_API_ENTRY cl_mem CL_API_CALL
+POname(clCreateFromGLBuffer)(cl_context      context,
+                        cl_mem_flags    flags,
+                        cl_GLuint       bufobj,
+                        cl_int *        errcode_ret)
+CL_API_SUFFIX__VERSION_1_0
+{
+  POCL_ABORT_UNIMPLEMENTED("The entire clCreateFromGLBuffer call");
+  return NULL;
+}
+POsym(clCreateFromGLBuffer)
+
+
+
+CL_API_ENTRY cl_mem CL_API_CALL
+POname(clCreateFromGLRenderbuffer)(cl_context      context,
+                        cl_mem_flags    flags,
+                        cl_GLuint       renderbuffer,
+                        cl_int *        errcode_ret)
+CL_API_SUFFIX__VERSION_1_0
+{
+  POCL_ABORT_UNIMPLEMENTED("The entire clCreateFromGLRenderbuffer call");
+  return NULL;
+}
+POsym(clCreateFromGLRenderbuffer)
+
+
+
+CL_API_ENTRY cl_int CL_API_CALL
+POname(clGetGLObjectInfo)(cl_mem        memobj,
+                          cl_gl_object_type *gl_object_type,
+                          cl_GLuint       *gl_object_name)
+CL_API_SUFFIX__VERSION_1_0
+{
+  POCL_ABORT_UNIMPLEMENTED("The entire clGetGLObjectInfo call");
+  return CL_OUT_OF_RESOURCES;
+}
+POsym(clGetGLObjectInfo)
+
+
+
+
+CL_API_ENTRY cl_int CL_API_CALL
+POname(clGetGLTextureInfo) (cl_mem        memobj,
+                            cl_gl_texture_info param_name,
+                            size_t  param_value_size,
+                            void  *param_value,
+                            size_t  *param_value_size_ret)
+CL_API_SUFFIX__VERSION_1_0
+{
+  POCL_ABORT_UNIMPLEMENTED("The entire clGetGLTextureInfo call");
+  return CL_OUT_OF_RESOURCES;
+}
+POsym(clGetGLTextureInfo)
+
+
+
+CL_API_ENTRY cl_int CL_API_CALL
+POname(clEnqueueAcquireGLObjects) ( cl_command_queue command_queue,
+                                    cl_uint num_objects,
+                                    const cl_mem *mem_objects,
+                                    cl_uint num_events_in_wait_list,
+                                    const cl_event *event_wait_list,
+                                    cl_event *event)
+CL_API_SUFFIX__VERSION_1_0
+{
+  POCL_ABORT_UNIMPLEMENTED("The entire clEnqueueAcquireGLObjects call");
+  return CL_OUT_OF_RESOURCES;
+}
+POsym(clEnqueueAcquireGLObjects)
+
+
+
+CL_API_ENTRY cl_int CL_API_CALL
+POname(clEnqueueReleaseGLObjects) ( cl_command_queue command_queue,
+                                    cl_uint num_objects,
+                                    const cl_mem *mem_objects,
+                                    cl_uint num_events_in_wait_list,
+                                    const cl_event *event_wait_list,
+                                    cl_event *event)
+CL_API_SUFFIX__VERSION_1_0
+{
+  POCL_ABORT_UNIMPLEMENTED("The entire clEnqueueReleaseGLObjects call");
+  return CL_OUT_OF_RESOURCES;
+}
+POsym(clEnqueueReleaseGLObjects)
+
+
+
+CL_API_ENTRY cl_int CL_API_CALL
+POname(clGetGLContextInfoKHR) ( const cl_context_properties  *properties ,
+  cl_gl_context_info  param_name ,
+  size_t  param_value_size ,
+  void  *param_value ,
+  size_t  *param_value_size_ret )
+
+CL_API_SUFFIX__VERSION_1_0
+{
+  POCL_ABORT_UNIMPLEMENTED("The entire clGetGLContextInfoKHR call");
+  return CL_OUT_OF_RESOURCES;
+}
+POsym(clGetGLContextInfoKHR)
diff --git a/lib/CL/pocl_queue_util.c b/lib/CL/pocl_queue_util.c
deleted file mode 100644
index f6c0b63..0000000
--- a/lib/CL/pocl_queue_util.c
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Command queue management functions
-
-   Copyright (c) 2015 Giuseppe Bilotta
-   
-   Permission is hereby granted, free of charge, to any person obtaining a copy
-   of this software and associated documentation files (the "Software"), to deal
-   in the Software without restriction, including without limitation the rights
-   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-   copies of the Software, and to permit persons to whom the Software is
-   furnished to do so, subject to the following conditions:
-   
-   The above copyright notice and this permission notice shall be included in
-   all copies or substantial portions of the Software.
-   
-   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-   THE SOFTWARE.
-*/
-
-/* We keep a global list of all 'live' command queues in order to be able
- * to force a clFinish on all of them before this is triggered by the destructors
- * at program end, which happen in unspecified order and might cause all sorts
- * of issues. This header defines the signatures of the available functions
- */
-
-#include <stdlib.h>
-#include <string.h>
-#include "pocl_debug.h"
-#include "pocl_queue_util.h"
-#include "common.h"
-
-static pocl_lock_t queue_lock = POCL_LOCK_INITIALIZER;
-static size_t queue_size = 0;
-static size_t queue_alloc = 0;
-static cl_command_queue *queue_list = NULL;
-
-#define QUEUE_ALLOC_SIZE 256
-
-int pocl_aborting;
-/*
-static void
-pocl_finish_all_queues()
-{
-  size_t i;
-  if (pocl_aborting)
-    return;
-  for (i = 0; i < queue_size; ++i) {
-    if (queue_list[i])
-      POname(clFinish)(queue_list[i]);
-  }
-  pocl_print_system_memory_stats();
-}
-*/
-
-void pocl_init_queue_list()
-{
-  POCL_INIT_LOCK(queue_lock);
-
-  POCL_LOCK(queue_lock);
-  // will probably never need a realloc, but still
-  queue_alloc = QUEUE_ALLOC_SIZE;
-
-  queue_list = calloc(queue_alloc, sizeof(cl_command_queue));
-
-  if (!queue_list)
-    POCL_ABORT("unable to allocate queue list!");
-
-  //atexit(pocl_finish_all_queues);
-
-  POCL_UNLOCK(queue_lock);
-
-}
-
-// walk the queue list, 
-static void
-pocl_compact_queue_list() {
-  size_t i; // walking index
-  size_t compact = 0; // number of non-NULL elements
-  for (i = 0; i < queue_size; ++i) {
-    if (queue_list[i])
-      compact++;
-    else {
-      // look for the first next non-NULL
-      while (i < queue_size && queue_list[i] == NULL)
-        ++i;
-      if (i == queue_size)
-        break; // no more entries
-      // move stuff over
-      memmove(queue_list + compact + 1, queue_list + i,
-        (queue_size - i + 1)*sizeof(*queue_list));
-      queue_size -= i - compact; // number of NULLs compacted
-      i = compact + 1;
-    }
-  }
-  queue_size = compact + 1;
-}
-
-void pocl_queue_list_insert(cl_command_queue q)
-{
-  POCL_LOCK(queue_lock);
-  if (queue_size == queue_alloc) {
-    // queue is full, try and compact it by removing the deleted queues
-    pocl_compact_queue_list();
-  }
-
-  if (queue_size == queue_alloc) {
-    // compaction failed to give us room
-    cl_command_queue *resized = realloc(queue_list, queue_alloc + 256);
-    if (!resized)
-      POCL_ABORT("failed to enlarge queue list!");
-    queue_list = resized;
-    queue_alloc += 256;
-  }
-
-  queue_list[queue_size++] = q;
-  POCL_UNLOCK(queue_lock);
-}
-
-void pocl_queue_list_delete(cl_command_queue q)
-{
-  POCL_LOCK(queue_lock);
-  size_t i;
-  for (i = 0; i < queue_size; ++i) {
-    if (queue_list[i] == q) {
-      queue_list[i] = NULL;
-      goto unlock;
-    }
-  }
-  // not found (?)
-  POCL_MSG_WARN("command queue %p not found during deletion\n", q);
-
-unlock:
-  POCL_UNLOCK(queue_lock);
-  return;
-}
-
diff --git a/lib/CL/pocl_runtime_config.h b/lib/CL/pocl_runtime_config.h
index 2fd5b48..3a7b2ca 100644
--- a/lib/CL/pocl_runtime_config.h
+++ b/lib/CL/pocl_runtime_config.h
@@ -37,4 +37,5 @@ const char* pocl_get_string_option(const char *key, const char *default_value);
 }
 #endif
 
+
 #endif
diff --git a/lib/CL/pocl_shared.h b/lib/CL/pocl_shared.h
index 95fefd4..a58280e 100644
--- a/lib/CL/pocl_shared.h
+++ b/lib/CL/pocl_shared.h
@@ -35,13 +35,6 @@ extern "C" {
 #endif
 
 
-/* Function for performing the actual mapping, used both from the
-   clFinish() and the blocking call. */
-void*
-pocl_map_mem_cmd(cl_device_id device,
-                 cl_mem buffer,
-                 mem_mapping_t *mapping_info);
-
 cl_int pocl_rect_copy(cl_command_queue command_queue,
                       cl_command_type command_type,
                       cl_mem src,
@@ -61,6 +54,28 @@ cl_int pocl_rect_copy(cl_command_queue command_queue,
 
 cl_int program_compile_dynamic_wg_binaries(cl_program program);
 
+cl_program create_program_skeleton (cl_context context, cl_uint num_devices,
+                                    const cl_device_id *device_list,
+                                    const size_t *lengths,
+                                    const unsigned char **binaries,
+                                    cl_int *binary_status, cl_int *errcode_ret,
+                                    int allow_empty_binaries);
+
+cl_int
+compile_and_link_program(int compile_program,
+                         int link_program,
+                         cl_program program,
+                         cl_uint num_devices,
+                         const cl_device_id *device_list,
+                         const char *options,
+                         cl_uint num_input_headers,
+                         const cl_program *input_headers,
+                         const char **header_include_names,
+                         cl_uint num_input_programs,
+                         const cl_program *input_programs,
+                         void (CL_CALLBACK *pfn_notify) (cl_program program,
+                                                         void *user_data),
+                         void *user_data);
 
 int context_set_properties(cl_context                    context,
                            const cl_context_properties * properties,
diff --git a/lib/CL/pocl_tracing.c b/lib/CL/pocl_tracing.c
index 13650db..128fb6f 100644
--- a/lib/CL/pocl_tracing.c
+++ b/lib/CL/pocl_tracing.c
@@ -29,13 +29,13 @@
 
 #ifdef LTTNG_UST_AVAILABLE
 #include "pocl_lttng.h"
+static const struct pocl_event_tracer lttng_tracer;
 #endif
 
 static int tracing_initialized = 0;
 static uint8_t event_trace_filter = 0xF;
 
 static const struct pocl_event_tracer text_logger;
-static const struct pocl_event_tracer lttng_tracer;
 
 /* List of tracers
  */
diff --git a/lib/CL/pocl_util.c b/lib/CL/pocl_util.c
index d45e89e..1c34292 100644
--- a/lib/CL/pocl_util.c
+++ b/lib/CL/pocl_util.c
@@ -26,14 +26,18 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include <sys/types.h>
-#include <sys/stat.h>
 #include <time.h>
 
 #ifndef _MSC_VER
-#  include <dirent.h>
-#  include <unistd.h>
-#  include <utime.h>
+#include <dirent.h>
+#include <sys/resource.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
+#include <unistd.h>
+#include <utime.h>
 #else
 #  include "vccompat.hpp"
 #endif
@@ -46,6 +50,11 @@
 #include "devices.h"
 #include "pocl_runtime_config.h"
 
+/* required for setting SSE/AVX flush denorms to zero flag */
+#if defined(__x86_64__) && defined(__GNUC__)
+#include <x86intrin.h>
+#endif
+
 struct list_item;
 
 typedef struct list_item
@@ -54,6 +63,110 @@ typedef struct list_item
   struct list_item *next;
 } list_item;
 
+
+
+void
+pocl_restore_ftz (unsigned ftz)
+{
+#if defined(__x86_64__) && defined(__GNUC__)
+
+#ifdef _MM_FLUSH_ZERO_ON
+  if (ftz & _MM_FLUSH_ZERO_ON)
+    _MM_SET_FLUSH_ZERO_MODE (_MM_FLUSH_ZERO_ON);
+  else
+    _MM_SET_FLUSH_ZERO_MODE (_MM_FLUSH_ZERO_OFF);
+#endif
+#ifdef _MM_DENORMALS_ZERO_ON
+  if (ftz & _MM_DENORMALS_ZERO_ON)
+    _MM_SET_DENORMALS_ZERO_MODE (_MM_DENORMALS_ZERO_ON);
+  else
+    _MM_SET_DENORMALS_ZERO_MODE (_MM_DENORMALS_ZERO_OFF);
+#endif
+
+#endif
+}
+
+unsigned
+pocl_save_ftz ()
+{
+#if defined(__x86_64__) && defined(__GNUC__)
+
+  unsigned s = 0;
+#ifdef _MM_FLUSH_ZERO_ON
+  if (_MM_GET_FLUSH_ZERO_MODE ())
+    s |= _MM_FLUSH_ZERO_ON;
+  else
+    s &= (~_MM_FLUSH_ZERO_ON);
+#endif
+#ifdef _MM_DENORMALS_ZERO_ON
+  if (_MM_GET_DENORMALS_ZERO_MODE ())
+    s |= _MM_DENORMALS_ZERO_ON;
+  else
+    s &= (~_MM_DENORMALS_ZERO_ON);
+#endif
+  return s;
+
+#else
+  return 0;
+#endif
+}
+
+void
+pocl_set_ftz (unsigned ftz)
+{
+#if defined(__x86_64__) && defined(__GNUC__)
+  if (ftz)
+    {
+#ifdef _MM_FLUSH_ZERO_ON
+      _MM_SET_FLUSH_ZERO_MODE (_MM_FLUSH_ZERO_ON);
+#endif
+
+#ifdef _MM_DENORMALS_ZERO_ON
+      _MM_SET_DENORMALS_ZERO_MODE (_MM_DENORMALS_ZERO_ON);
+#endif
+    }
+  else
+    {
+#ifdef _MM_FLUSH_ZERO_OFF
+      _MM_SET_FLUSH_ZERO_MODE (_MM_FLUSH_ZERO_OFF);
+#endif
+
+#ifdef _MM_DENORMALS_ZERO_OFF
+      _MM_SET_DENORMALS_ZERO_MODE (_MM_DENORMALS_ZERO_OFF);
+#endif
+    }
+#endif
+}
+
+
+void
+pocl_set_default_rm ()
+{
+#if defined(__x86_64__) && defined(__GNUC__) && defined(_MM_ROUND_NEAREST)
+  unsigned rm = _MM_GET_ROUNDING_MODE ();
+  if (rm != _MM_ROUND_NEAREST)
+    _MM_SET_ROUNDING_MODE (_MM_ROUND_NEAREST);
+#endif
+}
+
+unsigned
+pocl_save_rm ()
+{
+#if defined(__x86_64__) && defined(__GNUC__) && defined(_MM_ROUND_NEAREST)
+  return _MM_GET_ROUNDING_MODE ();
+#else
+  return 0;
+#endif
+}
+
+void
+pocl_restore_rm (unsigned rm)
+{
+#if defined(__x86_64__) && defined(__GNUC__) && defined(_MM_ROUND_NEAREST)
+  _MM_SET_ROUNDING_MODE (rm);
+#endif
+}
+
 uint32_t
 byteswap_uint32_t (uint32_t word, char should_swap)
 {
@@ -179,6 +292,35 @@ pocl_aligned_free (void *ptr)
 }
 #endif
 
+void
+pocl_lock_events_inorder (cl_event ev1, cl_event ev2)
+{
+  if (ev1->id < ev2->id)
+    {
+      POCL_LOCK_OBJ (ev1);
+      POCL_LOCK_OBJ (ev2);
+    }
+  else
+    {
+      POCL_LOCK_OBJ (ev2);
+      POCL_LOCK_OBJ (ev1);
+    }
+}
+
+void
+pocl_unlock_events_inorder (cl_event ev1, cl_event ev2)
+{
+  if (ev1->id < ev2->id)
+    {
+      POCL_UNLOCK_OBJ (ev1);
+      POCL_UNLOCK_OBJ (ev2);
+    }
+  else
+    {
+      POCL_UNLOCK_OBJ (ev2);
+      POCL_UNLOCK_OBJ (ev1);
+    }
+}
 
 cl_int pocl_create_event (cl_event *event, cl_command_queue command_queue, 
                           cl_command_type command_type, int num_buffers,
@@ -186,12 +328,14 @@ cl_int pocl_create_event (cl_event *event, cl_command_queue command_queue,
 {
   static unsigned int event_id_counter = 0;
 
+  POCL_MSG_PRINT_EVENTS ("creating event\n");
+
   if (context == NULL || !(context->valid))
     return CL_INVALID_CONTEXT;
   if (event != NULL)
     {
       *event = pocl_mem_manager_new_event ();
-      if (event == NULL)
+      if (*event == NULL)
         return CL_OUT_OF_HOST_MEMORY;
 
       (*event)->context = context;
@@ -203,23 +347,14 @@ cl_int pocl_create_event (cl_event *event, cl_command_queue command_queue,
         POname(clRetainCommandQueue) (command_queue);
 
       (*event)->command_type = command_type;
-      (*event)->command =  NULL;
-      (*event)->callback_list = NULL;
       (*event)->id = event_id_counter++;
-      (*event)->notify_list = NULL;
-      (*event)->data = NULL;
       (*event)->num_buffers = num_buffers;
       if (num_buffers > 0)
         {
           (*event)->mem_objs = malloc (num_buffers * sizeof(cl_mem));
           memcpy ((*event)->mem_objs, buffers, num_buffers * sizeof(cl_mem));
         }
-      else
-        (*event)->mem_objs = NULL;
       (*event)->status = CL_QUEUED;
-      (*event)->implicit_event = 0;
-      (*event)->next = NULL;
-      (*event)->prev = NULL;
 
       /* user events do not have cq */
       if (!command_queue)
@@ -235,6 +370,10 @@ pocl_create_event_sync(cl_event waiting_event,
 {
   event_node * volatile notify_target = NULL;
   event_node * volatile wait_list_item = NULL;
+
+  if (notifier_event == NULL)
+    return CL_SUCCESS;
+
   assert(notifier_event->pocl_refcount != 0);
   POCL_MSG_PRINT_INFO("create event sync: waiting %d, notifier %d\n", waiting_event->id, notifier_event->id);
   if (waiting_event == notifier_event)
@@ -243,19 +382,17 @@ pocl_create_event_sync(cl_event waiting_event,
              notifier_event->id);
       assert(waiting_event != notifier_event);
     }
+
+  pocl_lock_events_inorder (waiting_event, notifier_event);
+
   LL_FOREACH (waiting_event->wait_list, wait_list_item)
     {
       if (wait_list_item->event == notifier_event)
-        return CL_SUCCESS;
-    }
-  POCL_LOCK_OBJ (waiting_event);
-
-  if (notifier_event == NULL || notifier_event->status == CL_COMPLETE)
-    {
-      POCL_UNLOCK_OBJ (waiting_event);
-      return CL_SUCCESS;
+        goto FINISH;
     }
 
+  if (notifier_event->status == CL_COMPLETE)
+    goto FINISH;
   notify_target = pocl_mem_manager_new_event_node();
   wait_list_item = pocl_mem_manager_new_event_node();
   if (!notify_target || !wait_list_item)
@@ -265,8 +402,9 @@ pocl_create_event_sync(cl_event waiting_event,
   wait_list_item->event = notifier_event;
   LL_PREPEND (notifier_event->notify_list, notify_target);
   LL_PREPEND (waiting_event->wait_list, wait_list_item);
-  POCL_UNLOCK_OBJ (waiting_event);
 
+FINISH:
+  pocl_unlock_events_inorder (waiting_event, notifier_event);
   return CL_SUCCESS;
 }
 
@@ -315,6 +453,7 @@ cl_int pocl_create_command (_cl_command_node **cmd,
      one reference for the host and one for the runtime/driver */
   if (event_p)
     {
+      POCL_MSG_PRINT_EVENTS ("event pointer provided\n");
       *event_p = *event;
       (*event)->implicit_event = 0;
       (*event)->pocl_refcount = 2;
@@ -325,11 +464,8 @@ cl_int pocl_create_command (_cl_command_node **cmd,
       (*event)->pocl_refcount = 1;
     }
 
-  (*cmd)->next = NULL;
-  (*cmd)->prev = NULL;
   (*cmd)->device = command_queue->device;
   (*cmd)->event->command = (*cmd);
-  (*cmd)->ready = 0;
 
   /* in case of in-order queue, synchronize to previously enqueued command
      if available */
@@ -338,11 +474,9 @@ cl_int pocl_create_command (_cl_command_node **cmd,
       POCL_LOCK_OBJ (command_queue);
       if (command_queue->last_event.event)
         {
-          POCL_LOCK_OBJ (command_queue->last_event.event);
           pocl_create_event_sync ((*cmd)->event,
                                   command_queue->last_event.event,
                                   NULL);
-          POCL_UNLOCK_OBJ (command_queue->last_event.event);
         }
       POCL_UNLOCK_OBJ (command_queue);
     }
@@ -350,12 +484,10 @@ cl_int pocl_create_command (_cl_command_node **cmd,
   for (i = 0; i < num_events; ++i)
     {
       cl_event wle = wait_list[i];
-      POCL_LOCK_OBJ (wle);
       pocl_create_event_sync ((*cmd)->event, wle, NULL);
-      POCL_UNLOCK_OBJ (wle);
     }
-  POCL_MSG_PRINT_INFO("Created command struct (event %d, type %X)\n", 
-                      (*cmd)->event->id, command_type);
+  POCL_MSG_PRINT_EVENTS ("Created command struct (event %d, type %X)\n",
+                         (*cmd)->event->id, command_type);
   return CL_SUCCESS;
 }
 
@@ -377,9 +509,7 @@ void pocl_command_enqueue (cl_command_queue command_queue,
     {
       DL_FOREACH (command_queue->events, event)
         {
-          POCL_LOCK_OBJ(event);
           pocl_create_event_sync (node->event, event, NULL);
-          POCL_UNLOCK_OBJ (event);
         }
     }
   if (node->type == CL_COMMAND_BARRIER)
@@ -388,9 +518,7 @@ void pocl_command_enqueue (cl_command_queue command_queue,
     {
       if (command_queue->barrier)
         {
-          POCL_LOCK_OBJ(command_queue->barrier);
           pocl_create_event_sync (node->event, command_queue->barrier, NULL);
-          POCL_UNLOCK_OBJ (command_queue->barrier);
         }
     }
   DL_APPEND (command_queue->events, node->event);
@@ -398,7 +526,9 @@ void pocl_command_enqueue (cl_command_queue command_queue,
   command_queue->last_event.event_id = node->event->id;
   POCL_UNLOCK_OBJ (command_queue);
 
-  POCL_UPDATE_EVENT_QUEUED (&node->event);
+  POCL_LOCK_OBJ (node->event);
+  POCL_UPDATE_EVENT_QUEUED (node->event);
+  POCL_UNLOCK_OBJ (node->event);
 
   command_queue->device->ops->submit(node, command_queue);
 #ifdef POCL_DEBUG_BUILD
@@ -423,16 +553,15 @@ pocl_command_push (_cl_command_node *node,
       CDL_PREPEND ((*pending_list), node);
       return;
     }
-  POCL_LOCK_OBJ (node->event);
   if (pocl_command_is_ready(node->event))
     {
+      POCL_UPDATE_EVENT_SUBMITTED (node->event);
       CDL_PREPEND ((*ready_list), node);
     }
   else
     {
       CDL_PREPEND ((*pending_list), node);
     }
-  POCL_UNLOCK_OBJ (node->event);
 }
 
 int pocl_update_command_queue (cl_event event)
@@ -455,6 +584,42 @@ int pocl_update_command_queue (cl_event event)
   return cq_ready;
 }
 
+void
+pocl_cl_mem_inherit_flags (cl_mem mem, cl_mem from_buffer, cl_mem_flags flags)
+{
+  if ((flags & CL_MEM_READ_WRITE) | (flags & CL_MEM_READ_ONLY)
+      | (flags & CL_MEM_WRITE_ONLY))
+    {
+      mem->flags = (flags & CL_MEM_READ_WRITE) | (flags & CL_MEM_READ_ONLY)
+                   | (flags & CL_MEM_WRITE_ONLY);
+    }
+  else
+    {
+      mem->flags = (from_buffer->flags & CL_MEM_READ_WRITE)
+                   | (from_buffer->flags & CL_MEM_READ_ONLY)
+                   | (from_buffer->flags & CL_MEM_WRITE_ONLY);
+    }
+
+  if ((flags & CL_MEM_HOST_NO_ACCESS) | (flags & CL_MEM_HOST_READ_ONLY)
+      | (flags & CL_MEM_HOST_WRITE_ONLY))
+    {
+      mem->flags = mem->flags | ((flags & CL_MEM_HOST_NO_ACCESS)
+                                 | (flags & CL_MEM_HOST_READ_ONLY)
+                                 | (flags & CL_MEM_HOST_WRITE_ONLY));
+    }
+  else
+    {
+      mem->flags
+          = mem->flags | ((from_buffer->flags & CL_MEM_HOST_NO_ACCESS)
+                          | (from_buffer->flags & CL_MEM_HOST_READ_ONLY)
+                          | (from_buffer->flags & CL_MEM_HOST_WRITE_ONLY));
+    }
+
+  mem->flags = mem->flags | (from_buffer->flags & CL_MEM_USE_HOST_PTR)
+               | (from_buffer->flags & CL_MEM_ALLOC_HOST_PTR)
+               | (from_buffer->flags & CL_MEM_COPY_HOST_PTR);
+}
+
 cl_int pocl_update_mem_obj_sync (cl_command_queue cq, _cl_command_node *cmd, 
                                  cl_mem mem, char operation)
 {
@@ -476,12 +641,14 @@ cl_int pocl_update_mem_obj_sync (cl_command_queue cq, _cl_command_node *cmd,
 }
 
 int pocl_buffer_boundcheck(cl_mem buffer, size_t offset, size_t size) {
-  POCL_RETURN_ERROR_ON((offset > buffer->size), CL_INVALID_VALUE,
-            "offset(%zu) > buffer->size(%zu)", offset, buffer->size);
-  POCL_RETURN_ERROR_ON((size > buffer->size), CL_INVALID_VALUE,
-            "size(%zu) > buffer->size(%zu)", size, buffer->size);
-  POCL_RETURN_ERROR_ON((offset + size > buffer->size), CL_INVALID_VALUE,
-            "offset + size (%zu) > buffer->size(%zu)", (offset+size), buffer->size);
+  POCL_RETURN_ERROR_ON ((offset > buffer->size), CL_INVALID_VALUE,
+                        "offset(%zu) > buffer->size(%zu)\n", offset,
+                        buffer->size);
+  POCL_RETURN_ERROR_ON ((size > buffer->size), CL_INVALID_VALUE,
+                        "size(%zu) > buffer->size(%zu)\n", size, buffer->size);
+  POCL_RETURN_ERROR_ON ((offset + size > buffer->size), CL_INVALID_VALUE,
+                        "offset + size (%zu) > buffer->size(%zu)\n",
+                        (offset + size), buffer->size);
   return CL_SUCCESS;
 }
 
@@ -575,16 +742,16 @@ int pocl_buffers_overlap(cl_mem src_buffer,
   /* sub buffers overlap check  */
   if (src_buffer->parent && dst_buffer->parent &&
         (src_buffer->parent == dst_buffer->parent)) {
-      src_offset = (char*)src_buffer->mem_host_ptr - (char*)src_buffer->parent->mem_host_ptr +
-        src_offset;
-      dst_offset = (char*)dst_buffer->mem_host_ptr - (char*)dst_buffer->parent->mem_host_ptr +
-        dst_offset;
+      src_offset = src_buffer->origin + src_offset;
+      dst_offset = dst_buffer->origin + dst_offset;
 
-    POCL_RETURN_ERROR_ON(((src_offset <= dst_offset) && (dst_offset <=
-      (src_offset + size - 1))), CL_MEM_COPY_OVERLAP, "dst_offset lies inside \
+      POCL_RETURN_ERROR_ON (((src_offset <= dst_offset)
+                             && (dst_offset <= (src_offset + size - 1))),
+                            CL_MEM_COPY_OVERLAP, "dst_offset lies inside \
       the src region and src_buffer + dst_buffer are subbuffers of the same buffer");
-    POCL_RETURN_ERROR_ON(((dst_offset <= src_offset) && (src_offset <=
-      (dst_offset + size - 1))), CL_MEM_COPY_OVERLAP, "src_offset lies inside \
+      POCL_RETURN_ERROR_ON (((dst_offset <= src_offset)
+                             && (src_offset <= (dst_offset + size - 1))),
+                            CL_MEM_COPY_OVERLAP, "src_offset lies inside \
       the dst region and src_buffer + dst_buffer are subbuffers of the same buffer");
 
   }
@@ -682,6 +849,15 @@ check_copy_overlap(const size_t src_offset[3],
   return overlap;
 }
 
+/* For a subdevice parameter, return the actual device it belongs to. */
+cl_device_id
+pocl_real_dev (const cl_device_id dev)
+{
+  cl_device_id ret = dev;
+  while (ret->parent_device)
+    ret = ret->parent_device;
+  return ret;
+}
 
 /* Make a list of unique devices. If any device is a subdevice,
  * replace with parent, then remove duplicate parents. */
@@ -694,7 +870,7 @@ cl_device_id * pocl_unique_device_list(const cl_device_id * in, cl_uint num, cl_
 
   unsigned i;
   for (i=0; i < num; ++i)
-    out[i] = (in[i] ? POCL_REAL_DEV(in[i]) : NULL);
+    out[i] = (in[i] ? pocl_real_dev (in[i]) : NULL);
 
   i=1;
   unsigned device_i=0;
@@ -741,6 +917,35 @@ void pocl_setup_context(cl_context context)
         }
 }
 
+int
+pocl_check_event_wait_list (cl_command_queue command_queue,
+                            cl_uint num_events_in_wait_list,
+                            const cl_event *event_wait_list)
+{
+  POCL_RETURN_ERROR_COND (
+      (event_wait_list == NULL && num_events_in_wait_list > 0),
+      CL_INVALID_EVENT_WAIT_LIST);
+
+  POCL_RETURN_ERROR_COND (
+      (event_wait_list != NULL && num_events_in_wait_list == 0),
+      CL_INVALID_EVENT_WAIT_LIST);
+
+  if (event_wait_list)
+    {
+      unsigned i;
+      for (i = 0; i < num_events_in_wait_list; i++)
+        {
+          POCL_RETURN_ERROR_COND ((event_wait_list[i] == NULL),
+                                  CL_INVALID_EVENT_WAIT_LIST);
+          POCL_RETURN_ERROR_COND (
+              (event_wait_list[i]->context != command_queue->context),
+              CL_INVALID_CONTEXT);
+        }
+    }
+
+  return CL_SUCCESS;
+}
+
 const char*
 pocl_status_to_str (int status)
 {
@@ -824,3 +1029,130 @@ pocl_command_to_str (cl_command_type cmd)
 
   return "unknown";
 }
+
+/*
+ * This replaces a simple system(), because:
+ *
+ * 1) system() was causing issues (gpu lockups) with HSA when
+ * compiling code (via compile_parallel_bc_to_brig)
+ * with OpenCL 2.0 atomics (like CalcPie from AMD SDK).
+ * The reason of lockups is unknown (yet).
+ *
+ * 2) system() uses fork() which copies page table maps, and runs
+ * out of AS when pocl has already allocated huge buffers in memory.
+ * this happened in llvm_codegen()
+ *
+ * vfork() does not copy pagetables.
+ */
+int
+pocl_run_command (char *const *args)
+{
+  POCL_MSG_PRINT_INFO ("Launching: %s\n", args[0]);
+#ifdef HAVE_VFORK
+  pid_t p = vfork ();
+#elif defined(HAVE_FORK)
+  pid_t p = fork ();
+#else
+#error Must have fork() or vfork() system calls for HSA
+#endif
+  if (p == 0)
+    {
+      return execv (args[0], args);
+    }
+  else
+    {
+      if (p < 0)
+        return EXIT_FAILURE;
+      int status;
+      if (waitpid (p, &status, 0) < 0)
+        POCL_ABORT ("pocl: waitpid() failed.\n");
+      if (WIFEXITED (status))
+        return WEXITSTATUS (status);
+      else if (WIFSIGNALED (status))
+        return WTERMSIG (status);
+      else
+        return EXIT_FAILURE;
+    }
+}
+
+/*
+ * float 2 half / half 2 float
+ */
+
+static int const shift = 13;
+static int const shiftSign = 16;
+
+static int32_t const infN = 0x7F800000;  /* flt32 infinity */
+static int32_t const maxN = 0x477FE000;  /* max flt16 normal as a flt32 */
+static int32_t const minN = 0x38800000;  /* min flt16 normal as a flt32 */
+static int32_t const signN = 0x80000000; /* flt32 sign bit */
+
+/* static int32_t const infC = infN >> shift;
+ * static int32_t const infC = 0x3FC00;
+ * static int32_t const nanN = (infC + 1) << shift; // minimum flt16 nan as a flt32
+ */
+static int32_t const nanN = 0x7f802000;
+/* static int32_t const maxC = maxN >> shift; */
+static int32_t const maxC = 0x23bff;
+/* static int32_t const minC = minN >> shift;
+ * static int32_t const minC = 0x1c400;
+ * static int32_t const signC = signN >> shiftSign; // flt16 sign bit
+ */
+static int32_t const signC = 0x40000; /* flt16 sign bit */
+
+static int32_t const mulN = 0x52000000; /* (1 << 23) / minN */
+static int32_t const mulC = 0x33800000; /* minN / (1 << (23 - shift)) */
+
+static int32_t const subC = 0x003FF; /* max flt32 subnormal down shifted */
+static int32_t const norC = 0x00400; /* min flt32 normal down shifted */
+
+/* static int32_t const maxD = infC - maxC - 1; */
+static int32_t const maxD = 0x1c000;
+/* static int32_t const minD = minC - subC - 1; */
+static int32_t const minD = 0x1c000;
+
+typedef union
+{
+  float f;
+  int32_t si;
+  uint32_t ui;
+} H2F_Bits;
+
+float
+half_to_float (uint16_t value)
+{
+  H2F_Bits v;
+  v.ui = value;
+  int32_t sign = v.si & signC;
+  v.si ^= sign;
+  sign <<= shiftSign;
+  v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
+  v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
+  H2F_Bits s;
+  s.si = mulC;
+  s.f *= v.si;
+  int32_t mask = -(norC > v.si);
+  v.si <<= shift;
+  v.si ^= (s.si ^ v.si) & mask;
+  v.si |= sign;
+  return v.f;
+}
+
+uint16_t
+float_to_half (float value)
+{
+  H2F_Bits v, s;
+  v.f = value;
+  uint32_t sign = v.si & signN;
+  v.si ^= sign;
+  sign >>= shiftSign;
+  s.si = mulN;
+  s.si = s.f * v.f;
+  v.si ^= (s.si ^ v.si) & -(minN > v.si);
+  v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
+  v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
+  v.ui >>= shift;
+  v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
+  v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC);
+  return v.ui | sign;
+}
diff --git a/lib/CL/pocl_util.h b/lib/CL/pocl_util.h
index 50477fe..9e7cd36 100644
--- a/lib/CL/pocl_util.h
+++ b/lib/CL/pocl_util.h
@@ -40,6 +40,20 @@ extern "C" {
 uint32_t byteswap_uint32_t (uint32_t word, char should_swap);
 float byteswap_float (float word, char should_swap);
 
+/* set rounding mode */
+void pocl_restore_rm (unsigned rm);
+/* get current rounding mode */
+unsigned pocl_save_rm ();
+/* set OpenCL's default (round to nearest) rounding mode */
+void pocl_set_default_rm ();
+
+/* sets the flush-denorms-to-zero flag on the CPU, if supported */
+void pocl_set_ftz (unsigned ftz);
+
+/* saves / restores cpu flags*/
+unsigned pocl_save_ftz ();
+void pocl_restore_ftz (unsigned ftz);
+
 /* Finds the next highest power of two of the given value. */
 size_t pocl_size_ceil2(size_t x);
 
@@ -59,6 +73,12 @@ void *pocl_aligned_malloc(size_t alignment, size_t size);
 void pocl_aligned_free(void* ptr);
 #endif
 
+/* locks / unlocks two events in order of their event-id.
+ * This avoids any potential deadlocks of threads should
+ * they try to lock events in opposite order. */
+void pocl_lock_events_inorder (cl_event ev1, cl_event ev2);
+void pocl_unlock_events_inorder (cl_event ev1, cl_event ev2);
+
 /* Function for creating events */
 cl_int pocl_create_event (cl_event *event, cl_command_queue command_queue,
                           cl_command_type command_type, int num_buffers,
@@ -116,6 +136,9 @@ pocl_command_is_ready(cl_event event)
 int
 pocl_update_command_queue (cl_event event);
 
+void pocl_cl_mem_inherit_flags (cl_mem mem, cl_mem from_buffer,
+                                cl_mem_flags flags);
+
 cl_int 
 pocl_update_mem_obj_sync (cl_command_queue cq, _cl_command_node *cmd, 
                           cl_mem mem, char operation);
@@ -124,29 +147,39 @@ void pocl_setup_context(cl_context context);
 
 /* Helpers for dealing with devices / subdevices */
 
-#define POCL_REAL_DEV(dev) (dev->parent_device ? dev->parent_device : dev)
-
+cl_device_id pocl_real_dev (const cl_device_id);
 cl_device_id * pocl_unique_device_list(const cl_device_id * in, cl_uint num, cl_uint *real);
 
-#define POCL_CHECK_DEV_IN_CMDQ                                               \
-  do                                                                         \
-    {                                                                        \
-      device = command_queue->device;                                        \
-      for (i = 0; i < command_queue->context->num_devices; ++i)              \
-        {                                                                    \
-          if (command_queue->context->devices[i] == POCL_REAL_DEV(device))   \
-            break;                                                           \
-        }                                                                    \
-      assert(i < command_queue->context->num_devices);                       \
-    }                                                                        \
+#define POCL_CHECK_DEV_IN_CMDQ                                                \
+  do                                                                          \
+    {                                                                         \
+      device = pocl_real_dev (command_queue->device);                         \
+      for (i = 0; i < command_queue->context->num_devices; ++i)               \
+        {                                                                     \
+          if (command_queue->context->devices[i] == device)                   \
+            break;                                                            \
+        }                                                                     \
+      assert (i < command_queue->context->num_devices);                       \
+    }                                                                         \
   while (0)
 
+int pocl_check_event_wait_list(cl_command_queue     command_queue,
+                               cl_uint              num_events_in_wait_list,
+                               const cl_event *     event_wait_list);
+
 const char*
 pocl_status_to_str (int status);
 
 const char *
 pocl_command_to_str (cl_command_type cmd);
 
+int
+pocl_run_command(char * const *args);
+
+uint16_t float_to_half (float value);
+
+float half_to_float (uint16_t value);
+
 #ifdef __cplusplus
 }
 #endif
@@ -188,6 +221,26 @@ pocl_command_to_str (cl_command_type cmd);
     }                                                                   \
   while (0)
 
+#define POCL_RETURN_GETINFO_STR_FREE(__STR__)                                 \
+  do                                                                          \
+    {                                                                         \
+      size_t const value_size = strlen (__STR__) + 1;                         \
+      if (param_value)                                                        \
+        {                                                                     \
+          if (param_value_size >= value_size)                                 \
+            memcpy (param_value, __STR__, value_size);                        \
+          POCL_MEM_FREE (__STR__);                                            \
+          if (param_value_size < value_size)                                  \
+            return CL_INVALID_VALUE;                                          \
+        }                                                                     \
+      else                                                                    \
+        POCL_MEM_FREE (__STR__);                                              \
+      if (param_value_size_ret)                                               \
+        *param_value_size_ret = value_size;                                   \
+      return CL_SUCCESS;                                                      \
+    }                                                                         \
+  while (0)
+
 #define POCL_RETURN_GETINFO(__TYPE__, __VALUE__)                        \
   do                                                                    \
     {                                                                   \
@@ -206,4 +259,9 @@ pocl_command_to_str (cl_command_type cmd);
     }                                                                   \
   while (0)
 
+#define HANDLE_IMAGE1D_BUFFER(mem)                                            \
+  mem = ((mem->is_image && (mem->type == CL_MEM_OBJECT_IMAGE1D_BUFFER))       \
+             ? mem->buffer                                                    \
+             : mem);
+
 #endif
diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt
index 5a90ad5..5769009 100644
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -31,6 +31,33 @@ if (OCS_AVAILABLE)
   add_subdirectory("llvmopencl")
 endif()
 
+set(SANITIZER_OPTIONS "")
+
+if(ENABLE_ASAN)
+  list(APPEND SANITIZER_OPTIONS "-fsanitize=address")
+  list(APPEND SANITIZER_LIBS "asan")
+endif()
+
+if(ENABLE_LSAN)
+  list(APPEND SANITIZER_OPTIONS "-fsanitize=leak")
+  list(APPEND SANITIZER_LIBS "lsan")
+endif()
+
+if(ENABLE_TSAN)
+  list(APPEND SANITIZER_OPTIONS "-fsanitize=thread")
+  list(APPEND SANITIZER_LIBS "tsan")
+endif()
+
+if(ENABLE_UBSAN)
+  list(APPEND SANITIZER_OPTIONS "-fsanitize=undefined")
+  list(APPEND SANITIZER_LIBS "ubsan")
+endif()
+
+if(SANITIZER_OPTIONS)
+  list(APPEND SANITIZER_OPTIONS "-fno-omit-frame-pointer")
+  string(REPLACE ";" " " SANITIZER_OPTIONS_STR "${SANITIZER_OPTIONS}")
+endif()
+
 add_subdirectory("CL")
 
 #############################################################
@@ -41,7 +68,7 @@ add_subdirectory("CL")
 if(ENABLE_ICD)
 
   # -pthread makes gdb happier when debugging tests
-  set(OPENCL_LIBS "${PTHREAD_LDFLAGS};${OPENCL_LIBRARIES}")
+  set(OPENCL_LIBS "${PTHREAD_LIBRARY};${OPENCL_LIBRARIES}")
 
 else()
 
@@ -50,14 +77,28 @@ else()
   # executables (e.g. poclcc -> libOpenCL -> libLLVM)
   # and this must be specified on new systems,
   # otherwise linking fails.
-  set(OPENCL_LIBS "${PTHREAD_LDFLAGS};${POCL_LIBRARY_NAME};${POCL_TRANSITIVE_LIBS}")
+  set(OPENCL_LIBS "${PTHREAD_LIBRARY};${POCL_LIBRARY_NAME};${POCL_TRANSITIVE_LIBS}")
+
+endif()
+
+set(OPENCL_CFLAGS "")
 
+if(SANITIZER_OPTIONS)
+  list(INSERT OPENCL_LIBS 0 ${SANITIZER_LIBS})
+  set(OPENCL_CFLAGS "${OPENCL_CFLAGS} ${SANITIZER_OPTIONS_STR}")
 endif()
 
-set(OPENCL_CFLAGS "${PTHREAD_CFLAGS}")
+set(OPENCL_CFLAGS_STR "${OPENCL_CFLAGS}")
+separate_arguments(OPENCL_CFLAGS)
+
 
 set(OPENCL_LIBS "${OPENCL_LIBS}" PARENT_SCOPE)
 set(OPENCL_CFLAGS "${OPENCL_CFLAGS}" PARENT_SCOPE)
+set(OPENCL_CFLAGS_STR "${OPENCL_CFLAGS_STR}" PARENT_SCOPE)
+
+set(SANITIZER_LIBS "${SANITIZER_LIBS}" PARENT_SCOPE)
+set(SANITIZER_OPTIONS "${SANITIZER_OPTIONS}" PARENT_SCOPE)
+set(SANITIZER_OPTIONS_STR "${SANITIZER_OPTIONS_STR}" PARENT_SCOPE)
 
 add_subdirectory("poclu")
 
diff --git a/lib/kernel/CMakeLists.txt b/lib/kernel/CMakeLists.txt
index 86810c6..5492797 100644
--- a/lib/kernel/CMakeLists.txt
+++ b/lib/kernel/CMakeLists.txt
@@ -23,7 +23,6 @@
 #
 #=============================================================================
 
-# sources.mk
 set(SOURCES_WITHOUT_VML abs.cl
 abs_diff.cl
 acos.cl
@@ -37,6 +36,7 @@ asin.cl
 asinh.cl
 asinpi.cl
 async_work_group_copy.cl
+async_work_group_strided_copy.cl
 atan.cl
 atan2.cl
 atan2pi.cl
@@ -81,6 +81,9 @@ get_global_id.c
 get_global_offset.c
 get_global_size.c
 get_group_id.c
+get_image_array_size.cl
+get_image_channel_data_type.cl
+get_image_channel_order.cl
 get_image_depth.cl
 get_image_height.cl
 get_image_width.cl
@@ -181,10 +184,10 @@ vload.cl
 vload_half.cl
 vstore.cl
 vstore_half.cl
+vload_store_half_f16c.c
 wait_group_events.cl
 write_image.cl)
 
-# from sources-vml.mk
 set(SOURCES_WITH_VML abs.cl
 abs_diff.cl
 add_sat.cl
@@ -192,6 +195,7 @@ all.cl
 any.cl
 as_type.cl
 async_work_group_copy.cl
+async_work_group_strided_copy.cl
 atomics.cl
 barrier.ll
 bitselect.cl
@@ -204,6 +208,9 @@ get_global_id.c
 get_global_offset.c
 get_global_size.c
 get_group_id.c
+get_image_array_size.cl
+get_image_channel_data_type.cl
+get_image_channel_order.cl
 get_image_depth.cl
 get_image_height.cl
 get_image_width.cl
@@ -225,6 +232,7 @@ mul24.cl
 mul_hi.cl
 nextafter.cl
 popcount.cl
+prefetch.cl
 printf.c
 read_image.cl
 rhadd.cl
@@ -239,6 +247,7 @@ vload.cl
 vload_half.cl
 vstore.cl
 vstore_half.cl
+vload_store_half_f16c.c
 wait_group_events.cl
 write_image.cl
 vecmathlib-pocl/acos.cc
@@ -362,8 +371,243 @@ vecmathlib-pocl/tanh.cc
 vecmathlib-pocl/tanpi.cl
 vecmathlib-pocl/trunc.cc)
 
-#LKERNEL_HDRS_EXTRA - headers that should be dependencies
-set(KERNEL_DEPEND_HEADERS
+
+set(SOURCES_WITH_SLEEF abs.cl
+abs_diff.cl
+add_sat.cl
+all.cl
+any.cl
+as_type.cl
+async_work_group_copy.cl
+async_work_group_strided_copy.cl
+atomics.cl
+barrier.ll
+bitselect.cl
+clamp.cl
+clamp_int.cl
+clz.cl
+convert_type.cl
+cross.cl
+distance.cl
+dot.cl
+fast_distance.cl
+fast_length.cl
+fast_normalize.cl
+fract.cl
+get_global_id.c
+get_global_offset.c
+get_global_size.c
+get_group_id.c
+get_image_array_size.cl
+get_image_channel_data_type.cl
+get_image_channel_order.cl
+get_image_depth.cl
+get_image_dim.cl
+get_image_height.cl
+get_image_width.cl
+get_local_id.c
+get_local_size.c
+get_num_groups.c
+get_work_dim.c
+hadd.cl
+half_cos.cl
+half_divide.cl
+half_exp10.cl
+half_exp2.cl
+half_exp.cl
+half_log10.cl
+half_log2.cl
+half_log.cl
+half_powr.cl
+half_recip.cl
+half_rsqrt.cl
+half_sin.cl
+half_sqrt.cl
+half_tan.cl
+isequal.cl
+isgreater.cl
+isgreaterequal.cl
+isless.cl
+islessequal.cl
+islessgreater.cl
+isnotequal.cl
+isordered.cl
+isunordered.cl
+mad24.cl
+mad.cl
+mad_hi.cl
+mad_sat.cl
+max.cl
+max_i.cl
+maxmag.cl
+min.cl
+min_i.cl
+minmag.cl
+mix.cl
+mul24.cl
+mul_hi.cl
+nan.cl
+native_divide.cl
+native_exp10.cl
+native_exp2.cl
+native_exp.cl
+native_log10.cl
+native_log2.cl
+native_log.cl
+native_powr.cl
+native_recip.cl
+native_rsqrt.cl
+native_sqrt.cl
+popcount.cl
+prefetch.cl
+printf.c
+read_image.cl
+rhadd.cl
+rotate.cl
+rsqrt.cl
+select.cl
+shuffle.cl
+signbit.cl
+sign.cl
+smoothstep.cl
+step.cl
+sub_sat.cl
+upsample.cl
+vload.cl
+vload_half.cl
+vload_store_half_f16c.c
+vstore.cl
+vstore_half.cl
+wait_group_events.cl
+write_image.cl
+
+###################################################################
+
+# from libclc
+
+libclc-pocl/pocl_fma.cl
+libclc-pocl/acospi.cl
+libclc-pocl/asinpi.cl
+libclc-pocl/atan2pi.cl
+libclc-pocl/atanpi.cl
+libclc-pocl/sinpi.cl
+libclc-pocl/cospi.cl
+libclc-pocl/tanpi.cl
+libclc-pocl/cos.cl
+libclc-pocl/cosh.cl
+libclc-pocl/sin.cl
+libclc-pocl/sinh.cl
+libclc-pocl/tan.cl
+libclc-pocl/tanh.cl
+libclc-pocl/sincos.cl
+libclc-pocl/sincos_helpers.cl
+libclc-pocl/acosh.cl
+libclc-pocl/asinh.cl
+libclc-pocl/atanh.cl
+libclc-pocl/ep_log.cl
+libclc-pocl/radians.cl
+libclc-pocl/degrees.cl
+libclc-pocl/log2.cl
+libclc-pocl/logb.cl
+# currently unused
+#libclc/log1p.cl
+#libclc-pocl/frexp.cl
+#libclc-pocl/expfrexp.cl
+#libclc-pocl/frfrexp.cl
+libclc-pocl/pown.cl
+libclc-pocl/powr.cl
+libclc-pocl/pow.cl
+libclc-pocl/rootn.cl
+libclc-pocl/pow_helpers.cl
+libclc-pocl/fmod.cl
+libclc-pocl/remainder.cl
+libclc-pocl/remquo.cl
+libclc-pocl/ocml_helpers.cl
+
+libclc-pocl/isinf.cl
+libclc-pocl/isnan.cl
+libclc-pocl/isfinite.cl
+libclc-pocl/isnormal.cl
+
+
+libclc/vtables_fp32.cl
+libclc/vtables_fp64.cl
+libclc/normalize.cl
+libclc/length.cl
+
+###################################################################
+
+sleef/libm/sleef_glue.cl
+
+sleef-pocl/scalars.cl
+sleef-pocl/acos.cl
+sleef-pocl/asin.cl
+sleef-pocl/atan2.cl
+sleef-pocl/atan.cl
+sleef-pocl/cbrt.cl
+sleef-pocl/ceil.cl
+sleef-pocl/copysign.cl
+sleef-pocl/erfc.cl
+sleef-pocl/erf.cl
+sleef-pocl/exp10.cl
+sleef-pocl/exp2.cl
+sleef-pocl/exp.cl
+sleef-pocl/expm1.cl
+sleef-pocl/fabs.cl
+sleef-pocl/fdim.cl
+sleef-pocl/floor.cl
+sleef-pocl/fma.cl
+sleef-pocl/fmax.cl
+sleef-pocl/fmin.cl
+sleef-pocl/expfrexp.cl
+sleef-pocl/frfrexp.cl
+sleef-pocl/frexp.cl
+sleef-pocl/hypot.cl
+sleef-pocl/ilogb.cl
+sleef-pocl/ldexp.cl
+sleef-pocl/lgamma.cl
+sleef-pocl/lgamma_r.cl
+sleef-pocl/log10.cl
+sleef-pocl/log1p.cl
+sleef-pocl/log.cl
+sleef-pocl/modf.cl
+sleef-pocl/native_cos.cl
+sleef-pocl/native_sin.cl
+sleef-pocl/native_tan.cl
+sleef-pocl/nextafter.cl
+sleef-pocl/rint.cl
+sleef-pocl/round.cl
+sleef-pocl/sqrt.cl
+sleef-pocl/tgamma.cl
+sleef-pocl/trunc.cl
+)
+
+set(SLEEF_CL_KERNEL_DEPEND_HEADERS "")
+foreach(HEADER helperadvsimd.h   helperavx2.h     helperavx.h     helperpurec.h  helpersse2.h    misc.h helperavx2_128.h  helperavx512f.h  helperneon32.h  helpers.h      helpervecext.h)
+  list(APPEND SLEEF_CL_KERNEL_DEPEND_HEADERS "${CMAKE_SOURCE_DIR}/lib/kernel/sleef/arch/${HEADER}")
+endforeach()
+list(APPEND SLEEF_CL_KERNEL_DEPEND_HEADERS "${CMAKE_SOURCE_DIR}/lib/kernel/sleef/include/sleef_cl.h")
+
+set(SLEEF_C_KERNEL_DEPEND_HEADERS ${SLEEF_CL_KERNEL_DEPEND_HEADERS})
+
+# only CL files depend on these
+foreach(SOURCE sleef_builtin.c  sleef_glue_auto.c  sleef_glue.cl)
+list(APPEND SLEEF_CL_KERNEL_DEPEND_HEADERS "${CMAKE_SOURCE_DIR}/lib/kernel/sleef/libm/${SOURCE}")
+endforeach()
+
+# only C files depend on these
+foreach(SOURCE dd.h df.h rename.h rename_vec128.h rename_vec256.h rename_vec512.h )
+list(APPEND SLEEF_C_KERNEL_DEPEND_HEADERS "${CMAKE_SOURCE_DIR}/lib/kernel/sleef/libm/${SOURCE}")
+endforeach()
+list(APPEND SLEEF_C_KERNEL_DEPEND_HEADERS "${CMAKE_SOURCE_DIR}/lib/kernel/sleef/include/sleef.h")
+
+set(LIBCLC_KERNEL_DEPEND_HEADERS "")
+foreach(HEADER ep_log.h misc.h singlevec.h ocml_helpers.h sincos_helpers_fp32.h sincos_helpers_fp64.h vtables.h vtables_macros.h)
+  list(APPEND LIBCLC_KERNEL_DEPEND_HEADERS "${CMAKE_SOURCE_DIR}/lib/kernel/libclc/${HEADER}")
+endforeach()
+
+# vecmathlib headers that should be dependencies
+set(VML_KERNEL_DEPEND_HEADERS
 "${CMAKE_SOURCE_DIR}/lib/kernel/vecmathlib/vec_test.h"
 "${CMAKE_SOURCE_DIR}/lib/kernel/vecmathlib/mathfuncs.h"
 "${CMAKE_SOURCE_DIR}/lib/kernel/vecmathlib/mathfuncs_fabs.h"
@@ -430,6 +674,12 @@ endif()
 
 #*********************************************************************
 
+if(OCL_TARGETS MATCHES "cuda")
+  add_subdirectory("cuda")
+endif()
+
+#*********************************************************************
+
 # "Escape" a list before passing to an external command
 string(REPLACE ";" "****" KERNEL_BC_LIST_ESCAPED "${KERNEL_BC_LIST}")
 
diff --git a/lib/kernel/as_type.cl b/lib/kernel/as_type.cl
index 6fa95bb..137b18a 100644
--- a/lib/kernel/as_type.cl
+++ b/lib/kernel/as_type.cl
@@ -27,8 +27,12 @@
  * These map down to the corresponding SPIR/LLVM IR bitcast instruction.
  */
 
+#if (__clang_major__ < 5)
+
+/* Clang starting from 5 defines these as builtins in opencl-c.h */
+
 #define DEFINE_AS_TYPE(SRC, DST)                                        \
-  _CL_ALWAYSINLINE _CL_OVERLOADABLE                                     \
+  _CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE                                   \
   DST as_##DST(SRC a)                                                   \
   {                                                                     \
     union { SRC src; DST dst; } cvt;                                    \
@@ -237,3 +241,5 @@ DEFINE_AS_TYPE_128(long16)
 DEFINE_AS_TYPE_128(ulong16))
 __IF_FP64(
 DEFINE_AS_TYPE_128(double16))
+
+#endif
diff --git a/lib/kernel/async_work_group_copy.cl b/lib/kernel/async_work_group_copy.cl
index af3c6af..7c16d8e 100644
--- a/lib/kernel/async_work_group_copy.cl
+++ b/lib/kernel/async_work_group_copy.cl
@@ -75,3 +75,4 @@ __IF_INT64(IMPLEMENT_ASYNC_COPY_FUNCS(ulong));
 
 IMPLEMENT_ASYNC_COPY_FUNCS(float);
 __IF_FP64(IMPLEMENT_ASYNC_COPY_FUNCS(double));
+__IF_FP16 (IMPLEMENT_ASYNC_COPY_FUNCS (half));
diff --git a/lib/kernel/async_work_group_strided_copy.cl b/lib/kernel/async_work_group_strided_copy.cl
new file mode 100644
index 0000000..6e45834
--- /dev/null
+++ b/lib/kernel/async_work_group_strided_copy.cl
@@ -0,0 +1,77 @@
+/* OpenCL built-in library: async_work_group_strided_copy()
+
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include "templates.h"
+
+/* The default implementation for "async copies" is a
+   blocking one which doesn't actually need events for
+   anything.
+
+   The devices (actually, platforms) should override these to
+   implement proper block copies or similar. */
+
+#define IMPLEMENT_ASYNC_STRIDED_COPY_FUNCS_SINGLE(GENTYPE)                    \
+  __attribute__ ((overloadable)) event_t async_work_group_strided_copy (      \
+      __local GENTYPE *dst, const __global GENTYPE *src, size_t num_gentypes, \
+      size_t src_stride, event_t event)                                       \
+  {                                                                           \
+    __SINGLE_WI                                                               \
+    {                                                                         \
+      for (size_t i = 0; i < num_gentypes; ++i)                               \
+        dst[i] = src[i * src_stride];                                         \
+    }                                                                         \
+    return event;                                                             \
+  }                                                                           \
+                                                                              \
+  __attribute__ ((overloadable)) event_t async_work_group_strided_copy (      \
+      __global GENTYPE *dst, const __local GENTYPE *src, size_t num_gentypes, \
+      size_t dst_stride, event_t event)                                       \
+  {                                                                           \
+    __SINGLE_WI                                                               \
+    {                                                                         \
+      for (size_t i = 0; i < num_gentypes; ++i)                               \
+        dst[i * dst_stride] = src[i];                                         \
+    }                                                                         \
+    return event;                                                             \
+  }
+
+#define IMPLEMENT_ASYNC_STRIDED_COPY_FUNCS(GENTYPE)                           \
+  IMPLEMENT_ASYNC_STRIDED_COPY_FUNCS_SINGLE (GENTYPE)                         \
+  IMPLEMENT_ASYNC_STRIDED_COPY_FUNCS_SINGLE (GENTYPE##2)                      \
+  IMPLEMENT_ASYNC_STRIDED_COPY_FUNCS_SINGLE (GENTYPE##3)                      \
+  IMPLEMENT_ASYNC_STRIDED_COPY_FUNCS_SINGLE (GENTYPE##4)                      \
+  IMPLEMENT_ASYNC_STRIDED_COPY_FUNCS_SINGLE (GENTYPE##8)                      \
+  IMPLEMENT_ASYNC_STRIDED_COPY_FUNCS_SINGLE (GENTYPE##16)
+
+IMPLEMENT_ASYNC_STRIDED_COPY_FUNCS (char);
+IMPLEMENT_ASYNC_STRIDED_COPY_FUNCS (uchar);
+IMPLEMENT_ASYNC_STRIDED_COPY_FUNCS (short);
+IMPLEMENT_ASYNC_STRIDED_COPY_FUNCS (ushort);
+IMPLEMENT_ASYNC_STRIDED_COPY_FUNCS (int);
+IMPLEMENT_ASYNC_STRIDED_COPY_FUNCS (uint);
+__IF_INT64 (IMPLEMENT_ASYNC_STRIDED_COPY_FUNCS (long));
+__IF_INT64 (IMPLEMENT_ASYNC_STRIDED_COPY_FUNCS (ulong));
+
+__IF_FP16 (IMPLEMENT_ASYNC_STRIDED_COPY_FUNCS (half));
+IMPLEMENT_ASYNC_STRIDED_COPY_FUNCS (float);
+__IF_FP64 (IMPLEMENT_ASYNC_STRIDED_COPY_FUNCS (double));
diff --git a/lib/kernel/atomics.cl b/lib/kernel/atomics.cl
index 201a9ce..09472b5 100644
--- a/lib/kernel/atomics.cl
+++ b/lib/kernel/atomics.cl
@@ -55,20 +55,20 @@
 
 #ifdef cl_khr_int64
 #  define T long
-#  define MIN(a,b) (__builtin_trap(), 0L)
-#  define MAX(a,b) (__builtin_trap(), 0L)
-#  include "atomics.cl"
-#  undef T
 #  undef MIN
 #  undef MAX
-
-#  define T ulong
-#  define MIN(a,b) (__builtin_trap(), 0UL)
-#  define MAX(a,b) (__builtin_trap(), 0UL)
+#  define IS_UINT64
 #  include "atomics.cl"
+#  undef IS_UINT64
 #  undef T
+
+#  define T ulong
 #  undef MIN
 #  undef MAX
+#  define IS_UINT64
+#  include "atomics.cl"
+#  undef IS_UINT64
+#  undef T
 #endif
 
 
@@ -93,14 +93,12 @@
 __attribute__((overloadable))
 float atomic_xchg(volatile Q float *p, float val)
 {
-  // NOTE: We compare the float as int here...
-  union { volatile Q float *p; intptr_t i; } u1;
-  union { intptr_t i; volatile int *p; } u2;
-  u1.p = p;
-  u2.i = u1.i;
-  return __atomic_exchange_n(u2.p, val, __ATOMIC_RELAXED);
+  int retval = atomic_xchg ((volatile Q int *)p, as_int(val));
+  return as_float(retval);
 }
 
+
+
 #else
 
 
@@ -161,6 +159,7 @@ T atomic_cmpxchg(volatile Q T *p, T cmp, T val)
 
 // extended
 
+#ifdef MIN
 __attribute__((overloadable))
 T atomic_min(volatile Q T *p, T val)
 {
@@ -170,7 +169,24 @@ T atomic_min(volatile Q T *p, T val)
   u2.i = u1.i;
   return MIN(u2.p, val);
 }
+#endif
+
+#ifdef IS_UINT64
+__attribute__((overloadable))
+T atomic_min (volatile Q T *p, T val)
+{
+  T min,old;
+  do {
+    old = min = *p;
+    if (val < min)
+      old = atomic_cmpxchg(p, min, val);
+  } while (old != min);
+  return old;
+}
+#endif
+
 
+#ifdef MAX
 __attribute__((overloadable))
 T atomic_max(volatile Q T *p, T val)
 {
@@ -180,6 +196,21 @@ T atomic_max(volatile Q T *p, T val)
   u2.i = u1.i;
   return MAX(u2.p, val);
 }
+#endif
+
+#ifdef IS_UINT64
+__attribute__((overloadable))
+T atomic_max (volatile Q T *p, T val)
+{
+  T max,old;
+  do {
+    old = max = *p;
+    if (val > max)
+      old = atomic_cmpxchg(p, max, val);
+  } while (old != max);
+  return old;
+}
+#endif
 
 __attribute__((overloadable))
 T atomic_and(volatile Q T *p, T val)
diff --git a/lib/kernel/barrier.ll b/lib/kernel/barrier.ll
index c3fca6b..17ca81d 100644
--- a/lib/kernel/barrier.ll
+++ b/lib/kernel/barrier.ll
@@ -1,8 +1,8 @@
 ; This is an "illegal" C function name on purpose. It's a magic
 ; handle based on which we know it's the special WG barrier function.
-declare void @pocl.barrier()
+declare void @pocl.barrier() noduplicate
 
-define void @_Z7barrierj(i32 %flags) {
+define void @_Z7barrierj(i32 %flags) noduplicate {
 entry:
   call void @pocl.barrier()
   ret void
diff --git a/lib/kernel/convert_type.cl b/lib/kernel/convert_type.cl
index c2c818b..8da102a 100644
--- a/lib/kernel/convert_type.cl
+++ b/lib/kernel/convert_type.cl
@@ -27,254 +27,254 @@
    THE SOFTWARE.
 */
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char(char x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2(char2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4(char4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8(char8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16(char16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3(char3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar(char x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2(char2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4(char4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8(char8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16(char16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3(char3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short(char x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2(char2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4(char4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8(char8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16(char16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3(char3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort(char x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2(char2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4(char4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8(char8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16(char16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3(char3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int(char x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2(char2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4(char4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8(char8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16(char16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3(char3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint(char x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2(char2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4(char4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8(char8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16(char16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3(char3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
 }
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long(char x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2(char2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4(char4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8(char8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16(char16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3(char3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -282,37 +282,37 @@ long3 convert_long3(char3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong(char x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2(char2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4(char4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8(char8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16(char16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3(char3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -320,365 +320,365 @@ ulong3 convert_ulong3(char3 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half convert_half(char x)
 {
   return (half)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half2 convert_half2(char2 x)
 {
   return (half2)(convert_half(x.lo), convert_half(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half4 convert_half4(char4 x)
 {
   return (half4)(convert_half2(x.lo), convert_half2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half8 convert_half8(char8 x)
 {
   return (half8)(convert_half4(x.lo), convert_half4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half16 convert_half16(char16 x)
 {
   return (half16)(convert_half8(x.lo), convert_half8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half3 convert_half3(char3 x)
 {
   return (half3)(convert_half2(x.s01), convert_half(x.s2));
 }
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float(char x)
 {
   return (float)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2(char2 x)
 {
   return (float2)(convert_float(x.lo), convert_float(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4(char4 x)
 {
   return (float4)(convert_float2(x.lo), convert_float2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8(char8 x)
 {
   return (float8)(convert_float4(x.lo), convert_float4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16(char16 x)
 {
   return (float16)(convert_float8(x.lo), convert_float8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3(char3 x)
 {
   return (float3)(convert_float2(x.s01), convert_float(x.s2));
 }
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double(char x)
 {
   return (double)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2(char2 x)
 {
   return (double2)(convert_double(x.lo), convert_double(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4(char4 x)
 {
   return (double4)(convert_double2(x.lo), convert_double2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8(char8 x)
 {
   return (double8)(convert_double4(x.lo), convert_double4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16(char16 x)
 {
   return (double16)(convert_double8(x.lo), convert_double8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3(char3 x)
 {
   return (double3)(convert_double2(x.s01), convert_double(x.s2));
 }
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char(uchar x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2(uchar2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4(uchar4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8(uchar8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16(uchar16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3(uchar3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar(uchar x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2(uchar2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4(uchar4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8(uchar8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16(uchar16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3(uchar3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short(uchar x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2(uchar2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4(uchar4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8(uchar8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16(uchar16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3(uchar3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort(uchar x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2(uchar2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4(uchar4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8(uchar8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16(uchar16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3(uchar3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int(uchar x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2(uchar2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4(uchar4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8(uchar8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16(uchar16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3(uchar3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint(uchar x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2(uchar2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4(uchar4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8(uchar8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16(uchar16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3(uchar3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
 }
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long(uchar x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2(uchar2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4(uchar4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8(uchar8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16(uchar16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3(uchar3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -686,37 +686,37 @@ long3 convert_long3(uchar3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong(uchar x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2(uchar2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4(uchar4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8(uchar8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16(uchar16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3(uchar3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -724,365 +724,365 @@ ulong3 convert_ulong3(uchar3 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half convert_half(uchar x)
 {
   return (half)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half2 convert_half2(uchar2 x)
 {
   return (half2)(convert_half(x.lo), convert_half(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half4 convert_half4(uchar4 x)
 {
   return (half4)(convert_half2(x.lo), convert_half2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half8 convert_half8(uchar8 x)
 {
   return (half8)(convert_half4(x.lo), convert_half4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half16 convert_half16(uchar16 x)
 {
   return (half16)(convert_half8(x.lo), convert_half8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half3 convert_half3(uchar3 x)
 {
   return (half3)(convert_half2(x.s01), convert_half(x.s2));
 }
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float(uchar x)
 {
   return (float)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2(uchar2 x)
 {
   return (float2)(convert_float(x.lo), convert_float(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4(uchar4 x)
 {
   return (float4)(convert_float2(x.lo), convert_float2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8(uchar8 x)
 {
   return (float8)(convert_float4(x.lo), convert_float4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16(uchar16 x)
 {
   return (float16)(convert_float8(x.lo), convert_float8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3(uchar3 x)
 {
   return (float3)(convert_float2(x.s01), convert_float(x.s2));
 }
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double(uchar x)
 {
   return (double)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2(uchar2 x)
 {
   return (double2)(convert_double(x.lo), convert_double(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4(uchar4 x)
 {
   return (double4)(convert_double2(x.lo), convert_double2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8(uchar8 x)
 {
   return (double8)(convert_double4(x.lo), convert_double4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16(uchar16 x)
 {
   return (double16)(convert_double8(x.lo), convert_double8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3(uchar3 x)
 {
   return (double3)(convert_double2(x.s01), convert_double(x.s2));
 }
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char(short x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2(short2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4(short4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8(short8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16(short16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3(short3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar(short x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2(short2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4(short4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8(short8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16(short16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3(short3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short(short x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2(short2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4(short4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8(short8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16(short16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3(short3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort(short x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2(short2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4(short4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8(short8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16(short16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3(short3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int(short x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2(short2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4(short4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8(short8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16(short16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3(short3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint(short x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2(short2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4(short4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8(short8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16(short16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3(short3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
 }
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long(short x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2(short2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4(short4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8(short8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16(short16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3(short3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -1090,37 +1090,37 @@ long3 convert_long3(short3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong(short x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2(short2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4(short4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8(short8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16(short16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3(short3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -1128,365 +1128,365 @@ ulong3 convert_ulong3(short3 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half convert_half(short x)
 {
   return (half)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half2 convert_half2(short2 x)
 {
   return (half2)(convert_half(x.lo), convert_half(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half4 convert_half4(short4 x)
 {
   return (half4)(convert_half2(x.lo), convert_half2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half8 convert_half8(short8 x)
 {
   return (half8)(convert_half4(x.lo), convert_half4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half16 convert_half16(short16 x)
 {
   return (half16)(convert_half8(x.lo), convert_half8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half3 convert_half3(short3 x)
 {
   return (half3)(convert_half2(x.s01), convert_half(x.s2));
 }
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float(short x)
 {
   return (float)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2(short2 x)
 {
   return (float2)(convert_float(x.lo), convert_float(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4(short4 x)
 {
   return (float4)(convert_float2(x.lo), convert_float2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8(short8 x)
 {
   return (float8)(convert_float4(x.lo), convert_float4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16(short16 x)
 {
   return (float16)(convert_float8(x.lo), convert_float8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3(short3 x)
 {
   return (float3)(convert_float2(x.s01), convert_float(x.s2));
 }
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double(short x)
 {
   return (double)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2(short2 x)
 {
   return (double2)(convert_double(x.lo), convert_double(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4(short4 x)
 {
   return (double4)(convert_double2(x.lo), convert_double2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8(short8 x)
 {
   return (double8)(convert_double4(x.lo), convert_double4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16(short16 x)
 {
   return (double16)(convert_double8(x.lo), convert_double8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3(short3 x)
 {
   return (double3)(convert_double2(x.s01), convert_double(x.s2));
 }
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char(ushort x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2(ushort2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4(ushort4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8(ushort8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16(ushort16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3(ushort3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar(ushort x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2(ushort2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4(ushort4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8(ushort8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16(ushort16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3(ushort3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short(ushort x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2(ushort2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4(ushort4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8(ushort8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16(ushort16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3(ushort3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort(ushort x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2(ushort2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4(ushort4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8(ushort8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16(ushort16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3(ushort3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int(ushort x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2(ushort2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4(ushort4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8(ushort8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16(ushort16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3(ushort3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint(ushort x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2(ushort2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4(ushort4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8(ushort8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16(ushort16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3(ushort3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
 }
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long(ushort x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2(ushort2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4(ushort4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8(ushort8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16(ushort16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3(ushort3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -1494,37 +1494,37 @@ long3 convert_long3(ushort3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong(ushort x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2(ushort2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4(ushort4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8(ushort8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16(ushort16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3(ushort3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -1532,365 +1532,365 @@ ulong3 convert_ulong3(ushort3 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half convert_half(ushort x)
 {
   return (half)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half2 convert_half2(ushort2 x)
 {
   return (half2)(convert_half(x.lo), convert_half(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half4 convert_half4(ushort4 x)
 {
   return (half4)(convert_half2(x.lo), convert_half2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half8 convert_half8(ushort8 x)
 {
   return (half8)(convert_half4(x.lo), convert_half4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half16 convert_half16(ushort16 x)
 {
   return (half16)(convert_half8(x.lo), convert_half8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half3 convert_half3(ushort3 x)
 {
   return (half3)(convert_half2(x.s01), convert_half(x.s2));
 }
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float(ushort x)
 {
   return (float)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2(ushort2 x)
 {
   return (float2)(convert_float(x.lo), convert_float(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4(ushort4 x)
 {
   return (float4)(convert_float2(x.lo), convert_float2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8(ushort8 x)
 {
   return (float8)(convert_float4(x.lo), convert_float4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16(ushort16 x)
 {
   return (float16)(convert_float8(x.lo), convert_float8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3(ushort3 x)
 {
   return (float3)(convert_float2(x.s01), convert_float(x.s2));
 }
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double(ushort x)
 {
   return (double)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2(ushort2 x)
 {
   return (double2)(convert_double(x.lo), convert_double(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4(ushort4 x)
 {
   return (double4)(convert_double2(x.lo), convert_double2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8(ushort8 x)
 {
   return (double8)(convert_double4(x.lo), convert_double4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16(ushort16 x)
 {
   return (double16)(convert_double8(x.lo), convert_double8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3(ushort3 x)
 {
   return (double3)(convert_double2(x.s01), convert_double(x.s2));
 }
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char(int x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2(int2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4(int4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8(int8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16(int16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3(int3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar(int x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2(int2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4(int4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8(int8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16(int16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3(int3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short(int x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2(int2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4(int4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8(int8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16(int16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3(int3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort(int x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2(int2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4(int4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8(int8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16(int16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3(int3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int(int x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2(int2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4(int4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8(int8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16(int16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3(int3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint(int x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2(int2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4(int4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8(int8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16(int16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3(int3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
 }
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long(int x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2(int2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4(int4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8(int8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16(int16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3(int3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -1898,37 +1898,37 @@ long3 convert_long3(int3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong(int x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2(int2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4(int4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8(int8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16(int16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3(int3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -1936,365 +1936,365 @@ ulong3 convert_ulong3(int3 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half convert_half(int x)
 {
   return (half)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half2 convert_half2(int2 x)
 {
   return (half2)(convert_half(x.lo), convert_half(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half4 convert_half4(int4 x)
 {
   return (half4)(convert_half2(x.lo), convert_half2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half8 convert_half8(int8 x)
 {
   return (half8)(convert_half4(x.lo), convert_half4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half16 convert_half16(int16 x)
 {
   return (half16)(convert_half8(x.lo), convert_half8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half3 convert_half3(int3 x)
 {
   return (half3)(convert_half2(x.s01), convert_half(x.s2));
 }
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float(int x)
 {
   return (float)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2(int2 x)
 {
   return (float2)(convert_float(x.lo), convert_float(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4(int4 x)
 {
   return (float4)(convert_float2(x.lo), convert_float2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8(int8 x)
 {
   return (float8)(convert_float4(x.lo), convert_float4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16(int16 x)
 {
   return (float16)(convert_float8(x.lo), convert_float8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3(int3 x)
 {
   return (float3)(convert_float2(x.s01), convert_float(x.s2));
 }
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double(int x)
 {
   return (double)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2(int2 x)
 {
   return (double2)(convert_double(x.lo), convert_double(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4(int4 x)
 {
   return (double4)(convert_double2(x.lo), convert_double2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8(int8 x)
 {
   return (double8)(convert_double4(x.lo), convert_double4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16(int16 x)
 {
   return (double16)(convert_double8(x.lo), convert_double8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3(int3 x)
 {
   return (double3)(convert_double2(x.s01), convert_double(x.s2));
 }
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char(uint x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2(uint2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4(uint4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8(uint8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16(uint16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3(uint3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar(uint x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2(uint2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4(uint4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8(uint8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16(uint16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3(uint3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short(uint x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2(uint2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4(uint4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8(uint8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16(uint16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3(uint3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort(uint x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2(uint2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4(uint4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8(uint8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16(uint16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3(uint3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int(uint x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2(uint2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4(uint4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8(uint8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16(uint16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3(uint3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint(uint x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2(uint2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4(uint4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8(uint8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16(uint16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3(uint3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
 }
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long(uint x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2(uint2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4(uint4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8(uint8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16(uint16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3(uint3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -2302,37 +2302,37 @@ long3 convert_long3(uint3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong(uint x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2(uint2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4(uint4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8(uint8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16(uint16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3(uint3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -2340,111 +2340,111 @@ ulong3 convert_ulong3(uint3 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half convert_half(uint x)
 {
   return (half)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half2 convert_half2(uint2 x)
 {
   return (half2)(convert_half(x.lo), convert_half(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half4 convert_half4(uint4 x)
 {
   return (half4)(convert_half2(x.lo), convert_half2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half8 convert_half8(uint8 x)
 {
   return (half8)(convert_half4(x.lo), convert_half4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half16 convert_half16(uint16 x)
 {
   return (half16)(convert_half8(x.lo), convert_half8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half3 convert_half3(uint3 x)
 {
   return (half3)(convert_half2(x.s01), convert_half(x.s2));
 }
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float(uint x)
 {
   return (float)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2(uint2 x)
 {
   return (float2)(convert_float(x.lo), convert_float(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4(uint4 x)
 {
   return (float4)(convert_float2(x.lo), convert_float2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8(uint8 x)
 {
   return (float8)(convert_float4(x.lo), convert_float4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16(uint16 x)
 {
   return (float16)(convert_float8(x.lo), convert_float8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3(uint3 x)
 {
   return (float3)(convert_float2(x.s01), convert_float(x.s2));
 }
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double(uint x)
 {
   return (double)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2(uint2 x)
 {
   return (double2)(convert_double(x.lo), convert_double(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4(uint4 x)
 {
   return (double4)(convert_double2(x.lo), convert_double2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8(uint8 x)
 {
   return (double8)(convert_double4(x.lo), convert_double4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16(uint16 x)
 {
   return (double16)(convert_double8(x.lo), convert_double8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3(uint3 x)
 {
   return (double3)(convert_double2(x.s01), convert_double(x.s2));
@@ -2452,37 +2452,37 @@ double3 convert_double3(uint3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char(long x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2(long2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4(long4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8(long8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16(long16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3(long3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
@@ -2490,37 +2490,37 @@ char3 convert_char3(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar(long x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2(long2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4(long4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8(long8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16(long16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3(long3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
@@ -2528,37 +2528,37 @@ uchar3 convert_uchar3(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short(long x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2(long2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4(long4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8(long8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16(long16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3(long3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
@@ -2566,37 +2566,37 @@ short3 convert_short3(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort(long x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2(long2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4(long4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8(long8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16(long16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3(long3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
@@ -2604,37 +2604,37 @@ ushort3 convert_ushort3(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int(long x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2(long2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4(long4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8(long8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16(long16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3(long3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
@@ -2642,37 +2642,37 @@ int3 convert_int3(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint(long x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2(long2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4(long4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8(long8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16(long16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3(long3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
@@ -2680,37 +2680,37 @@ uint3 convert_uint3(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long(long x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2(long2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4(long4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8(long8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16(long16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3(long3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -2718,37 +2718,37 @@ long3 convert_long3(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong(long x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2(long2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4(long4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8(long8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16(long16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3(long3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -2756,37 +2756,37 @@ ulong3 convert_ulong3(long3 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half convert_half(long x)
 {
   return (half)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half2 convert_half2(long2 x)
 {
   return (half2)(convert_half(x.lo), convert_half(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half4 convert_half4(long4 x)
 {
   return (half4)(convert_half2(x.lo), convert_half2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half8 convert_half8(long8 x)
 {
   return (half8)(convert_half4(x.lo), convert_half4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half16 convert_half16(long16 x)
 {
   return (half16)(convert_half8(x.lo), convert_half8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half3 convert_half3(long3 x)
 {
   return (half3)(convert_half2(x.s01), convert_half(x.s2));
@@ -2794,37 +2794,37 @@ half3 convert_half3(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float(long x)
 {
   return (float)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2(long2 x)
 {
   return (float2)(convert_float(x.lo), convert_float(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4(long4 x)
 {
   return (float4)(convert_float2(x.lo), convert_float2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8(long8 x)
 {
   return (float8)(convert_float4(x.lo), convert_float4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16(long16 x)
 {
   return (float16)(convert_float8(x.lo), convert_float8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3(long3 x)
 {
   return (float3)(convert_float2(x.s01), convert_float(x.s2));
@@ -2832,37 +2832,37 @@ float3 convert_float3(long3 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double(long x)
 {
   return (double)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2(long2 x)
 {
   return (double2)(convert_double(x.lo), convert_double(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4(long4 x)
 {
   return (double4)(convert_double2(x.lo), convert_double2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8(long8 x)
 {
   return (double8)(convert_double4(x.lo), convert_double4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16(long16 x)
 {
   return (double16)(convert_double8(x.lo), convert_double8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3(long3 x)
 {
   return (double3)(convert_double2(x.s01), convert_double(x.s2));
@@ -2870,37 +2870,37 @@ double3 convert_double3(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char(ulong x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2(ulong2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4(ulong4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8(ulong8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16(ulong16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3(ulong3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
@@ -2908,37 +2908,37 @@ char3 convert_char3(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar(ulong x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2(ulong2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4(ulong4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8(ulong8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16(ulong16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3(ulong3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
@@ -2946,37 +2946,37 @@ uchar3 convert_uchar3(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short(ulong x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2(ulong2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4(ulong4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8(ulong8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16(ulong16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3(ulong3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
@@ -2984,37 +2984,37 @@ short3 convert_short3(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort(ulong x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2(ulong2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4(ulong4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8(ulong8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16(ulong16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3(ulong3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
@@ -3022,37 +3022,37 @@ ushort3 convert_ushort3(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int(ulong x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2(ulong2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4(ulong4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8(ulong8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16(ulong16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3(ulong3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
@@ -3060,37 +3060,37 @@ int3 convert_int3(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint(ulong x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2(ulong2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4(ulong4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8(ulong8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16(ulong16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3(ulong3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
@@ -3098,37 +3098,37 @@ uint3 convert_uint3(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long(ulong x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2(ulong2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4(ulong4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8(ulong8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16(ulong16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3(ulong3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -3136,37 +3136,37 @@ long3 convert_long3(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong(ulong x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2(ulong2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4(ulong4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8(ulong8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16(ulong16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3(ulong3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -3174,37 +3174,37 @@ ulong3 convert_ulong3(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half convert_half(ulong x)
 {
   return (half)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half2 convert_half2(ulong2 x)
 {
   return (half2)(convert_half(x.lo), convert_half(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half4 convert_half4(ulong4 x)
 {
   return (half4)(convert_half2(x.lo), convert_half2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half8 convert_half8(ulong8 x)
 {
   return (half8)(convert_half4(x.lo), convert_half4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half16 convert_half16(ulong16 x)
 {
   return (half16)(convert_half8(x.lo), convert_half8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half3 convert_half3(ulong3 x)
 {
   return (half3)(convert_half2(x.s01), convert_half(x.s2));
@@ -3212,37 +3212,37 @@ half3 convert_half3(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float(ulong x)
 {
   return (float)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2(ulong2 x)
 {
   return (float2)(convert_float(x.lo), convert_float(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4(ulong4 x)
 {
   return (float4)(convert_float2(x.lo), convert_float2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8(ulong8 x)
 {
   return (float8)(convert_float4(x.lo), convert_float4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16(ulong16 x)
 {
   return (float16)(convert_float8(x.lo), convert_float8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3(ulong3 x)
 {
   return (float3)(convert_float2(x.s01), convert_float(x.s2));
@@ -3250,37 +3250,37 @@ float3 convert_float3(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double(ulong x)
 {
   return (double)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2(ulong2 x)
 {
   return (double2)(convert_double(x.lo), convert_double(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4(ulong4 x)
 {
   return (double4)(convert_double2(x.lo), convert_double2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8(ulong8 x)
 {
   return (double8)(convert_double4(x.lo), convert_double4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16(ulong16 x)
 {
   return (double16)(convert_double8(x.lo), convert_double8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3(ulong3 x)
 {
   return (double3)(convert_double2(x.s01), convert_double(x.s2));
@@ -3288,37 +3288,37 @@ double3 convert_double3(ulong3 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char(half x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2(half2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4(half4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8(half8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16(half16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3(half3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
@@ -3326,37 +3326,37 @@ char3 convert_char3(half3 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar(half x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2(half2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4(half4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8(half8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16(half16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3(half3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
@@ -3364,37 +3364,37 @@ uchar3 convert_uchar3(half3 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short(half x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2(half2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4(half4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8(half8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16(half16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3(half3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
@@ -3402,37 +3402,37 @@ short3 convert_short3(half3 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort(half x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2(half2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4(half4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8(half8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16(half16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3(half3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
@@ -3440,37 +3440,37 @@ ushort3 convert_ushort3(half3 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int(half x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2(half2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4(half4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8(half8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16(half16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3(half3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
@@ -3478,37 +3478,37 @@ int3 convert_int3(half3 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint(half x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2(half2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4(half4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8(half8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16(half16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3(half3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
@@ -3516,37 +3516,37 @@ uint3 convert_uint3(half3 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long(half x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2(half2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4(half4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8(half8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16(half16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3(half3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -3554,37 +3554,37 @@ long3 convert_long3(half3 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong(half x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2(half2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4(half4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8(half8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16(half16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3(half3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -3592,37 +3592,37 @@ ulong3 convert_ulong3(half3 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half convert_half(half x)
 {
   return (half)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half2 convert_half2(half2 x)
 {
   return (half2)(convert_half(x.lo), convert_half(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half4 convert_half4(half4 x)
 {
   return (half4)(convert_half2(x.lo), convert_half2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half8 convert_half8(half8 x)
 {
   return (half8)(convert_half4(x.lo), convert_half4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half16 convert_half16(half16 x)
 {
   return (half16)(convert_half8(x.lo), convert_half8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half3 convert_half3(half3 x)
 {
   return (half3)(convert_half2(x.s01), convert_half(x.s2));
@@ -3630,37 +3630,37 @@ half3 convert_half3(half3 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float(half x)
 {
   return (float)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2(half2 x)
 {
   return (float2)(convert_float(x.lo), convert_float(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4(half4 x)
 {
   return (float4)(convert_float2(x.lo), convert_float2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8(half8 x)
 {
   return (float8)(convert_float4(x.lo), convert_float4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16(half16 x)
 {
   return (float16)(convert_float8(x.lo), convert_float8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3(half3 x)
 {
   return (float3)(convert_float2(x.s01), convert_float(x.s2));
@@ -3668,291 +3668,291 @@ float3 convert_float3(half3 x)
 #endif
 
 #if defined(cl_khr_fp64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double(half x)
 {
   return (double)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2(half2 x)
 {
   return (double2)(convert_double(x.lo), convert_double(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4(half4 x)
 {
   return (double4)(convert_double2(x.lo), convert_double2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8(half8 x)
 {
   return (double8)(convert_double4(x.lo), convert_double4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16(half16 x)
 {
   return (double16)(convert_double8(x.lo), convert_double8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3(half3 x)
 {
   return (double3)(convert_double2(x.s01), convert_double(x.s2));
 }
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char(float x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2(float2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4(float4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8(float8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16(float16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3(float3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar(float x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2(float2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4(float4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8(float8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16(float16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3(float3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short(float x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2(float2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4(float4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8(float8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16(float16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3(float3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort(float x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2(float2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4(float4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8(float8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16(float16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3(float3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int(float x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2(float2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4(float4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8(float8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16(float16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3(float3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint(float x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2(float2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4(float4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8(float8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16(float16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3(float3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
 }
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long(float x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2(float2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4(float4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8(float8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16(float16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3(float3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -3960,37 +3960,37 @@ long3 convert_long3(float3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong(float x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2(float2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4(float4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8(float8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16(float16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3(float3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -3998,111 +3998,111 @@ ulong3 convert_ulong3(float3 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half convert_half(float x)
 {
   return (half)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half2 convert_half2(float2 x)
 {
   return (half2)(convert_half(x.lo), convert_half(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half4 convert_half4(float4 x)
 {
   return (half4)(convert_half2(x.lo), convert_half2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half8 convert_half8(float8 x)
 {
   return (half8)(convert_half4(x.lo), convert_half4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half16 convert_half16(float16 x)
 {
   return (half16)(convert_half8(x.lo), convert_half8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half3 convert_half3(float3 x)
 {
   return (half3)(convert_half2(x.s01), convert_half(x.s2));
 }
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float(float x)
 {
   return (float)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2(float2 x)
 {
   return (float2)(convert_float(x.lo), convert_float(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4(float4 x)
 {
   return (float4)(convert_float2(x.lo), convert_float2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8(float8 x)
 {
   return (float8)(convert_float4(x.lo), convert_float4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16(float16 x)
 {
   return (float16)(convert_float8(x.lo), convert_float8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3(float3 x)
 {
   return (float3)(convert_float2(x.s01), convert_float(x.s2));
 }
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double(float x)
 {
   return (double)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2(float2 x)
 {
   return (double2)(convert_double(x.lo), convert_double(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4(float4 x)
 {
   return (double4)(convert_double2(x.lo), convert_double2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8(float8 x)
 {
   return (double8)(convert_double4(x.lo), convert_double4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16(float16 x)
 {
   return (double16)(convert_double8(x.lo), convert_double8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3(float3 x)
 {
   return (double3)(convert_double2(x.s01), convert_double(x.s2));
@@ -4110,37 +4110,37 @@ double3 convert_double3(float3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char(double x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2(double2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4(double4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8(double8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16(double16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3(double3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
@@ -4148,37 +4148,37 @@ char3 convert_char3(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar(double x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2(double2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4(double4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8(double8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16(double16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3(double3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
@@ -4186,37 +4186,37 @@ uchar3 convert_uchar3(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short(double x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2(double2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4(double4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8(double8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16(double16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3(double3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
@@ -4224,37 +4224,37 @@ short3 convert_short3(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort(double x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2(double2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4(double4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8(double8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16(double16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3(double3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
@@ -4262,37 +4262,37 @@ ushort3 convert_ushort3(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int(double x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2(double2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4(double4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8(double8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16(double16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3(double3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
@@ -4300,37 +4300,37 @@ int3 convert_int3(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint(double x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2(double2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4(double4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8(double8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16(double16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3(double3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
@@ -4338,37 +4338,37 @@ uint3 convert_uint3(double3 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long(double x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2(double2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4(double4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8(double8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16(double16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3(double3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -4376,37 +4376,37 @@ long3 convert_long3(double3 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong(double x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2(double2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4(double4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8(double8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16(double16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3(double3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -4414,37 +4414,37 @@ ulong3 convert_ulong3(double3 x)
 #endif
 
 #if defined(cl_khr_fp64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half convert_half(double x)
 {
   return (half)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half2 convert_half2(double2 x)
 {
   return (half2)(convert_half(x.lo), convert_half(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half4 convert_half4(double4 x)
 {
   return (half4)(convert_half2(x.lo), convert_half2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half8 convert_half8(double8 x)
 {
   return (half8)(convert_half4(x.lo), convert_half4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half16 convert_half16(double16 x)
 {
   return (half16)(convert_half8(x.lo), convert_half8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 half3 convert_half3(double3 x)
 {
   return (half3)(convert_half2(x.s01), convert_half(x.s2));
@@ -4452,37 +4452,37 @@ half3 convert_half3(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float(double x)
 {
   return (float)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2(double2 x)
 {
   return (float2)(convert_float(x.lo), convert_float(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4(double4 x)
 {
   return (float4)(convert_float2(x.lo), convert_float2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8(double8 x)
 {
   return (float8)(convert_float4(x.lo), convert_float4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16(double16 x)
 {
   return (float16)(convert_float8(x.lo), convert_float8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3(double3 x)
 {
   return (float3)(convert_float2(x.s01), convert_float(x.s2));
@@ -4490,939 +4490,939 @@ float3 convert_float3(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double(double x)
 {
   return (double)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2(double2 x)
 {
   return (double2)(convert_double(x.lo), convert_double(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4(double4 x)
 {
   return (double4)(convert_double2(x.lo), convert_double2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8(double8 x)
 {
   return (double8)(convert_double4(x.lo), convert_double4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16(double16 x)
 {
   return (double16)(convert_double8(x.lo), convert_double8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3(double3 x)
 {
   return (double3)(convert_double2(x.s01), convert_double(x.s2));
 }
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rtz(char x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rtz(char2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rtz(char4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rtz(char8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rtz(char16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rtz(char3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rte(char x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rte(char2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rte(char4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rte(char8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rte(char16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rte(char3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rtp(char x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rtp(char2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rtp(char4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rtp(char8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rtp(char16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rtp(char3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rtn(char x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rtn(char2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rtn(char4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rtn(char8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rtn(char16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rtn(char3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rtz(char x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rtz(char2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rtz(char4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rtz(char8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rtz(char16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rtz(char3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rte(char x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rte(char2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rte(char4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rte(char8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rte(char16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rte(char3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rtp(char x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rtp(char2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rtp(char4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rtp(char8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rtp(char16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rtp(char3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rtn(char x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rtn(char2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rtn(char4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rtn(char8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rtn(char16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rtn(char3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rtz(char x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rtz(char2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rtz(char4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rtz(char8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rtz(char16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rtz(char3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rte(char x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rte(char2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rte(char4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rte(char8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rte(char16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rte(char3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rtp(char x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rtp(char2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rtp(char4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rtp(char8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rtp(char16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rtp(char3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rtn(char x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rtn(char2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rtn(char4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rtn(char8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rtn(char16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rtn(char3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rtz(char x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rtz(char2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rtz(char4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rtz(char8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rtz(char16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rtz(char3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rte(char x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rte(char2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rte(char4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rte(char8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rte(char16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rte(char3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rtp(char x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rtp(char2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rtp(char4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rtp(char8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rtp(char16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rtp(char3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rtn(char x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rtn(char2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rtn(char4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rtn(char8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rtn(char16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rtn(char3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rtz(char x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rtz(char2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rtz(char4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rtz(char8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rtz(char16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rtz(char3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rte(char x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rte(char2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rte(char4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rte(char8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rte(char16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rte(char3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rtp(char x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rtp(char2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rtp(char4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rtp(char8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rtp(char16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rtp(char3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rtn(char x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rtn(char2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rtn(char4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rtn(char8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rtn(char16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rtn(char3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rtz(char x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rtz(char2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rtz(char4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rtz(char8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rtz(char16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rtz(char3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rte(char x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rte(char2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rte(char4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rte(char8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rte(char16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rte(char3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rtp(char x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rtp(char2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rtp(char4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rtp(char8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rtp(char16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rtp(char3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rtn(char x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rtn(char2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rtn(char4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rtn(char8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rtn(char16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rtn(char3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
 }
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rtz(char x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rtz(char2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rtz(char4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rtz(char8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rtz(char16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rtz(char3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -5430,37 +5430,37 @@ long3 convert_long3_rtz(char3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rte(char x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rte(char2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rte(char4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rte(char8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rte(char16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rte(char3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -5468,37 +5468,37 @@ long3 convert_long3_rte(char3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rtp(char x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rtp(char2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rtp(char4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rtp(char8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rtp(char16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rtp(char3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -5506,37 +5506,37 @@ long3 convert_long3_rtp(char3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rtn(char x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rtn(char2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rtn(char4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rtn(char8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rtn(char16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rtn(char3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -5544,37 +5544,37 @@ long3 convert_long3_rtn(char3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rtz(char x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rtz(char2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rtz(char4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rtz(char8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rtz(char16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rtz(char3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -5582,37 +5582,37 @@ ulong3 convert_ulong3_rtz(char3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rte(char x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rte(char2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rte(char4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rte(char8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rte(char16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rte(char3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -5620,37 +5620,37 @@ ulong3 convert_ulong3_rte(char3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rtp(char x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rtp(char2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rtp(char4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rtp(char8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rtp(char16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rtp(char3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -5658,939 +5658,939 @@ ulong3 convert_ulong3_rtp(char3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rtn(char x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rtn(char2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rtn(char4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rtn(char8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rtn(char16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rtn(char3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
 }
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rtz(uchar x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rtz(uchar2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rtz(uchar4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rtz(uchar8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rtz(uchar16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rtz(uchar3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rte(uchar x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rte(uchar2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rte(uchar4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rte(uchar8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rte(uchar16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rte(uchar3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rtp(uchar x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rtp(uchar2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rtp(uchar4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rtp(uchar8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rtp(uchar16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rtp(uchar3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rtn(uchar x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rtn(uchar2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rtn(uchar4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rtn(uchar8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rtn(uchar16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rtn(uchar3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rtz(uchar x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rtz(uchar2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rtz(uchar4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rtz(uchar8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rtz(uchar16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rtz(uchar3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rte(uchar x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rte(uchar2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rte(uchar4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rte(uchar8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rte(uchar16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rte(uchar3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rtp(uchar x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rtp(uchar2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rtp(uchar4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rtp(uchar8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rtp(uchar16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rtp(uchar3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rtn(uchar x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rtn(uchar2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rtn(uchar4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rtn(uchar8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rtn(uchar16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rtn(uchar3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rtz(uchar x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rtz(uchar2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rtz(uchar4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rtz(uchar8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rtz(uchar16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rtz(uchar3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rte(uchar x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rte(uchar2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rte(uchar4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rte(uchar8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rte(uchar16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rte(uchar3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rtp(uchar x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rtp(uchar2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rtp(uchar4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rtp(uchar8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rtp(uchar16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rtp(uchar3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rtn(uchar x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rtn(uchar2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rtn(uchar4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rtn(uchar8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rtn(uchar16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rtn(uchar3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rtz(uchar x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rtz(uchar2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rtz(uchar4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rtz(uchar8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rtz(uchar16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rtz(uchar3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rte(uchar x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rte(uchar2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rte(uchar4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rte(uchar8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rte(uchar16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rte(uchar3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rtp(uchar x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rtp(uchar2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rtp(uchar4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rtp(uchar8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rtp(uchar16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rtp(uchar3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rtn(uchar x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rtn(uchar2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rtn(uchar4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rtn(uchar8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rtn(uchar16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rtn(uchar3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rtz(uchar x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rtz(uchar2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rtz(uchar4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rtz(uchar8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rtz(uchar16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rtz(uchar3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rte(uchar x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rte(uchar2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rte(uchar4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rte(uchar8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rte(uchar16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rte(uchar3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rtp(uchar x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rtp(uchar2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rtp(uchar4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rtp(uchar8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rtp(uchar16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rtp(uchar3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rtn(uchar x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rtn(uchar2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rtn(uchar4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rtn(uchar8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rtn(uchar16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rtn(uchar3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rtz(uchar x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rtz(uchar2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rtz(uchar4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rtz(uchar8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rtz(uchar16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rtz(uchar3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rte(uchar x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rte(uchar2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rte(uchar4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rte(uchar8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rte(uchar16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rte(uchar3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rtp(uchar x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rtp(uchar2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rtp(uchar4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rtp(uchar8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rtp(uchar16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rtp(uchar3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rtn(uchar x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rtn(uchar2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rtn(uchar4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rtn(uchar8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rtn(uchar16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rtn(uchar3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
 }
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rtz(uchar x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rtz(uchar2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rtz(uchar4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rtz(uchar8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rtz(uchar16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rtz(uchar3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -6598,37 +6598,37 @@ long3 convert_long3_rtz(uchar3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rte(uchar x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rte(uchar2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rte(uchar4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rte(uchar8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rte(uchar16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rte(uchar3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -6636,37 +6636,37 @@ long3 convert_long3_rte(uchar3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rtp(uchar x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rtp(uchar2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rtp(uchar4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rtp(uchar8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rtp(uchar16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rtp(uchar3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -6674,37 +6674,37 @@ long3 convert_long3_rtp(uchar3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rtn(uchar x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rtn(uchar2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rtn(uchar4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rtn(uchar8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rtn(uchar16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rtn(uchar3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -6712,37 +6712,37 @@ long3 convert_long3_rtn(uchar3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rtz(uchar x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rtz(uchar2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rtz(uchar4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rtz(uchar8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rtz(uchar16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rtz(uchar3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -6750,37 +6750,37 @@ ulong3 convert_ulong3_rtz(uchar3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rte(uchar x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rte(uchar2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rte(uchar4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rte(uchar8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rte(uchar16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rte(uchar3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -6788,37 +6788,37 @@ ulong3 convert_ulong3_rte(uchar3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rtp(uchar x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rtp(uchar2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rtp(uchar4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rtp(uchar8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rtp(uchar16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rtp(uchar3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -6826,939 +6826,939 @@ ulong3 convert_ulong3_rtp(uchar3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rtn(uchar x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rtn(uchar2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rtn(uchar4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rtn(uchar8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rtn(uchar16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rtn(uchar3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
 }
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rtz(short x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rtz(short2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rtz(short4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rtz(short8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rtz(short16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rtz(short3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rte(short x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rte(short2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rte(short4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rte(short8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rte(short16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rte(short3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rtp(short x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rtp(short2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rtp(short4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rtp(short8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rtp(short16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rtp(short3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rtn(short x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rtn(short2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rtn(short4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rtn(short8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rtn(short16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rtn(short3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rtz(short x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rtz(short2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rtz(short4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rtz(short8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rtz(short16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rtz(short3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rte(short x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rte(short2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rte(short4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rte(short8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rte(short16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rte(short3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rtp(short x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rtp(short2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rtp(short4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rtp(short8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rtp(short16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rtp(short3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rtn(short x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rtn(short2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rtn(short4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rtn(short8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rtn(short16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rtn(short3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rtz(short x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rtz(short2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rtz(short4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rtz(short8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rtz(short16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rtz(short3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rte(short x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rte(short2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rte(short4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rte(short8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rte(short16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rte(short3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rtp(short x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rtp(short2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rtp(short4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rtp(short8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rtp(short16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rtp(short3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rtn(short x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rtn(short2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rtn(short4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rtn(short8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rtn(short16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rtn(short3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rtz(short x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rtz(short2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rtz(short4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rtz(short8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rtz(short16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rtz(short3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rte(short x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rte(short2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rte(short4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rte(short8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rte(short16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rte(short3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rtp(short x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rtp(short2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rtp(short4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rtp(short8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rtp(short16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rtp(short3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rtn(short x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rtn(short2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rtn(short4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rtn(short8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rtn(short16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rtn(short3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rtz(short x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rtz(short2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rtz(short4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rtz(short8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rtz(short16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rtz(short3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rte(short x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rte(short2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rte(short4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rte(short8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rte(short16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rte(short3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rtp(short x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rtp(short2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rtp(short4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rtp(short8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rtp(short16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rtp(short3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rtn(short x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rtn(short2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rtn(short4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rtn(short8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rtn(short16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rtn(short3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rtz(short x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rtz(short2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rtz(short4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rtz(short8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rtz(short16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rtz(short3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rte(short x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rte(short2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rte(short4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rte(short8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rte(short16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rte(short3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rtp(short x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rtp(short2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rtp(short4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rtp(short8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rtp(short16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rtp(short3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rtn(short x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rtn(short2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rtn(short4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rtn(short8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rtn(short16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rtn(short3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
 }
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rtz(short x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rtz(short2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rtz(short4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rtz(short8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rtz(short16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rtz(short3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -7766,37 +7766,37 @@ long3 convert_long3_rtz(short3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rte(short x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rte(short2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rte(short4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rte(short8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rte(short16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rte(short3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -7804,37 +7804,37 @@ long3 convert_long3_rte(short3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rtp(short x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rtp(short2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rtp(short4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rtp(short8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rtp(short16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rtp(short3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -7842,37 +7842,37 @@ long3 convert_long3_rtp(short3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rtn(short x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rtn(short2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rtn(short4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rtn(short8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rtn(short16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rtn(short3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -7880,37 +7880,37 @@ long3 convert_long3_rtn(short3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rtz(short x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rtz(short2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rtz(short4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rtz(short8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rtz(short16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rtz(short3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -7918,37 +7918,37 @@ ulong3 convert_ulong3_rtz(short3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rte(short x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rte(short2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rte(short4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rte(short8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rte(short16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rte(short3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -7956,37 +7956,37 @@ ulong3 convert_ulong3_rte(short3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rtp(short x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rtp(short2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rtp(short4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rtp(short8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rtp(short16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rtp(short3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -7994,939 +7994,939 @@ ulong3 convert_ulong3_rtp(short3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rtn(short x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rtn(short2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rtn(short4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rtn(short8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rtn(short16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rtn(short3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
 }
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rtz(ushort x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rtz(ushort2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rtz(ushort4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rtz(ushort8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rtz(ushort16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rtz(ushort3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rte(ushort x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rte(ushort2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rte(ushort4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rte(ushort8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rte(ushort16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rte(ushort3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rtp(ushort x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rtp(ushort2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rtp(ushort4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rtp(ushort8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rtp(ushort16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rtp(ushort3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rtn(ushort x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rtn(ushort2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rtn(ushort4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rtn(ushort8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rtn(ushort16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rtn(ushort3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rtz(ushort x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rtz(ushort2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rtz(ushort4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rtz(ushort8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rtz(ushort16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rtz(ushort3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rte(ushort x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rte(ushort2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rte(ushort4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rte(ushort8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rte(ushort16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rte(ushort3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rtp(ushort x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rtp(ushort2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rtp(ushort4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rtp(ushort8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rtp(ushort16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rtp(ushort3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rtn(ushort x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rtn(ushort2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rtn(ushort4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rtn(ushort8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rtn(ushort16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rtn(ushort3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rtz(ushort x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rtz(ushort2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rtz(ushort4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rtz(ushort8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rtz(ushort16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rtz(ushort3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rte(ushort x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rte(ushort2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rte(ushort4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rte(ushort8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rte(ushort16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rte(ushort3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rtp(ushort x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rtp(ushort2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rtp(ushort4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rtp(ushort8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rtp(ushort16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rtp(ushort3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rtn(ushort x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rtn(ushort2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rtn(ushort4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rtn(ushort8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rtn(ushort16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rtn(ushort3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rtz(ushort x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rtz(ushort2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rtz(ushort4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rtz(ushort8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rtz(ushort16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rtz(ushort3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rte(ushort x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rte(ushort2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rte(ushort4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rte(ushort8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rte(ushort16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rte(ushort3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rtp(ushort x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rtp(ushort2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rtp(ushort4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rtp(ushort8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rtp(ushort16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rtp(ushort3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rtn(ushort x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rtn(ushort2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rtn(ushort4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rtn(ushort8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rtn(ushort16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rtn(ushort3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rtz(ushort x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rtz(ushort2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rtz(ushort4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rtz(ushort8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rtz(ushort16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rtz(ushort3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rte(ushort x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rte(ushort2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rte(ushort4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rte(ushort8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rte(ushort16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rte(ushort3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rtp(ushort x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rtp(ushort2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rtp(ushort4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rtp(ushort8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rtp(ushort16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rtp(ushort3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rtn(ushort x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rtn(ushort2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rtn(ushort4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rtn(ushort8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rtn(ushort16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rtn(ushort3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rtz(ushort x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rtz(ushort2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rtz(ushort4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rtz(ushort8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rtz(ushort16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rtz(ushort3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rte(ushort x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rte(ushort2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rte(ushort4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rte(ushort8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rte(ushort16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rte(ushort3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rtp(ushort x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rtp(ushort2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rtp(ushort4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rtp(ushort8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rtp(ushort16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rtp(ushort3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rtn(ushort x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rtn(ushort2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rtn(ushort4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rtn(ushort8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rtn(ushort16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rtn(ushort3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
 }
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rtz(ushort x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rtz(ushort2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rtz(ushort4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rtz(ushort8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rtz(ushort16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rtz(ushort3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -8934,37 +8934,37 @@ long3 convert_long3_rtz(ushort3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rte(ushort x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rte(ushort2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rte(ushort4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rte(ushort8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rte(ushort16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rte(ushort3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -8972,37 +8972,37 @@ long3 convert_long3_rte(ushort3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rtp(ushort x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rtp(ushort2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rtp(ushort4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rtp(ushort8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rtp(ushort16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rtp(ushort3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -9010,37 +9010,37 @@ long3 convert_long3_rtp(ushort3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rtn(ushort x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rtn(ushort2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rtn(ushort4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rtn(ushort8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rtn(ushort16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rtn(ushort3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -9048,37 +9048,37 @@ long3 convert_long3_rtn(ushort3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rtz(ushort x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rtz(ushort2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rtz(ushort4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rtz(ushort8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rtz(ushort16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rtz(ushort3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -9086,37 +9086,37 @@ ulong3 convert_ulong3_rtz(ushort3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rte(ushort x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rte(ushort2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rte(ushort4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rte(ushort8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rte(ushort16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rte(ushort3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -9124,37 +9124,37 @@ ulong3 convert_ulong3_rte(ushort3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rtp(ushort x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rtp(ushort2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rtp(ushort4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rtp(ushort8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rtp(ushort16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rtp(ushort3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -9162,939 +9162,939 @@ ulong3 convert_ulong3_rtp(ushort3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rtn(ushort x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rtn(ushort2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rtn(ushort4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rtn(ushort8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rtn(ushort16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rtn(ushort3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
 }
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rtz(int x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rtz(int2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rtz(int4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rtz(int8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rtz(int16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rtz(int3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rte(int x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rte(int2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rte(int4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rte(int8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rte(int16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rte(int3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rtp(int x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rtp(int2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rtp(int4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rtp(int8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rtp(int16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rtp(int3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rtn(int x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rtn(int2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rtn(int4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rtn(int8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rtn(int16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rtn(int3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rtz(int x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rtz(int2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rtz(int4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rtz(int8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rtz(int16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rtz(int3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rte(int x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rte(int2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rte(int4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rte(int8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rte(int16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rte(int3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rtp(int x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rtp(int2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rtp(int4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rtp(int8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rtp(int16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rtp(int3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rtn(int x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rtn(int2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rtn(int4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rtn(int8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rtn(int16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rtn(int3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rtz(int x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rtz(int2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rtz(int4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rtz(int8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rtz(int16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rtz(int3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rte(int x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rte(int2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rte(int4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rte(int8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rte(int16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rte(int3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rtp(int x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rtp(int2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rtp(int4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rtp(int8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rtp(int16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rtp(int3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rtn(int x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rtn(int2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rtn(int4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rtn(int8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rtn(int16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rtn(int3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rtz(int x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rtz(int2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rtz(int4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rtz(int8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rtz(int16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rtz(int3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rte(int x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rte(int2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rte(int4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rte(int8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rte(int16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rte(int3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rtp(int x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rtp(int2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rtp(int4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rtp(int8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rtp(int16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rtp(int3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rtn(int x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rtn(int2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rtn(int4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rtn(int8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rtn(int16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rtn(int3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rtz(int x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rtz(int2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rtz(int4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rtz(int8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rtz(int16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rtz(int3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rte(int x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rte(int2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rte(int4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rte(int8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rte(int16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rte(int3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rtp(int x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rtp(int2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rtp(int4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rtp(int8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rtp(int16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rtp(int3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rtn(int x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rtn(int2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rtn(int4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rtn(int8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rtn(int16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rtn(int3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rtz(int x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rtz(int2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rtz(int4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rtz(int8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rtz(int16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rtz(int3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rte(int x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rte(int2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rte(int4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rte(int8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rte(int16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rte(int3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rtp(int x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rtp(int2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rtp(int4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rtp(int8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rtp(int16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rtp(int3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rtn(int x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rtn(int2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rtn(int4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rtn(int8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rtn(int16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rtn(int3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
 }
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rtz(int x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rtz(int2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rtz(int4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rtz(int8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rtz(int16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rtz(int3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -10102,37 +10102,37 @@ long3 convert_long3_rtz(int3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rte(int x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rte(int2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rte(int4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rte(int8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rte(int16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rte(int3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -10140,37 +10140,37 @@ long3 convert_long3_rte(int3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rtp(int x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rtp(int2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rtp(int4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rtp(int8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rtp(int16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rtp(int3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -10178,37 +10178,37 @@ long3 convert_long3_rtp(int3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rtn(int x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rtn(int2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rtn(int4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rtn(int8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rtn(int16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rtn(int3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -10216,37 +10216,37 @@ long3 convert_long3_rtn(int3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rtz(int x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rtz(int2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rtz(int4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rtz(int8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rtz(int16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rtz(int3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -10254,37 +10254,37 @@ ulong3 convert_ulong3_rtz(int3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rte(int x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rte(int2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rte(int4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rte(int8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rte(int16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rte(int3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -10292,37 +10292,37 @@ ulong3 convert_ulong3_rte(int3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rtp(int x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rtp(int2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rtp(int4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rtp(int8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rtp(int16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rtp(int3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -10330,939 +10330,939 @@ ulong3 convert_ulong3_rtp(int3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rtn(int x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rtn(int2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rtn(int4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rtn(int8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rtn(int16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rtn(int3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
 }
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rtz(uint x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rtz(uint2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rtz(uint4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rtz(uint8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rtz(uint16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rtz(uint3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rte(uint x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rte(uint2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rte(uint4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rte(uint8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rte(uint16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rte(uint3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rtp(uint x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rtp(uint2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rtp(uint4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rtp(uint8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rtp(uint16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rtp(uint3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rtn(uint x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rtn(uint2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rtn(uint4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rtn(uint8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rtn(uint16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rtn(uint3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rtz(uint x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rtz(uint2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rtz(uint4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rtz(uint8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rtz(uint16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rtz(uint3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rte(uint x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rte(uint2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rte(uint4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rte(uint8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rte(uint16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rte(uint3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rtp(uint x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rtp(uint2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rtp(uint4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rtp(uint8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rtp(uint16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rtp(uint3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rtn(uint x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rtn(uint2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rtn(uint4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rtn(uint8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rtn(uint16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rtn(uint3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rtz(uint x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rtz(uint2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rtz(uint4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rtz(uint8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rtz(uint16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rtz(uint3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rte(uint x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rte(uint2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rte(uint4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rte(uint8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rte(uint16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rte(uint3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rtp(uint x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rtp(uint2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rtp(uint4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rtp(uint8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rtp(uint16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rtp(uint3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rtn(uint x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rtn(uint2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rtn(uint4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rtn(uint8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rtn(uint16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rtn(uint3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rtz(uint x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rtz(uint2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rtz(uint4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rtz(uint8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rtz(uint16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rtz(uint3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rte(uint x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rte(uint2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rte(uint4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rte(uint8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rte(uint16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rte(uint3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rtp(uint x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rtp(uint2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rtp(uint4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rtp(uint8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rtp(uint16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rtp(uint3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rtn(uint x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rtn(uint2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rtn(uint4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rtn(uint8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rtn(uint16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rtn(uint3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rtz(uint x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rtz(uint2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rtz(uint4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rtz(uint8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rtz(uint16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rtz(uint3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rte(uint x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rte(uint2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rte(uint4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rte(uint8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rte(uint16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rte(uint3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rtp(uint x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rtp(uint2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rtp(uint4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rtp(uint8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rtp(uint16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rtp(uint3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rtn(uint x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rtn(uint2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rtn(uint4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rtn(uint8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rtn(uint16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rtn(uint3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rtz(uint x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rtz(uint2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rtz(uint4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rtz(uint8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rtz(uint16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rtz(uint3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rte(uint x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rte(uint2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rte(uint4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rte(uint8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rte(uint16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rte(uint3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rtp(uint x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rtp(uint2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rtp(uint4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rtp(uint8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rtp(uint16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rtp(uint3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rtn(uint x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rtn(uint2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rtn(uint4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rtn(uint8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rtn(uint16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rtn(uint3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
 }
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rtz(uint x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rtz(uint2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rtz(uint4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rtz(uint8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rtz(uint16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rtz(uint3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -11270,37 +11270,37 @@ long3 convert_long3_rtz(uint3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rte(uint x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rte(uint2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rte(uint4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rte(uint8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rte(uint16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rte(uint3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -11308,37 +11308,37 @@ long3 convert_long3_rte(uint3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rtp(uint x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rtp(uint2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rtp(uint4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rtp(uint8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rtp(uint16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rtp(uint3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -11346,37 +11346,37 @@ long3 convert_long3_rtp(uint3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rtn(uint x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rtn(uint2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rtn(uint4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rtn(uint8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rtn(uint16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rtn(uint3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -11384,37 +11384,37 @@ long3 convert_long3_rtn(uint3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rtz(uint x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rtz(uint2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rtz(uint4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rtz(uint8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rtz(uint16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rtz(uint3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -11422,37 +11422,37 @@ ulong3 convert_ulong3_rtz(uint3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rte(uint x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rte(uint2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rte(uint4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rte(uint8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rte(uint16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rte(uint3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -11460,37 +11460,37 @@ ulong3 convert_ulong3_rte(uint3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rtp(uint x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rtp(uint2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rtp(uint4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rtp(uint8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rtp(uint16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rtp(uint3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -11498,37 +11498,37 @@ ulong3 convert_ulong3_rtp(uint3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rtn(uint x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rtn(uint2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rtn(uint4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rtn(uint8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rtn(uint16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rtn(uint3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -11536,37 +11536,37 @@ ulong3 convert_ulong3_rtn(uint3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rtz(long x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rtz(long2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rtz(long4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rtz(long8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rtz(long16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rtz(long3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
@@ -11574,37 +11574,37 @@ char3 convert_char3_rtz(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rte(long x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rte(long2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rte(long4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rte(long8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rte(long16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rte(long3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
@@ -11612,37 +11612,37 @@ char3 convert_char3_rte(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rtp(long x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rtp(long2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rtp(long4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rtp(long8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rtp(long16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rtp(long3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
@@ -11650,37 +11650,37 @@ char3 convert_char3_rtp(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rtn(long x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rtn(long2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rtn(long4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rtn(long8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rtn(long16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rtn(long3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
@@ -11688,37 +11688,37 @@ char3 convert_char3_rtn(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rtz(long x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rtz(long2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rtz(long4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rtz(long8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rtz(long16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rtz(long3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
@@ -11726,37 +11726,37 @@ uchar3 convert_uchar3_rtz(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rte(long x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rte(long2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rte(long4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rte(long8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rte(long16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rte(long3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
@@ -11764,37 +11764,37 @@ uchar3 convert_uchar3_rte(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rtp(long x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rtp(long2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rtp(long4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rtp(long8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rtp(long16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rtp(long3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
@@ -11802,37 +11802,37 @@ uchar3 convert_uchar3_rtp(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rtn(long x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rtn(long2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rtn(long4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rtn(long8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rtn(long16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rtn(long3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
@@ -11840,37 +11840,37 @@ uchar3 convert_uchar3_rtn(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rtz(long x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rtz(long2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rtz(long4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rtz(long8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rtz(long16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rtz(long3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
@@ -11878,37 +11878,37 @@ short3 convert_short3_rtz(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rte(long x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rte(long2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rte(long4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rte(long8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rte(long16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rte(long3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
@@ -11916,37 +11916,37 @@ short3 convert_short3_rte(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rtp(long x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rtp(long2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rtp(long4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rtp(long8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rtp(long16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rtp(long3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
@@ -11954,37 +11954,37 @@ short3 convert_short3_rtp(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rtn(long x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rtn(long2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rtn(long4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rtn(long8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rtn(long16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rtn(long3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
@@ -11992,37 +11992,37 @@ short3 convert_short3_rtn(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rtz(long x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rtz(long2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rtz(long4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rtz(long8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rtz(long16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rtz(long3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
@@ -12030,37 +12030,37 @@ ushort3 convert_ushort3_rtz(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rte(long x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rte(long2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rte(long4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rte(long8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rte(long16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rte(long3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
@@ -12068,37 +12068,37 @@ ushort3 convert_ushort3_rte(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rtp(long x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rtp(long2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rtp(long4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rtp(long8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rtp(long16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rtp(long3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
@@ -12106,37 +12106,37 @@ ushort3 convert_ushort3_rtp(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rtn(long x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rtn(long2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rtn(long4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rtn(long8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rtn(long16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rtn(long3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
@@ -12144,37 +12144,37 @@ ushort3 convert_ushort3_rtn(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rtz(long x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rtz(long2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rtz(long4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rtz(long8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rtz(long16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rtz(long3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
@@ -12182,37 +12182,37 @@ int3 convert_int3_rtz(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rte(long x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rte(long2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rte(long4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rte(long8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rte(long16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rte(long3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
@@ -12220,37 +12220,37 @@ int3 convert_int3_rte(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rtp(long x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rtp(long2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rtp(long4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rtp(long8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rtp(long16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rtp(long3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
@@ -12258,37 +12258,37 @@ int3 convert_int3_rtp(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rtn(long x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rtn(long2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rtn(long4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rtn(long8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rtn(long16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rtn(long3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
@@ -12296,37 +12296,37 @@ int3 convert_int3_rtn(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rtz(long x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rtz(long2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rtz(long4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rtz(long8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rtz(long16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rtz(long3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
@@ -12334,37 +12334,37 @@ uint3 convert_uint3_rtz(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rte(long x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rte(long2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rte(long4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rte(long8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rte(long16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rte(long3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
@@ -12372,37 +12372,37 @@ uint3 convert_uint3_rte(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rtp(long x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rtp(long2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rtp(long4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rtp(long8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rtp(long16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rtp(long3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
@@ -12410,37 +12410,37 @@ uint3 convert_uint3_rtp(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rtn(long x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rtn(long2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rtn(long4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rtn(long8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rtn(long16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rtn(long3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
@@ -12448,37 +12448,37 @@ uint3 convert_uint3_rtn(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rtz(long x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rtz(long2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rtz(long4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rtz(long8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rtz(long16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rtz(long3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -12486,37 +12486,37 @@ long3 convert_long3_rtz(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rte(long x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rte(long2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rte(long4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rte(long8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rte(long16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rte(long3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -12524,37 +12524,37 @@ long3 convert_long3_rte(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rtp(long x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rtp(long2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rtp(long4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rtp(long8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rtp(long16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rtp(long3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -12562,37 +12562,37 @@ long3 convert_long3_rtp(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rtn(long x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rtn(long2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rtn(long4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rtn(long8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rtn(long16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rtn(long3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -12600,37 +12600,37 @@ long3 convert_long3_rtn(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rtz(long x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rtz(long2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rtz(long4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rtz(long8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rtz(long16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rtz(long3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -12638,37 +12638,37 @@ ulong3 convert_ulong3_rtz(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rte(long x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rte(long2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rte(long4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rte(long8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rte(long16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rte(long3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -12676,37 +12676,37 @@ ulong3 convert_ulong3_rte(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rtp(long x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rtp(long2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rtp(long4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rtp(long8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rtp(long16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rtp(long3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -12714,37 +12714,37 @@ ulong3 convert_ulong3_rtp(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rtn(long x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rtn(long2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rtn(long4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rtn(long8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rtn(long16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rtn(long3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -12752,37 +12752,37 @@ ulong3 convert_ulong3_rtn(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rtz(ulong x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rtz(ulong2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rtz(ulong4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rtz(ulong8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rtz(ulong16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rtz(ulong3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
@@ -12790,37 +12790,37 @@ char3 convert_char3_rtz(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rte(ulong x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rte(ulong2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rte(ulong4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rte(ulong8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rte(ulong16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rte(ulong3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
@@ -12828,37 +12828,37 @@ char3 convert_char3_rte(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rtp(ulong x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rtp(ulong2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rtp(ulong4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rtp(ulong8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rtp(ulong16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rtp(ulong3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
@@ -12866,37 +12866,37 @@ char3 convert_char3_rtp(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rtn(ulong x)
 {
   return (char)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rtn(ulong2 x)
 {
   return (char2)(convert_char(x.lo), convert_char(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rtn(ulong4 x)
 {
   return (char4)(convert_char2(x.lo), convert_char2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rtn(ulong8 x)
 {
   return (char8)(convert_char4(x.lo), convert_char4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rtn(ulong16 x)
 {
   return (char16)(convert_char8(x.lo), convert_char8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rtn(ulong3 x)
 {
   return (char3)(convert_char2(x.s01), convert_char(x.s2));
@@ -12904,37 +12904,37 @@ char3 convert_char3_rtn(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rtz(ulong x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rtz(ulong2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rtz(ulong4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rtz(ulong8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rtz(ulong16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rtz(ulong3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
@@ -12942,37 +12942,37 @@ uchar3 convert_uchar3_rtz(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rte(ulong x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rte(ulong2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rte(ulong4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rte(ulong8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rte(ulong16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rte(ulong3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
@@ -12980,37 +12980,37 @@ uchar3 convert_uchar3_rte(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rtp(ulong x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rtp(ulong2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rtp(ulong4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rtp(ulong8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rtp(ulong16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rtp(ulong3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
@@ -13018,37 +13018,37 @@ uchar3 convert_uchar3_rtp(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rtn(ulong x)
 {
   return (uchar)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rtn(ulong2 x)
 {
   return (uchar2)(convert_uchar(x.lo), convert_uchar(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rtn(ulong4 x)
 {
   return (uchar4)(convert_uchar2(x.lo), convert_uchar2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rtn(ulong8 x)
 {
   return (uchar8)(convert_uchar4(x.lo), convert_uchar4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rtn(ulong16 x)
 {
   return (uchar16)(convert_uchar8(x.lo), convert_uchar8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rtn(ulong3 x)
 {
   return (uchar3)(convert_uchar2(x.s01), convert_uchar(x.s2));
@@ -13056,37 +13056,37 @@ uchar3 convert_uchar3_rtn(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rtz(ulong x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rtz(ulong2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rtz(ulong4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rtz(ulong8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rtz(ulong16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rtz(ulong3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
@@ -13094,37 +13094,37 @@ short3 convert_short3_rtz(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rte(ulong x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rte(ulong2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rte(ulong4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rte(ulong8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rte(ulong16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rte(ulong3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
@@ -13132,37 +13132,37 @@ short3 convert_short3_rte(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rtp(ulong x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rtp(ulong2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rtp(ulong4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rtp(ulong8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rtp(ulong16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rtp(ulong3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
@@ -13170,37 +13170,37 @@ short3 convert_short3_rtp(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rtn(ulong x)
 {
   return (short)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rtn(ulong2 x)
 {
   return (short2)(convert_short(x.lo), convert_short(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rtn(ulong4 x)
 {
   return (short4)(convert_short2(x.lo), convert_short2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rtn(ulong8 x)
 {
   return (short8)(convert_short4(x.lo), convert_short4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rtn(ulong16 x)
 {
   return (short16)(convert_short8(x.lo), convert_short8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rtn(ulong3 x)
 {
   return (short3)(convert_short2(x.s01), convert_short(x.s2));
@@ -13208,37 +13208,37 @@ short3 convert_short3_rtn(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rtz(ulong x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rtz(ulong2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rtz(ulong4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rtz(ulong8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rtz(ulong16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rtz(ulong3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
@@ -13246,37 +13246,37 @@ ushort3 convert_ushort3_rtz(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rte(ulong x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rte(ulong2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rte(ulong4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rte(ulong8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rte(ulong16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rte(ulong3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
@@ -13284,37 +13284,37 @@ ushort3 convert_ushort3_rte(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rtp(ulong x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rtp(ulong2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rtp(ulong4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rtp(ulong8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rtp(ulong16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rtp(ulong3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
@@ -13322,37 +13322,37 @@ ushort3 convert_ushort3_rtp(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rtn(ulong x)
 {
   return (ushort)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rtn(ulong2 x)
 {
   return (ushort2)(convert_ushort(x.lo), convert_ushort(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rtn(ulong4 x)
 {
   return (ushort4)(convert_ushort2(x.lo), convert_ushort2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rtn(ulong8 x)
 {
   return (ushort8)(convert_ushort4(x.lo), convert_ushort4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rtn(ulong16 x)
 {
   return (ushort16)(convert_ushort8(x.lo), convert_ushort8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rtn(ulong3 x)
 {
   return (ushort3)(convert_ushort2(x.s01), convert_ushort(x.s2));
@@ -13360,37 +13360,37 @@ ushort3 convert_ushort3_rtn(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rtz(ulong x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rtz(ulong2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rtz(ulong4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rtz(ulong8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rtz(ulong16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rtz(ulong3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
@@ -13398,37 +13398,37 @@ int3 convert_int3_rtz(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rte(ulong x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rte(ulong2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rte(ulong4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rte(ulong8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rte(ulong16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rte(ulong3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
@@ -13436,37 +13436,37 @@ int3 convert_int3_rte(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rtp(ulong x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rtp(ulong2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rtp(ulong4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rtp(ulong8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rtp(ulong16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rtp(ulong3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
@@ -13474,37 +13474,37 @@ int3 convert_int3_rtp(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rtn(ulong x)
 {
   return (int)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rtn(ulong2 x)
 {
   return (int2)(convert_int(x.lo), convert_int(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rtn(ulong4 x)
 {
   return (int4)(convert_int2(x.lo), convert_int2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rtn(ulong8 x)
 {
   return (int8)(convert_int4(x.lo), convert_int4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rtn(ulong16 x)
 {
   return (int16)(convert_int8(x.lo), convert_int8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rtn(ulong3 x)
 {
   return (int3)(convert_int2(x.s01), convert_int(x.s2));
@@ -13512,37 +13512,37 @@ int3 convert_int3_rtn(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rtz(ulong x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rtz(ulong2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rtz(ulong4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rtz(ulong8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rtz(ulong16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rtz(ulong3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
@@ -13550,37 +13550,37 @@ uint3 convert_uint3_rtz(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rte(ulong x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rte(ulong2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rte(ulong4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rte(ulong8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rte(ulong16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rte(ulong3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
@@ -13588,37 +13588,37 @@ uint3 convert_uint3_rte(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rtp(ulong x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rtp(ulong2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rtp(ulong4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rtp(ulong8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rtp(ulong16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rtp(ulong3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
@@ -13626,37 +13626,37 @@ uint3 convert_uint3_rtp(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rtn(ulong x)
 {
   return (uint)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rtn(ulong2 x)
 {
   return (uint2)(convert_uint(x.lo), convert_uint(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rtn(ulong4 x)
 {
   return (uint4)(convert_uint2(x.lo), convert_uint2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rtn(ulong8 x)
 {
   return (uint8)(convert_uint4(x.lo), convert_uint4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rtn(ulong16 x)
 {
   return (uint16)(convert_uint8(x.lo), convert_uint8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rtn(ulong3 x)
 {
   return (uint3)(convert_uint2(x.s01), convert_uint(x.s2));
@@ -13664,37 +13664,37 @@ uint3 convert_uint3_rtn(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rtz(ulong x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rtz(ulong2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rtz(ulong4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rtz(ulong8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rtz(ulong16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rtz(ulong3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -13702,37 +13702,37 @@ long3 convert_long3_rtz(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rte(ulong x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rte(ulong2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rte(ulong4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rte(ulong8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rte(ulong16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rte(ulong3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -13740,37 +13740,37 @@ long3 convert_long3_rte(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rtp(ulong x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rtp(ulong2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rtp(ulong4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rtp(ulong8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rtp(ulong16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rtp(ulong3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -13778,37 +13778,37 @@ long3 convert_long3_rtp(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rtn(ulong x)
 {
   return (long)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rtn(ulong2 x)
 {
   return (long2)(convert_long(x.lo), convert_long(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rtn(ulong4 x)
 {
   return (long4)(convert_long2(x.lo), convert_long2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rtn(ulong8 x)
 {
   return (long8)(convert_long4(x.lo), convert_long4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rtn(ulong16 x)
 {
   return (long16)(convert_long8(x.lo), convert_long8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rtn(ulong3 x)
 {
   return (long3)(convert_long2(x.s01), convert_long(x.s2));
@@ -13816,37 +13816,37 @@ long3 convert_long3_rtn(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rtz(ulong x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rtz(ulong2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rtz(ulong4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rtz(ulong8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rtz(ulong16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rtz(ulong3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -13854,37 +13854,37 @@ ulong3 convert_ulong3_rtz(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rte(ulong x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rte(ulong2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rte(ulong4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rte(ulong8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rte(ulong16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rte(ulong3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -13892,37 +13892,37 @@ ulong3 convert_ulong3_rte(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rtp(ulong x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rtp(ulong2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rtp(ulong4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rtp(ulong8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rtp(ulong16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rtp(ulong3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -13930,37 +13930,37 @@ ulong3 convert_ulong3_rtp(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rtn(ulong x)
 {
   return (ulong)x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rtn(ulong2 x)
 {
   return (ulong2)(convert_ulong(x.lo), convert_ulong(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rtn(ulong4 x)
 {
   return (ulong4)(convert_ulong2(x.lo), convert_ulong2(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rtn(ulong8 x)
 {
   return (ulong8)(convert_ulong4(x.lo), convert_ulong4(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rtn(ulong16 x)
 {
   return (ulong16)(convert_ulong8(x.lo), convert_ulong8(x.hi));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rtn(ulong3 x)
 {
   return (ulong3)(convert_ulong2(x.s01), convert_ulong(x.s2));
@@ -13968,234 +13968,234 @@ ulong3 convert_ulong3_rtn(ulong3 x)
 #endif
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat(char x)
 {
   return x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat(char2 x)
 {
   return x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat(char3 x)
 {
   return x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat(char4 x)
 {
   return x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat(char8 x)
 {
   return x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat(char16 x)
 {
   return x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat(char x)
 {
   x = max(x, (char)0);
   return convert_uchar(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat(char2 x)
 {
   x = max(x, (char)0);
   return convert_uchar2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat(char3 x)
 {
   x = max(x, (char)0);
   return convert_uchar3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat(char4 x)
 {
   x = max(x, (char)0);
   return convert_uchar4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat(char8 x)
 {
   x = max(x, (char)0);
   return convert_uchar8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat(char16 x)
 {
   x = max(x, (char)0);
   return convert_uchar16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat(char x)
 {
   return convert_short(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat(char2 x)
 {
   return convert_short2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat(char3 x)
 {
   return convert_short3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat(char4 x)
 {
   return convert_short4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat(char8 x)
 {
   return convert_short8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat(char16 x)
 {
   return convert_short16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat(char x)
 {
   x = max(x, (char)0);
   return convert_ushort(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat(char2 x)
 {
   x = max(x, (char)0);
   return convert_ushort2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat(char3 x)
 {
   x = max(x, (char)0);
   return convert_ushort3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat(char4 x)
 {
   x = max(x, (char)0);
   return convert_ushort4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat(char8 x)
 {
   x = max(x, (char)0);
   return convert_ushort8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat(char16 x)
 {
   x = max(x, (char)0);
   return convert_ushort16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat(char x)
 {
   return convert_int(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat(char2 x)
 {
   return convert_int2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat(char3 x)
 {
   return convert_int3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat(char4 x)
 {
   return convert_int4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat(char8 x)
 {
   return convert_int8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat(char16 x)
 {
   return convert_int16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat(char x)
 {
   x = max(x, (char)0);
   return convert_uint(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat(char2 x)
 {
   x = max(x, (char)0);
   return convert_uint2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat(char3 x)
 {
   x = max(x, (char)0);
   return convert_uint3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat(char4 x)
 {
   x = max(x, (char)0);
   return convert_uint4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat(char8 x)
 {
   x = max(x, (char)0);
   return convert_uint8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat(char16 x)
 {
   x = max(x, (char)0);
@@ -14203,7 +14203,7 @@ uint16 convert_uint16_sat(char16 x)
 }
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat(char x)
 {
   return convert_long(x);
@@ -14211,7 +14211,7 @@ long convert_long_sat(char x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat(char2 x)
 {
   return convert_long2(x);
@@ -14219,7 +14219,7 @@ long2 convert_long2_sat(char2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat(char3 x)
 {
   return convert_long3(x);
@@ -14227,7 +14227,7 @@ long3 convert_long3_sat(char3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat(char4 x)
 {
   return convert_long4(x);
@@ -14235,7 +14235,7 @@ long4 convert_long4_sat(char4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat(char8 x)
 {
   return convert_long8(x);
@@ -14243,7 +14243,7 @@ long8 convert_long8_sat(char8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat(char16 x)
 {
   return convert_long16(x);
@@ -14251,7 +14251,7 @@ long16 convert_long16_sat(char16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat(char x)
 {
   x = max(x, (char)0);
@@ -14260,7 +14260,7 @@ ulong convert_ulong_sat(char x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat(char2 x)
 {
   x = max(x, (char)0);
@@ -14269,7 +14269,7 @@ ulong2 convert_ulong2_sat(char2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat(char3 x)
 {
   x = max(x, (char)0);
@@ -14278,7 +14278,7 @@ ulong3 convert_ulong3_sat(char3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat(char4 x)
 {
   x = max(x, (char)0);
@@ -14287,7 +14287,7 @@ ulong4 convert_ulong4_sat(char4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat(char8 x)
 {
   x = max(x, (char)0);
@@ -14296,7 +14296,7 @@ ulong8 convert_ulong8_sat(char8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat(char16 x)
 {
   x = max(x, (char)0);
@@ -14304,230 +14304,230 @@ ulong16 convert_ulong16_sat(char16 x)
 }
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat(uchar x)
 {
   x = min(x, (uchar)CHAR_MAX);
   return convert_char(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat(uchar2 x)
 {
   x = min(x, (uchar)CHAR_MAX);
   return convert_char2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat(uchar3 x)
 {
   x = min(x, (uchar)CHAR_MAX);
   return convert_char3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat(uchar4 x)
 {
   x = min(x, (uchar)CHAR_MAX);
   return convert_char4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat(uchar8 x)
 {
   x = min(x, (uchar)CHAR_MAX);
   return convert_char8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat(uchar16 x)
 {
   x = min(x, (uchar)CHAR_MAX);
   return convert_char16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat(uchar x)
 {
   return x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat(uchar2 x)
 {
   return x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat(uchar3 x)
 {
   return x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat(uchar4 x)
 {
   return x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat(uchar8 x)
 {
   return x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat(uchar16 x)
 {
   return x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat(uchar x)
 {
   return convert_short(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat(uchar2 x)
 {
   return convert_short2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat(uchar3 x)
 {
   return convert_short3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat(uchar4 x)
 {
   return convert_short4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat(uchar8 x)
 {
   return convert_short8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat(uchar16 x)
 {
   return convert_short16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat(uchar x)
 {
   return convert_ushort(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat(uchar2 x)
 {
   return convert_ushort2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat(uchar3 x)
 {
   return convert_ushort3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat(uchar4 x)
 {
   return convert_ushort4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat(uchar8 x)
 {
   return convert_ushort8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat(uchar16 x)
 {
   return convert_ushort16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat(uchar x)
 {
   return convert_int(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat(uchar2 x)
 {
   return convert_int2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat(uchar3 x)
 {
   return convert_int3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat(uchar4 x)
 {
   return convert_int4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat(uchar8 x)
 {
   return convert_int8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat(uchar16 x)
 {
   return convert_int16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat(uchar x)
 {
   return convert_uint(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat(uchar2 x)
 {
   return convert_uint2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat(uchar3 x)
 {
   return convert_uint3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat(uchar4 x)
 {
   return convert_uint4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat(uchar8 x)
 {
   return convert_uint8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat(uchar16 x)
 {
   return convert_uint16(x);
 }
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat(uchar x)
 {
   return convert_long(x);
@@ -14535,7 +14535,7 @@ long convert_long_sat(uchar x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat(uchar2 x)
 {
   return convert_long2(x);
@@ -14543,7 +14543,7 @@ long2 convert_long2_sat(uchar2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat(uchar3 x)
 {
   return convert_long3(x);
@@ -14551,7 +14551,7 @@ long3 convert_long3_sat(uchar3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat(uchar4 x)
 {
   return convert_long4(x);
@@ -14559,7 +14559,7 @@ long4 convert_long4_sat(uchar4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat(uchar8 x)
 {
   return convert_long8(x);
@@ -14567,7 +14567,7 @@ long8 convert_long8_sat(uchar8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat(uchar16 x)
 {
   return convert_long16(x);
@@ -14575,7 +14575,7 @@ long16 convert_long16_sat(uchar16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat(uchar x)
 {
   return convert_ulong(x);
@@ -14583,7 +14583,7 @@ ulong convert_ulong_sat(uchar x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat(uchar2 x)
 {
   return convert_ulong2(x);
@@ -14591,7 +14591,7 @@ ulong2 convert_ulong2_sat(uchar2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat(uchar3 x)
 {
   return convert_ulong3(x);
@@ -14599,7 +14599,7 @@ ulong3 convert_ulong3_sat(uchar3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat(uchar4 x)
 {
   return convert_ulong4(x);
@@ -14607,7 +14607,7 @@ ulong4 convert_ulong4_sat(uchar4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat(uchar8 x)
 {
   return convert_ulong8(x);
@@ -14615,247 +14615,247 @@ ulong8 convert_ulong8_sat(uchar8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat(uchar16 x)
 {
   return convert_ulong16(x);
 }
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat(short x)
 {
   x = clamp(x, (short)CHAR_MIN, (short)CHAR_MAX);
   return convert_char(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat(short2 x)
 {
   x = clamp(x, (short)CHAR_MIN, (short)CHAR_MAX);
   return convert_char2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat(short3 x)
 {
   x = clamp(x, (short)CHAR_MIN, (short)CHAR_MAX);
   return convert_char3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat(short4 x)
 {
   x = clamp(x, (short)CHAR_MIN, (short)CHAR_MAX);
   return convert_char4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat(short8 x)
 {
   x = clamp(x, (short)CHAR_MIN, (short)CHAR_MAX);
   return convert_char8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat(short16 x)
 {
   x = clamp(x, (short)CHAR_MIN, (short)CHAR_MAX);
   return convert_char16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat(short x)
 {
   x = clamp(x, (short)0, (short)UCHAR_MAX);
   return convert_uchar(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat(short2 x)
 {
   x = clamp(x, (short)0, (short)UCHAR_MAX);
   return convert_uchar2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat(short3 x)
 {
   x = clamp(x, (short)0, (short)UCHAR_MAX);
   return convert_uchar3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat(short4 x)
 {
   x = clamp(x, (short)0, (short)UCHAR_MAX);
   return convert_uchar4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat(short8 x)
 {
   x = clamp(x, (short)0, (short)UCHAR_MAX);
   return convert_uchar8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat(short16 x)
 {
   x = clamp(x, (short)0, (short)UCHAR_MAX);
   return convert_uchar16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat(short x)
 {
   return x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat(short2 x)
 {
   return x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat(short3 x)
 {
   return x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat(short4 x)
 {
   return x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat(short8 x)
 {
   return x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat(short16 x)
 {
   return x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat(short x)
 {
   x = max(x, (short)0);
   return convert_ushort(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat(short2 x)
 {
   x = max(x, (short)0);
   return convert_ushort2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat(short3 x)
 {
   x = max(x, (short)0);
   return convert_ushort3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat(short4 x)
 {
   x = max(x, (short)0);
   return convert_ushort4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat(short8 x)
 {
   x = max(x, (short)0);
   return convert_ushort8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat(short16 x)
 {
   x = max(x, (short)0);
   return convert_ushort16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat(short x)
 {
   return convert_int(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat(short2 x)
 {
   return convert_int2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat(short3 x)
 {
   return convert_int3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat(short4 x)
 {
   return convert_int4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat(short8 x)
 {
   return convert_int8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat(short16 x)
 {
   return convert_int16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat(short x)
 {
   x = max(x, (short)0);
   return convert_uint(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat(short2 x)
 {
   x = max(x, (short)0);
   return convert_uint2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat(short3 x)
 {
   x = max(x, (short)0);
   return convert_uint3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat(short4 x)
 {
   x = max(x, (short)0);
   return convert_uint4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat(short8 x)
 {
   x = max(x, (short)0);
   return convert_uint8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat(short16 x)
 {
   x = max(x, (short)0);
@@ -14863,7 +14863,7 @@ uint16 convert_uint16_sat(short16 x)
 }
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat(short x)
 {
   return convert_long(x);
@@ -14871,7 +14871,7 @@ long convert_long_sat(short x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat(short2 x)
 {
   return convert_long2(x);
@@ -14879,7 +14879,7 @@ long2 convert_long2_sat(short2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat(short3 x)
 {
   return convert_long3(x);
@@ -14887,7 +14887,7 @@ long3 convert_long3_sat(short3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat(short4 x)
 {
   return convert_long4(x);
@@ -14895,7 +14895,7 @@ long4 convert_long4_sat(short4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat(short8 x)
 {
   return convert_long8(x);
@@ -14903,7 +14903,7 @@ long8 convert_long8_sat(short8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat(short16 x)
 {
   return convert_long16(x);
@@ -14911,7 +14911,7 @@ long16 convert_long16_sat(short16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat(short x)
 {
   x = max(x, (short)0);
@@ -14920,7 +14920,7 @@ ulong convert_ulong_sat(short x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat(short2 x)
 {
   x = max(x, (short)0);
@@ -14929,7 +14929,7 @@ ulong2 convert_ulong2_sat(short2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat(short3 x)
 {
   x = max(x, (short)0);
@@ -14938,7 +14938,7 @@ ulong3 convert_ulong3_sat(short3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat(short4 x)
 {
   x = max(x, (short)0);
@@ -14947,7 +14947,7 @@ ulong4 convert_ulong4_sat(short4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat(short8 x)
 {
   x = max(x, (short)0);
@@ -14956,7 +14956,7 @@ ulong8 convert_ulong8_sat(short8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat(short16 x)
 {
   x = max(x, (short)0);
@@ -14964,242 +14964,242 @@ ulong16 convert_ulong16_sat(short16 x)
 }
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat(ushort x)
 {
   x = min(x, (ushort)CHAR_MAX);
   return convert_char(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat(ushort2 x)
 {
   x = min(x, (ushort)CHAR_MAX);
   return convert_char2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat(ushort3 x)
 {
   x = min(x, (ushort)CHAR_MAX);
   return convert_char3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat(ushort4 x)
 {
   x = min(x, (ushort)CHAR_MAX);
   return convert_char4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat(ushort8 x)
 {
   x = min(x, (ushort)CHAR_MAX);
   return convert_char8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat(ushort16 x)
 {
   x = min(x, (ushort)CHAR_MAX);
   return convert_char16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat(ushort x)
 {
   x = min(x, (ushort)UCHAR_MAX);
   return convert_uchar(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat(ushort2 x)
 {
   x = min(x, (ushort)UCHAR_MAX);
   return convert_uchar2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat(ushort3 x)
 {
   x = min(x, (ushort)UCHAR_MAX);
   return convert_uchar3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat(ushort4 x)
 {
   x = min(x, (ushort)UCHAR_MAX);
   return convert_uchar4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat(ushort8 x)
 {
   x = min(x, (ushort)UCHAR_MAX);
   return convert_uchar8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat(ushort16 x)
 {
   x = min(x, (ushort)UCHAR_MAX);
   return convert_uchar16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat(ushort x)
 {
   x = min(x, (ushort)SHRT_MAX);
   return convert_short(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat(ushort2 x)
 {
   x = min(x, (ushort)SHRT_MAX);
   return convert_short2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat(ushort3 x)
 {
   x = min(x, (ushort)SHRT_MAX);
   return convert_short3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat(ushort4 x)
 {
   x = min(x, (ushort)SHRT_MAX);
   return convert_short4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat(ushort8 x)
 {
   x = min(x, (ushort)SHRT_MAX);
   return convert_short8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat(ushort16 x)
 {
   x = min(x, (ushort)SHRT_MAX);
   return convert_short16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat(ushort x)
 {
   return x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat(ushort2 x)
 {
   return x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat(ushort3 x)
 {
   return x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat(ushort4 x)
 {
   return x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat(ushort8 x)
 {
   return x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat(ushort16 x)
 {
   return x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat(ushort x)
 {
   return convert_int(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat(ushort2 x)
 {
   return convert_int2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat(ushort3 x)
 {
   return convert_int3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat(ushort4 x)
 {
   return convert_int4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat(ushort8 x)
 {
   return convert_int8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat(ushort16 x)
 {
   return convert_int16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat(ushort x)
 {
   return convert_uint(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat(ushort2 x)
 {
   return convert_uint2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat(ushort3 x)
 {
   return convert_uint3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat(ushort4 x)
 {
   return convert_uint4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat(ushort8 x)
 {
   return convert_uint8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat(ushort16 x)
 {
   return convert_uint16(x);
 }
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat(ushort x)
 {
   return convert_long(x);
@@ -15207,7 +15207,7 @@ long convert_long_sat(ushort x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat(ushort2 x)
 {
   return convert_long2(x);
@@ -15215,7 +15215,7 @@ long2 convert_long2_sat(ushort2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat(ushort3 x)
 {
   return convert_long3(x);
@@ -15223,7 +15223,7 @@ long3 convert_long3_sat(ushort3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat(ushort4 x)
 {
   return convert_long4(x);
@@ -15231,7 +15231,7 @@ long4 convert_long4_sat(ushort4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat(ushort8 x)
 {
   return convert_long8(x);
@@ -15239,7 +15239,7 @@ long8 convert_long8_sat(ushort8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat(ushort16 x)
 {
   return convert_long16(x);
@@ -15247,7 +15247,7 @@ long16 convert_long16_sat(ushort16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat(ushort x)
 {
   return convert_ulong(x);
@@ -15255,7 +15255,7 @@ ulong convert_ulong_sat(ushort x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat(ushort2 x)
 {
   return convert_ulong2(x);
@@ -15263,7 +15263,7 @@ ulong2 convert_ulong2_sat(ushort2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat(ushort3 x)
 {
   return convert_ulong3(x);
@@ -15271,7 +15271,7 @@ ulong3 convert_ulong3_sat(ushort3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat(ushort4 x)
 {
   return convert_ulong4(x);
@@ -15279,7 +15279,7 @@ ulong4 convert_ulong4_sat(ushort4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat(ushort8 x)
 {
   return convert_ulong8(x);
@@ -15287,253 +15287,253 @@ ulong8 convert_ulong8_sat(ushort8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat(ushort16 x)
 {
   return convert_ulong16(x);
 }
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat(int x)
 {
   x = clamp(x, (int)CHAR_MIN, (int)CHAR_MAX);
   return convert_char(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat(int2 x)
 {
   x = clamp(x, (int)CHAR_MIN, (int)CHAR_MAX);
   return convert_char2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat(int3 x)
 {
   x = clamp(x, (int)CHAR_MIN, (int)CHAR_MAX);
   return convert_char3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat(int4 x)
 {
   x = clamp(x, (int)CHAR_MIN, (int)CHAR_MAX);
   return convert_char4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat(int8 x)
 {
   x = clamp(x, (int)CHAR_MIN, (int)CHAR_MAX);
   return convert_char8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat(int16 x)
 {
   x = clamp(x, (int)CHAR_MIN, (int)CHAR_MAX);
   return convert_char16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat(int x)
 {
   x = clamp(x, (int)0, (int)UCHAR_MAX);
   return convert_uchar(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat(int2 x)
 {
   x = clamp(x, (int)0, (int)UCHAR_MAX);
   return convert_uchar2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat(int3 x)
 {
   x = clamp(x, (int)0, (int)UCHAR_MAX);
   return convert_uchar3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat(int4 x)
 {
   x = clamp(x, (int)0, (int)UCHAR_MAX);
   return convert_uchar4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat(int8 x)
 {
   x = clamp(x, (int)0, (int)UCHAR_MAX);
   return convert_uchar8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat(int16 x)
 {
   x = clamp(x, (int)0, (int)UCHAR_MAX);
   return convert_uchar16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat(int x)
 {
   x = clamp(x, (int)SHRT_MIN, (int)SHRT_MAX);
   return convert_short(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat(int2 x)
 {
   x = clamp(x, (int)SHRT_MIN, (int)SHRT_MAX);
   return convert_short2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat(int3 x)
 {
   x = clamp(x, (int)SHRT_MIN, (int)SHRT_MAX);
   return convert_short3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat(int4 x)
 {
   x = clamp(x, (int)SHRT_MIN, (int)SHRT_MAX);
   return convert_short4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat(int8 x)
 {
   x = clamp(x, (int)SHRT_MIN, (int)SHRT_MAX);
   return convert_short8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat(int16 x)
 {
   x = clamp(x, (int)SHRT_MIN, (int)SHRT_MAX);
   return convert_short16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat(int x)
 {
   x = clamp(x, (int)0, (int)USHRT_MAX);
   return convert_ushort(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat(int2 x)
 {
   x = clamp(x, (int)0, (int)USHRT_MAX);
   return convert_ushort2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat(int3 x)
 {
   x = clamp(x, (int)0, (int)USHRT_MAX);
   return convert_ushort3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat(int4 x)
 {
   x = clamp(x, (int)0, (int)USHRT_MAX);
   return convert_ushort4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat(int8 x)
 {
   x = clamp(x, (int)0, (int)USHRT_MAX);
   return convert_ushort8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat(int16 x)
 {
   x = clamp(x, (int)0, (int)USHRT_MAX);
   return convert_ushort16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat(int x)
 {
   return x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat(int2 x)
 {
   return x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat(int3 x)
 {
   return x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat(int4 x)
 {
   return x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat(int8 x)
 {
   return x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat(int16 x)
 {
   return x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat(int x)
 {
   x = max(x, (int)0);
   return convert_uint(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat(int2 x)
 {
   x = max(x, (int)0);
   return convert_uint2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat(int3 x)
 {
   x = max(x, (int)0);
   return convert_uint3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat(int4 x)
 {
   x = max(x, (int)0);
   return convert_uint4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat(int8 x)
 {
   x = max(x, (int)0);
   return convert_uint8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat(int16 x)
 {
   x = max(x, (int)0);
@@ -15541,7 +15541,7 @@ uint16 convert_uint16_sat(int16 x)
 }
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat(int x)
 {
   return convert_long(x);
@@ -15549,7 +15549,7 @@ long convert_long_sat(int x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat(int2 x)
 {
   return convert_long2(x);
@@ -15557,7 +15557,7 @@ long2 convert_long2_sat(int2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat(int3 x)
 {
   return convert_long3(x);
@@ -15565,7 +15565,7 @@ long3 convert_long3_sat(int3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat(int4 x)
 {
   return convert_long4(x);
@@ -15573,7 +15573,7 @@ long4 convert_long4_sat(int4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat(int8 x)
 {
   return convert_long8(x);
@@ -15581,7 +15581,7 @@ long8 convert_long8_sat(int8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat(int16 x)
 {
   return convert_long16(x);
@@ -15589,7 +15589,7 @@ long16 convert_long16_sat(int16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat(int x)
 {
   x = max(x, (int)0);
@@ -15598,7 +15598,7 @@ ulong convert_ulong_sat(int x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat(int2 x)
 {
   x = max(x, (int)0);
@@ -15607,7 +15607,7 @@ ulong2 convert_ulong2_sat(int2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat(int3 x)
 {
   x = max(x, (int)0);
@@ -15616,7 +15616,7 @@ ulong3 convert_ulong3_sat(int3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat(int4 x)
 {
   x = max(x, (int)0);
@@ -15625,7 +15625,7 @@ ulong4 convert_ulong4_sat(int4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat(int8 x)
 {
   x = max(x, (int)0);
@@ -15634,7 +15634,7 @@ ulong8 convert_ulong8_sat(int8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat(int16 x)
 {
   x = max(x, (int)0);
@@ -15642,254 +15642,254 @@ ulong16 convert_ulong16_sat(int16 x)
 }
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat(uint x)
 {
   x = min(x, (uint)CHAR_MAX);
   return convert_char(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat(uint2 x)
 {
   x = min(x, (uint)CHAR_MAX);
   return convert_char2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat(uint3 x)
 {
   x = min(x, (uint)CHAR_MAX);
   return convert_char3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat(uint4 x)
 {
   x = min(x, (uint)CHAR_MAX);
   return convert_char4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat(uint8 x)
 {
   x = min(x, (uint)CHAR_MAX);
   return convert_char8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat(uint16 x)
 {
   x = min(x, (uint)CHAR_MAX);
   return convert_char16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat(uint x)
 {
   x = min(x, (uint)UCHAR_MAX);
   return convert_uchar(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat(uint2 x)
 {
   x = min(x, (uint)UCHAR_MAX);
   return convert_uchar2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat(uint3 x)
 {
   x = min(x, (uint)UCHAR_MAX);
   return convert_uchar3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat(uint4 x)
 {
   x = min(x, (uint)UCHAR_MAX);
   return convert_uchar4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat(uint8 x)
 {
   x = min(x, (uint)UCHAR_MAX);
   return convert_uchar8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat(uint16 x)
 {
   x = min(x, (uint)UCHAR_MAX);
   return convert_uchar16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat(uint x)
 {
   x = min(x, (uint)SHRT_MAX);
   return convert_short(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat(uint2 x)
 {
   x = min(x, (uint)SHRT_MAX);
   return convert_short2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat(uint3 x)
 {
   x = min(x, (uint)SHRT_MAX);
   return convert_short3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat(uint4 x)
 {
   x = min(x, (uint)SHRT_MAX);
   return convert_short4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat(uint8 x)
 {
   x = min(x, (uint)SHRT_MAX);
   return convert_short8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat(uint16 x)
 {
   x = min(x, (uint)SHRT_MAX);
   return convert_short16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat(uint x)
 {
   x = min(x, (uint)USHRT_MAX);
   return convert_ushort(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat(uint2 x)
 {
   x = min(x, (uint)USHRT_MAX);
   return convert_ushort2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat(uint3 x)
 {
   x = min(x, (uint)USHRT_MAX);
   return convert_ushort3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat(uint4 x)
 {
   x = min(x, (uint)USHRT_MAX);
   return convert_ushort4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat(uint8 x)
 {
   x = min(x, (uint)USHRT_MAX);
   return convert_ushort8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat(uint16 x)
 {
   x = min(x, (uint)USHRT_MAX);
   return convert_ushort16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat(uint x)
 {
   x = min(x, (uint)INT_MAX);
   return convert_int(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat(uint2 x)
 {
   x = min(x, (uint)INT_MAX);
   return convert_int2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat(uint3 x)
 {
   x = min(x, (uint)INT_MAX);
   return convert_int3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat(uint4 x)
 {
   x = min(x, (uint)INT_MAX);
   return convert_int4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat(uint8 x)
 {
   x = min(x, (uint)INT_MAX);
   return convert_int8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat(uint16 x)
 {
   x = min(x, (uint)INT_MAX);
   return convert_int16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat(uint x)
 {
   return x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat(uint2 x)
 {
   return x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat(uint3 x)
 {
   return x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat(uint4 x)
 {
   return x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat(uint8 x)
 {
   return x;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat(uint16 x)
 {
   return x;
 }
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat(uint x)
 {
   return convert_long(x);
@@ -15897,7 +15897,7 @@ long convert_long_sat(uint x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat(uint2 x)
 {
   return convert_long2(x);
@@ -15905,7 +15905,7 @@ long2 convert_long2_sat(uint2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat(uint3 x)
 {
   return convert_long3(x);
@@ -15913,7 +15913,7 @@ long3 convert_long3_sat(uint3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat(uint4 x)
 {
   return convert_long4(x);
@@ -15921,7 +15921,7 @@ long4 convert_long4_sat(uint4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat(uint8 x)
 {
   return convert_long8(x);
@@ -15929,7 +15929,7 @@ long8 convert_long8_sat(uint8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat(uint16 x)
 {
   return convert_long16(x);
@@ -15937,7 +15937,7 @@ long16 convert_long16_sat(uint16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat(uint x)
 {
   return convert_ulong(x);
@@ -15945,7 +15945,7 @@ ulong convert_ulong_sat(uint x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat(uint2 x)
 {
   return convert_ulong2(x);
@@ -15953,7 +15953,7 @@ ulong2 convert_ulong2_sat(uint2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat(uint3 x)
 {
   return convert_ulong3(x);
@@ -15961,7 +15961,7 @@ ulong3 convert_ulong3_sat(uint3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat(uint4 x)
 {
   return convert_ulong4(x);
@@ -15969,7 +15969,7 @@ ulong4 convert_ulong4_sat(uint4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat(uint8 x)
 {
   return convert_ulong8(x);
@@ -15977,7 +15977,7 @@ ulong8 convert_ulong8_sat(uint8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat(uint16 x)
 {
   return convert_ulong16(x);
@@ -15985,7 +15985,7 @@ ulong16 convert_ulong16_sat(uint16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat(long x)
 {
   x = clamp(x, (long)CHAR_MIN, (long)CHAR_MAX);
@@ -15994,7 +15994,7 @@ char convert_char_sat(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat(long2 x)
 {
   x = clamp(x, (long)CHAR_MIN, (long)CHAR_MAX);
@@ -16003,7 +16003,7 @@ char2 convert_char2_sat(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat(long3 x)
 {
   x = clamp(x, (long)CHAR_MIN, (long)CHAR_MAX);
@@ -16012,7 +16012,7 @@ char3 convert_char3_sat(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat(long4 x)
 {
   x = clamp(x, (long)CHAR_MIN, (long)CHAR_MAX);
@@ -16021,7 +16021,7 @@ char4 convert_char4_sat(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat(long8 x)
 {
   x = clamp(x, (long)CHAR_MIN, (long)CHAR_MAX);
@@ -16030,7 +16030,7 @@ char8 convert_char8_sat(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat(long16 x)
 {
   x = clamp(x, (long)CHAR_MIN, (long)CHAR_MAX);
@@ -16039,7 +16039,7 @@ char16 convert_char16_sat(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat(long x)
 {
   x = clamp(x, (long)0, (long)UCHAR_MAX);
@@ -16048,7 +16048,7 @@ uchar convert_uchar_sat(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat(long2 x)
 {
   x = clamp(x, (long)0, (long)UCHAR_MAX);
@@ -16057,7 +16057,7 @@ uchar2 convert_uchar2_sat(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat(long3 x)
 {
   x = clamp(x, (long)0, (long)UCHAR_MAX);
@@ -16066,7 +16066,7 @@ uchar3 convert_uchar3_sat(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat(long4 x)
 {
   x = clamp(x, (long)0, (long)UCHAR_MAX);
@@ -16075,7 +16075,7 @@ uchar4 convert_uchar4_sat(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat(long8 x)
 {
   x = clamp(x, (long)0, (long)UCHAR_MAX);
@@ -16084,7 +16084,7 @@ uchar8 convert_uchar8_sat(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat(long16 x)
 {
   x = clamp(x, (long)0, (long)UCHAR_MAX);
@@ -16093,7 +16093,7 @@ uchar16 convert_uchar16_sat(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat(long x)
 {
   x = clamp(x, (long)SHRT_MIN, (long)SHRT_MAX);
@@ -16102,7 +16102,7 @@ short convert_short_sat(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat(long2 x)
 {
   x = clamp(x, (long)SHRT_MIN, (long)SHRT_MAX);
@@ -16111,7 +16111,7 @@ short2 convert_short2_sat(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat(long3 x)
 {
   x = clamp(x, (long)SHRT_MIN, (long)SHRT_MAX);
@@ -16120,7 +16120,7 @@ short3 convert_short3_sat(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat(long4 x)
 {
   x = clamp(x, (long)SHRT_MIN, (long)SHRT_MAX);
@@ -16129,7 +16129,7 @@ short4 convert_short4_sat(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat(long8 x)
 {
   x = clamp(x, (long)SHRT_MIN, (long)SHRT_MAX);
@@ -16138,7 +16138,7 @@ short8 convert_short8_sat(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat(long16 x)
 {
   x = clamp(x, (long)SHRT_MIN, (long)SHRT_MAX);
@@ -16147,7 +16147,7 @@ short16 convert_short16_sat(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat(long x)
 {
   x = clamp(x, (long)0, (long)USHRT_MAX);
@@ -16156,7 +16156,7 @@ ushort convert_ushort_sat(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat(long2 x)
 {
   x = clamp(x, (long)0, (long)USHRT_MAX);
@@ -16165,7 +16165,7 @@ ushort2 convert_ushort2_sat(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat(long3 x)
 {
   x = clamp(x, (long)0, (long)USHRT_MAX);
@@ -16174,7 +16174,7 @@ ushort3 convert_ushort3_sat(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat(long4 x)
 {
   x = clamp(x, (long)0, (long)USHRT_MAX);
@@ -16183,7 +16183,7 @@ ushort4 convert_ushort4_sat(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat(long8 x)
 {
   x = clamp(x, (long)0, (long)USHRT_MAX);
@@ -16192,7 +16192,7 @@ ushort8 convert_ushort8_sat(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat(long16 x)
 {
   x = clamp(x, (long)0, (long)USHRT_MAX);
@@ -16201,7 +16201,7 @@ ushort16 convert_ushort16_sat(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat(long x)
 {
   x = clamp(x, (long)INT_MIN, (long)INT_MAX);
@@ -16210,7 +16210,7 @@ int convert_int_sat(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat(long2 x)
 {
   x = clamp(x, (long)INT_MIN, (long)INT_MAX);
@@ -16219,7 +16219,7 @@ int2 convert_int2_sat(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat(long3 x)
 {
   x = clamp(x, (long)INT_MIN, (long)INT_MAX);
@@ -16228,7 +16228,7 @@ int3 convert_int3_sat(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat(long4 x)
 {
   x = clamp(x, (long)INT_MIN, (long)INT_MAX);
@@ -16237,7 +16237,7 @@ int4 convert_int4_sat(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat(long8 x)
 {
   x = clamp(x, (long)INT_MIN, (long)INT_MAX);
@@ -16246,7 +16246,7 @@ int8 convert_int8_sat(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat(long16 x)
 {
   x = clamp(x, (long)INT_MIN, (long)INT_MAX);
@@ -16255,7 +16255,7 @@ int16 convert_int16_sat(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat(long x)
 {
   x = clamp(x, (long)0, (long)UINT_MAX);
@@ -16264,7 +16264,7 @@ uint convert_uint_sat(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat(long2 x)
 {
   x = clamp(x, (long)0, (long)UINT_MAX);
@@ -16273,7 +16273,7 @@ uint2 convert_uint2_sat(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat(long3 x)
 {
   x = clamp(x, (long)0, (long)UINT_MAX);
@@ -16282,7 +16282,7 @@ uint3 convert_uint3_sat(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat(long4 x)
 {
   x = clamp(x, (long)0, (long)UINT_MAX);
@@ -16291,7 +16291,7 @@ uint4 convert_uint4_sat(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat(long8 x)
 {
   x = clamp(x, (long)0, (long)UINT_MAX);
@@ -16300,7 +16300,7 @@ uint8 convert_uint8_sat(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat(long16 x)
 {
   x = clamp(x, (long)0, (long)UINT_MAX);
@@ -16309,7 +16309,7 @@ uint16 convert_uint16_sat(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat(long x)
 {
   return x;
@@ -16317,7 +16317,7 @@ long convert_long_sat(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat(long2 x)
 {
   return x;
@@ -16325,7 +16325,7 @@ long2 convert_long2_sat(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat(long3 x)
 {
   return x;
@@ -16333,7 +16333,7 @@ long3 convert_long3_sat(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat(long4 x)
 {
   return x;
@@ -16341,7 +16341,7 @@ long4 convert_long4_sat(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat(long8 x)
 {
   return x;
@@ -16349,7 +16349,7 @@ long8 convert_long8_sat(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat(long16 x)
 {
   return x;
@@ -16357,7 +16357,7 @@ long16 convert_long16_sat(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat(long x)
 {
   x = max(x, (long)0);
@@ -16366,7 +16366,7 @@ ulong convert_ulong_sat(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat(long2 x)
 {
   x = max(x, (long)0);
@@ -16375,7 +16375,7 @@ ulong2 convert_ulong2_sat(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat(long3 x)
 {
   x = max(x, (long)0);
@@ -16384,7 +16384,7 @@ ulong3 convert_ulong3_sat(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat(long4 x)
 {
   x = max(x, (long)0);
@@ -16393,7 +16393,7 @@ ulong4 convert_ulong4_sat(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat(long8 x)
 {
   x = max(x, (long)0);
@@ -16402,7 +16402,7 @@ ulong8 convert_ulong8_sat(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat(long16 x)
 {
   x = max(x, (long)0);
@@ -16411,7 +16411,7 @@ ulong16 convert_ulong16_sat(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat(ulong x)
 {
   x = min(x, (ulong)CHAR_MAX);
@@ -16420,7 +16420,7 @@ char convert_char_sat(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat(ulong2 x)
 {
   x = min(x, (ulong)CHAR_MAX);
@@ -16429,7 +16429,7 @@ char2 convert_char2_sat(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat(ulong3 x)
 {
   x = min(x, (ulong)CHAR_MAX);
@@ -16438,7 +16438,7 @@ char3 convert_char3_sat(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat(ulong4 x)
 {
   x = min(x, (ulong)CHAR_MAX);
@@ -16447,7 +16447,7 @@ char4 convert_char4_sat(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat(ulong8 x)
 {
   x = min(x, (ulong)CHAR_MAX);
@@ -16456,7 +16456,7 @@ char8 convert_char8_sat(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat(ulong16 x)
 {
   x = min(x, (ulong)CHAR_MAX);
@@ -16465,7 +16465,7 @@ char16 convert_char16_sat(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat(ulong x)
 {
   x = min(x, (ulong)UCHAR_MAX);
@@ -16474,7 +16474,7 @@ uchar convert_uchar_sat(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat(ulong2 x)
 {
   x = min(x, (ulong)UCHAR_MAX);
@@ -16483,7 +16483,7 @@ uchar2 convert_uchar2_sat(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat(ulong3 x)
 {
   x = min(x, (ulong)UCHAR_MAX);
@@ -16492,7 +16492,7 @@ uchar3 convert_uchar3_sat(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat(ulong4 x)
 {
   x = min(x, (ulong)UCHAR_MAX);
@@ -16501,7 +16501,7 @@ uchar4 convert_uchar4_sat(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat(ulong8 x)
 {
   x = min(x, (ulong)UCHAR_MAX);
@@ -16510,7 +16510,7 @@ uchar8 convert_uchar8_sat(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat(ulong16 x)
 {
   x = min(x, (ulong)UCHAR_MAX);
@@ -16519,7 +16519,7 @@ uchar16 convert_uchar16_sat(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat(ulong x)
 {
   x = min(x, (ulong)SHRT_MAX);
@@ -16528,7 +16528,7 @@ short convert_short_sat(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat(ulong2 x)
 {
   x = min(x, (ulong)SHRT_MAX);
@@ -16537,7 +16537,7 @@ short2 convert_short2_sat(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat(ulong3 x)
 {
   x = min(x, (ulong)SHRT_MAX);
@@ -16546,7 +16546,7 @@ short3 convert_short3_sat(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat(ulong4 x)
 {
   x = min(x, (ulong)SHRT_MAX);
@@ -16555,7 +16555,7 @@ short4 convert_short4_sat(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat(ulong8 x)
 {
   x = min(x, (ulong)SHRT_MAX);
@@ -16564,7 +16564,7 @@ short8 convert_short8_sat(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat(ulong16 x)
 {
   x = min(x, (ulong)SHRT_MAX);
@@ -16573,7 +16573,7 @@ short16 convert_short16_sat(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat(ulong x)
 {
   x = min(x, (ulong)USHRT_MAX);
@@ -16582,7 +16582,7 @@ ushort convert_ushort_sat(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat(ulong2 x)
 {
   x = min(x, (ulong)USHRT_MAX);
@@ -16591,7 +16591,7 @@ ushort2 convert_ushort2_sat(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat(ulong3 x)
 {
   x = min(x, (ulong)USHRT_MAX);
@@ -16600,7 +16600,7 @@ ushort3 convert_ushort3_sat(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat(ulong4 x)
 {
   x = min(x, (ulong)USHRT_MAX);
@@ -16609,7 +16609,7 @@ ushort4 convert_ushort4_sat(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat(ulong8 x)
 {
   x = min(x, (ulong)USHRT_MAX);
@@ -16618,7 +16618,7 @@ ushort8 convert_ushort8_sat(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat(ulong16 x)
 {
   x = min(x, (ulong)USHRT_MAX);
@@ -16627,7 +16627,7 @@ ushort16 convert_ushort16_sat(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat(ulong x)
 {
   x = min(x, (ulong)INT_MAX);
@@ -16636,7 +16636,7 @@ int convert_int_sat(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat(ulong2 x)
 {
   x = min(x, (ulong)INT_MAX);
@@ -16645,7 +16645,7 @@ int2 convert_int2_sat(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat(ulong3 x)
 {
   x = min(x, (ulong)INT_MAX);
@@ -16654,7 +16654,7 @@ int3 convert_int3_sat(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat(ulong4 x)
 {
   x = min(x, (ulong)INT_MAX);
@@ -16663,7 +16663,7 @@ int4 convert_int4_sat(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat(ulong8 x)
 {
   x = min(x, (ulong)INT_MAX);
@@ -16672,7 +16672,7 @@ int8 convert_int8_sat(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat(ulong16 x)
 {
   x = min(x, (ulong)INT_MAX);
@@ -16681,7 +16681,7 @@ int16 convert_int16_sat(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat(ulong x)
 {
   x = min(x, (ulong)UINT_MAX);
@@ -16690,7 +16690,7 @@ uint convert_uint_sat(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat(ulong2 x)
 {
   x = min(x, (ulong)UINT_MAX);
@@ -16699,7 +16699,7 @@ uint2 convert_uint2_sat(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat(ulong3 x)
 {
   x = min(x, (ulong)UINT_MAX);
@@ -16708,7 +16708,7 @@ uint3 convert_uint3_sat(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat(ulong4 x)
 {
   x = min(x, (ulong)UINT_MAX);
@@ -16717,7 +16717,7 @@ uint4 convert_uint4_sat(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat(ulong8 x)
 {
   x = min(x, (ulong)UINT_MAX);
@@ -16726,7 +16726,7 @@ uint8 convert_uint8_sat(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat(ulong16 x)
 {
   x = min(x, (ulong)UINT_MAX);
@@ -16735,7 +16735,7 @@ uint16 convert_uint16_sat(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat(ulong x)
 {
   x = min(x, (ulong)LONG_MAX);
@@ -16744,7 +16744,7 @@ long convert_long_sat(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat(ulong2 x)
 {
   x = min(x, (ulong)LONG_MAX);
@@ -16753,7 +16753,7 @@ long2 convert_long2_sat(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat(ulong3 x)
 {
   x = min(x, (ulong)LONG_MAX);
@@ -16762,7 +16762,7 @@ long3 convert_long3_sat(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat(ulong4 x)
 {
   x = min(x, (ulong)LONG_MAX);
@@ -16771,7 +16771,7 @@ long4 convert_long4_sat(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat(ulong8 x)
 {
   x = min(x, (ulong)LONG_MAX);
@@ -16780,7 +16780,7 @@ long8 convert_long8_sat(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat(ulong16 x)
 {
   x = min(x, (ulong)LONG_MAX);
@@ -16789,7 +16789,7 @@ long16 convert_long16_sat(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat(ulong x)
 {
   return x;
@@ -16797,7 +16797,7 @@ ulong convert_ulong_sat(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat(ulong2 x)
 {
   return x;
@@ -16805,7 +16805,7 @@ ulong2 convert_ulong2_sat(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat(ulong3 x)
 {
   return x;
@@ -16813,7 +16813,7 @@ ulong3 convert_ulong3_sat(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat(ulong4 x)
 {
   return x;
@@ -16821,7 +16821,7 @@ ulong4 convert_ulong4_sat(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat(ulong8 x)
 {
   return x;
@@ -16829,7 +16829,7 @@ ulong8 convert_ulong8_sat(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat(ulong16 x)
 {
   return x;
@@ -16837,7 +16837,7 @@ ulong16 convert_ulong16_sat(ulong16 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat(half2 x)
 {
   x = clamp(x, (half)CHAR_MIN, (half)CHAR_MAX);
@@ -16846,7 +16846,7 @@ char2 convert_char2_sat(half2 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat(half3 x)
 {
   x = clamp(x, (half)CHAR_MIN, (half)CHAR_MAX);
@@ -16855,7 +16855,7 @@ char3 convert_char3_sat(half3 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat(half4 x)
 {
   x = clamp(x, (half)CHAR_MIN, (half)CHAR_MAX);
@@ -16864,7 +16864,7 @@ char4 convert_char4_sat(half4 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat(half8 x)
 {
   x = clamp(x, (half)CHAR_MIN, (half)CHAR_MAX);
@@ -16873,7 +16873,7 @@ char8 convert_char8_sat(half8 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat(half16 x)
 {
   x = clamp(x, (half)CHAR_MIN, (half)CHAR_MAX);
@@ -16882,7 +16882,7 @@ char16 convert_char16_sat(half16 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat(half2 x)
 {
   x = clamp(x, (half)0, (half)UCHAR_MAX);
@@ -16891,7 +16891,7 @@ uchar2 convert_uchar2_sat(half2 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat(half3 x)
 {
   x = clamp(x, (half)0, (half)UCHAR_MAX);
@@ -16900,7 +16900,7 @@ uchar3 convert_uchar3_sat(half3 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat(half4 x)
 {
   x = clamp(x, (half)0, (half)UCHAR_MAX);
@@ -16909,7 +16909,7 @@ uchar4 convert_uchar4_sat(half4 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat(half8 x)
 {
   x = clamp(x, (half)0, (half)UCHAR_MAX);
@@ -16918,7 +16918,7 @@ uchar8 convert_uchar8_sat(half8 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat(half16 x)
 {
   x = clamp(x, (half)0, (half)UCHAR_MAX);
@@ -16927,7 +16927,7 @@ uchar16 convert_uchar16_sat(half16 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat(half2 x)
 {
   x = max(x, (half)0);
@@ -16936,7 +16936,7 @@ short2 convert_short2_sat(half2 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat(half3 x)
 {
   x = max(x, (half)0);
@@ -16945,7 +16945,7 @@ short3 convert_short3_sat(half3 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat(half4 x)
 {
   x = max(x, (half)0);
@@ -16954,7 +16954,7 @@ short4 convert_short4_sat(half4 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat(half8 x)
 {
   x = max(x, (half)0);
@@ -16963,7 +16963,7 @@ short8 convert_short8_sat(half8 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat(half16 x)
 {
   x = max(x, (half)0);
@@ -16972,7 +16972,7 @@ short16 convert_short16_sat(half16 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat(half2 x)
 {
   x = max(x, (half)0);
@@ -16981,7 +16981,7 @@ ushort2 convert_ushort2_sat(half2 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat(half3 x)
 {
   x = max(x, (half)0);
@@ -16990,7 +16990,7 @@ ushort3 convert_ushort3_sat(half3 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat(half4 x)
 {
   x = max(x, (half)0);
@@ -16999,7 +16999,7 @@ ushort4 convert_ushort4_sat(half4 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat(half8 x)
 {
   x = max(x, (half)0);
@@ -17008,7 +17008,7 @@ ushort8 convert_ushort8_sat(half8 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat(half16 x)
 {
   x = max(x, (half)0);
@@ -17017,7 +17017,7 @@ ushort16 convert_ushort16_sat(half16 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat(half2 x)
 {
   return convert_int2(x);
@@ -17025,7 +17025,7 @@ int2 convert_int2_sat(half2 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat(half3 x)
 {
   return convert_int3(x);
@@ -17033,7 +17033,7 @@ int3 convert_int3_sat(half3 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat(half4 x)
 {
   return convert_int4(x);
@@ -17041,7 +17041,7 @@ int4 convert_int4_sat(half4 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat(half8 x)
 {
   return convert_int8(x);
@@ -17049,7 +17049,7 @@ int8 convert_int8_sat(half8 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat(half16 x)
 {
   return convert_int16(x);
@@ -17057,7 +17057,7 @@ int16 convert_int16_sat(half16 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat(half2 x)
 {
   x = max(x, (half)0);
@@ -17066,7 +17066,7 @@ uint2 convert_uint2_sat(half2 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat(half3 x)
 {
   x = max(x, (half)0);
@@ -17075,7 +17075,7 @@ uint3 convert_uint3_sat(half3 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat(half4 x)
 {
   x = max(x, (half)0);
@@ -17084,7 +17084,7 @@ uint4 convert_uint4_sat(half4 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat(half8 x)
 {
   x = max(x, (half)0);
@@ -17093,7 +17093,7 @@ uint8 convert_uint8_sat(half8 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat(half16 x)
 {
   x = max(x, (half)0);
@@ -17102,7 +17102,7 @@ uint16 convert_uint16_sat(half16 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat(half2 x)
 {
   return convert_long2(x);
@@ -17110,7 +17110,7 @@ long2 convert_long2_sat(half2 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat(half3 x)
 {
   return convert_long3(x);
@@ -17118,7 +17118,7 @@ long3 convert_long3_sat(half3 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat(half4 x)
 {
   return convert_long4(x);
@@ -17126,7 +17126,7 @@ long4 convert_long4_sat(half4 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat(half8 x)
 {
   return convert_long8(x);
@@ -17134,7 +17134,7 @@ long8 convert_long8_sat(half8 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat(half16 x)
 {
   return convert_long16(x);
@@ -17142,7 +17142,7 @@ long16 convert_long16_sat(half16 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat(half2 x)
 {
   x = max(x, (half)0);
@@ -17151,7 +17151,7 @@ ulong2 convert_ulong2_sat(half2 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat(half3 x)
 {
   x = max(x, (half)0);
@@ -17160,7 +17160,7 @@ ulong3 convert_ulong3_sat(half3 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat(half4 x)
 {
   x = max(x, (half)0);
@@ -17169,7 +17169,7 @@ ulong4 convert_ulong4_sat(half4 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat(half8 x)
 {
   x = max(x, (half)0);
@@ -17178,7 +17178,7 @@ ulong8 convert_ulong8_sat(half8 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat(half16 x)
 {
   x = max(x, (half)0);
@@ -17186,1992 +17186,1992 @@ ulong16 convert_ulong16_sat(half16 x)
 }
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat(float x)
 {
   char y = convert_char(x);
-  y = select(y, (char)CHAR_MIN, convert_char(x < (float)CHAR_MIN));
-  y = select(y, (char)CHAR_MAX, convert_char(x > (float)CHAR_MAX));
+  y = select(y, (char)CHAR_MIN, convert_char(x < (float)(-0x1p+7f)));
+  y = select(y, (char)CHAR_MAX, convert_char(x >= (float)(0x1p+7f)));
   return y;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat(float2 x)
 {
   char2 y = convert_char2(x);
-  y = select(y, (char2)CHAR_MIN, convert_char2(x < (float2)CHAR_MIN));
-  y = select(y, (char2)CHAR_MAX, convert_char2(x > (float2)CHAR_MAX));
+  y = select(y, (char2)CHAR_MIN, convert_char2(x < (float2)(-0x1p+7f)));
+  y = select(y, (char2)CHAR_MAX, convert_char2(x >= (float2)(0x1p+7f)));
   return y;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat(float3 x)
 {
   char3 y = convert_char3(x);
-  y = select(y, (char3)CHAR_MIN, convert_char3(x < (float3)CHAR_MIN));
-  y = select(y, (char3)CHAR_MAX, convert_char3(x > (float3)CHAR_MAX));
+  y = select(y, (char3)CHAR_MIN, convert_char3(x < (float3)(-0x1p+7f)));
+  y = select(y, (char3)CHAR_MAX, convert_char3(x >= (float3)(0x1p+7f)));
   return y;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat(float4 x)
 {
   char4 y = convert_char4(x);
-  y = select(y, (char4)CHAR_MIN, convert_char4(x < (float4)CHAR_MIN));
-  y = select(y, (char4)CHAR_MAX, convert_char4(x > (float4)CHAR_MAX));
+  y = select(y, (char4)CHAR_MIN, convert_char4(x < (float4)(-0x1p+7f)));
+  y = select(y, (char4)CHAR_MAX, convert_char4(x >= (float4)(0x1p+7f)));
   return y;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat(float8 x)
 {
   char8 y = convert_char8(x);
-  y = select(y, (char8)CHAR_MIN, convert_char8(x < (float8)CHAR_MIN));
-  y = select(y, (char8)CHAR_MAX, convert_char8(x > (float8)CHAR_MAX));
+  y = select(y, (char8)CHAR_MIN, convert_char8(x < (float8)(-0x1p+7f)));
+  y = select(y, (char8)CHAR_MAX, convert_char8(x >= (float8)(0x1p+7f)));
   return y;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat(float16 x)
 {
   char16 y = convert_char16(x);
-  y = select(y, (char16)CHAR_MIN, convert_char16(x < (float16)CHAR_MIN));
-  y = select(y, (char16)CHAR_MAX, convert_char16(x > (float16)CHAR_MAX));
+  y = select(y, (char16)CHAR_MIN, convert_char16(x < (float16)(-0x1p+7f)));
+  y = select(y, (char16)CHAR_MAX, convert_char16(x >= (float16)(0x1p+7f)));
   return y;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat(float x)
 {
   uchar y = convert_uchar(x);
-  y = select(y, (uchar)0, as_uchar(convert_char(x < (float)0)));
-  y = select(y, (uchar)UCHAR_MAX, as_uchar(convert_char(x > (float)UCHAR_MAX)));
+  y = select(y, (uchar)0, as_uchar(convert_char(x < (float)0.0f)));
+  y = select(y, (uchar)UCHAR_MAX, as_uchar(convert_char(x >= (float)(0x1p+8f))));
   return y;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat(float2 x)
 {
   uchar2 y = convert_uchar2(x);
-  y = select(y, (uchar2)0, as_uchar2(convert_char2(x < (float2)0)));
-  y = select(y, (uchar2)UCHAR_MAX, as_uchar2(convert_char2(x > (float2)UCHAR_MAX)));
+  y = select(y, (uchar2)0, as_uchar2(convert_char2(x < (float2)0.0f)));
+  y = select(y, (uchar2)UCHAR_MAX, as_uchar2(convert_char2(x >= (float2)(0x1p+8f))));
   return y;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat(float3 x)
 {
   uchar3 y = convert_uchar3(x);
-  y = select(y, (uchar3)0, as_uchar3(convert_char3(x < (float3)0)));
-  y = select(y, (uchar3)UCHAR_MAX, as_uchar3(convert_char3(x > (float3)UCHAR_MAX)));
+  y = select(y, (uchar3)0, as_uchar3(convert_char3(x < (float3)0.0f)));
+  y = select(y, (uchar3)UCHAR_MAX, as_uchar3(convert_char3(x >= (float3)(0x1p+8f))));
   return y;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat(float4 x)
 {
   uchar4 y = convert_uchar4(x);
-  y = select(y, (uchar4)0, as_uchar4(convert_char4(x < (float4)0)));
-  y = select(y, (uchar4)UCHAR_MAX, as_uchar4(convert_char4(x > (float4)UCHAR_MAX)));
+  y = select(y, (uchar4)0, as_uchar4(convert_char4(x < (float4)0.0f)));
+  y = select(y, (uchar4)UCHAR_MAX, as_uchar4(convert_char4(x >= (float4)(0x1p+8f))));
   return y;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat(float8 x)
 {
   uchar8 y = convert_uchar8(x);
-  y = select(y, (uchar8)0, as_uchar8(convert_char8(x < (float8)0)));
-  y = select(y, (uchar8)UCHAR_MAX, as_uchar8(convert_char8(x > (float8)UCHAR_MAX)));
+  y = select(y, (uchar8)0, as_uchar8(convert_char8(x < (float8)0.0f)));
+  y = select(y, (uchar8)UCHAR_MAX, as_uchar8(convert_char8(x >= (float8)(0x1p+8f))));
   return y;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat(float16 x)
 {
   uchar16 y = convert_uchar16(x);
-  y = select(y, (uchar16)0, as_uchar16(convert_char16(x < (float16)0)));
-  y = select(y, (uchar16)UCHAR_MAX, as_uchar16(convert_char16(x > (float16)UCHAR_MAX)));
+  y = select(y, (uchar16)0, as_uchar16(convert_char16(x < (float16)0.0f)));
+  y = select(y, (uchar16)UCHAR_MAX, as_uchar16(convert_char16(x >= (float16)(0x1p+8f))));
   return y;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat(float x)
 {
   short y = convert_short(x);
-  y = select(y, (short)SHRT_MIN, convert_short(x < (float)SHRT_MIN));
-  y = select(y, (short)SHRT_MAX, convert_short(x > (float)SHRT_MAX));
+  y = select(y, (short)SHRT_MIN, convert_short(x < (float)(-0x1p+15f)));
+  y = select(y, (short)SHRT_MAX, convert_short(x >= (float)(0x1p+15f)));
   return y;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat(float2 x)
 {
   short2 y = convert_short2(x);
-  y = select(y, (short2)SHRT_MIN, convert_short2(x < (float2)SHRT_MIN));
-  y = select(y, (short2)SHRT_MAX, convert_short2(x > (float2)SHRT_MAX));
+  y = select(y, (short2)SHRT_MIN, convert_short2(x < (float2)(-0x1p+15f)));
+  y = select(y, (short2)SHRT_MAX, convert_short2(x >= (float2)(0x1p+15f)));
   return y;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat(float3 x)
 {
   short3 y = convert_short3(x);
-  y = select(y, (short3)SHRT_MIN, convert_short3(x < (float3)SHRT_MIN));
-  y = select(y, (short3)SHRT_MAX, convert_short3(x > (float3)SHRT_MAX));
+  y = select(y, (short3)SHRT_MIN, convert_short3(x < (float3)(-0x1p+15f)));
+  y = select(y, (short3)SHRT_MAX, convert_short3(x >= (float3)(0x1p+15f)));
   return y;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat(float4 x)
 {
   short4 y = convert_short4(x);
-  y = select(y, (short4)SHRT_MIN, convert_short4(x < (float4)SHRT_MIN));
-  y = select(y, (short4)SHRT_MAX, convert_short4(x > (float4)SHRT_MAX));
+  y = select(y, (short4)SHRT_MIN, convert_short4(x < (float4)(-0x1p+15f)));
+  y = select(y, (short4)SHRT_MAX, convert_short4(x >= (float4)(0x1p+15f)));
   return y;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat(float8 x)
 {
   short8 y = convert_short8(x);
-  y = select(y, (short8)SHRT_MIN, convert_short8(x < (float8)SHRT_MIN));
-  y = select(y, (short8)SHRT_MAX, convert_short8(x > (float8)SHRT_MAX));
+  y = select(y, (short8)SHRT_MIN, convert_short8(x < (float8)(-0x1p+15f)));
+  y = select(y, (short8)SHRT_MAX, convert_short8(x >= (float8)(0x1p+15f)));
   return y;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat(float16 x)
 {
   short16 y = convert_short16(x);
-  y = select(y, (short16)SHRT_MIN, convert_short16(x < (float16)SHRT_MIN));
-  y = select(y, (short16)SHRT_MAX, convert_short16(x > (float16)SHRT_MAX));
+  y = select(y, (short16)SHRT_MIN, convert_short16(x < (float16)(-0x1p+15f)));
+  y = select(y, (short16)SHRT_MAX, convert_short16(x >= (float16)(0x1p+15f)));
   return y;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat(float x)
 {
   ushort y = convert_ushort(x);
-  y = select(y, (ushort)0, as_ushort(convert_short(x < (float)0)));
-  y = select(y, (ushort)USHRT_MAX, as_ushort(convert_short(x > (float)USHRT_MAX)));
+  y = select(y, (ushort)0, as_ushort(convert_short(x < (float)0.0f)));
+  y = select(y, (ushort)USHRT_MAX, as_ushort(convert_short(x >= (float)(0x1p+16f))));
   return y;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat(float2 x)
 {
   ushort2 y = convert_ushort2(x);
-  y = select(y, (ushort2)0, as_ushort2(convert_short2(x < (float2)0)));
-  y = select(y, (ushort2)USHRT_MAX, as_ushort2(convert_short2(x > (float2)USHRT_MAX)));
+  y = select(y, (ushort2)0, as_ushort2(convert_short2(x < (float2)0.0f)));
+  y = select(y, (ushort2)USHRT_MAX, as_ushort2(convert_short2(x >= (float2)(0x1p+16f))));
   return y;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat(float3 x)
 {
   ushort3 y = convert_ushort3(x);
-  y = select(y, (ushort3)0, as_ushort3(convert_short3(x < (float3)0)));
-  y = select(y, (ushort3)USHRT_MAX, as_ushort3(convert_short3(x > (float3)USHRT_MAX)));
+  y = select(y, (ushort3)0, as_ushort3(convert_short3(x < (float3)0.0f)));
+  y = select(y, (ushort3)USHRT_MAX, as_ushort3(convert_short3(x >= (float3)(0x1p+16f))));
   return y;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat(float4 x)
 {
   ushort4 y = convert_ushort4(x);
-  y = select(y, (ushort4)0, as_ushort4(convert_short4(x < (float4)0)));
-  y = select(y, (ushort4)USHRT_MAX, as_ushort4(convert_short4(x > (float4)USHRT_MAX)));
+  y = select(y, (ushort4)0, as_ushort4(convert_short4(x < (float4)0.0f)));
+  y = select(y, (ushort4)USHRT_MAX, as_ushort4(convert_short4(x >= (float4)(0x1p+16f))));
   return y;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat(float8 x)
 {
   ushort8 y = convert_ushort8(x);
-  y = select(y, (ushort8)0, as_ushort8(convert_short8(x < (float8)0)));
-  y = select(y, (ushort8)USHRT_MAX, as_ushort8(convert_short8(x > (float8)USHRT_MAX)));
+  y = select(y, (ushort8)0, as_ushort8(convert_short8(x < (float8)0.0f)));
+  y = select(y, (ushort8)USHRT_MAX, as_ushort8(convert_short8(x >= (float8)(0x1p+16f))));
   return y;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat(float16 x)
 {
   ushort16 y = convert_ushort16(x);
-  y = select(y, (ushort16)0, as_ushort16(convert_short16(x < (float16)0)));
-  y = select(y, (ushort16)USHRT_MAX, as_ushort16(convert_short16(x > (float16)USHRT_MAX)));
+  y = select(y, (ushort16)0, as_ushort16(convert_short16(x < (float16)0.0f)));
+  y = select(y, (ushort16)USHRT_MAX, as_ushort16(convert_short16(x >= (float16)(0x1p+16f))));
   return y;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat(float x)
 {
   int y = convert_int(x);
-  y = select(y, (int)INT_MIN, convert_int(x < (float)INT_MIN));
-  y = select(y, (int)INT_MAX, convert_int(x > (float)INT_MAX));
+  y = select(y, (int)INT_MIN, convert_int(x < (float)(-0x1p+31f)));
+  y = select(y, (int)INT_MAX, convert_int(x >= (float)(0x1p+31f)));
   return y;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat(float2 x)
 {
   int2 y = convert_int2(x);
-  y = select(y, (int2)INT_MIN, convert_int2(x < (float2)INT_MIN));
-  y = select(y, (int2)INT_MAX, convert_int2(x > (float2)INT_MAX));
+  y = select(y, (int2)INT_MIN, convert_int2(x < (float2)(-0x1p+31f)));
+  y = select(y, (int2)INT_MAX, convert_int2(x >= (float2)(0x1p+31f)));
   return y;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat(float3 x)
 {
   int3 y = convert_int3(x);
-  y = select(y, (int3)INT_MIN, convert_int3(x < (float3)INT_MIN));
-  y = select(y, (int3)INT_MAX, convert_int3(x > (float3)INT_MAX));
+  y = select(y, (int3)INT_MIN, convert_int3(x < (float3)(-0x1p+31f)));
+  y = select(y, (int3)INT_MAX, convert_int3(x >= (float3)(0x1p+31f)));
   return y;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat(float4 x)
 {
   int4 y = convert_int4(x);
-  y = select(y, (int4)INT_MIN, convert_int4(x < (float4)INT_MIN));
-  y = select(y, (int4)INT_MAX, convert_int4(x > (float4)INT_MAX));
+  y = select(y, (int4)INT_MIN, convert_int4(x < (float4)(-0x1p+31f)));
+  y = select(y, (int4)INT_MAX, convert_int4(x >= (float4)(0x1p+31f)));
   return y;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat(float8 x)
 {
   int8 y = convert_int8(x);
-  y = select(y, (int8)INT_MIN, convert_int8(x < (float8)INT_MIN));
-  y = select(y, (int8)INT_MAX, convert_int8(x > (float8)INT_MAX));
+  y = select(y, (int8)INT_MIN, convert_int8(x < (float8)(-0x1p+31f)));
+  y = select(y, (int8)INT_MAX, convert_int8(x >= (float8)(0x1p+31f)));
   return y;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat(float16 x)
 {
   int16 y = convert_int16(x);
-  y = select(y, (int16)INT_MIN, convert_int16(x < (float16)INT_MIN));
-  y = select(y, (int16)INT_MAX, convert_int16(x > (float16)INT_MAX));
+  y = select(y, (int16)INT_MIN, convert_int16(x < (float16)(-0x1p+31f)));
+  y = select(y, (int16)INT_MAX, convert_int16(x >= (float16)(0x1p+31f)));
   return y;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat(float x)
 {
   uint y = convert_uint(x);
-  y = select(y, (uint)0, as_uint(convert_int(x < (float)0)));
-  y = select(y, (uint)UINT_MAX, as_uint(convert_int(x > (float)UINT_MAX)));
+  y = select(y, (uint)0, as_uint(convert_int(x < (float)0.0f)));
+  y = select(y, (uint)UINT_MAX, as_uint(convert_int(x >= (float)(0x1p+32f))));
   return y;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat(float2 x)
 {
   uint2 y = convert_uint2(x);
-  y = select(y, (uint2)0, as_uint2(convert_int2(x < (float2)0)));
-  y = select(y, (uint2)UINT_MAX, as_uint2(convert_int2(x > (float2)UINT_MAX)));
+  y = select(y, (uint2)0, as_uint2(convert_int2(x < (float2)0.0f)));
+  y = select(y, (uint2)UINT_MAX, as_uint2(convert_int2(x >= (float2)(0x1p+32f))));
   return y;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat(float3 x)
 {
   uint3 y = convert_uint3(x);
-  y = select(y, (uint3)0, as_uint3(convert_int3(x < (float3)0)));
-  y = select(y, (uint3)UINT_MAX, as_uint3(convert_int3(x > (float3)UINT_MAX)));
+  y = select(y, (uint3)0, as_uint3(convert_int3(x < (float3)0.0f)));
+  y = select(y, (uint3)UINT_MAX, as_uint3(convert_int3(x >= (float3)(0x1p+32f))));
   return y;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat(float4 x)
 {
   uint4 y = convert_uint4(x);
-  y = select(y, (uint4)0, as_uint4(convert_int4(x < (float4)0)));
-  y = select(y, (uint4)UINT_MAX, as_uint4(convert_int4(x > (float4)UINT_MAX)));
+  y = select(y, (uint4)0, as_uint4(convert_int4(x < (float4)0.0f)));
+  y = select(y, (uint4)UINT_MAX, as_uint4(convert_int4(x >= (float4)(0x1p+32f))));
   return y;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat(float8 x)
 {
   uint8 y = convert_uint8(x);
-  y = select(y, (uint8)0, as_uint8(convert_int8(x < (float8)0)));
-  y = select(y, (uint8)UINT_MAX, as_uint8(convert_int8(x > (float8)UINT_MAX)));
+  y = select(y, (uint8)0, as_uint8(convert_int8(x < (float8)0.0f)));
+  y = select(y, (uint8)UINT_MAX, as_uint8(convert_int8(x >= (float8)(0x1p+32f))));
   return y;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat(float16 x)
 {
   uint16 y = convert_uint16(x);
-  y = select(y, (uint16)0, as_uint16(convert_int16(x < (float16)0)));
-  y = select(y, (uint16)UINT_MAX, as_uint16(convert_int16(x > (float16)UINT_MAX)));
+  y = select(y, (uint16)0, as_uint16(convert_int16(x < (float16)0.0f)));
+  y = select(y, (uint16)UINT_MAX, as_uint16(convert_int16(x >= (float16)(0x1p+32f))));
   return y;
 }
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat(float x)
 {
   long y = convert_long(x);
-  y = select(y, (long)LONG_MIN, convert_long(x < (float)LONG_MIN));
-  y = select(y, (long)LONG_MAX, convert_long(x > (float)LONG_MAX));
+  y = select(y, (long)LONG_MIN, convert_long(x < (float)(-0x1p+63f)));
+  y = select(y, (long)LONG_MAX, convert_long(x >= (float)(0x1p+63f)));
   return y;
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat(float2 x)
 {
   long2 y = convert_long2(x);
-  y = select(y, (long2)LONG_MIN, convert_long2(x < (float2)LONG_MIN));
-  y = select(y, (long2)LONG_MAX, convert_long2(x > (float2)LONG_MAX));
+  y = select(y, (long2)LONG_MIN, convert_long2(x < (float2)(-0x1p+63f)));
+  y = select(y, (long2)LONG_MAX, convert_long2(x >= (float2)(0x1p+63f)));
   return y;
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat(float3 x)
 {
   long3 y = convert_long3(x);
-  y = select(y, (long3)LONG_MIN, convert_long3(x < (float3)LONG_MIN));
-  y = select(y, (long3)LONG_MAX, convert_long3(x > (float3)LONG_MAX));
+  y = select(y, (long3)LONG_MIN, convert_long3(x < (float3)(-0x1p+63f)));
+  y = select(y, (long3)LONG_MAX, convert_long3(x >= (float3)(0x1p+63f)));
   return y;
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat(float4 x)
 {
   long4 y = convert_long4(x);
-  y = select(y, (long4)LONG_MIN, convert_long4(x < (float4)LONG_MIN));
-  y = select(y, (long4)LONG_MAX, convert_long4(x > (float4)LONG_MAX));
+  y = select(y, (long4)LONG_MIN, convert_long4(x < (float4)(-0x1p+63f)));
+  y = select(y, (long4)LONG_MAX, convert_long4(x >= (float4)(0x1p+63f)));
   return y;
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat(float8 x)
 {
   long8 y = convert_long8(x);
-  y = select(y, (long8)LONG_MIN, convert_long8(x < (float8)LONG_MIN));
-  y = select(y, (long8)LONG_MAX, convert_long8(x > (float8)LONG_MAX));
+  y = select(y, (long8)LONG_MIN, convert_long8(x < (float8)(-0x1p+63f)));
+  y = select(y, (long8)LONG_MAX, convert_long8(x >= (float8)(0x1p+63f)));
   return y;
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat(float16 x)
 {
   long16 y = convert_long16(x);
-  y = select(y, (long16)LONG_MIN, convert_long16(x < (float16)LONG_MIN));
-  y = select(y, (long16)LONG_MAX, convert_long16(x > (float16)LONG_MAX));
+  y = select(y, (long16)LONG_MIN, convert_long16(x < (float16)(-0x1p+63f)));
+  y = select(y, (long16)LONG_MAX, convert_long16(x >= (float16)(0x1p+63f)));
   return y;
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat(float x)
 {
   ulong y = convert_ulong(x);
-  y = select(y, (ulong)0, as_ulong(convert_long(x < (float)0)));
-  y = select(y, (ulong)ULONG_MAX, as_ulong(convert_long(x > (float)ULONG_MAX)));
+  y = select(y, (ulong)0, as_ulong(convert_long(x < (float)0.0f)));
+  y = select(y, (ulong)ULONG_MAX, as_ulong(convert_long(x >= (float)(0x1p+64f))));
   return y;
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat(float2 x)
 {
   ulong2 y = convert_ulong2(x);
-  y = select(y, (ulong2)0, as_ulong2(convert_long2(x < (float2)0)));
-  y = select(y, (ulong2)ULONG_MAX, as_ulong2(convert_long2(x > (float2)ULONG_MAX)));
+  y = select(y, (ulong2)0, as_ulong2(convert_long2(x < (float2)0.0f)));
+  y = select(y, (ulong2)ULONG_MAX, as_ulong2(convert_long2(x >= (float2)(0x1p+64f))));
   return y;
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat(float3 x)
 {
   ulong3 y = convert_ulong3(x);
-  y = select(y, (ulong3)0, as_ulong3(convert_long3(x < (float3)0)));
-  y = select(y, (ulong3)ULONG_MAX, as_ulong3(convert_long3(x > (float3)ULONG_MAX)));
+  y = select(y, (ulong3)0, as_ulong3(convert_long3(x < (float3)0.0f)));
+  y = select(y, (ulong3)ULONG_MAX, as_ulong3(convert_long3(x >= (float3)(0x1p+64f))));
   return y;
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat(float4 x)
 {
   ulong4 y = convert_ulong4(x);
-  y = select(y, (ulong4)0, as_ulong4(convert_long4(x < (float4)0)));
-  y = select(y, (ulong4)ULONG_MAX, as_ulong4(convert_long4(x > (float4)ULONG_MAX)));
+  y = select(y, (ulong4)0, as_ulong4(convert_long4(x < (float4)0.0f)));
+  y = select(y, (ulong4)ULONG_MAX, as_ulong4(convert_long4(x >= (float4)(0x1p+64f))));
   return y;
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat(float8 x)
 {
   ulong8 y = convert_ulong8(x);
-  y = select(y, (ulong8)0, as_ulong8(convert_long8(x < (float8)0)));
-  y = select(y, (ulong8)ULONG_MAX, as_ulong8(convert_long8(x > (float8)ULONG_MAX)));
+  y = select(y, (ulong8)0, as_ulong8(convert_long8(x < (float8)0.0f)));
+  y = select(y, (ulong8)ULONG_MAX, as_ulong8(convert_long8(x >= (float8)(0x1p+64f))));
   return y;
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat(float16 x)
 {
   ulong16 y = convert_ulong16(x);
-  y = select(y, (ulong16)0, as_ulong16(convert_long16(x < (float16)0)));
-  y = select(y, (ulong16)ULONG_MAX, as_ulong16(convert_long16(x > (float16)ULONG_MAX)));
+  y = select(y, (ulong16)0, as_ulong16(convert_long16(x < (float16)0.0f)));
+  y = select(y, (ulong16)ULONG_MAX, as_ulong16(convert_long16(x >= (float16)(0x1p+64f))));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat(double x)
 {
   char y = convert_char(x);
-  y = select(y, (char)CHAR_MIN, convert_char(x < (double)CHAR_MIN));
-  y = select(y, (char)CHAR_MAX, convert_char(x > (double)CHAR_MAX));
+  y = select(y, (char)CHAR_MIN, convert_char(x < (double)(-0x1p+7f)));
+  y = select(y, (char)CHAR_MAX, convert_char(x >= (double)(0x1p+7f)));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat(double2 x)
 {
   char2 y = convert_char2(x);
-  y = select(y, (char2)CHAR_MIN, convert_char2(x < (double2)CHAR_MIN));
-  y = select(y, (char2)CHAR_MAX, convert_char2(x > (double2)CHAR_MAX));
+  y = select(y, (char2)CHAR_MIN, convert_char2(x < (double2)(-0x1p+7f)));
+  y = select(y, (char2)CHAR_MAX, convert_char2(x >= (double2)(0x1p+7f)));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat(double3 x)
 {
   char3 y = convert_char3(x);
-  y = select(y, (char3)CHAR_MIN, convert_char3(x < (double3)CHAR_MIN));
-  y = select(y, (char3)CHAR_MAX, convert_char3(x > (double3)CHAR_MAX));
+  y = select(y, (char3)CHAR_MIN, convert_char3(x < (double3)(-0x1p+7f)));
+  y = select(y, (char3)CHAR_MAX, convert_char3(x >= (double3)(0x1p+7f)));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat(double4 x)
 {
   char4 y = convert_char4(x);
-  y = select(y, (char4)CHAR_MIN, convert_char4(x < (double4)CHAR_MIN));
-  y = select(y, (char4)CHAR_MAX, convert_char4(x > (double4)CHAR_MAX));
+  y = select(y, (char4)CHAR_MIN, convert_char4(x < (double4)(-0x1p+7f)));
+  y = select(y, (char4)CHAR_MAX, convert_char4(x >= (double4)(0x1p+7f)));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat(double8 x)
 {
   char8 y = convert_char8(x);
-  y = select(y, (char8)CHAR_MIN, convert_char8(x < (double8)CHAR_MIN));
-  y = select(y, (char8)CHAR_MAX, convert_char8(x > (double8)CHAR_MAX));
+  y = select(y, (char8)CHAR_MIN, convert_char8(x < (double8)(-0x1p+7f)));
+  y = select(y, (char8)CHAR_MAX, convert_char8(x >= (double8)(0x1p+7f)));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat(double16 x)
 {
   char16 y = convert_char16(x);
-  y = select(y, (char16)CHAR_MIN, convert_char16(x < (double16)CHAR_MIN));
-  y = select(y, (char16)CHAR_MAX, convert_char16(x > (double16)CHAR_MAX));
+  y = select(y, (char16)CHAR_MIN, convert_char16(x < (double16)(-0x1p+7f)));
+  y = select(y, (char16)CHAR_MAX, convert_char16(x >= (double16)(0x1p+7f)));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat(double x)
 {
   uchar y = convert_uchar(x);
-  y = select(y, (uchar)0, as_uchar(convert_char(x < (double)0)));
-  y = select(y, (uchar)UCHAR_MAX, as_uchar(convert_char(x > (double)UCHAR_MAX)));
+  y = select(y, (uchar)0, as_uchar(convert_char(x < (double)0.0f)));
+  y = select(y, (uchar)UCHAR_MAX, as_uchar(convert_char(x >= (double)(0x1p+8f))));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat(double2 x)
 {
   uchar2 y = convert_uchar2(x);
-  y = select(y, (uchar2)0, as_uchar2(convert_char2(x < (double2)0)));
-  y = select(y, (uchar2)UCHAR_MAX, as_uchar2(convert_char2(x > (double2)UCHAR_MAX)));
+  y = select(y, (uchar2)0, as_uchar2(convert_char2(x < (double2)0.0f)));
+  y = select(y, (uchar2)UCHAR_MAX, as_uchar2(convert_char2(x >= (double2)(0x1p+8f))));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat(double3 x)
 {
   uchar3 y = convert_uchar3(x);
-  y = select(y, (uchar3)0, as_uchar3(convert_char3(x < (double3)0)));
-  y = select(y, (uchar3)UCHAR_MAX, as_uchar3(convert_char3(x > (double3)UCHAR_MAX)));
+  y = select(y, (uchar3)0, as_uchar3(convert_char3(x < (double3)0.0f)));
+  y = select(y, (uchar3)UCHAR_MAX, as_uchar3(convert_char3(x >= (double3)(0x1p+8f))));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat(double4 x)
 {
   uchar4 y = convert_uchar4(x);
-  y = select(y, (uchar4)0, as_uchar4(convert_char4(x < (double4)0)));
-  y = select(y, (uchar4)UCHAR_MAX, as_uchar4(convert_char4(x > (double4)UCHAR_MAX)));
+  y = select(y, (uchar4)0, as_uchar4(convert_char4(x < (double4)0.0f)));
+  y = select(y, (uchar4)UCHAR_MAX, as_uchar4(convert_char4(x >= (double4)(0x1p+8f))));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat(double8 x)
 {
   uchar8 y = convert_uchar8(x);
-  y = select(y, (uchar8)0, as_uchar8(convert_char8(x < (double8)0)));
-  y = select(y, (uchar8)UCHAR_MAX, as_uchar8(convert_char8(x > (double8)UCHAR_MAX)));
+  y = select(y, (uchar8)0, as_uchar8(convert_char8(x < (double8)0.0f)));
+  y = select(y, (uchar8)UCHAR_MAX, as_uchar8(convert_char8(x >= (double8)(0x1p+8f))));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat(double16 x)
 {
   uchar16 y = convert_uchar16(x);
-  y = select(y, (uchar16)0, as_uchar16(convert_char16(x < (double16)0)));
-  y = select(y, (uchar16)UCHAR_MAX, as_uchar16(convert_char16(x > (double16)UCHAR_MAX)));
+  y = select(y, (uchar16)0, as_uchar16(convert_char16(x < (double16)0.0f)));
+  y = select(y, (uchar16)UCHAR_MAX, as_uchar16(convert_char16(x >= (double16)(0x1p+8f))));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat(double x)
 {
   short y = convert_short(x);
-  y = select(y, (short)SHRT_MIN, convert_short(x < (double)SHRT_MIN));
-  y = select(y, (short)SHRT_MAX, convert_short(x > (double)SHRT_MAX));
+  y = select(y, (short)SHRT_MIN, convert_short(x < (double)(-0x1p+15f)));
+  y = select(y, (short)SHRT_MAX, convert_short(x >= (double)(0x1p+15f)));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat(double2 x)
 {
   short2 y = convert_short2(x);
-  y = select(y, (short2)SHRT_MIN, convert_short2(x < (double2)SHRT_MIN));
-  y = select(y, (short2)SHRT_MAX, convert_short2(x > (double2)SHRT_MAX));
+  y = select(y, (short2)SHRT_MIN, convert_short2(x < (double2)(-0x1p+15f)));
+  y = select(y, (short2)SHRT_MAX, convert_short2(x >= (double2)(0x1p+15f)));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat(double3 x)
 {
   short3 y = convert_short3(x);
-  y = select(y, (short3)SHRT_MIN, convert_short3(x < (double3)SHRT_MIN));
-  y = select(y, (short3)SHRT_MAX, convert_short3(x > (double3)SHRT_MAX));
+  y = select(y, (short3)SHRT_MIN, convert_short3(x < (double3)(-0x1p+15f)));
+  y = select(y, (short3)SHRT_MAX, convert_short3(x >= (double3)(0x1p+15f)));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat(double4 x)
 {
   short4 y = convert_short4(x);
-  y = select(y, (short4)SHRT_MIN, convert_short4(x < (double4)SHRT_MIN));
-  y = select(y, (short4)SHRT_MAX, convert_short4(x > (double4)SHRT_MAX));
+  y = select(y, (short4)SHRT_MIN, convert_short4(x < (double4)(-0x1p+15f)));
+  y = select(y, (short4)SHRT_MAX, convert_short4(x >= (double4)(0x1p+15f)));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat(double8 x)
 {
   short8 y = convert_short8(x);
-  y = select(y, (short8)SHRT_MIN, convert_short8(x < (double8)SHRT_MIN));
-  y = select(y, (short8)SHRT_MAX, convert_short8(x > (double8)SHRT_MAX));
+  y = select(y, (short8)SHRT_MIN, convert_short8(x < (double8)(-0x1p+15f)));
+  y = select(y, (short8)SHRT_MAX, convert_short8(x >= (double8)(0x1p+15f)));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat(double16 x)
 {
   short16 y = convert_short16(x);
-  y = select(y, (short16)SHRT_MIN, convert_short16(x < (double16)SHRT_MIN));
-  y = select(y, (short16)SHRT_MAX, convert_short16(x > (double16)SHRT_MAX));
+  y = select(y, (short16)SHRT_MIN, convert_short16(x < (double16)(-0x1p+15f)));
+  y = select(y, (short16)SHRT_MAX, convert_short16(x >= (double16)(0x1p+15f)));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat(double x)
 {
   ushort y = convert_ushort(x);
-  y = select(y, (ushort)0, as_ushort(convert_short(x < (double)0)));
-  y = select(y, (ushort)USHRT_MAX, as_ushort(convert_short(x > (double)USHRT_MAX)));
+  y = select(y, (ushort)0, as_ushort(convert_short(x < (double)0.0f)));
+  y = select(y, (ushort)USHRT_MAX, as_ushort(convert_short(x >= (double)(0x1p+16f))));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat(double2 x)
 {
   ushort2 y = convert_ushort2(x);
-  y = select(y, (ushort2)0, as_ushort2(convert_short2(x < (double2)0)));
-  y = select(y, (ushort2)USHRT_MAX, as_ushort2(convert_short2(x > (double2)USHRT_MAX)));
+  y = select(y, (ushort2)0, as_ushort2(convert_short2(x < (double2)0.0f)));
+  y = select(y, (ushort2)USHRT_MAX, as_ushort2(convert_short2(x >= (double2)(0x1p+16f))));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat(double3 x)
 {
   ushort3 y = convert_ushort3(x);
-  y = select(y, (ushort3)0, as_ushort3(convert_short3(x < (double3)0)));
-  y = select(y, (ushort3)USHRT_MAX, as_ushort3(convert_short3(x > (double3)USHRT_MAX)));
+  y = select(y, (ushort3)0, as_ushort3(convert_short3(x < (double3)0.0f)));
+  y = select(y, (ushort3)USHRT_MAX, as_ushort3(convert_short3(x >= (double3)(0x1p+16f))));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat(double4 x)
 {
   ushort4 y = convert_ushort4(x);
-  y = select(y, (ushort4)0, as_ushort4(convert_short4(x < (double4)0)));
-  y = select(y, (ushort4)USHRT_MAX, as_ushort4(convert_short4(x > (double4)USHRT_MAX)));
+  y = select(y, (ushort4)0, as_ushort4(convert_short4(x < (double4)0.0f)));
+  y = select(y, (ushort4)USHRT_MAX, as_ushort4(convert_short4(x >= (double4)(0x1p+16f))));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat(double8 x)
 {
   ushort8 y = convert_ushort8(x);
-  y = select(y, (ushort8)0, as_ushort8(convert_short8(x < (double8)0)));
-  y = select(y, (ushort8)USHRT_MAX, as_ushort8(convert_short8(x > (double8)USHRT_MAX)));
+  y = select(y, (ushort8)0, as_ushort8(convert_short8(x < (double8)0.0f)));
+  y = select(y, (ushort8)USHRT_MAX, as_ushort8(convert_short8(x >= (double8)(0x1p+16f))));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat(double16 x)
 {
   ushort16 y = convert_ushort16(x);
-  y = select(y, (ushort16)0, as_ushort16(convert_short16(x < (double16)0)));
-  y = select(y, (ushort16)USHRT_MAX, as_ushort16(convert_short16(x > (double16)USHRT_MAX)));
+  y = select(y, (ushort16)0, as_ushort16(convert_short16(x < (double16)0.0f)));
+  y = select(y, (ushort16)USHRT_MAX, as_ushort16(convert_short16(x >= (double16)(0x1p+16f))));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat(double x)
 {
   int y = convert_int(x);
-  y = select(y, (int)INT_MIN, convert_int(x < (double)INT_MIN));
-  y = select(y, (int)INT_MAX, convert_int(x > (double)INT_MAX));
+  y = select(y, (int)INT_MIN, convert_int(x < (double)(-0x1p+31f)));
+  y = select(y, (int)INT_MAX, convert_int(x >= (double)(0x1p+31f)));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat(double2 x)
 {
   int2 y = convert_int2(x);
-  y = select(y, (int2)INT_MIN, convert_int2(x < (double2)INT_MIN));
-  y = select(y, (int2)INT_MAX, convert_int2(x > (double2)INT_MAX));
+  y = select(y, (int2)INT_MIN, convert_int2(x < (double2)(-0x1p+31f)));
+  y = select(y, (int2)INT_MAX, convert_int2(x >= (double2)(0x1p+31f)));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat(double3 x)
 {
   int3 y = convert_int3(x);
-  y = select(y, (int3)INT_MIN, convert_int3(x < (double3)INT_MIN));
-  y = select(y, (int3)INT_MAX, convert_int3(x > (double3)INT_MAX));
+  y = select(y, (int3)INT_MIN, convert_int3(x < (double3)(-0x1p+31f)));
+  y = select(y, (int3)INT_MAX, convert_int3(x >= (double3)(0x1p+31f)));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat(double4 x)
 {
   int4 y = convert_int4(x);
-  y = select(y, (int4)INT_MIN, convert_int4(x < (double4)INT_MIN));
-  y = select(y, (int4)INT_MAX, convert_int4(x > (double4)INT_MAX));
+  y = select(y, (int4)INT_MIN, convert_int4(x < (double4)(-0x1p+31f)));
+  y = select(y, (int4)INT_MAX, convert_int4(x >= (double4)(0x1p+31f)));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat(double8 x)
 {
   int8 y = convert_int8(x);
-  y = select(y, (int8)INT_MIN, convert_int8(x < (double8)INT_MIN));
-  y = select(y, (int8)INT_MAX, convert_int8(x > (double8)INT_MAX));
+  y = select(y, (int8)INT_MIN, convert_int8(x < (double8)(-0x1p+31f)));
+  y = select(y, (int8)INT_MAX, convert_int8(x >= (double8)(0x1p+31f)));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat(double16 x)
 {
   int16 y = convert_int16(x);
-  y = select(y, (int16)INT_MIN, convert_int16(x < (double16)INT_MIN));
-  y = select(y, (int16)INT_MAX, convert_int16(x > (double16)INT_MAX));
+  y = select(y, (int16)INT_MIN, convert_int16(x < (double16)(-0x1p+31f)));
+  y = select(y, (int16)INT_MAX, convert_int16(x >= (double16)(0x1p+31f)));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat(double x)
 {
   uint y = convert_uint(x);
-  y = select(y, (uint)0, as_uint(convert_int(x < (double)0)));
-  y = select(y, (uint)UINT_MAX, as_uint(convert_int(x > (double)UINT_MAX)));
+  y = select(y, (uint)0, as_uint(convert_int(x < (double)0.0f)));
+  y = select(y, (uint)UINT_MAX, as_uint(convert_int(x >= (double)(0x1p+32f))));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat(double2 x)
 {
   uint2 y = convert_uint2(x);
-  y = select(y, (uint2)0, as_uint2(convert_int2(x < (double2)0)));
-  y = select(y, (uint2)UINT_MAX, as_uint2(convert_int2(x > (double2)UINT_MAX)));
+  y = select(y, (uint2)0, as_uint2(convert_int2(x < (double2)0.0f)));
+  y = select(y, (uint2)UINT_MAX, as_uint2(convert_int2(x >= (double2)(0x1p+32f))));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat(double3 x)
 {
   uint3 y = convert_uint3(x);
-  y = select(y, (uint3)0, as_uint3(convert_int3(x < (double3)0)));
-  y = select(y, (uint3)UINT_MAX, as_uint3(convert_int3(x > (double3)UINT_MAX)));
+  y = select(y, (uint3)0, as_uint3(convert_int3(x < (double3)0.0f)));
+  y = select(y, (uint3)UINT_MAX, as_uint3(convert_int3(x >= (double3)(0x1p+32f))));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat(double4 x)
 {
   uint4 y = convert_uint4(x);
-  y = select(y, (uint4)0, as_uint4(convert_int4(x < (double4)0)));
-  y = select(y, (uint4)UINT_MAX, as_uint4(convert_int4(x > (double4)UINT_MAX)));
+  y = select(y, (uint4)0, as_uint4(convert_int4(x < (double4)0.0f)));
+  y = select(y, (uint4)UINT_MAX, as_uint4(convert_int4(x >= (double4)(0x1p+32f))));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat(double8 x)
 {
   uint8 y = convert_uint8(x);
-  y = select(y, (uint8)0, as_uint8(convert_int8(x < (double8)0)));
-  y = select(y, (uint8)UINT_MAX, as_uint8(convert_int8(x > (double8)UINT_MAX)));
+  y = select(y, (uint8)0, as_uint8(convert_int8(x < (double8)0.0f)));
+  y = select(y, (uint8)UINT_MAX, as_uint8(convert_int8(x >= (double8)(0x1p+32f))));
   return y;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat(double16 x)
 {
   uint16 y = convert_uint16(x);
-  y = select(y, (uint16)0, as_uint16(convert_int16(x < (double16)0)));
-  y = select(y, (uint16)UINT_MAX, as_uint16(convert_int16(x > (double16)UINT_MAX)));
+  y = select(y, (uint16)0, as_uint16(convert_int16(x < (double16)0.0f)));
+  y = select(y, (uint16)UINT_MAX, as_uint16(convert_int16(x >= (double16)(0x1p+32f))));
   return y;
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat(double x)
 {
   long y = convert_long(x);
-  y = select(y, (long)LONG_MIN, convert_long(x < (double)LONG_MIN));
-  y = select(y, (long)LONG_MAX, convert_long(x > (double)LONG_MAX));
+  y = select(y, (long)LONG_MIN, convert_long(x < (double)(-0x1p+63f)));
+  y = select(y, (long)LONG_MAX, convert_long(x >= (double)(0x1p+63f)));
   return y;
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat(double2 x)
 {
   long2 y = convert_long2(x);
-  y = select(y, (long2)LONG_MIN, convert_long2(x < (double2)LONG_MIN));
-  y = select(y, (long2)LONG_MAX, convert_long2(x > (double2)LONG_MAX));
+  y = select(y, (long2)LONG_MIN, convert_long2(x < (double2)(-0x1p+63f)));
+  y = select(y, (long2)LONG_MAX, convert_long2(x >= (double2)(0x1p+63f)));
   return y;
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat(double3 x)
 {
   long3 y = convert_long3(x);
-  y = select(y, (long3)LONG_MIN, convert_long3(x < (double3)LONG_MIN));
-  y = select(y, (long3)LONG_MAX, convert_long3(x > (double3)LONG_MAX));
+  y = select(y, (long3)LONG_MIN, convert_long3(x < (double3)(-0x1p+63f)));
+  y = select(y, (long3)LONG_MAX, convert_long3(x >= (double3)(0x1p+63f)));
   return y;
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat(double4 x)
 {
   long4 y = convert_long4(x);
-  y = select(y, (long4)LONG_MIN, convert_long4(x < (double4)LONG_MIN));
-  y = select(y, (long4)LONG_MAX, convert_long4(x > (double4)LONG_MAX));
+  y = select(y, (long4)LONG_MIN, convert_long4(x < (double4)(-0x1p+63f)));
+  y = select(y, (long4)LONG_MAX, convert_long4(x >= (double4)(0x1p+63f)));
   return y;
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat(double8 x)
 {
   long8 y = convert_long8(x);
-  y = select(y, (long8)LONG_MIN, convert_long8(x < (double8)LONG_MIN));
-  y = select(y, (long8)LONG_MAX, convert_long8(x > (double8)LONG_MAX));
+  y = select(y, (long8)LONG_MIN, convert_long8(x < (double8)(-0x1p+63f)));
+  y = select(y, (long8)LONG_MAX, convert_long8(x >= (double8)(0x1p+63f)));
   return y;
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat(double16 x)
 {
   long16 y = convert_long16(x);
-  y = select(y, (long16)LONG_MIN, convert_long16(x < (double16)LONG_MIN));
-  y = select(y, (long16)LONG_MAX, convert_long16(x > (double16)LONG_MAX));
+  y = select(y, (long16)LONG_MIN, convert_long16(x < (double16)(-0x1p+63f)));
+  y = select(y, (long16)LONG_MAX, convert_long16(x >= (double16)(0x1p+63f)));
   return y;
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat(double x)
 {
   ulong y = convert_ulong(x);
-  y = select(y, (ulong)0, as_ulong(convert_long(x < (double)0)));
-  y = select(y, (ulong)ULONG_MAX, as_ulong(convert_long(x > (double)ULONG_MAX)));
+  y = select(y, (ulong)0, as_ulong(convert_long(x < (double)0.0f)));
+  y = select(y, (ulong)ULONG_MAX, as_ulong(convert_long(x >= (double)(0x1p+64f))));
   return y;
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat(double2 x)
 {
   ulong2 y = convert_ulong2(x);
-  y = select(y, (ulong2)0, as_ulong2(convert_long2(x < (double2)0)));
-  y = select(y, (ulong2)ULONG_MAX, as_ulong2(convert_long2(x > (double2)ULONG_MAX)));
+  y = select(y, (ulong2)0, as_ulong2(convert_long2(x < (double2)0.0f)));
+  y = select(y, (ulong2)ULONG_MAX, as_ulong2(convert_long2(x >= (double2)(0x1p+64f))));
   return y;
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat(double3 x)
 {
   ulong3 y = convert_ulong3(x);
-  y = select(y, (ulong3)0, as_ulong3(convert_long3(x < (double3)0)));
-  y = select(y, (ulong3)ULONG_MAX, as_ulong3(convert_long3(x > (double3)ULONG_MAX)));
+  y = select(y, (ulong3)0, as_ulong3(convert_long3(x < (double3)0.0f)));
+  y = select(y, (ulong3)ULONG_MAX, as_ulong3(convert_long3(x >= (double3)(0x1p+64f))));
   return y;
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat(double4 x)
 {
   ulong4 y = convert_ulong4(x);
-  y = select(y, (ulong4)0, as_ulong4(convert_long4(x < (double4)0)));
-  y = select(y, (ulong4)ULONG_MAX, as_ulong4(convert_long4(x > (double4)ULONG_MAX)));
+  y = select(y, (ulong4)0, as_ulong4(convert_long4(x < (double4)0.0f)));
+  y = select(y, (ulong4)ULONG_MAX, as_ulong4(convert_long4(x >= (double4)(0x1p+64f))));
   return y;
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat(double8 x)
 {
   ulong8 y = convert_ulong8(x);
-  y = select(y, (ulong8)0, as_ulong8(convert_long8(x < (double8)0)));
-  y = select(y, (ulong8)ULONG_MAX, as_ulong8(convert_long8(x > (double8)ULONG_MAX)));
+  y = select(y, (ulong8)0, as_ulong8(convert_long8(x < (double8)0.0f)));
+  y = select(y, (ulong8)ULONG_MAX, as_ulong8(convert_long8(x >= (double8)(0x1p+64f))));
   return y;
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat(double16 x)
 {
   ulong16 y = convert_ulong16(x);
-  y = select(y, (ulong16)0, as_ulong16(convert_long16(x < (double16)0)));
-  y = select(y, (ulong16)ULONG_MAX, as_ulong16(convert_long16(x > (double16)ULONG_MAX)));
+  y = select(y, (ulong16)0, as_ulong16(convert_long16(x < (double16)0.0f)));
+  y = select(y, (ulong16)ULONG_MAX, as_ulong16(convert_long16(x >= (double16)(0x1p+64f))));
   return y;
 }
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rtz(char x)
 {
   return convert_char_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rte(char x)
 {
   return convert_char_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rtp(char x)
 {
   return convert_char_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rtn(char x)
 {
   return convert_char_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rtz(char2 x)
 {
   return convert_char2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rte(char2 x)
 {
   return convert_char2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rtp(char2 x)
 {
   return convert_char2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rtn(char2 x)
 {
   return convert_char2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rtz(char3 x)
 {
   return convert_char3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rte(char3 x)
 {
   return convert_char3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rtp(char3 x)
 {
   return convert_char3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rtn(char3 x)
 {
   return convert_char3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rtz(char4 x)
 {
   return convert_char4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rte(char4 x)
 {
   return convert_char4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rtp(char4 x)
 {
   return convert_char4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rtn(char4 x)
 {
   return convert_char4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rtz(char8 x)
 {
   return convert_char8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rte(char8 x)
 {
   return convert_char8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rtp(char8 x)
 {
   return convert_char8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rtn(char8 x)
 {
   return convert_char8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rtz(char16 x)
 {
   return convert_char16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rte(char16 x)
 {
   return convert_char16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rtp(char16 x)
 {
   return convert_char16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rtn(char16 x)
 {
   return convert_char16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rtz(char x)
 {
   return convert_uchar_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rte(char x)
 {
   return convert_uchar_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rtp(char x)
 {
   return convert_uchar_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rtn(char x)
 {
   return convert_uchar_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rtz(char2 x)
 {
   return convert_uchar2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rte(char2 x)
 {
   return convert_uchar2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rtp(char2 x)
 {
   return convert_uchar2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rtn(char2 x)
 {
   return convert_uchar2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rtz(char3 x)
 {
   return convert_uchar3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rte(char3 x)
 {
   return convert_uchar3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rtp(char3 x)
 {
   return convert_uchar3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rtn(char3 x)
 {
   return convert_uchar3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rtz(char4 x)
 {
   return convert_uchar4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rte(char4 x)
 {
   return convert_uchar4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rtp(char4 x)
 {
   return convert_uchar4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rtn(char4 x)
 {
   return convert_uchar4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rtz(char8 x)
 {
   return convert_uchar8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rte(char8 x)
 {
   return convert_uchar8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rtp(char8 x)
 {
   return convert_uchar8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rtn(char8 x)
 {
   return convert_uchar8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rtz(char16 x)
 {
   return convert_uchar16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rte(char16 x)
 {
   return convert_uchar16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rtp(char16 x)
 {
   return convert_uchar16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rtn(char16 x)
 {
   return convert_uchar16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rtz(char x)
 {
   return convert_short_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rte(char x)
 {
   return convert_short_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rtp(char x)
 {
   return convert_short_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rtn(char x)
 {
   return convert_short_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rtz(char2 x)
 {
   return convert_short2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rte(char2 x)
 {
   return convert_short2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rtp(char2 x)
 {
   return convert_short2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rtn(char2 x)
 {
   return convert_short2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rtz(char3 x)
 {
   return convert_short3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rte(char3 x)
 {
   return convert_short3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rtp(char3 x)
 {
   return convert_short3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rtn(char3 x)
 {
   return convert_short3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rtz(char4 x)
 {
   return convert_short4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rte(char4 x)
 {
   return convert_short4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rtp(char4 x)
 {
   return convert_short4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rtn(char4 x)
 {
   return convert_short4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rtz(char8 x)
 {
   return convert_short8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rte(char8 x)
 {
   return convert_short8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rtp(char8 x)
 {
   return convert_short8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rtn(char8 x)
 {
   return convert_short8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rtz(char16 x)
 {
   return convert_short16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rte(char16 x)
 {
   return convert_short16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rtp(char16 x)
 {
   return convert_short16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rtn(char16 x)
 {
   return convert_short16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rtz(char x)
 {
   return convert_ushort_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rte(char x)
 {
   return convert_ushort_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rtp(char x)
 {
   return convert_ushort_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rtn(char x)
 {
   return convert_ushort_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rtz(char2 x)
 {
   return convert_ushort2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rte(char2 x)
 {
   return convert_ushort2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rtp(char2 x)
 {
   return convert_ushort2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rtn(char2 x)
 {
   return convert_ushort2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rtz(char3 x)
 {
   return convert_ushort3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rte(char3 x)
 {
   return convert_ushort3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rtp(char3 x)
 {
   return convert_ushort3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rtn(char3 x)
 {
   return convert_ushort3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rtz(char4 x)
 {
   return convert_ushort4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rte(char4 x)
 {
   return convert_ushort4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rtp(char4 x)
 {
   return convert_ushort4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rtn(char4 x)
 {
   return convert_ushort4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rtz(char8 x)
 {
   return convert_ushort8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rte(char8 x)
 {
   return convert_ushort8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rtp(char8 x)
 {
   return convert_ushort8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rtn(char8 x)
 {
   return convert_ushort8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rtz(char16 x)
 {
   return convert_ushort16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rte(char16 x)
 {
   return convert_ushort16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rtp(char16 x)
 {
   return convert_ushort16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rtn(char16 x)
 {
   return convert_ushort16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rtz(char x)
 {
   return convert_int_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rte(char x)
 {
   return convert_int_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rtp(char x)
 {
   return convert_int_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rtn(char x)
 {
   return convert_int_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rtz(char2 x)
 {
   return convert_int2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rte(char2 x)
 {
   return convert_int2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rtp(char2 x)
 {
   return convert_int2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rtn(char2 x)
 {
   return convert_int2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rtz(char3 x)
 {
   return convert_int3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rte(char3 x)
 {
   return convert_int3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rtp(char3 x)
 {
   return convert_int3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rtn(char3 x)
 {
   return convert_int3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rtz(char4 x)
 {
   return convert_int4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rte(char4 x)
 {
   return convert_int4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rtp(char4 x)
 {
   return convert_int4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rtn(char4 x)
 {
   return convert_int4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rtz(char8 x)
 {
   return convert_int8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rte(char8 x)
 {
   return convert_int8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rtp(char8 x)
 {
   return convert_int8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rtn(char8 x)
 {
   return convert_int8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rtz(char16 x)
 {
   return convert_int16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rte(char16 x)
 {
   return convert_int16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rtp(char16 x)
 {
   return convert_int16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rtn(char16 x)
 {
   return convert_int16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rtz(char x)
 {
   return convert_uint_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rte(char x)
 {
   return convert_uint_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rtp(char x)
 {
   return convert_uint_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rtn(char x)
 {
   return convert_uint_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rtz(char2 x)
 {
   return convert_uint2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rte(char2 x)
 {
   return convert_uint2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rtp(char2 x)
 {
   return convert_uint2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rtn(char2 x)
 {
   return convert_uint2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rtz(char3 x)
 {
   return convert_uint3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rte(char3 x)
 {
   return convert_uint3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rtp(char3 x)
 {
   return convert_uint3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rtn(char3 x)
 {
   return convert_uint3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rtz(char4 x)
 {
   return convert_uint4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rte(char4 x)
 {
   return convert_uint4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rtp(char4 x)
 {
   return convert_uint4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rtn(char4 x)
 {
   return convert_uint4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rtz(char8 x)
 {
   return convert_uint8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rte(char8 x)
 {
   return convert_uint8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rtp(char8 x)
 {
   return convert_uint8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rtn(char8 x)
 {
   return convert_uint8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rtz(char16 x)
 {
   return convert_uint16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rte(char16 x)
 {
   return convert_uint16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rtp(char16 x)
 {
   return convert_uint16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rtn(char16 x)
 {
   return convert_uint16_sat(x);
@@ -19179,7 +19179,7 @@ uint16 convert_uint16_sat_rtn(char16 x)
 
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rtz(char x)
 {
   return convert_long_sat(x);
@@ -19188,7 +19188,7 @@ long convert_long_sat_rtz(char x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rte(char x)
 {
   return convert_long_sat(x);
@@ -19197,7 +19197,7 @@ long convert_long_sat_rte(char x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rtp(char x)
 {
   return convert_long_sat(x);
@@ -19206,7 +19206,7 @@ long convert_long_sat_rtp(char x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rtn(char x)
 {
   return convert_long_sat(x);
@@ -19215,7 +19215,7 @@ long convert_long_sat_rtn(char x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rtz(char2 x)
 {
   return convert_long2_sat(x);
@@ -19224,7 +19224,7 @@ long2 convert_long2_sat_rtz(char2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rte(char2 x)
 {
   return convert_long2_sat(x);
@@ -19233,7 +19233,7 @@ long2 convert_long2_sat_rte(char2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rtp(char2 x)
 {
   return convert_long2_sat(x);
@@ -19242,7 +19242,7 @@ long2 convert_long2_sat_rtp(char2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rtn(char2 x)
 {
   return convert_long2_sat(x);
@@ -19251,7 +19251,7 @@ long2 convert_long2_sat_rtn(char2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rtz(char3 x)
 {
   return convert_long3_sat(x);
@@ -19260,7 +19260,7 @@ long3 convert_long3_sat_rtz(char3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rte(char3 x)
 {
   return convert_long3_sat(x);
@@ -19269,7 +19269,7 @@ long3 convert_long3_sat_rte(char3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rtp(char3 x)
 {
   return convert_long3_sat(x);
@@ -19278,7 +19278,7 @@ long3 convert_long3_sat_rtp(char3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rtn(char3 x)
 {
   return convert_long3_sat(x);
@@ -19287,7 +19287,7 @@ long3 convert_long3_sat_rtn(char3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rtz(char4 x)
 {
   return convert_long4_sat(x);
@@ -19296,7 +19296,7 @@ long4 convert_long4_sat_rtz(char4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rte(char4 x)
 {
   return convert_long4_sat(x);
@@ -19305,7 +19305,7 @@ long4 convert_long4_sat_rte(char4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rtp(char4 x)
 {
   return convert_long4_sat(x);
@@ -19314,7 +19314,7 @@ long4 convert_long4_sat_rtp(char4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rtn(char4 x)
 {
   return convert_long4_sat(x);
@@ -19323,7 +19323,7 @@ long4 convert_long4_sat_rtn(char4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rtz(char8 x)
 {
   return convert_long8_sat(x);
@@ -19332,7 +19332,7 @@ long8 convert_long8_sat_rtz(char8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rte(char8 x)
 {
   return convert_long8_sat(x);
@@ -19341,7 +19341,7 @@ long8 convert_long8_sat_rte(char8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rtp(char8 x)
 {
   return convert_long8_sat(x);
@@ -19350,7 +19350,7 @@ long8 convert_long8_sat_rtp(char8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rtn(char8 x)
 {
   return convert_long8_sat(x);
@@ -19359,7 +19359,7 @@ long8 convert_long8_sat_rtn(char8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rtz(char16 x)
 {
   return convert_long16_sat(x);
@@ -19368,7 +19368,7 @@ long16 convert_long16_sat_rtz(char16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rte(char16 x)
 {
   return convert_long16_sat(x);
@@ -19377,7 +19377,7 @@ long16 convert_long16_sat_rte(char16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rtp(char16 x)
 {
   return convert_long16_sat(x);
@@ -19386,7 +19386,7 @@ long16 convert_long16_sat_rtp(char16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rtn(char16 x)
 {
   return convert_long16_sat(x);
@@ -19395,7 +19395,7 @@ long16 convert_long16_sat_rtn(char16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rtz(char x)
 {
   return convert_ulong_sat(x);
@@ -19404,7 +19404,7 @@ ulong convert_ulong_sat_rtz(char x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rte(char x)
 {
   return convert_ulong_sat(x);
@@ -19413,7 +19413,7 @@ ulong convert_ulong_sat_rte(char x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rtp(char x)
 {
   return convert_ulong_sat(x);
@@ -19422,7 +19422,7 @@ ulong convert_ulong_sat_rtp(char x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rtn(char x)
 {
   return convert_ulong_sat(x);
@@ -19431,7 +19431,7 @@ ulong convert_ulong_sat_rtn(char x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rtz(char2 x)
 {
   return convert_ulong2_sat(x);
@@ -19440,7 +19440,7 @@ ulong2 convert_ulong2_sat_rtz(char2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rte(char2 x)
 {
   return convert_ulong2_sat(x);
@@ -19449,7 +19449,7 @@ ulong2 convert_ulong2_sat_rte(char2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rtp(char2 x)
 {
   return convert_ulong2_sat(x);
@@ -19458,7 +19458,7 @@ ulong2 convert_ulong2_sat_rtp(char2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rtn(char2 x)
 {
   return convert_ulong2_sat(x);
@@ -19467,7 +19467,7 @@ ulong2 convert_ulong2_sat_rtn(char2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rtz(char3 x)
 {
   return convert_ulong3_sat(x);
@@ -19476,7 +19476,7 @@ ulong3 convert_ulong3_sat_rtz(char3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rte(char3 x)
 {
   return convert_ulong3_sat(x);
@@ -19485,7 +19485,7 @@ ulong3 convert_ulong3_sat_rte(char3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rtp(char3 x)
 {
   return convert_ulong3_sat(x);
@@ -19494,7 +19494,7 @@ ulong3 convert_ulong3_sat_rtp(char3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rtn(char3 x)
 {
   return convert_ulong3_sat(x);
@@ -19503,7 +19503,7 @@ ulong3 convert_ulong3_sat_rtn(char3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rtz(char4 x)
 {
   return convert_ulong4_sat(x);
@@ -19512,7 +19512,7 @@ ulong4 convert_ulong4_sat_rtz(char4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rte(char4 x)
 {
   return convert_ulong4_sat(x);
@@ -19521,7 +19521,7 @@ ulong4 convert_ulong4_sat_rte(char4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rtp(char4 x)
 {
   return convert_ulong4_sat(x);
@@ -19530,7 +19530,7 @@ ulong4 convert_ulong4_sat_rtp(char4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rtn(char4 x)
 {
   return convert_ulong4_sat(x);
@@ -19539,7 +19539,7 @@ ulong4 convert_ulong4_sat_rtn(char4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rtz(char8 x)
 {
   return convert_ulong8_sat(x);
@@ -19548,7 +19548,7 @@ ulong8 convert_ulong8_sat_rtz(char8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rte(char8 x)
 {
   return convert_ulong8_sat(x);
@@ -19557,7 +19557,7 @@ ulong8 convert_ulong8_sat_rte(char8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rtp(char8 x)
 {
   return convert_ulong8_sat(x);
@@ -19566,7 +19566,7 @@ ulong8 convert_ulong8_sat_rtp(char8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rtn(char8 x)
 {
   return convert_ulong8_sat(x);
@@ -19575,7 +19575,7 @@ ulong8 convert_ulong8_sat_rtn(char8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rtz(char16 x)
 {
   return convert_ulong16_sat(x);
@@ -19584,7 +19584,7 @@ ulong16 convert_ulong16_sat_rtz(char16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rte(char16 x)
 {
   return convert_ulong16_sat(x);
@@ -19593,7 +19593,7 @@ ulong16 convert_ulong16_sat_rte(char16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rtp(char16 x)
 {
   return convert_ulong16_sat(x);
@@ -19602,7 +19602,7 @@ ulong16 convert_ulong16_sat_rtp(char16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rtn(char16 x)
 {
   return convert_ulong16_sat(x);
@@ -19610,1008 +19610,1008 @@ ulong16 convert_ulong16_sat_rtn(char16 x)
 
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rtz(uchar x)
 {
   return convert_char_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rte(uchar x)
 {
   return convert_char_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rtp(uchar x)
 {
   return convert_char_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rtn(uchar x)
 {
   return convert_char_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rtz(uchar2 x)
 {
   return convert_char2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rte(uchar2 x)
 {
   return convert_char2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rtp(uchar2 x)
 {
   return convert_char2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rtn(uchar2 x)
 {
   return convert_char2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rtz(uchar3 x)
 {
   return convert_char3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rte(uchar3 x)
 {
   return convert_char3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rtp(uchar3 x)
 {
   return convert_char3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rtn(uchar3 x)
 {
   return convert_char3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rtz(uchar4 x)
 {
   return convert_char4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rte(uchar4 x)
 {
   return convert_char4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rtp(uchar4 x)
 {
   return convert_char4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rtn(uchar4 x)
 {
   return convert_char4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rtz(uchar8 x)
 {
   return convert_char8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rte(uchar8 x)
 {
   return convert_char8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rtp(uchar8 x)
 {
   return convert_char8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rtn(uchar8 x)
 {
   return convert_char8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rtz(uchar16 x)
 {
   return convert_char16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rte(uchar16 x)
 {
   return convert_char16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rtp(uchar16 x)
 {
   return convert_char16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rtn(uchar16 x)
 {
   return convert_char16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rtz(uchar x)
 {
   return convert_uchar_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rte(uchar x)
 {
   return convert_uchar_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rtp(uchar x)
 {
   return convert_uchar_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rtn(uchar x)
 {
   return convert_uchar_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rtz(uchar2 x)
 {
   return convert_uchar2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rte(uchar2 x)
 {
   return convert_uchar2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rtp(uchar2 x)
 {
   return convert_uchar2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rtn(uchar2 x)
 {
   return convert_uchar2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rtz(uchar3 x)
 {
   return convert_uchar3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rte(uchar3 x)
 {
   return convert_uchar3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rtp(uchar3 x)
 {
   return convert_uchar3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rtn(uchar3 x)
 {
   return convert_uchar3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rtz(uchar4 x)
 {
   return convert_uchar4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rte(uchar4 x)
 {
   return convert_uchar4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rtp(uchar4 x)
 {
   return convert_uchar4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rtn(uchar4 x)
 {
   return convert_uchar4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rtz(uchar8 x)
 {
   return convert_uchar8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rte(uchar8 x)
 {
   return convert_uchar8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rtp(uchar8 x)
 {
   return convert_uchar8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rtn(uchar8 x)
 {
   return convert_uchar8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rtz(uchar16 x)
 {
   return convert_uchar16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rte(uchar16 x)
 {
   return convert_uchar16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rtp(uchar16 x)
 {
   return convert_uchar16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rtn(uchar16 x)
 {
   return convert_uchar16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rtz(uchar x)
 {
   return convert_short_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rte(uchar x)
 {
   return convert_short_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rtp(uchar x)
 {
   return convert_short_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rtn(uchar x)
 {
   return convert_short_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rtz(uchar2 x)
 {
   return convert_short2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rte(uchar2 x)
 {
   return convert_short2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rtp(uchar2 x)
 {
   return convert_short2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rtn(uchar2 x)
 {
   return convert_short2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rtz(uchar3 x)
 {
   return convert_short3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rte(uchar3 x)
 {
   return convert_short3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rtp(uchar3 x)
 {
   return convert_short3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rtn(uchar3 x)
 {
   return convert_short3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rtz(uchar4 x)
 {
   return convert_short4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rte(uchar4 x)
 {
   return convert_short4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rtp(uchar4 x)
 {
   return convert_short4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rtn(uchar4 x)
 {
   return convert_short4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rtz(uchar8 x)
 {
   return convert_short8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rte(uchar8 x)
 {
   return convert_short8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rtp(uchar8 x)
 {
   return convert_short8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rtn(uchar8 x)
 {
   return convert_short8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rtz(uchar16 x)
 {
   return convert_short16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rte(uchar16 x)
 {
   return convert_short16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rtp(uchar16 x)
 {
   return convert_short16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rtn(uchar16 x)
 {
   return convert_short16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rtz(uchar x)
 {
   return convert_ushort_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rte(uchar x)
 {
   return convert_ushort_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rtp(uchar x)
 {
   return convert_ushort_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rtn(uchar x)
 {
   return convert_ushort_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rtz(uchar2 x)
 {
   return convert_ushort2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rte(uchar2 x)
 {
   return convert_ushort2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rtp(uchar2 x)
 {
   return convert_ushort2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rtn(uchar2 x)
 {
   return convert_ushort2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rtz(uchar3 x)
 {
   return convert_ushort3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rte(uchar3 x)
 {
   return convert_ushort3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rtp(uchar3 x)
 {
   return convert_ushort3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rtn(uchar3 x)
 {
   return convert_ushort3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rtz(uchar4 x)
 {
   return convert_ushort4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rte(uchar4 x)
 {
   return convert_ushort4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rtp(uchar4 x)
 {
   return convert_ushort4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rtn(uchar4 x)
 {
   return convert_ushort4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rtz(uchar8 x)
 {
   return convert_ushort8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rte(uchar8 x)
 {
   return convert_ushort8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rtp(uchar8 x)
 {
   return convert_ushort8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rtn(uchar8 x)
 {
   return convert_ushort8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rtz(uchar16 x)
 {
   return convert_ushort16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rte(uchar16 x)
 {
   return convert_ushort16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rtp(uchar16 x)
 {
   return convert_ushort16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rtn(uchar16 x)
 {
   return convert_ushort16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rtz(uchar x)
 {
   return convert_int_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rte(uchar x)
 {
   return convert_int_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rtp(uchar x)
 {
   return convert_int_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rtn(uchar x)
 {
   return convert_int_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rtz(uchar2 x)
 {
   return convert_int2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rte(uchar2 x)
 {
   return convert_int2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rtp(uchar2 x)
 {
   return convert_int2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rtn(uchar2 x)
 {
   return convert_int2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rtz(uchar3 x)
 {
   return convert_int3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rte(uchar3 x)
 {
   return convert_int3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rtp(uchar3 x)
 {
   return convert_int3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rtn(uchar3 x)
 {
   return convert_int3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rtz(uchar4 x)
 {
   return convert_int4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rte(uchar4 x)
 {
   return convert_int4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rtp(uchar4 x)
 {
   return convert_int4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rtn(uchar4 x)
 {
   return convert_int4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rtz(uchar8 x)
 {
   return convert_int8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rte(uchar8 x)
 {
   return convert_int8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rtp(uchar8 x)
 {
   return convert_int8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rtn(uchar8 x)
 {
   return convert_int8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rtz(uchar16 x)
 {
   return convert_int16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rte(uchar16 x)
 {
   return convert_int16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rtp(uchar16 x)
 {
   return convert_int16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rtn(uchar16 x)
 {
   return convert_int16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rtz(uchar x)
 {
   return convert_uint_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rte(uchar x)
 {
   return convert_uint_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rtp(uchar x)
 {
   return convert_uint_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rtn(uchar x)
 {
   return convert_uint_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rtz(uchar2 x)
 {
   return convert_uint2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rte(uchar2 x)
 {
   return convert_uint2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rtp(uchar2 x)
 {
   return convert_uint2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rtn(uchar2 x)
 {
   return convert_uint2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rtz(uchar3 x)
 {
   return convert_uint3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rte(uchar3 x)
 {
   return convert_uint3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rtp(uchar3 x)
 {
   return convert_uint3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rtn(uchar3 x)
 {
   return convert_uint3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rtz(uchar4 x)
 {
   return convert_uint4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rte(uchar4 x)
 {
   return convert_uint4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rtp(uchar4 x)
 {
   return convert_uint4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rtn(uchar4 x)
 {
   return convert_uint4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rtz(uchar8 x)
 {
   return convert_uint8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rte(uchar8 x)
 {
   return convert_uint8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rtp(uchar8 x)
 {
   return convert_uint8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rtn(uchar8 x)
 {
   return convert_uint8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rtz(uchar16 x)
 {
   return convert_uint16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rte(uchar16 x)
 {
   return convert_uint16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rtp(uchar16 x)
 {
   return convert_uint16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rtn(uchar16 x)
 {
   return convert_uint16_sat(x);
@@ -20619,7 +20619,7 @@ uint16 convert_uint16_sat_rtn(uchar16 x)
 
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rtz(uchar x)
 {
   return convert_long_sat(x);
@@ -20628,7 +20628,7 @@ long convert_long_sat_rtz(uchar x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rte(uchar x)
 {
   return convert_long_sat(x);
@@ -20637,7 +20637,7 @@ long convert_long_sat_rte(uchar x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rtp(uchar x)
 {
   return convert_long_sat(x);
@@ -20646,7 +20646,7 @@ long convert_long_sat_rtp(uchar x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rtn(uchar x)
 {
   return convert_long_sat(x);
@@ -20655,7 +20655,7 @@ long convert_long_sat_rtn(uchar x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rtz(uchar2 x)
 {
   return convert_long2_sat(x);
@@ -20664,7 +20664,7 @@ long2 convert_long2_sat_rtz(uchar2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rte(uchar2 x)
 {
   return convert_long2_sat(x);
@@ -20673,7 +20673,7 @@ long2 convert_long2_sat_rte(uchar2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rtp(uchar2 x)
 {
   return convert_long2_sat(x);
@@ -20682,7 +20682,7 @@ long2 convert_long2_sat_rtp(uchar2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rtn(uchar2 x)
 {
   return convert_long2_sat(x);
@@ -20691,7 +20691,7 @@ long2 convert_long2_sat_rtn(uchar2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rtz(uchar3 x)
 {
   return convert_long3_sat(x);
@@ -20700,7 +20700,7 @@ long3 convert_long3_sat_rtz(uchar3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rte(uchar3 x)
 {
   return convert_long3_sat(x);
@@ -20709,7 +20709,7 @@ long3 convert_long3_sat_rte(uchar3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rtp(uchar3 x)
 {
   return convert_long3_sat(x);
@@ -20718,7 +20718,7 @@ long3 convert_long3_sat_rtp(uchar3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rtn(uchar3 x)
 {
   return convert_long3_sat(x);
@@ -20727,7 +20727,7 @@ long3 convert_long3_sat_rtn(uchar3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rtz(uchar4 x)
 {
   return convert_long4_sat(x);
@@ -20736,7 +20736,7 @@ long4 convert_long4_sat_rtz(uchar4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rte(uchar4 x)
 {
   return convert_long4_sat(x);
@@ -20745,7 +20745,7 @@ long4 convert_long4_sat_rte(uchar4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rtp(uchar4 x)
 {
   return convert_long4_sat(x);
@@ -20754,7 +20754,7 @@ long4 convert_long4_sat_rtp(uchar4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rtn(uchar4 x)
 {
   return convert_long4_sat(x);
@@ -20763,7 +20763,7 @@ long4 convert_long4_sat_rtn(uchar4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rtz(uchar8 x)
 {
   return convert_long8_sat(x);
@@ -20772,7 +20772,7 @@ long8 convert_long8_sat_rtz(uchar8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rte(uchar8 x)
 {
   return convert_long8_sat(x);
@@ -20781,7 +20781,7 @@ long8 convert_long8_sat_rte(uchar8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rtp(uchar8 x)
 {
   return convert_long8_sat(x);
@@ -20790,7 +20790,7 @@ long8 convert_long8_sat_rtp(uchar8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rtn(uchar8 x)
 {
   return convert_long8_sat(x);
@@ -20799,7 +20799,7 @@ long8 convert_long8_sat_rtn(uchar8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rtz(uchar16 x)
 {
   return convert_long16_sat(x);
@@ -20808,7 +20808,7 @@ long16 convert_long16_sat_rtz(uchar16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rte(uchar16 x)
 {
   return convert_long16_sat(x);
@@ -20817,7 +20817,7 @@ long16 convert_long16_sat_rte(uchar16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rtp(uchar16 x)
 {
   return convert_long16_sat(x);
@@ -20826,7 +20826,7 @@ long16 convert_long16_sat_rtp(uchar16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rtn(uchar16 x)
 {
   return convert_long16_sat(x);
@@ -20835,7 +20835,7 @@ long16 convert_long16_sat_rtn(uchar16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rtz(uchar x)
 {
   return convert_ulong_sat(x);
@@ -20844,7 +20844,7 @@ ulong convert_ulong_sat_rtz(uchar x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rte(uchar x)
 {
   return convert_ulong_sat(x);
@@ -20853,7 +20853,7 @@ ulong convert_ulong_sat_rte(uchar x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rtp(uchar x)
 {
   return convert_ulong_sat(x);
@@ -20862,7 +20862,7 @@ ulong convert_ulong_sat_rtp(uchar x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rtn(uchar x)
 {
   return convert_ulong_sat(x);
@@ -20871,7 +20871,7 @@ ulong convert_ulong_sat_rtn(uchar x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rtz(uchar2 x)
 {
   return convert_ulong2_sat(x);
@@ -20880,7 +20880,7 @@ ulong2 convert_ulong2_sat_rtz(uchar2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rte(uchar2 x)
 {
   return convert_ulong2_sat(x);
@@ -20889,7 +20889,7 @@ ulong2 convert_ulong2_sat_rte(uchar2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rtp(uchar2 x)
 {
   return convert_ulong2_sat(x);
@@ -20898,7 +20898,7 @@ ulong2 convert_ulong2_sat_rtp(uchar2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rtn(uchar2 x)
 {
   return convert_ulong2_sat(x);
@@ -20907,7 +20907,7 @@ ulong2 convert_ulong2_sat_rtn(uchar2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rtz(uchar3 x)
 {
   return convert_ulong3_sat(x);
@@ -20916,7 +20916,7 @@ ulong3 convert_ulong3_sat_rtz(uchar3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rte(uchar3 x)
 {
   return convert_ulong3_sat(x);
@@ -20925,7 +20925,7 @@ ulong3 convert_ulong3_sat_rte(uchar3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rtp(uchar3 x)
 {
   return convert_ulong3_sat(x);
@@ -20934,7 +20934,7 @@ ulong3 convert_ulong3_sat_rtp(uchar3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rtn(uchar3 x)
 {
   return convert_ulong3_sat(x);
@@ -20943,7 +20943,7 @@ ulong3 convert_ulong3_sat_rtn(uchar3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rtz(uchar4 x)
 {
   return convert_ulong4_sat(x);
@@ -20952,7 +20952,7 @@ ulong4 convert_ulong4_sat_rtz(uchar4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rte(uchar4 x)
 {
   return convert_ulong4_sat(x);
@@ -20961,7 +20961,7 @@ ulong4 convert_ulong4_sat_rte(uchar4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rtp(uchar4 x)
 {
   return convert_ulong4_sat(x);
@@ -20970,7 +20970,7 @@ ulong4 convert_ulong4_sat_rtp(uchar4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rtn(uchar4 x)
 {
   return convert_ulong4_sat(x);
@@ -20979,7 +20979,7 @@ ulong4 convert_ulong4_sat_rtn(uchar4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rtz(uchar8 x)
 {
   return convert_ulong8_sat(x);
@@ -20988,7 +20988,7 @@ ulong8 convert_ulong8_sat_rtz(uchar8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rte(uchar8 x)
 {
   return convert_ulong8_sat(x);
@@ -20997,7 +20997,7 @@ ulong8 convert_ulong8_sat_rte(uchar8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rtp(uchar8 x)
 {
   return convert_ulong8_sat(x);
@@ -21006,7 +21006,7 @@ ulong8 convert_ulong8_sat_rtp(uchar8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rtn(uchar8 x)
 {
   return convert_ulong8_sat(x);
@@ -21015,7 +21015,7 @@ ulong8 convert_ulong8_sat_rtn(uchar8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rtz(uchar16 x)
 {
   return convert_ulong16_sat(x);
@@ -21024,7 +21024,7 @@ ulong16 convert_ulong16_sat_rtz(uchar16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rte(uchar16 x)
 {
   return convert_ulong16_sat(x);
@@ -21033,7 +21033,7 @@ ulong16 convert_ulong16_sat_rte(uchar16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rtp(uchar16 x)
 {
   return convert_ulong16_sat(x);
@@ -21042,7 +21042,7 @@ ulong16 convert_ulong16_sat_rtp(uchar16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rtn(uchar16 x)
 {
   return convert_ulong16_sat(x);
@@ -21050,1008 +21050,1008 @@ ulong16 convert_ulong16_sat_rtn(uchar16 x)
 
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rtz(short x)
 {
   return convert_char_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rte(short x)
 {
   return convert_char_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rtp(short x)
 {
   return convert_char_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rtn(short x)
 {
   return convert_char_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rtz(short2 x)
 {
   return convert_char2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rte(short2 x)
 {
   return convert_char2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rtp(short2 x)
 {
   return convert_char2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rtn(short2 x)
 {
   return convert_char2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rtz(short3 x)
 {
   return convert_char3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rte(short3 x)
 {
   return convert_char3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rtp(short3 x)
 {
   return convert_char3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rtn(short3 x)
 {
   return convert_char3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rtz(short4 x)
 {
   return convert_char4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rte(short4 x)
 {
   return convert_char4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rtp(short4 x)
 {
   return convert_char4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rtn(short4 x)
 {
   return convert_char4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rtz(short8 x)
 {
   return convert_char8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rte(short8 x)
 {
   return convert_char8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rtp(short8 x)
 {
   return convert_char8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rtn(short8 x)
 {
   return convert_char8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rtz(short16 x)
 {
   return convert_char16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rte(short16 x)
 {
   return convert_char16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rtp(short16 x)
 {
   return convert_char16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rtn(short16 x)
 {
   return convert_char16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rtz(short x)
 {
   return convert_uchar_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rte(short x)
 {
   return convert_uchar_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rtp(short x)
 {
   return convert_uchar_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rtn(short x)
 {
   return convert_uchar_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rtz(short2 x)
 {
   return convert_uchar2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rte(short2 x)
 {
   return convert_uchar2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rtp(short2 x)
 {
   return convert_uchar2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rtn(short2 x)
 {
   return convert_uchar2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rtz(short3 x)
 {
   return convert_uchar3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rte(short3 x)
 {
   return convert_uchar3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rtp(short3 x)
 {
   return convert_uchar3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rtn(short3 x)
 {
   return convert_uchar3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rtz(short4 x)
 {
   return convert_uchar4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rte(short4 x)
 {
   return convert_uchar4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rtp(short4 x)
 {
   return convert_uchar4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rtn(short4 x)
 {
   return convert_uchar4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rtz(short8 x)
 {
   return convert_uchar8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rte(short8 x)
 {
   return convert_uchar8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rtp(short8 x)
 {
   return convert_uchar8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rtn(short8 x)
 {
   return convert_uchar8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rtz(short16 x)
 {
   return convert_uchar16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rte(short16 x)
 {
   return convert_uchar16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rtp(short16 x)
 {
   return convert_uchar16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rtn(short16 x)
 {
   return convert_uchar16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rtz(short x)
 {
   return convert_short_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rte(short x)
 {
   return convert_short_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rtp(short x)
 {
   return convert_short_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rtn(short x)
 {
   return convert_short_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rtz(short2 x)
 {
   return convert_short2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rte(short2 x)
 {
   return convert_short2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rtp(short2 x)
 {
   return convert_short2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rtn(short2 x)
 {
   return convert_short2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rtz(short3 x)
 {
   return convert_short3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rte(short3 x)
 {
   return convert_short3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rtp(short3 x)
 {
   return convert_short3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rtn(short3 x)
 {
   return convert_short3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rtz(short4 x)
 {
   return convert_short4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rte(short4 x)
 {
   return convert_short4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rtp(short4 x)
 {
   return convert_short4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rtn(short4 x)
 {
   return convert_short4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rtz(short8 x)
 {
   return convert_short8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rte(short8 x)
 {
   return convert_short8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rtp(short8 x)
 {
   return convert_short8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rtn(short8 x)
 {
   return convert_short8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rtz(short16 x)
 {
   return convert_short16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rte(short16 x)
 {
   return convert_short16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rtp(short16 x)
 {
   return convert_short16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rtn(short16 x)
 {
   return convert_short16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rtz(short x)
 {
   return convert_ushort_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rte(short x)
 {
   return convert_ushort_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rtp(short x)
 {
   return convert_ushort_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rtn(short x)
 {
   return convert_ushort_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rtz(short2 x)
 {
   return convert_ushort2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rte(short2 x)
 {
   return convert_ushort2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rtp(short2 x)
 {
   return convert_ushort2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rtn(short2 x)
 {
   return convert_ushort2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rtz(short3 x)
 {
   return convert_ushort3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rte(short3 x)
 {
   return convert_ushort3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rtp(short3 x)
 {
   return convert_ushort3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rtn(short3 x)
 {
   return convert_ushort3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rtz(short4 x)
 {
   return convert_ushort4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rte(short4 x)
 {
   return convert_ushort4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rtp(short4 x)
 {
   return convert_ushort4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rtn(short4 x)
 {
   return convert_ushort4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rtz(short8 x)
 {
   return convert_ushort8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rte(short8 x)
 {
   return convert_ushort8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rtp(short8 x)
 {
   return convert_ushort8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rtn(short8 x)
 {
   return convert_ushort8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rtz(short16 x)
 {
   return convert_ushort16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rte(short16 x)
 {
   return convert_ushort16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rtp(short16 x)
 {
   return convert_ushort16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rtn(short16 x)
 {
   return convert_ushort16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rtz(short x)
 {
   return convert_int_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rte(short x)
 {
   return convert_int_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rtp(short x)
 {
   return convert_int_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rtn(short x)
 {
   return convert_int_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rtz(short2 x)
 {
   return convert_int2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rte(short2 x)
 {
   return convert_int2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rtp(short2 x)
 {
   return convert_int2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rtn(short2 x)
 {
   return convert_int2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rtz(short3 x)
 {
   return convert_int3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rte(short3 x)
 {
   return convert_int3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rtp(short3 x)
 {
   return convert_int3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rtn(short3 x)
 {
   return convert_int3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rtz(short4 x)
 {
   return convert_int4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rte(short4 x)
 {
   return convert_int4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rtp(short4 x)
 {
   return convert_int4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rtn(short4 x)
 {
   return convert_int4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rtz(short8 x)
 {
   return convert_int8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rte(short8 x)
 {
   return convert_int8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rtp(short8 x)
 {
   return convert_int8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rtn(short8 x)
 {
   return convert_int8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rtz(short16 x)
 {
   return convert_int16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rte(short16 x)
 {
   return convert_int16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rtp(short16 x)
 {
   return convert_int16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rtn(short16 x)
 {
   return convert_int16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rtz(short x)
 {
   return convert_uint_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rte(short x)
 {
   return convert_uint_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rtp(short x)
 {
   return convert_uint_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rtn(short x)
 {
   return convert_uint_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rtz(short2 x)
 {
   return convert_uint2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rte(short2 x)
 {
   return convert_uint2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rtp(short2 x)
 {
   return convert_uint2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rtn(short2 x)
 {
   return convert_uint2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rtz(short3 x)
 {
   return convert_uint3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rte(short3 x)
 {
   return convert_uint3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rtp(short3 x)
 {
   return convert_uint3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rtn(short3 x)
 {
   return convert_uint3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rtz(short4 x)
 {
   return convert_uint4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rte(short4 x)
 {
   return convert_uint4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rtp(short4 x)
 {
   return convert_uint4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rtn(short4 x)
 {
   return convert_uint4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rtz(short8 x)
 {
   return convert_uint8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rte(short8 x)
 {
   return convert_uint8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rtp(short8 x)
 {
   return convert_uint8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rtn(short8 x)
 {
   return convert_uint8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rtz(short16 x)
 {
   return convert_uint16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rte(short16 x)
 {
   return convert_uint16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rtp(short16 x)
 {
   return convert_uint16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rtn(short16 x)
 {
   return convert_uint16_sat(x);
@@ -22059,7 +22059,7 @@ uint16 convert_uint16_sat_rtn(short16 x)
 
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rtz(short x)
 {
   return convert_long_sat(x);
@@ -22068,7 +22068,7 @@ long convert_long_sat_rtz(short x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rte(short x)
 {
   return convert_long_sat(x);
@@ -22077,7 +22077,7 @@ long convert_long_sat_rte(short x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rtp(short x)
 {
   return convert_long_sat(x);
@@ -22086,7 +22086,7 @@ long convert_long_sat_rtp(short x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rtn(short x)
 {
   return convert_long_sat(x);
@@ -22095,7 +22095,7 @@ long convert_long_sat_rtn(short x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rtz(short2 x)
 {
   return convert_long2_sat(x);
@@ -22104,7 +22104,7 @@ long2 convert_long2_sat_rtz(short2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rte(short2 x)
 {
   return convert_long2_sat(x);
@@ -22113,7 +22113,7 @@ long2 convert_long2_sat_rte(short2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rtp(short2 x)
 {
   return convert_long2_sat(x);
@@ -22122,7 +22122,7 @@ long2 convert_long2_sat_rtp(short2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rtn(short2 x)
 {
   return convert_long2_sat(x);
@@ -22131,7 +22131,7 @@ long2 convert_long2_sat_rtn(short2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rtz(short3 x)
 {
   return convert_long3_sat(x);
@@ -22140,7 +22140,7 @@ long3 convert_long3_sat_rtz(short3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rte(short3 x)
 {
   return convert_long3_sat(x);
@@ -22149,7 +22149,7 @@ long3 convert_long3_sat_rte(short3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rtp(short3 x)
 {
   return convert_long3_sat(x);
@@ -22158,7 +22158,7 @@ long3 convert_long3_sat_rtp(short3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rtn(short3 x)
 {
   return convert_long3_sat(x);
@@ -22167,7 +22167,7 @@ long3 convert_long3_sat_rtn(short3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rtz(short4 x)
 {
   return convert_long4_sat(x);
@@ -22176,7 +22176,7 @@ long4 convert_long4_sat_rtz(short4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rte(short4 x)
 {
   return convert_long4_sat(x);
@@ -22185,7 +22185,7 @@ long4 convert_long4_sat_rte(short4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rtp(short4 x)
 {
   return convert_long4_sat(x);
@@ -22194,7 +22194,7 @@ long4 convert_long4_sat_rtp(short4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rtn(short4 x)
 {
   return convert_long4_sat(x);
@@ -22203,7 +22203,7 @@ long4 convert_long4_sat_rtn(short4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rtz(short8 x)
 {
   return convert_long8_sat(x);
@@ -22212,7 +22212,7 @@ long8 convert_long8_sat_rtz(short8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rte(short8 x)
 {
   return convert_long8_sat(x);
@@ -22221,7 +22221,7 @@ long8 convert_long8_sat_rte(short8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rtp(short8 x)
 {
   return convert_long8_sat(x);
@@ -22230,7 +22230,7 @@ long8 convert_long8_sat_rtp(short8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rtn(short8 x)
 {
   return convert_long8_sat(x);
@@ -22239,7 +22239,7 @@ long8 convert_long8_sat_rtn(short8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rtz(short16 x)
 {
   return convert_long16_sat(x);
@@ -22248,7 +22248,7 @@ long16 convert_long16_sat_rtz(short16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rte(short16 x)
 {
   return convert_long16_sat(x);
@@ -22257,7 +22257,7 @@ long16 convert_long16_sat_rte(short16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rtp(short16 x)
 {
   return convert_long16_sat(x);
@@ -22266,7 +22266,7 @@ long16 convert_long16_sat_rtp(short16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rtn(short16 x)
 {
   return convert_long16_sat(x);
@@ -22275,7 +22275,7 @@ long16 convert_long16_sat_rtn(short16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rtz(short x)
 {
   return convert_ulong_sat(x);
@@ -22284,7 +22284,7 @@ ulong convert_ulong_sat_rtz(short x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rte(short x)
 {
   return convert_ulong_sat(x);
@@ -22293,7 +22293,7 @@ ulong convert_ulong_sat_rte(short x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rtp(short x)
 {
   return convert_ulong_sat(x);
@@ -22302,7 +22302,7 @@ ulong convert_ulong_sat_rtp(short x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rtn(short x)
 {
   return convert_ulong_sat(x);
@@ -22311,7 +22311,7 @@ ulong convert_ulong_sat_rtn(short x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rtz(short2 x)
 {
   return convert_ulong2_sat(x);
@@ -22320,7 +22320,7 @@ ulong2 convert_ulong2_sat_rtz(short2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rte(short2 x)
 {
   return convert_ulong2_sat(x);
@@ -22329,7 +22329,7 @@ ulong2 convert_ulong2_sat_rte(short2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rtp(short2 x)
 {
   return convert_ulong2_sat(x);
@@ -22338,7 +22338,7 @@ ulong2 convert_ulong2_sat_rtp(short2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rtn(short2 x)
 {
   return convert_ulong2_sat(x);
@@ -22347,7 +22347,7 @@ ulong2 convert_ulong2_sat_rtn(short2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rtz(short3 x)
 {
   return convert_ulong3_sat(x);
@@ -22356,7 +22356,7 @@ ulong3 convert_ulong3_sat_rtz(short3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rte(short3 x)
 {
   return convert_ulong3_sat(x);
@@ -22365,7 +22365,7 @@ ulong3 convert_ulong3_sat_rte(short3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rtp(short3 x)
 {
   return convert_ulong3_sat(x);
@@ -22374,7 +22374,7 @@ ulong3 convert_ulong3_sat_rtp(short3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rtn(short3 x)
 {
   return convert_ulong3_sat(x);
@@ -22383,7 +22383,7 @@ ulong3 convert_ulong3_sat_rtn(short3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rtz(short4 x)
 {
   return convert_ulong4_sat(x);
@@ -22392,7 +22392,7 @@ ulong4 convert_ulong4_sat_rtz(short4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rte(short4 x)
 {
   return convert_ulong4_sat(x);
@@ -22401,7 +22401,7 @@ ulong4 convert_ulong4_sat_rte(short4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rtp(short4 x)
 {
   return convert_ulong4_sat(x);
@@ -22410,7 +22410,7 @@ ulong4 convert_ulong4_sat_rtp(short4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rtn(short4 x)
 {
   return convert_ulong4_sat(x);
@@ -22419,7 +22419,7 @@ ulong4 convert_ulong4_sat_rtn(short4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rtz(short8 x)
 {
   return convert_ulong8_sat(x);
@@ -22428,7 +22428,7 @@ ulong8 convert_ulong8_sat_rtz(short8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rte(short8 x)
 {
   return convert_ulong8_sat(x);
@@ -22437,7 +22437,7 @@ ulong8 convert_ulong8_sat_rte(short8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rtp(short8 x)
 {
   return convert_ulong8_sat(x);
@@ -22446,7 +22446,7 @@ ulong8 convert_ulong8_sat_rtp(short8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rtn(short8 x)
 {
   return convert_ulong8_sat(x);
@@ -22455,7 +22455,7 @@ ulong8 convert_ulong8_sat_rtn(short8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rtz(short16 x)
 {
   return convert_ulong16_sat(x);
@@ -22464,7 +22464,7 @@ ulong16 convert_ulong16_sat_rtz(short16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rte(short16 x)
 {
   return convert_ulong16_sat(x);
@@ -22473,7 +22473,7 @@ ulong16 convert_ulong16_sat_rte(short16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rtp(short16 x)
 {
   return convert_ulong16_sat(x);
@@ -22482,7 +22482,7 @@ ulong16 convert_ulong16_sat_rtp(short16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rtn(short16 x)
 {
   return convert_ulong16_sat(x);
@@ -22490,1008 +22490,1008 @@ ulong16 convert_ulong16_sat_rtn(short16 x)
 
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rtz(ushort x)
 {
   return convert_char_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rte(ushort x)
 {
   return convert_char_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rtp(ushort x)
 {
   return convert_char_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rtn(ushort x)
 {
   return convert_char_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rtz(ushort2 x)
 {
   return convert_char2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rte(ushort2 x)
 {
   return convert_char2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rtp(ushort2 x)
 {
   return convert_char2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rtn(ushort2 x)
 {
   return convert_char2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rtz(ushort3 x)
 {
   return convert_char3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rte(ushort3 x)
 {
   return convert_char3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rtp(ushort3 x)
 {
   return convert_char3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rtn(ushort3 x)
 {
   return convert_char3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rtz(ushort4 x)
 {
   return convert_char4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rte(ushort4 x)
 {
   return convert_char4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rtp(ushort4 x)
 {
   return convert_char4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rtn(ushort4 x)
 {
   return convert_char4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rtz(ushort8 x)
 {
   return convert_char8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rte(ushort8 x)
 {
   return convert_char8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rtp(ushort8 x)
 {
   return convert_char8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rtn(ushort8 x)
 {
   return convert_char8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rtz(ushort16 x)
 {
   return convert_char16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rte(ushort16 x)
 {
   return convert_char16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rtp(ushort16 x)
 {
   return convert_char16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rtn(ushort16 x)
 {
   return convert_char16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rtz(ushort x)
 {
   return convert_uchar_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rte(ushort x)
 {
   return convert_uchar_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rtp(ushort x)
 {
   return convert_uchar_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rtn(ushort x)
 {
   return convert_uchar_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rtz(ushort2 x)
 {
   return convert_uchar2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rte(ushort2 x)
 {
   return convert_uchar2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rtp(ushort2 x)
 {
   return convert_uchar2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rtn(ushort2 x)
 {
   return convert_uchar2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rtz(ushort3 x)
 {
   return convert_uchar3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rte(ushort3 x)
 {
   return convert_uchar3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rtp(ushort3 x)
 {
   return convert_uchar3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rtn(ushort3 x)
 {
   return convert_uchar3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rtz(ushort4 x)
 {
   return convert_uchar4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rte(ushort4 x)
 {
   return convert_uchar4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rtp(ushort4 x)
 {
   return convert_uchar4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rtn(ushort4 x)
 {
   return convert_uchar4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rtz(ushort8 x)
 {
   return convert_uchar8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rte(ushort8 x)
 {
   return convert_uchar8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rtp(ushort8 x)
 {
   return convert_uchar8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rtn(ushort8 x)
 {
   return convert_uchar8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rtz(ushort16 x)
 {
   return convert_uchar16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rte(ushort16 x)
 {
   return convert_uchar16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rtp(ushort16 x)
 {
   return convert_uchar16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rtn(ushort16 x)
 {
   return convert_uchar16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rtz(ushort x)
 {
   return convert_short_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rte(ushort x)
 {
   return convert_short_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rtp(ushort x)
 {
   return convert_short_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rtn(ushort x)
 {
   return convert_short_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rtz(ushort2 x)
 {
   return convert_short2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rte(ushort2 x)
 {
   return convert_short2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rtp(ushort2 x)
 {
   return convert_short2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rtn(ushort2 x)
 {
   return convert_short2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rtz(ushort3 x)
 {
   return convert_short3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rte(ushort3 x)
 {
   return convert_short3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rtp(ushort3 x)
 {
   return convert_short3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rtn(ushort3 x)
 {
   return convert_short3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rtz(ushort4 x)
 {
   return convert_short4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rte(ushort4 x)
 {
   return convert_short4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rtp(ushort4 x)
 {
   return convert_short4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rtn(ushort4 x)
 {
   return convert_short4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rtz(ushort8 x)
 {
   return convert_short8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rte(ushort8 x)
 {
   return convert_short8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rtp(ushort8 x)
 {
   return convert_short8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rtn(ushort8 x)
 {
   return convert_short8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rtz(ushort16 x)
 {
   return convert_short16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rte(ushort16 x)
 {
   return convert_short16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rtp(ushort16 x)
 {
   return convert_short16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rtn(ushort16 x)
 {
   return convert_short16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rtz(ushort x)
 {
   return convert_ushort_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rte(ushort x)
 {
   return convert_ushort_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rtp(ushort x)
 {
   return convert_ushort_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rtn(ushort x)
 {
   return convert_ushort_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rtz(ushort2 x)
 {
   return convert_ushort2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rte(ushort2 x)
 {
   return convert_ushort2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rtp(ushort2 x)
 {
   return convert_ushort2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rtn(ushort2 x)
 {
   return convert_ushort2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rtz(ushort3 x)
 {
   return convert_ushort3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rte(ushort3 x)
 {
   return convert_ushort3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rtp(ushort3 x)
 {
   return convert_ushort3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rtn(ushort3 x)
 {
   return convert_ushort3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rtz(ushort4 x)
 {
   return convert_ushort4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rte(ushort4 x)
 {
   return convert_ushort4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rtp(ushort4 x)
 {
   return convert_ushort4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rtn(ushort4 x)
 {
   return convert_ushort4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rtz(ushort8 x)
 {
   return convert_ushort8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rte(ushort8 x)
 {
   return convert_ushort8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rtp(ushort8 x)
 {
   return convert_ushort8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rtn(ushort8 x)
 {
   return convert_ushort8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rtz(ushort16 x)
 {
   return convert_ushort16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rte(ushort16 x)
 {
   return convert_ushort16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rtp(ushort16 x)
 {
   return convert_ushort16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rtn(ushort16 x)
 {
   return convert_ushort16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rtz(ushort x)
 {
   return convert_int_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rte(ushort x)
 {
   return convert_int_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rtp(ushort x)
 {
   return convert_int_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rtn(ushort x)
 {
   return convert_int_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rtz(ushort2 x)
 {
   return convert_int2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rte(ushort2 x)
 {
   return convert_int2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rtp(ushort2 x)
 {
   return convert_int2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rtn(ushort2 x)
 {
   return convert_int2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rtz(ushort3 x)
 {
   return convert_int3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rte(ushort3 x)
 {
   return convert_int3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rtp(ushort3 x)
 {
   return convert_int3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rtn(ushort3 x)
 {
   return convert_int3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rtz(ushort4 x)
 {
   return convert_int4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rte(ushort4 x)
 {
   return convert_int4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rtp(ushort4 x)
 {
   return convert_int4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rtn(ushort4 x)
 {
   return convert_int4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rtz(ushort8 x)
 {
   return convert_int8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rte(ushort8 x)
 {
   return convert_int8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rtp(ushort8 x)
 {
   return convert_int8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rtn(ushort8 x)
 {
   return convert_int8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rtz(ushort16 x)
 {
   return convert_int16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rte(ushort16 x)
 {
   return convert_int16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rtp(ushort16 x)
 {
   return convert_int16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rtn(ushort16 x)
 {
   return convert_int16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rtz(ushort x)
 {
   return convert_uint_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rte(ushort x)
 {
   return convert_uint_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rtp(ushort x)
 {
   return convert_uint_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rtn(ushort x)
 {
   return convert_uint_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rtz(ushort2 x)
 {
   return convert_uint2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rte(ushort2 x)
 {
   return convert_uint2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rtp(ushort2 x)
 {
   return convert_uint2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rtn(ushort2 x)
 {
   return convert_uint2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rtz(ushort3 x)
 {
   return convert_uint3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rte(ushort3 x)
 {
   return convert_uint3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rtp(ushort3 x)
 {
   return convert_uint3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rtn(ushort3 x)
 {
   return convert_uint3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rtz(ushort4 x)
 {
   return convert_uint4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rte(ushort4 x)
 {
   return convert_uint4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rtp(ushort4 x)
 {
   return convert_uint4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rtn(ushort4 x)
 {
   return convert_uint4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rtz(ushort8 x)
 {
   return convert_uint8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rte(ushort8 x)
 {
   return convert_uint8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rtp(ushort8 x)
 {
   return convert_uint8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rtn(ushort8 x)
 {
   return convert_uint8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rtz(ushort16 x)
 {
   return convert_uint16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rte(ushort16 x)
 {
   return convert_uint16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rtp(ushort16 x)
 {
   return convert_uint16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rtn(ushort16 x)
 {
   return convert_uint16_sat(x);
@@ -23499,7 +23499,7 @@ uint16 convert_uint16_sat_rtn(ushort16 x)
 
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rtz(ushort x)
 {
   return convert_long_sat(x);
@@ -23508,7 +23508,7 @@ long convert_long_sat_rtz(ushort x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rte(ushort x)
 {
   return convert_long_sat(x);
@@ -23517,7 +23517,7 @@ long convert_long_sat_rte(ushort x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rtp(ushort x)
 {
   return convert_long_sat(x);
@@ -23526,7 +23526,7 @@ long convert_long_sat_rtp(ushort x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rtn(ushort x)
 {
   return convert_long_sat(x);
@@ -23535,7 +23535,7 @@ long convert_long_sat_rtn(ushort x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rtz(ushort2 x)
 {
   return convert_long2_sat(x);
@@ -23544,7 +23544,7 @@ long2 convert_long2_sat_rtz(ushort2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rte(ushort2 x)
 {
   return convert_long2_sat(x);
@@ -23553,7 +23553,7 @@ long2 convert_long2_sat_rte(ushort2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rtp(ushort2 x)
 {
   return convert_long2_sat(x);
@@ -23562,7 +23562,7 @@ long2 convert_long2_sat_rtp(ushort2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rtn(ushort2 x)
 {
   return convert_long2_sat(x);
@@ -23571,7 +23571,7 @@ long2 convert_long2_sat_rtn(ushort2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rtz(ushort3 x)
 {
   return convert_long3_sat(x);
@@ -23580,7 +23580,7 @@ long3 convert_long3_sat_rtz(ushort3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rte(ushort3 x)
 {
   return convert_long3_sat(x);
@@ -23589,7 +23589,7 @@ long3 convert_long3_sat_rte(ushort3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rtp(ushort3 x)
 {
   return convert_long3_sat(x);
@@ -23598,7 +23598,7 @@ long3 convert_long3_sat_rtp(ushort3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rtn(ushort3 x)
 {
   return convert_long3_sat(x);
@@ -23607,7 +23607,7 @@ long3 convert_long3_sat_rtn(ushort3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rtz(ushort4 x)
 {
   return convert_long4_sat(x);
@@ -23616,7 +23616,7 @@ long4 convert_long4_sat_rtz(ushort4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rte(ushort4 x)
 {
   return convert_long4_sat(x);
@@ -23625,7 +23625,7 @@ long4 convert_long4_sat_rte(ushort4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rtp(ushort4 x)
 {
   return convert_long4_sat(x);
@@ -23634,7 +23634,7 @@ long4 convert_long4_sat_rtp(ushort4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rtn(ushort4 x)
 {
   return convert_long4_sat(x);
@@ -23643,7 +23643,7 @@ long4 convert_long4_sat_rtn(ushort4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rtz(ushort8 x)
 {
   return convert_long8_sat(x);
@@ -23652,7 +23652,7 @@ long8 convert_long8_sat_rtz(ushort8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rte(ushort8 x)
 {
   return convert_long8_sat(x);
@@ -23661,7 +23661,7 @@ long8 convert_long8_sat_rte(ushort8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rtp(ushort8 x)
 {
   return convert_long8_sat(x);
@@ -23670,7 +23670,7 @@ long8 convert_long8_sat_rtp(ushort8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rtn(ushort8 x)
 {
   return convert_long8_sat(x);
@@ -23679,7 +23679,7 @@ long8 convert_long8_sat_rtn(ushort8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rtz(ushort16 x)
 {
   return convert_long16_sat(x);
@@ -23688,7 +23688,7 @@ long16 convert_long16_sat_rtz(ushort16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rte(ushort16 x)
 {
   return convert_long16_sat(x);
@@ -23697,7 +23697,7 @@ long16 convert_long16_sat_rte(ushort16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rtp(ushort16 x)
 {
   return convert_long16_sat(x);
@@ -23706,7 +23706,7 @@ long16 convert_long16_sat_rtp(ushort16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rtn(ushort16 x)
 {
   return convert_long16_sat(x);
@@ -23715,7 +23715,7 @@ long16 convert_long16_sat_rtn(ushort16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rtz(ushort x)
 {
   return convert_ulong_sat(x);
@@ -23724,7 +23724,7 @@ ulong convert_ulong_sat_rtz(ushort x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rte(ushort x)
 {
   return convert_ulong_sat(x);
@@ -23733,7 +23733,7 @@ ulong convert_ulong_sat_rte(ushort x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rtp(ushort x)
 {
   return convert_ulong_sat(x);
@@ -23742,7 +23742,7 @@ ulong convert_ulong_sat_rtp(ushort x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rtn(ushort x)
 {
   return convert_ulong_sat(x);
@@ -23751,7 +23751,7 @@ ulong convert_ulong_sat_rtn(ushort x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rtz(ushort2 x)
 {
   return convert_ulong2_sat(x);
@@ -23760,7 +23760,7 @@ ulong2 convert_ulong2_sat_rtz(ushort2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rte(ushort2 x)
 {
   return convert_ulong2_sat(x);
@@ -23769,7 +23769,7 @@ ulong2 convert_ulong2_sat_rte(ushort2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rtp(ushort2 x)
 {
   return convert_ulong2_sat(x);
@@ -23778,7 +23778,7 @@ ulong2 convert_ulong2_sat_rtp(ushort2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rtn(ushort2 x)
 {
   return convert_ulong2_sat(x);
@@ -23787,7 +23787,7 @@ ulong2 convert_ulong2_sat_rtn(ushort2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rtz(ushort3 x)
 {
   return convert_ulong3_sat(x);
@@ -23796,7 +23796,7 @@ ulong3 convert_ulong3_sat_rtz(ushort3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rte(ushort3 x)
 {
   return convert_ulong3_sat(x);
@@ -23805,7 +23805,7 @@ ulong3 convert_ulong3_sat_rte(ushort3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rtp(ushort3 x)
 {
   return convert_ulong3_sat(x);
@@ -23814,7 +23814,7 @@ ulong3 convert_ulong3_sat_rtp(ushort3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rtn(ushort3 x)
 {
   return convert_ulong3_sat(x);
@@ -23823,7 +23823,7 @@ ulong3 convert_ulong3_sat_rtn(ushort3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rtz(ushort4 x)
 {
   return convert_ulong4_sat(x);
@@ -23832,7 +23832,7 @@ ulong4 convert_ulong4_sat_rtz(ushort4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rte(ushort4 x)
 {
   return convert_ulong4_sat(x);
@@ -23841,7 +23841,7 @@ ulong4 convert_ulong4_sat_rte(ushort4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rtp(ushort4 x)
 {
   return convert_ulong4_sat(x);
@@ -23850,7 +23850,7 @@ ulong4 convert_ulong4_sat_rtp(ushort4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rtn(ushort4 x)
 {
   return convert_ulong4_sat(x);
@@ -23859,7 +23859,7 @@ ulong4 convert_ulong4_sat_rtn(ushort4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rtz(ushort8 x)
 {
   return convert_ulong8_sat(x);
@@ -23868,7 +23868,7 @@ ulong8 convert_ulong8_sat_rtz(ushort8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rte(ushort8 x)
 {
   return convert_ulong8_sat(x);
@@ -23877,7 +23877,7 @@ ulong8 convert_ulong8_sat_rte(ushort8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rtp(ushort8 x)
 {
   return convert_ulong8_sat(x);
@@ -23886,7 +23886,7 @@ ulong8 convert_ulong8_sat_rtp(ushort8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rtn(ushort8 x)
 {
   return convert_ulong8_sat(x);
@@ -23895,7 +23895,7 @@ ulong8 convert_ulong8_sat_rtn(ushort8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rtz(ushort16 x)
 {
   return convert_ulong16_sat(x);
@@ -23904,7 +23904,7 @@ ulong16 convert_ulong16_sat_rtz(ushort16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rte(ushort16 x)
 {
   return convert_ulong16_sat(x);
@@ -23913,7 +23913,7 @@ ulong16 convert_ulong16_sat_rte(ushort16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rtp(ushort16 x)
 {
   return convert_ulong16_sat(x);
@@ -23922,7 +23922,7 @@ ulong16 convert_ulong16_sat_rtp(ushort16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rtn(ushort16 x)
 {
   return convert_ulong16_sat(x);
@@ -23930,1008 +23930,1008 @@ ulong16 convert_ulong16_sat_rtn(ushort16 x)
 
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rtz(int x)
 {
   return convert_char_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rte(int x)
 {
   return convert_char_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rtp(int x)
 {
   return convert_char_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rtn(int x)
 {
   return convert_char_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rtz(int2 x)
 {
   return convert_char2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rte(int2 x)
 {
   return convert_char2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rtp(int2 x)
 {
   return convert_char2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rtn(int2 x)
 {
   return convert_char2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rtz(int3 x)
 {
   return convert_char3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rte(int3 x)
 {
   return convert_char3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rtp(int3 x)
 {
   return convert_char3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rtn(int3 x)
 {
   return convert_char3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rtz(int4 x)
 {
   return convert_char4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rte(int4 x)
 {
   return convert_char4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rtp(int4 x)
 {
   return convert_char4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rtn(int4 x)
 {
   return convert_char4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rtz(int8 x)
 {
   return convert_char8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rte(int8 x)
 {
   return convert_char8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rtp(int8 x)
 {
   return convert_char8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rtn(int8 x)
 {
   return convert_char8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rtz(int16 x)
 {
   return convert_char16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rte(int16 x)
 {
   return convert_char16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rtp(int16 x)
 {
   return convert_char16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rtn(int16 x)
 {
   return convert_char16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rtz(int x)
 {
   return convert_uchar_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rte(int x)
 {
   return convert_uchar_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rtp(int x)
 {
   return convert_uchar_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rtn(int x)
 {
   return convert_uchar_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rtz(int2 x)
 {
   return convert_uchar2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rte(int2 x)
 {
   return convert_uchar2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rtp(int2 x)
 {
   return convert_uchar2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rtn(int2 x)
 {
   return convert_uchar2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rtz(int3 x)
 {
   return convert_uchar3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rte(int3 x)
 {
   return convert_uchar3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rtp(int3 x)
 {
   return convert_uchar3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rtn(int3 x)
 {
   return convert_uchar3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rtz(int4 x)
 {
   return convert_uchar4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rte(int4 x)
 {
   return convert_uchar4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rtp(int4 x)
 {
   return convert_uchar4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rtn(int4 x)
 {
   return convert_uchar4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rtz(int8 x)
 {
   return convert_uchar8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rte(int8 x)
 {
   return convert_uchar8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rtp(int8 x)
 {
   return convert_uchar8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rtn(int8 x)
 {
   return convert_uchar8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rtz(int16 x)
 {
   return convert_uchar16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rte(int16 x)
 {
   return convert_uchar16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rtp(int16 x)
 {
   return convert_uchar16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rtn(int16 x)
 {
   return convert_uchar16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rtz(int x)
 {
   return convert_short_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rte(int x)
 {
   return convert_short_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rtp(int x)
 {
   return convert_short_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rtn(int x)
 {
   return convert_short_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rtz(int2 x)
 {
   return convert_short2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rte(int2 x)
 {
   return convert_short2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rtp(int2 x)
 {
   return convert_short2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rtn(int2 x)
 {
   return convert_short2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rtz(int3 x)
 {
   return convert_short3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rte(int3 x)
 {
   return convert_short3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rtp(int3 x)
 {
   return convert_short3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rtn(int3 x)
 {
   return convert_short3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rtz(int4 x)
 {
   return convert_short4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rte(int4 x)
 {
   return convert_short4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rtp(int4 x)
 {
   return convert_short4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rtn(int4 x)
 {
   return convert_short4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rtz(int8 x)
 {
   return convert_short8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rte(int8 x)
 {
   return convert_short8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rtp(int8 x)
 {
   return convert_short8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rtn(int8 x)
 {
   return convert_short8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rtz(int16 x)
 {
   return convert_short16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rte(int16 x)
 {
   return convert_short16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rtp(int16 x)
 {
   return convert_short16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rtn(int16 x)
 {
   return convert_short16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rtz(int x)
 {
   return convert_ushort_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rte(int x)
 {
   return convert_ushort_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rtp(int x)
 {
   return convert_ushort_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rtn(int x)
 {
   return convert_ushort_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rtz(int2 x)
 {
   return convert_ushort2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rte(int2 x)
 {
   return convert_ushort2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rtp(int2 x)
 {
   return convert_ushort2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rtn(int2 x)
 {
   return convert_ushort2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rtz(int3 x)
 {
   return convert_ushort3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rte(int3 x)
 {
   return convert_ushort3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rtp(int3 x)
 {
   return convert_ushort3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rtn(int3 x)
 {
   return convert_ushort3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rtz(int4 x)
 {
   return convert_ushort4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rte(int4 x)
 {
   return convert_ushort4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rtp(int4 x)
 {
   return convert_ushort4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rtn(int4 x)
 {
   return convert_ushort4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rtz(int8 x)
 {
   return convert_ushort8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rte(int8 x)
 {
   return convert_ushort8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rtp(int8 x)
 {
   return convert_ushort8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rtn(int8 x)
 {
   return convert_ushort8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rtz(int16 x)
 {
   return convert_ushort16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rte(int16 x)
 {
   return convert_ushort16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rtp(int16 x)
 {
   return convert_ushort16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rtn(int16 x)
 {
   return convert_ushort16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rtz(int x)
 {
   return convert_int_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rte(int x)
 {
   return convert_int_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rtp(int x)
 {
   return convert_int_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rtn(int x)
 {
   return convert_int_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rtz(int2 x)
 {
   return convert_int2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rte(int2 x)
 {
   return convert_int2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rtp(int2 x)
 {
   return convert_int2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rtn(int2 x)
 {
   return convert_int2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rtz(int3 x)
 {
   return convert_int3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rte(int3 x)
 {
   return convert_int3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rtp(int3 x)
 {
   return convert_int3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rtn(int3 x)
 {
   return convert_int3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rtz(int4 x)
 {
   return convert_int4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rte(int4 x)
 {
   return convert_int4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rtp(int4 x)
 {
   return convert_int4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rtn(int4 x)
 {
   return convert_int4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rtz(int8 x)
 {
   return convert_int8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rte(int8 x)
 {
   return convert_int8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rtp(int8 x)
 {
   return convert_int8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rtn(int8 x)
 {
   return convert_int8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rtz(int16 x)
 {
   return convert_int16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rte(int16 x)
 {
   return convert_int16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rtp(int16 x)
 {
   return convert_int16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rtn(int16 x)
 {
   return convert_int16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rtz(int x)
 {
   return convert_uint_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rte(int x)
 {
   return convert_uint_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rtp(int x)
 {
   return convert_uint_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rtn(int x)
 {
   return convert_uint_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rtz(int2 x)
 {
   return convert_uint2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rte(int2 x)
 {
   return convert_uint2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rtp(int2 x)
 {
   return convert_uint2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rtn(int2 x)
 {
   return convert_uint2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rtz(int3 x)
 {
   return convert_uint3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rte(int3 x)
 {
   return convert_uint3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rtp(int3 x)
 {
   return convert_uint3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rtn(int3 x)
 {
   return convert_uint3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rtz(int4 x)
 {
   return convert_uint4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rte(int4 x)
 {
   return convert_uint4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rtp(int4 x)
 {
   return convert_uint4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rtn(int4 x)
 {
   return convert_uint4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rtz(int8 x)
 {
   return convert_uint8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rte(int8 x)
 {
   return convert_uint8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rtp(int8 x)
 {
   return convert_uint8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rtn(int8 x)
 {
   return convert_uint8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rtz(int16 x)
 {
   return convert_uint16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rte(int16 x)
 {
   return convert_uint16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rtp(int16 x)
 {
   return convert_uint16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rtn(int16 x)
 {
   return convert_uint16_sat(x);
@@ -24939,7 +24939,7 @@ uint16 convert_uint16_sat_rtn(int16 x)
 
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rtz(int x)
 {
   return convert_long_sat(x);
@@ -24948,7 +24948,7 @@ long convert_long_sat_rtz(int x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rte(int x)
 {
   return convert_long_sat(x);
@@ -24957,7 +24957,7 @@ long convert_long_sat_rte(int x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rtp(int x)
 {
   return convert_long_sat(x);
@@ -24966,7 +24966,7 @@ long convert_long_sat_rtp(int x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rtn(int x)
 {
   return convert_long_sat(x);
@@ -24975,7 +24975,7 @@ long convert_long_sat_rtn(int x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rtz(int2 x)
 {
   return convert_long2_sat(x);
@@ -24984,7 +24984,7 @@ long2 convert_long2_sat_rtz(int2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rte(int2 x)
 {
   return convert_long2_sat(x);
@@ -24993,7 +24993,7 @@ long2 convert_long2_sat_rte(int2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rtp(int2 x)
 {
   return convert_long2_sat(x);
@@ -25002,7 +25002,7 @@ long2 convert_long2_sat_rtp(int2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rtn(int2 x)
 {
   return convert_long2_sat(x);
@@ -25011,7 +25011,7 @@ long2 convert_long2_sat_rtn(int2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rtz(int3 x)
 {
   return convert_long3_sat(x);
@@ -25020,7 +25020,7 @@ long3 convert_long3_sat_rtz(int3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rte(int3 x)
 {
   return convert_long3_sat(x);
@@ -25029,7 +25029,7 @@ long3 convert_long3_sat_rte(int3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rtp(int3 x)
 {
   return convert_long3_sat(x);
@@ -25038,7 +25038,7 @@ long3 convert_long3_sat_rtp(int3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rtn(int3 x)
 {
   return convert_long3_sat(x);
@@ -25047,7 +25047,7 @@ long3 convert_long3_sat_rtn(int3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rtz(int4 x)
 {
   return convert_long4_sat(x);
@@ -25056,7 +25056,7 @@ long4 convert_long4_sat_rtz(int4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rte(int4 x)
 {
   return convert_long4_sat(x);
@@ -25065,7 +25065,7 @@ long4 convert_long4_sat_rte(int4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rtp(int4 x)
 {
   return convert_long4_sat(x);
@@ -25074,7 +25074,7 @@ long4 convert_long4_sat_rtp(int4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rtn(int4 x)
 {
   return convert_long4_sat(x);
@@ -25083,7 +25083,7 @@ long4 convert_long4_sat_rtn(int4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rtz(int8 x)
 {
   return convert_long8_sat(x);
@@ -25092,7 +25092,7 @@ long8 convert_long8_sat_rtz(int8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rte(int8 x)
 {
   return convert_long8_sat(x);
@@ -25101,7 +25101,7 @@ long8 convert_long8_sat_rte(int8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rtp(int8 x)
 {
   return convert_long8_sat(x);
@@ -25110,7 +25110,7 @@ long8 convert_long8_sat_rtp(int8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rtn(int8 x)
 {
   return convert_long8_sat(x);
@@ -25119,7 +25119,7 @@ long8 convert_long8_sat_rtn(int8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rtz(int16 x)
 {
   return convert_long16_sat(x);
@@ -25128,7 +25128,7 @@ long16 convert_long16_sat_rtz(int16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rte(int16 x)
 {
   return convert_long16_sat(x);
@@ -25137,7 +25137,7 @@ long16 convert_long16_sat_rte(int16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rtp(int16 x)
 {
   return convert_long16_sat(x);
@@ -25146,7 +25146,7 @@ long16 convert_long16_sat_rtp(int16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rtn(int16 x)
 {
   return convert_long16_sat(x);
@@ -25155,7 +25155,7 @@ long16 convert_long16_sat_rtn(int16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rtz(int x)
 {
   return convert_ulong_sat(x);
@@ -25164,7 +25164,7 @@ ulong convert_ulong_sat_rtz(int x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rte(int x)
 {
   return convert_ulong_sat(x);
@@ -25173,7 +25173,7 @@ ulong convert_ulong_sat_rte(int x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rtp(int x)
 {
   return convert_ulong_sat(x);
@@ -25182,7 +25182,7 @@ ulong convert_ulong_sat_rtp(int x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rtn(int x)
 {
   return convert_ulong_sat(x);
@@ -25191,7 +25191,7 @@ ulong convert_ulong_sat_rtn(int x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rtz(int2 x)
 {
   return convert_ulong2_sat(x);
@@ -25200,7 +25200,7 @@ ulong2 convert_ulong2_sat_rtz(int2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rte(int2 x)
 {
   return convert_ulong2_sat(x);
@@ -25209,7 +25209,7 @@ ulong2 convert_ulong2_sat_rte(int2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rtp(int2 x)
 {
   return convert_ulong2_sat(x);
@@ -25218,7 +25218,7 @@ ulong2 convert_ulong2_sat_rtp(int2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rtn(int2 x)
 {
   return convert_ulong2_sat(x);
@@ -25227,7 +25227,7 @@ ulong2 convert_ulong2_sat_rtn(int2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rtz(int3 x)
 {
   return convert_ulong3_sat(x);
@@ -25236,7 +25236,7 @@ ulong3 convert_ulong3_sat_rtz(int3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rte(int3 x)
 {
   return convert_ulong3_sat(x);
@@ -25245,7 +25245,7 @@ ulong3 convert_ulong3_sat_rte(int3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rtp(int3 x)
 {
   return convert_ulong3_sat(x);
@@ -25254,7 +25254,7 @@ ulong3 convert_ulong3_sat_rtp(int3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rtn(int3 x)
 {
   return convert_ulong3_sat(x);
@@ -25263,7 +25263,7 @@ ulong3 convert_ulong3_sat_rtn(int3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rtz(int4 x)
 {
   return convert_ulong4_sat(x);
@@ -25272,7 +25272,7 @@ ulong4 convert_ulong4_sat_rtz(int4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rte(int4 x)
 {
   return convert_ulong4_sat(x);
@@ -25281,7 +25281,7 @@ ulong4 convert_ulong4_sat_rte(int4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rtp(int4 x)
 {
   return convert_ulong4_sat(x);
@@ -25290,7 +25290,7 @@ ulong4 convert_ulong4_sat_rtp(int4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rtn(int4 x)
 {
   return convert_ulong4_sat(x);
@@ -25299,7 +25299,7 @@ ulong4 convert_ulong4_sat_rtn(int4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rtz(int8 x)
 {
   return convert_ulong8_sat(x);
@@ -25308,7 +25308,7 @@ ulong8 convert_ulong8_sat_rtz(int8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rte(int8 x)
 {
   return convert_ulong8_sat(x);
@@ -25317,7 +25317,7 @@ ulong8 convert_ulong8_sat_rte(int8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rtp(int8 x)
 {
   return convert_ulong8_sat(x);
@@ -25326,7 +25326,7 @@ ulong8 convert_ulong8_sat_rtp(int8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rtn(int8 x)
 {
   return convert_ulong8_sat(x);
@@ -25335,7 +25335,7 @@ ulong8 convert_ulong8_sat_rtn(int8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rtz(int16 x)
 {
   return convert_ulong16_sat(x);
@@ -25344,7 +25344,7 @@ ulong16 convert_ulong16_sat_rtz(int16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rte(int16 x)
 {
   return convert_ulong16_sat(x);
@@ -25353,7 +25353,7 @@ ulong16 convert_ulong16_sat_rte(int16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rtp(int16 x)
 {
   return convert_ulong16_sat(x);
@@ -25362,7 +25362,7 @@ ulong16 convert_ulong16_sat_rtp(int16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rtn(int16 x)
 {
   return convert_ulong16_sat(x);
@@ -25370,1008 +25370,1008 @@ ulong16 convert_ulong16_sat_rtn(int16 x)
 
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rtz(uint x)
 {
   return convert_char_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rte(uint x)
 {
   return convert_char_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rtp(uint x)
 {
   return convert_char_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rtn(uint x)
 {
   return convert_char_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rtz(uint2 x)
 {
   return convert_char2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rte(uint2 x)
 {
   return convert_char2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rtp(uint2 x)
 {
   return convert_char2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rtn(uint2 x)
 {
   return convert_char2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rtz(uint3 x)
 {
   return convert_char3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rte(uint3 x)
 {
   return convert_char3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rtp(uint3 x)
 {
   return convert_char3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rtn(uint3 x)
 {
   return convert_char3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rtz(uint4 x)
 {
   return convert_char4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rte(uint4 x)
 {
   return convert_char4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rtp(uint4 x)
 {
   return convert_char4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rtn(uint4 x)
 {
   return convert_char4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rtz(uint8 x)
 {
   return convert_char8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rte(uint8 x)
 {
   return convert_char8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rtp(uint8 x)
 {
   return convert_char8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rtn(uint8 x)
 {
   return convert_char8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rtz(uint16 x)
 {
   return convert_char16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rte(uint16 x)
 {
   return convert_char16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rtp(uint16 x)
 {
   return convert_char16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rtn(uint16 x)
 {
   return convert_char16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rtz(uint x)
 {
   return convert_uchar_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rte(uint x)
 {
   return convert_uchar_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rtp(uint x)
 {
   return convert_uchar_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rtn(uint x)
 {
   return convert_uchar_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rtz(uint2 x)
 {
   return convert_uchar2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rte(uint2 x)
 {
   return convert_uchar2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rtp(uint2 x)
 {
   return convert_uchar2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rtn(uint2 x)
 {
   return convert_uchar2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rtz(uint3 x)
 {
   return convert_uchar3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rte(uint3 x)
 {
   return convert_uchar3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rtp(uint3 x)
 {
   return convert_uchar3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rtn(uint3 x)
 {
   return convert_uchar3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rtz(uint4 x)
 {
   return convert_uchar4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rte(uint4 x)
 {
   return convert_uchar4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rtp(uint4 x)
 {
   return convert_uchar4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rtn(uint4 x)
 {
   return convert_uchar4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rtz(uint8 x)
 {
   return convert_uchar8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rte(uint8 x)
 {
   return convert_uchar8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rtp(uint8 x)
 {
   return convert_uchar8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rtn(uint8 x)
 {
   return convert_uchar8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rtz(uint16 x)
 {
   return convert_uchar16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rte(uint16 x)
 {
   return convert_uchar16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rtp(uint16 x)
 {
   return convert_uchar16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rtn(uint16 x)
 {
   return convert_uchar16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rtz(uint x)
 {
   return convert_short_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rte(uint x)
 {
   return convert_short_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rtp(uint x)
 {
   return convert_short_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rtn(uint x)
 {
   return convert_short_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rtz(uint2 x)
 {
   return convert_short2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rte(uint2 x)
 {
   return convert_short2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rtp(uint2 x)
 {
   return convert_short2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rtn(uint2 x)
 {
   return convert_short2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rtz(uint3 x)
 {
   return convert_short3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rte(uint3 x)
 {
   return convert_short3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rtp(uint3 x)
 {
   return convert_short3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rtn(uint3 x)
 {
   return convert_short3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rtz(uint4 x)
 {
   return convert_short4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rte(uint4 x)
 {
   return convert_short4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rtp(uint4 x)
 {
   return convert_short4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rtn(uint4 x)
 {
   return convert_short4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rtz(uint8 x)
 {
   return convert_short8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rte(uint8 x)
 {
   return convert_short8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rtp(uint8 x)
 {
   return convert_short8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rtn(uint8 x)
 {
   return convert_short8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rtz(uint16 x)
 {
   return convert_short16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rte(uint16 x)
 {
   return convert_short16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rtp(uint16 x)
 {
   return convert_short16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rtn(uint16 x)
 {
   return convert_short16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rtz(uint x)
 {
   return convert_ushort_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rte(uint x)
 {
   return convert_ushort_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rtp(uint x)
 {
   return convert_ushort_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rtn(uint x)
 {
   return convert_ushort_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rtz(uint2 x)
 {
   return convert_ushort2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rte(uint2 x)
 {
   return convert_ushort2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rtp(uint2 x)
 {
   return convert_ushort2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rtn(uint2 x)
 {
   return convert_ushort2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rtz(uint3 x)
 {
   return convert_ushort3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rte(uint3 x)
 {
   return convert_ushort3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rtp(uint3 x)
 {
   return convert_ushort3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rtn(uint3 x)
 {
   return convert_ushort3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rtz(uint4 x)
 {
   return convert_ushort4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rte(uint4 x)
 {
   return convert_ushort4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rtp(uint4 x)
 {
   return convert_ushort4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rtn(uint4 x)
 {
   return convert_ushort4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rtz(uint8 x)
 {
   return convert_ushort8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rte(uint8 x)
 {
   return convert_ushort8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rtp(uint8 x)
 {
   return convert_ushort8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rtn(uint8 x)
 {
   return convert_ushort8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rtz(uint16 x)
 {
   return convert_ushort16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rte(uint16 x)
 {
   return convert_ushort16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rtp(uint16 x)
 {
   return convert_ushort16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rtn(uint16 x)
 {
   return convert_ushort16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rtz(uint x)
 {
   return convert_int_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rte(uint x)
 {
   return convert_int_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rtp(uint x)
 {
   return convert_int_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rtn(uint x)
 {
   return convert_int_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rtz(uint2 x)
 {
   return convert_int2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rte(uint2 x)
 {
   return convert_int2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rtp(uint2 x)
 {
   return convert_int2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rtn(uint2 x)
 {
   return convert_int2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rtz(uint3 x)
 {
   return convert_int3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rte(uint3 x)
 {
   return convert_int3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rtp(uint3 x)
 {
   return convert_int3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rtn(uint3 x)
 {
   return convert_int3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rtz(uint4 x)
 {
   return convert_int4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rte(uint4 x)
 {
   return convert_int4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rtp(uint4 x)
 {
   return convert_int4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rtn(uint4 x)
 {
   return convert_int4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rtz(uint8 x)
 {
   return convert_int8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rte(uint8 x)
 {
   return convert_int8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rtp(uint8 x)
 {
   return convert_int8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rtn(uint8 x)
 {
   return convert_int8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rtz(uint16 x)
 {
   return convert_int16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rte(uint16 x)
 {
   return convert_int16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rtp(uint16 x)
 {
   return convert_int16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rtn(uint16 x)
 {
   return convert_int16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rtz(uint x)
 {
   return convert_uint_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rte(uint x)
 {
   return convert_uint_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rtp(uint x)
 {
   return convert_uint_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rtn(uint x)
 {
   return convert_uint_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rtz(uint2 x)
 {
   return convert_uint2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rte(uint2 x)
 {
   return convert_uint2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rtp(uint2 x)
 {
   return convert_uint2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rtn(uint2 x)
 {
   return convert_uint2_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rtz(uint3 x)
 {
   return convert_uint3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rte(uint3 x)
 {
   return convert_uint3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rtp(uint3 x)
 {
   return convert_uint3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rtn(uint3 x)
 {
   return convert_uint3_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rtz(uint4 x)
 {
   return convert_uint4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rte(uint4 x)
 {
   return convert_uint4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rtp(uint4 x)
 {
   return convert_uint4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rtn(uint4 x)
 {
   return convert_uint4_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rtz(uint8 x)
 {
   return convert_uint8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rte(uint8 x)
 {
   return convert_uint8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rtp(uint8 x)
 {
   return convert_uint8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rtn(uint8 x)
 {
   return convert_uint8_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rtz(uint16 x)
 {
   return convert_uint16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rte(uint16 x)
 {
   return convert_uint16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rtp(uint16 x)
 {
   return convert_uint16_sat(x);
 }
 
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rtn(uint16 x)
 {
   return convert_uint16_sat(x);
@@ -26379,7 +26379,7 @@ uint16 convert_uint16_sat_rtn(uint16 x)
 
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rtz(uint x)
 {
   return convert_long_sat(x);
@@ -26388,7 +26388,7 @@ long convert_long_sat_rtz(uint x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rte(uint x)
 {
   return convert_long_sat(x);
@@ -26397,7 +26397,7 @@ long convert_long_sat_rte(uint x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rtp(uint x)
 {
   return convert_long_sat(x);
@@ -26406,7 +26406,7 @@ long convert_long_sat_rtp(uint x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rtn(uint x)
 {
   return convert_long_sat(x);
@@ -26415,7 +26415,7 @@ long convert_long_sat_rtn(uint x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rtz(uint2 x)
 {
   return convert_long2_sat(x);
@@ -26424,7 +26424,7 @@ long2 convert_long2_sat_rtz(uint2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rte(uint2 x)
 {
   return convert_long2_sat(x);
@@ -26433,7 +26433,7 @@ long2 convert_long2_sat_rte(uint2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rtp(uint2 x)
 {
   return convert_long2_sat(x);
@@ -26442,7 +26442,7 @@ long2 convert_long2_sat_rtp(uint2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rtn(uint2 x)
 {
   return convert_long2_sat(x);
@@ -26451,7 +26451,7 @@ long2 convert_long2_sat_rtn(uint2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rtz(uint3 x)
 {
   return convert_long3_sat(x);
@@ -26460,7 +26460,7 @@ long3 convert_long3_sat_rtz(uint3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rte(uint3 x)
 {
   return convert_long3_sat(x);
@@ -26469,7 +26469,7 @@ long3 convert_long3_sat_rte(uint3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rtp(uint3 x)
 {
   return convert_long3_sat(x);
@@ -26478,7 +26478,7 @@ long3 convert_long3_sat_rtp(uint3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rtn(uint3 x)
 {
   return convert_long3_sat(x);
@@ -26487,7 +26487,7 @@ long3 convert_long3_sat_rtn(uint3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rtz(uint4 x)
 {
   return convert_long4_sat(x);
@@ -26496,7 +26496,7 @@ long4 convert_long4_sat_rtz(uint4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rte(uint4 x)
 {
   return convert_long4_sat(x);
@@ -26505,7 +26505,7 @@ long4 convert_long4_sat_rte(uint4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rtp(uint4 x)
 {
   return convert_long4_sat(x);
@@ -26514,7 +26514,7 @@ long4 convert_long4_sat_rtp(uint4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rtn(uint4 x)
 {
   return convert_long4_sat(x);
@@ -26523,7 +26523,7 @@ long4 convert_long4_sat_rtn(uint4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rtz(uint8 x)
 {
   return convert_long8_sat(x);
@@ -26532,7 +26532,7 @@ long8 convert_long8_sat_rtz(uint8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rte(uint8 x)
 {
   return convert_long8_sat(x);
@@ -26541,7 +26541,7 @@ long8 convert_long8_sat_rte(uint8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rtp(uint8 x)
 {
   return convert_long8_sat(x);
@@ -26550,7 +26550,7 @@ long8 convert_long8_sat_rtp(uint8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rtn(uint8 x)
 {
   return convert_long8_sat(x);
@@ -26559,7 +26559,7 @@ long8 convert_long8_sat_rtn(uint8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rtz(uint16 x)
 {
   return convert_long16_sat(x);
@@ -26568,7 +26568,7 @@ long16 convert_long16_sat_rtz(uint16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rte(uint16 x)
 {
   return convert_long16_sat(x);
@@ -26577,7 +26577,7 @@ long16 convert_long16_sat_rte(uint16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rtp(uint16 x)
 {
   return convert_long16_sat(x);
@@ -26586,7 +26586,7 @@ long16 convert_long16_sat_rtp(uint16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rtn(uint16 x)
 {
   return convert_long16_sat(x);
@@ -26595,7 +26595,7 @@ long16 convert_long16_sat_rtn(uint16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rtz(uint x)
 {
   return convert_ulong_sat(x);
@@ -26604,7 +26604,7 @@ ulong convert_ulong_sat_rtz(uint x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rte(uint x)
 {
   return convert_ulong_sat(x);
@@ -26613,7 +26613,7 @@ ulong convert_ulong_sat_rte(uint x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rtp(uint x)
 {
   return convert_ulong_sat(x);
@@ -26622,7 +26622,7 @@ ulong convert_ulong_sat_rtp(uint x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rtn(uint x)
 {
   return convert_ulong_sat(x);
@@ -26631,7 +26631,7 @@ ulong convert_ulong_sat_rtn(uint x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rtz(uint2 x)
 {
   return convert_ulong2_sat(x);
@@ -26640,7 +26640,7 @@ ulong2 convert_ulong2_sat_rtz(uint2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rte(uint2 x)
 {
   return convert_ulong2_sat(x);
@@ -26649,7 +26649,7 @@ ulong2 convert_ulong2_sat_rte(uint2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rtp(uint2 x)
 {
   return convert_ulong2_sat(x);
@@ -26658,7 +26658,7 @@ ulong2 convert_ulong2_sat_rtp(uint2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rtn(uint2 x)
 {
   return convert_ulong2_sat(x);
@@ -26667,7 +26667,7 @@ ulong2 convert_ulong2_sat_rtn(uint2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rtz(uint3 x)
 {
   return convert_ulong3_sat(x);
@@ -26676,7 +26676,7 @@ ulong3 convert_ulong3_sat_rtz(uint3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rte(uint3 x)
 {
   return convert_ulong3_sat(x);
@@ -26685,7 +26685,7 @@ ulong3 convert_ulong3_sat_rte(uint3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rtp(uint3 x)
 {
   return convert_ulong3_sat(x);
@@ -26694,7 +26694,7 @@ ulong3 convert_ulong3_sat_rtp(uint3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rtn(uint3 x)
 {
   return convert_ulong3_sat(x);
@@ -26703,7 +26703,7 @@ ulong3 convert_ulong3_sat_rtn(uint3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rtz(uint4 x)
 {
   return convert_ulong4_sat(x);
@@ -26712,7 +26712,7 @@ ulong4 convert_ulong4_sat_rtz(uint4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rte(uint4 x)
 {
   return convert_ulong4_sat(x);
@@ -26721,7 +26721,7 @@ ulong4 convert_ulong4_sat_rte(uint4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rtp(uint4 x)
 {
   return convert_ulong4_sat(x);
@@ -26730,7 +26730,7 @@ ulong4 convert_ulong4_sat_rtp(uint4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rtn(uint4 x)
 {
   return convert_ulong4_sat(x);
@@ -26739,7 +26739,7 @@ ulong4 convert_ulong4_sat_rtn(uint4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rtz(uint8 x)
 {
   return convert_ulong8_sat(x);
@@ -26748,7 +26748,7 @@ ulong8 convert_ulong8_sat_rtz(uint8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rte(uint8 x)
 {
   return convert_ulong8_sat(x);
@@ -26757,7 +26757,7 @@ ulong8 convert_ulong8_sat_rte(uint8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rtp(uint8 x)
 {
   return convert_ulong8_sat(x);
@@ -26766,7 +26766,7 @@ ulong8 convert_ulong8_sat_rtp(uint8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rtn(uint8 x)
 {
   return convert_ulong8_sat(x);
@@ -26775,7 +26775,7 @@ ulong8 convert_ulong8_sat_rtn(uint8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rtz(uint16 x)
 {
   return convert_ulong16_sat(x);
@@ -26784,7 +26784,7 @@ ulong16 convert_ulong16_sat_rtz(uint16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rte(uint16 x)
 {
   return convert_ulong16_sat(x);
@@ -26793,7 +26793,7 @@ ulong16 convert_ulong16_sat_rte(uint16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rtp(uint16 x)
 {
   return convert_ulong16_sat(x);
@@ -26802,7 +26802,7 @@ ulong16 convert_ulong16_sat_rtp(uint16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rtn(uint16 x)
 {
   return convert_ulong16_sat(x);
@@ -26811,7 +26811,7 @@ ulong16 convert_ulong16_sat_rtn(uint16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rtz(long x)
 {
   return convert_char_sat(x);
@@ -26820,7 +26820,7 @@ char convert_char_sat_rtz(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rte(long x)
 {
   return convert_char_sat(x);
@@ -26829,7 +26829,7 @@ char convert_char_sat_rte(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rtp(long x)
 {
   return convert_char_sat(x);
@@ -26838,7 +26838,7 @@ char convert_char_sat_rtp(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rtn(long x)
 {
   return convert_char_sat(x);
@@ -26847,7 +26847,7 @@ char convert_char_sat_rtn(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rtz(long2 x)
 {
   return convert_char2_sat(x);
@@ -26856,7 +26856,7 @@ char2 convert_char2_sat_rtz(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rte(long2 x)
 {
   return convert_char2_sat(x);
@@ -26865,7 +26865,7 @@ char2 convert_char2_sat_rte(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rtp(long2 x)
 {
   return convert_char2_sat(x);
@@ -26874,7 +26874,7 @@ char2 convert_char2_sat_rtp(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rtn(long2 x)
 {
   return convert_char2_sat(x);
@@ -26883,7 +26883,7 @@ char2 convert_char2_sat_rtn(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rtz(long3 x)
 {
   return convert_char3_sat(x);
@@ -26892,7 +26892,7 @@ char3 convert_char3_sat_rtz(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rte(long3 x)
 {
   return convert_char3_sat(x);
@@ -26901,7 +26901,7 @@ char3 convert_char3_sat_rte(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rtp(long3 x)
 {
   return convert_char3_sat(x);
@@ -26910,7 +26910,7 @@ char3 convert_char3_sat_rtp(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rtn(long3 x)
 {
   return convert_char3_sat(x);
@@ -26919,7 +26919,7 @@ char3 convert_char3_sat_rtn(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rtz(long4 x)
 {
   return convert_char4_sat(x);
@@ -26928,7 +26928,7 @@ char4 convert_char4_sat_rtz(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rte(long4 x)
 {
   return convert_char4_sat(x);
@@ -26937,7 +26937,7 @@ char4 convert_char4_sat_rte(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rtp(long4 x)
 {
   return convert_char4_sat(x);
@@ -26946,7 +26946,7 @@ char4 convert_char4_sat_rtp(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rtn(long4 x)
 {
   return convert_char4_sat(x);
@@ -26955,7 +26955,7 @@ char4 convert_char4_sat_rtn(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rtz(long8 x)
 {
   return convert_char8_sat(x);
@@ -26964,7 +26964,7 @@ char8 convert_char8_sat_rtz(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rte(long8 x)
 {
   return convert_char8_sat(x);
@@ -26973,7 +26973,7 @@ char8 convert_char8_sat_rte(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rtp(long8 x)
 {
   return convert_char8_sat(x);
@@ -26982,7 +26982,7 @@ char8 convert_char8_sat_rtp(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rtn(long8 x)
 {
   return convert_char8_sat(x);
@@ -26991,7 +26991,7 @@ char8 convert_char8_sat_rtn(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rtz(long16 x)
 {
   return convert_char16_sat(x);
@@ -27000,7 +27000,7 @@ char16 convert_char16_sat_rtz(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rte(long16 x)
 {
   return convert_char16_sat(x);
@@ -27009,7 +27009,7 @@ char16 convert_char16_sat_rte(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rtp(long16 x)
 {
   return convert_char16_sat(x);
@@ -27018,7 +27018,7 @@ char16 convert_char16_sat_rtp(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rtn(long16 x)
 {
   return convert_char16_sat(x);
@@ -27027,7 +27027,7 @@ char16 convert_char16_sat_rtn(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rtz(long x)
 {
   return convert_uchar_sat(x);
@@ -27036,7 +27036,7 @@ uchar convert_uchar_sat_rtz(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rte(long x)
 {
   return convert_uchar_sat(x);
@@ -27045,7 +27045,7 @@ uchar convert_uchar_sat_rte(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rtp(long x)
 {
   return convert_uchar_sat(x);
@@ -27054,7 +27054,7 @@ uchar convert_uchar_sat_rtp(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rtn(long x)
 {
   return convert_uchar_sat(x);
@@ -27063,7 +27063,7 @@ uchar convert_uchar_sat_rtn(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rtz(long2 x)
 {
   return convert_uchar2_sat(x);
@@ -27072,7 +27072,7 @@ uchar2 convert_uchar2_sat_rtz(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rte(long2 x)
 {
   return convert_uchar2_sat(x);
@@ -27081,7 +27081,7 @@ uchar2 convert_uchar2_sat_rte(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rtp(long2 x)
 {
   return convert_uchar2_sat(x);
@@ -27090,7 +27090,7 @@ uchar2 convert_uchar2_sat_rtp(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rtn(long2 x)
 {
   return convert_uchar2_sat(x);
@@ -27099,7 +27099,7 @@ uchar2 convert_uchar2_sat_rtn(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rtz(long3 x)
 {
   return convert_uchar3_sat(x);
@@ -27108,7 +27108,7 @@ uchar3 convert_uchar3_sat_rtz(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rte(long3 x)
 {
   return convert_uchar3_sat(x);
@@ -27117,7 +27117,7 @@ uchar3 convert_uchar3_sat_rte(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rtp(long3 x)
 {
   return convert_uchar3_sat(x);
@@ -27126,7 +27126,7 @@ uchar3 convert_uchar3_sat_rtp(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rtn(long3 x)
 {
   return convert_uchar3_sat(x);
@@ -27135,7 +27135,7 @@ uchar3 convert_uchar3_sat_rtn(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rtz(long4 x)
 {
   return convert_uchar4_sat(x);
@@ -27144,7 +27144,7 @@ uchar4 convert_uchar4_sat_rtz(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rte(long4 x)
 {
   return convert_uchar4_sat(x);
@@ -27153,7 +27153,7 @@ uchar4 convert_uchar4_sat_rte(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rtp(long4 x)
 {
   return convert_uchar4_sat(x);
@@ -27162,7 +27162,7 @@ uchar4 convert_uchar4_sat_rtp(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rtn(long4 x)
 {
   return convert_uchar4_sat(x);
@@ -27171,7 +27171,7 @@ uchar4 convert_uchar4_sat_rtn(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rtz(long8 x)
 {
   return convert_uchar8_sat(x);
@@ -27180,7 +27180,7 @@ uchar8 convert_uchar8_sat_rtz(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rte(long8 x)
 {
   return convert_uchar8_sat(x);
@@ -27189,7 +27189,7 @@ uchar8 convert_uchar8_sat_rte(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rtp(long8 x)
 {
   return convert_uchar8_sat(x);
@@ -27198,7 +27198,7 @@ uchar8 convert_uchar8_sat_rtp(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rtn(long8 x)
 {
   return convert_uchar8_sat(x);
@@ -27207,7 +27207,7 @@ uchar8 convert_uchar8_sat_rtn(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rtz(long16 x)
 {
   return convert_uchar16_sat(x);
@@ -27216,7 +27216,7 @@ uchar16 convert_uchar16_sat_rtz(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rte(long16 x)
 {
   return convert_uchar16_sat(x);
@@ -27225,7 +27225,7 @@ uchar16 convert_uchar16_sat_rte(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rtp(long16 x)
 {
   return convert_uchar16_sat(x);
@@ -27234,7 +27234,7 @@ uchar16 convert_uchar16_sat_rtp(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rtn(long16 x)
 {
   return convert_uchar16_sat(x);
@@ -27243,7 +27243,7 @@ uchar16 convert_uchar16_sat_rtn(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rtz(long x)
 {
   return convert_short_sat(x);
@@ -27252,7 +27252,7 @@ short convert_short_sat_rtz(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rte(long x)
 {
   return convert_short_sat(x);
@@ -27261,7 +27261,7 @@ short convert_short_sat_rte(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rtp(long x)
 {
   return convert_short_sat(x);
@@ -27270,7 +27270,7 @@ short convert_short_sat_rtp(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rtn(long x)
 {
   return convert_short_sat(x);
@@ -27279,7 +27279,7 @@ short convert_short_sat_rtn(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rtz(long2 x)
 {
   return convert_short2_sat(x);
@@ -27288,7 +27288,7 @@ short2 convert_short2_sat_rtz(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rte(long2 x)
 {
   return convert_short2_sat(x);
@@ -27297,7 +27297,7 @@ short2 convert_short2_sat_rte(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rtp(long2 x)
 {
   return convert_short2_sat(x);
@@ -27306,7 +27306,7 @@ short2 convert_short2_sat_rtp(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rtn(long2 x)
 {
   return convert_short2_sat(x);
@@ -27315,7 +27315,7 @@ short2 convert_short2_sat_rtn(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rtz(long3 x)
 {
   return convert_short3_sat(x);
@@ -27324,7 +27324,7 @@ short3 convert_short3_sat_rtz(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rte(long3 x)
 {
   return convert_short3_sat(x);
@@ -27333,7 +27333,7 @@ short3 convert_short3_sat_rte(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rtp(long3 x)
 {
   return convert_short3_sat(x);
@@ -27342,7 +27342,7 @@ short3 convert_short3_sat_rtp(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rtn(long3 x)
 {
   return convert_short3_sat(x);
@@ -27351,7 +27351,7 @@ short3 convert_short3_sat_rtn(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rtz(long4 x)
 {
   return convert_short4_sat(x);
@@ -27360,7 +27360,7 @@ short4 convert_short4_sat_rtz(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rte(long4 x)
 {
   return convert_short4_sat(x);
@@ -27369,7 +27369,7 @@ short4 convert_short4_sat_rte(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rtp(long4 x)
 {
   return convert_short4_sat(x);
@@ -27378,7 +27378,7 @@ short4 convert_short4_sat_rtp(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rtn(long4 x)
 {
   return convert_short4_sat(x);
@@ -27387,7 +27387,7 @@ short4 convert_short4_sat_rtn(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rtz(long8 x)
 {
   return convert_short8_sat(x);
@@ -27396,7 +27396,7 @@ short8 convert_short8_sat_rtz(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rte(long8 x)
 {
   return convert_short8_sat(x);
@@ -27405,7 +27405,7 @@ short8 convert_short8_sat_rte(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rtp(long8 x)
 {
   return convert_short8_sat(x);
@@ -27414,7 +27414,7 @@ short8 convert_short8_sat_rtp(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rtn(long8 x)
 {
   return convert_short8_sat(x);
@@ -27423,7 +27423,7 @@ short8 convert_short8_sat_rtn(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rtz(long16 x)
 {
   return convert_short16_sat(x);
@@ -27432,7 +27432,7 @@ short16 convert_short16_sat_rtz(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rte(long16 x)
 {
   return convert_short16_sat(x);
@@ -27441,7 +27441,7 @@ short16 convert_short16_sat_rte(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rtp(long16 x)
 {
   return convert_short16_sat(x);
@@ -27450,7 +27450,7 @@ short16 convert_short16_sat_rtp(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rtn(long16 x)
 {
   return convert_short16_sat(x);
@@ -27459,7 +27459,7 @@ short16 convert_short16_sat_rtn(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rtz(long x)
 {
   return convert_ushort_sat(x);
@@ -27468,7 +27468,7 @@ ushort convert_ushort_sat_rtz(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rte(long x)
 {
   return convert_ushort_sat(x);
@@ -27477,7 +27477,7 @@ ushort convert_ushort_sat_rte(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rtp(long x)
 {
   return convert_ushort_sat(x);
@@ -27486,7 +27486,7 @@ ushort convert_ushort_sat_rtp(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rtn(long x)
 {
   return convert_ushort_sat(x);
@@ -27495,7 +27495,7 @@ ushort convert_ushort_sat_rtn(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rtz(long2 x)
 {
   return convert_ushort2_sat(x);
@@ -27504,7 +27504,7 @@ ushort2 convert_ushort2_sat_rtz(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rte(long2 x)
 {
   return convert_ushort2_sat(x);
@@ -27513,7 +27513,7 @@ ushort2 convert_ushort2_sat_rte(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rtp(long2 x)
 {
   return convert_ushort2_sat(x);
@@ -27522,7 +27522,7 @@ ushort2 convert_ushort2_sat_rtp(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rtn(long2 x)
 {
   return convert_ushort2_sat(x);
@@ -27531,7 +27531,7 @@ ushort2 convert_ushort2_sat_rtn(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rtz(long3 x)
 {
   return convert_ushort3_sat(x);
@@ -27540,7 +27540,7 @@ ushort3 convert_ushort3_sat_rtz(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rte(long3 x)
 {
   return convert_ushort3_sat(x);
@@ -27549,7 +27549,7 @@ ushort3 convert_ushort3_sat_rte(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rtp(long3 x)
 {
   return convert_ushort3_sat(x);
@@ -27558,7 +27558,7 @@ ushort3 convert_ushort3_sat_rtp(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rtn(long3 x)
 {
   return convert_ushort3_sat(x);
@@ -27567,7 +27567,7 @@ ushort3 convert_ushort3_sat_rtn(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rtz(long4 x)
 {
   return convert_ushort4_sat(x);
@@ -27576,7 +27576,7 @@ ushort4 convert_ushort4_sat_rtz(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rte(long4 x)
 {
   return convert_ushort4_sat(x);
@@ -27585,7 +27585,7 @@ ushort4 convert_ushort4_sat_rte(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rtp(long4 x)
 {
   return convert_ushort4_sat(x);
@@ -27594,7 +27594,7 @@ ushort4 convert_ushort4_sat_rtp(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rtn(long4 x)
 {
   return convert_ushort4_sat(x);
@@ -27603,7 +27603,7 @@ ushort4 convert_ushort4_sat_rtn(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rtz(long8 x)
 {
   return convert_ushort8_sat(x);
@@ -27612,7 +27612,7 @@ ushort8 convert_ushort8_sat_rtz(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rte(long8 x)
 {
   return convert_ushort8_sat(x);
@@ -27621,7 +27621,7 @@ ushort8 convert_ushort8_sat_rte(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rtp(long8 x)
 {
   return convert_ushort8_sat(x);
@@ -27630,7 +27630,7 @@ ushort8 convert_ushort8_sat_rtp(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rtn(long8 x)
 {
   return convert_ushort8_sat(x);
@@ -27639,7 +27639,7 @@ ushort8 convert_ushort8_sat_rtn(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rtz(long16 x)
 {
   return convert_ushort16_sat(x);
@@ -27648,7 +27648,7 @@ ushort16 convert_ushort16_sat_rtz(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rte(long16 x)
 {
   return convert_ushort16_sat(x);
@@ -27657,7 +27657,7 @@ ushort16 convert_ushort16_sat_rte(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rtp(long16 x)
 {
   return convert_ushort16_sat(x);
@@ -27666,7 +27666,7 @@ ushort16 convert_ushort16_sat_rtp(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rtn(long16 x)
 {
   return convert_ushort16_sat(x);
@@ -27675,7 +27675,7 @@ ushort16 convert_ushort16_sat_rtn(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rtz(long x)
 {
   return convert_int_sat(x);
@@ -27684,7 +27684,7 @@ int convert_int_sat_rtz(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rte(long x)
 {
   return convert_int_sat(x);
@@ -27693,7 +27693,7 @@ int convert_int_sat_rte(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rtp(long x)
 {
   return convert_int_sat(x);
@@ -27702,7 +27702,7 @@ int convert_int_sat_rtp(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rtn(long x)
 {
   return convert_int_sat(x);
@@ -27711,7 +27711,7 @@ int convert_int_sat_rtn(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rtz(long2 x)
 {
   return convert_int2_sat(x);
@@ -27720,7 +27720,7 @@ int2 convert_int2_sat_rtz(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rte(long2 x)
 {
   return convert_int2_sat(x);
@@ -27729,7 +27729,7 @@ int2 convert_int2_sat_rte(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rtp(long2 x)
 {
   return convert_int2_sat(x);
@@ -27738,7 +27738,7 @@ int2 convert_int2_sat_rtp(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rtn(long2 x)
 {
   return convert_int2_sat(x);
@@ -27747,7 +27747,7 @@ int2 convert_int2_sat_rtn(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rtz(long3 x)
 {
   return convert_int3_sat(x);
@@ -27756,7 +27756,7 @@ int3 convert_int3_sat_rtz(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rte(long3 x)
 {
   return convert_int3_sat(x);
@@ -27765,7 +27765,7 @@ int3 convert_int3_sat_rte(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rtp(long3 x)
 {
   return convert_int3_sat(x);
@@ -27774,7 +27774,7 @@ int3 convert_int3_sat_rtp(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rtn(long3 x)
 {
   return convert_int3_sat(x);
@@ -27783,7 +27783,7 @@ int3 convert_int3_sat_rtn(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rtz(long4 x)
 {
   return convert_int4_sat(x);
@@ -27792,7 +27792,7 @@ int4 convert_int4_sat_rtz(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rte(long4 x)
 {
   return convert_int4_sat(x);
@@ -27801,7 +27801,7 @@ int4 convert_int4_sat_rte(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rtp(long4 x)
 {
   return convert_int4_sat(x);
@@ -27810,7 +27810,7 @@ int4 convert_int4_sat_rtp(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rtn(long4 x)
 {
   return convert_int4_sat(x);
@@ -27819,7 +27819,7 @@ int4 convert_int4_sat_rtn(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rtz(long8 x)
 {
   return convert_int8_sat(x);
@@ -27828,7 +27828,7 @@ int8 convert_int8_sat_rtz(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rte(long8 x)
 {
   return convert_int8_sat(x);
@@ -27837,7 +27837,7 @@ int8 convert_int8_sat_rte(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rtp(long8 x)
 {
   return convert_int8_sat(x);
@@ -27846,7 +27846,7 @@ int8 convert_int8_sat_rtp(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rtn(long8 x)
 {
   return convert_int8_sat(x);
@@ -27855,7 +27855,7 @@ int8 convert_int8_sat_rtn(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rtz(long16 x)
 {
   return convert_int16_sat(x);
@@ -27864,7 +27864,7 @@ int16 convert_int16_sat_rtz(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rte(long16 x)
 {
   return convert_int16_sat(x);
@@ -27873,7 +27873,7 @@ int16 convert_int16_sat_rte(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rtp(long16 x)
 {
   return convert_int16_sat(x);
@@ -27882,7 +27882,7 @@ int16 convert_int16_sat_rtp(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rtn(long16 x)
 {
   return convert_int16_sat(x);
@@ -27891,7 +27891,7 @@ int16 convert_int16_sat_rtn(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rtz(long x)
 {
   return convert_uint_sat(x);
@@ -27900,7 +27900,7 @@ uint convert_uint_sat_rtz(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rte(long x)
 {
   return convert_uint_sat(x);
@@ -27909,7 +27909,7 @@ uint convert_uint_sat_rte(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rtp(long x)
 {
   return convert_uint_sat(x);
@@ -27918,7 +27918,7 @@ uint convert_uint_sat_rtp(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rtn(long x)
 {
   return convert_uint_sat(x);
@@ -27927,7 +27927,7 @@ uint convert_uint_sat_rtn(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rtz(long2 x)
 {
   return convert_uint2_sat(x);
@@ -27936,7 +27936,7 @@ uint2 convert_uint2_sat_rtz(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rte(long2 x)
 {
   return convert_uint2_sat(x);
@@ -27945,7 +27945,7 @@ uint2 convert_uint2_sat_rte(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rtp(long2 x)
 {
   return convert_uint2_sat(x);
@@ -27954,7 +27954,7 @@ uint2 convert_uint2_sat_rtp(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rtn(long2 x)
 {
   return convert_uint2_sat(x);
@@ -27963,7 +27963,7 @@ uint2 convert_uint2_sat_rtn(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rtz(long3 x)
 {
   return convert_uint3_sat(x);
@@ -27972,7 +27972,7 @@ uint3 convert_uint3_sat_rtz(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rte(long3 x)
 {
   return convert_uint3_sat(x);
@@ -27981,7 +27981,7 @@ uint3 convert_uint3_sat_rte(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rtp(long3 x)
 {
   return convert_uint3_sat(x);
@@ -27990,7 +27990,7 @@ uint3 convert_uint3_sat_rtp(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rtn(long3 x)
 {
   return convert_uint3_sat(x);
@@ -27999,7 +27999,7 @@ uint3 convert_uint3_sat_rtn(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rtz(long4 x)
 {
   return convert_uint4_sat(x);
@@ -28008,7 +28008,7 @@ uint4 convert_uint4_sat_rtz(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rte(long4 x)
 {
   return convert_uint4_sat(x);
@@ -28017,7 +28017,7 @@ uint4 convert_uint4_sat_rte(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rtp(long4 x)
 {
   return convert_uint4_sat(x);
@@ -28026,7 +28026,7 @@ uint4 convert_uint4_sat_rtp(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rtn(long4 x)
 {
   return convert_uint4_sat(x);
@@ -28035,7 +28035,7 @@ uint4 convert_uint4_sat_rtn(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rtz(long8 x)
 {
   return convert_uint8_sat(x);
@@ -28044,7 +28044,7 @@ uint8 convert_uint8_sat_rtz(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rte(long8 x)
 {
   return convert_uint8_sat(x);
@@ -28053,7 +28053,7 @@ uint8 convert_uint8_sat_rte(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rtp(long8 x)
 {
   return convert_uint8_sat(x);
@@ -28062,7 +28062,7 @@ uint8 convert_uint8_sat_rtp(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rtn(long8 x)
 {
   return convert_uint8_sat(x);
@@ -28071,7 +28071,7 @@ uint8 convert_uint8_sat_rtn(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rtz(long16 x)
 {
   return convert_uint16_sat(x);
@@ -28080,7 +28080,7 @@ uint16 convert_uint16_sat_rtz(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rte(long16 x)
 {
   return convert_uint16_sat(x);
@@ -28089,7 +28089,7 @@ uint16 convert_uint16_sat_rte(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rtp(long16 x)
 {
   return convert_uint16_sat(x);
@@ -28098,7 +28098,7 @@ uint16 convert_uint16_sat_rtp(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rtn(long16 x)
 {
   return convert_uint16_sat(x);
@@ -28107,7 +28107,7 @@ uint16 convert_uint16_sat_rtn(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rtz(long x)
 {
   return convert_long_sat(x);
@@ -28116,7 +28116,7 @@ long convert_long_sat_rtz(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rte(long x)
 {
   return convert_long_sat(x);
@@ -28125,7 +28125,7 @@ long convert_long_sat_rte(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rtp(long x)
 {
   return convert_long_sat(x);
@@ -28134,7 +28134,7 @@ long convert_long_sat_rtp(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rtn(long x)
 {
   return convert_long_sat(x);
@@ -28143,7 +28143,7 @@ long convert_long_sat_rtn(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rtz(long2 x)
 {
   return convert_long2_sat(x);
@@ -28152,7 +28152,7 @@ long2 convert_long2_sat_rtz(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rte(long2 x)
 {
   return convert_long2_sat(x);
@@ -28161,7 +28161,7 @@ long2 convert_long2_sat_rte(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rtp(long2 x)
 {
   return convert_long2_sat(x);
@@ -28170,7 +28170,7 @@ long2 convert_long2_sat_rtp(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rtn(long2 x)
 {
   return convert_long2_sat(x);
@@ -28179,7 +28179,7 @@ long2 convert_long2_sat_rtn(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rtz(long3 x)
 {
   return convert_long3_sat(x);
@@ -28188,7 +28188,7 @@ long3 convert_long3_sat_rtz(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rte(long3 x)
 {
   return convert_long3_sat(x);
@@ -28197,7 +28197,7 @@ long3 convert_long3_sat_rte(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rtp(long3 x)
 {
   return convert_long3_sat(x);
@@ -28206,7 +28206,7 @@ long3 convert_long3_sat_rtp(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rtn(long3 x)
 {
   return convert_long3_sat(x);
@@ -28215,7 +28215,7 @@ long3 convert_long3_sat_rtn(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rtz(long4 x)
 {
   return convert_long4_sat(x);
@@ -28224,7 +28224,7 @@ long4 convert_long4_sat_rtz(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rte(long4 x)
 {
   return convert_long4_sat(x);
@@ -28233,7 +28233,7 @@ long4 convert_long4_sat_rte(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rtp(long4 x)
 {
   return convert_long4_sat(x);
@@ -28242,7 +28242,7 @@ long4 convert_long4_sat_rtp(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rtn(long4 x)
 {
   return convert_long4_sat(x);
@@ -28251,7 +28251,7 @@ long4 convert_long4_sat_rtn(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rtz(long8 x)
 {
   return convert_long8_sat(x);
@@ -28260,7 +28260,7 @@ long8 convert_long8_sat_rtz(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rte(long8 x)
 {
   return convert_long8_sat(x);
@@ -28269,7 +28269,7 @@ long8 convert_long8_sat_rte(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rtp(long8 x)
 {
   return convert_long8_sat(x);
@@ -28278,7 +28278,7 @@ long8 convert_long8_sat_rtp(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rtn(long8 x)
 {
   return convert_long8_sat(x);
@@ -28287,7 +28287,7 @@ long8 convert_long8_sat_rtn(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rtz(long16 x)
 {
   return convert_long16_sat(x);
@@ -28296,7 +28296,7 @@ long16 convert_long16_sat_rtz(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rte(long16 x)
 {
   return convert_long16_sat(x);
@@ -28305,7 +28305,7 @@ long16 convert_long16_sat_rte(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rtp(long16 x)
 {
   return convert_long16_sat(x);
@@ -28314,7 +28314,7 @@ long16 convert_long16_sat_rtp(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rtn(long16 x)
 {
   return convert_long16_sat(x);
@@ -28323,7 +28323,7 @@ long16 convert_long16_sat_rtn(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rtz(long x)
 {
   return convert_ulong_sat(x);
@@ -28332,7 +28332,7 @@ ulong convert_ulong_sat_rtz(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rte(long x)
 {
   return convert_ulong_sat(x);
@@ -28341,7 +28341,7 @@ ulong convert_ulong_sat_rte(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rtp(long x)
 {
   return convert_ulong_sat(x);
@@ -28350,7 +28350,7 @@ ulong convert_ulong_sat_rtp(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rtn(long x)
 {
   return convert_ulong_sat(x);
@@ -28359,7 +28359,7 @@ ulong convert_ulong_sat_rtn(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rtz(long2 x)
 {
   return convert_ulong2_sat(x);
@@ -28368,7 +28368,7 @@ ulong2 convert_ulong2_sat_rtz(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rte(long2 x)
 {
   return convert_ulong2_sat(x);
@@ -28377,7 +28377,7 @@ ulong2 convert_ulong2_sat_rte(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rtp(long2 x)
 {
   return convert_ulong2_sat(x);
@@ -28386,7 +28386,7 @@ ulong2 convert_ulong2_sat_rtp(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rtn(long2 x)
 {
   return convert_ulong2_sat(x);
@@ -28395,7 +28395,7 @@ ulong2 convert_ulong2_sat_rtn(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rtz(long3 x)
 {
   return convert_ulong3_sat(x);
@@ -28404,7 +28404,7 @@ ulong3 convert_ulong3_sat_rtz(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rte(long3 x)
 {
   return convert_ulong3_sat(x);
@@ -28413,7 +28413,7 @@ ulong3 convert_ulong3_sat_rte(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rtp(long3 x)
 {
   return convert_ulong3_sat(x);
@@ -28422,7 +28422,7 @@ ulong3 convert_ulong3_sat_rtp(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rtn(long3 x)
 {
   return convert_ulong3_sat(x);
@@ -28431,7 +28431,7 @@ ulong3 convert_ulong3_sat_rtn(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rtz(long4 x)
 {
   return convert_ulong4_sat(x);
@@ -28440,7 +28440,7 @@ ulong4 convert_ulong4_sat_rtz(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rte(long4 x)
 {
   return convert_ulong4_sat(x);
@@ -28449,7 +28449,7 @@ ulong4 convert_ulong4_sat_rte(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rtp(long4 x)
 {
   return convert_ulong4_sat(x);
@@ -28458,7 +28458,7 @@ ulong4 convert_ulong4_sat_rtp(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rtn(long4 x)
 {
   return convert_ulong4_sat(x);
@@ -28467,7 +28467,7 @@ ulong4 convert_ulong4_sat_rtn(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rtz(long8 x)
 {
   return convert_ulong8_sat(x);
@@ -28476,7 +28476,7 @@ ulong8 convert_ulong8_sat_rtz(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rte(long8 x)
 {
   return convert_ulong8_sat(x);
@@ -28485,7 +28485,7 @@ ulong8 convert_ulong8_sat_rte(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rtp(long8 x)
 {
   return convert_ulong8_sat(x);
@@ -28494,7 +28494,7 @@ ulong8 convert_ulong8_sat_rtp(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rtn(long8 x)
 {
   return convert_ulong8_sat(x);
@@ -28503,7 +28503,7 @@ ulong8 convert_ulong8_sat_rtn(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rtz(long16 x)
 {
   return convert_ulong16_sat(x);
@@ -28512,7 +28512,7 @@ ulong16 convert_ulong16_sat_rtz(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rte(long16 x)
 {
   return convert_ulong16_sat(x);
@@ -28521,7 +28521,7 @@ ulong16 convert_ulong16_sat_rte(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rtp(long16 x)
 {
   return convert_ulong16_sat(x);
@@ -28530,7 +28530,7 @@ ulong16 convert_ulong16_sat_rtp(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rtn(long16 x)
 {
   return convert_ulong16_sat(x);
@@ -28539,7 +28539,7 @@ ulong16 convert_ulong16_sat_rtn(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rtz(ulong x)
 {
   return convert_char_sat(x);
@@ -28548,7 +28548,7 @@ char convert_char_sat_rtz(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rte(ulong x)
 {
   return convert_char_sat(x);
@@ -28557,7 +28557,7 @@ char convert_char_sat_rte(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rtp(ulong x)
 {
   return convert_char_sat(x);
@@ -28566,7 +28566,7 @@ char convert_char_sat_rtp(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rtn(ulong x)
 {
   return convert_char_sat(x);
@@ -28575,7 +28575,7 @@ char convert_char_sat_rtn(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rtz(ulong2 x)
 {
   return convert_char2_sat(x);
@@ -28584,7 +28584,7 @@ char2 convert_char2_sat_rtz(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rte(ulong2 x)
 {
   return convert_char2_sat(x);
@@ -28593,7 +28593,7 @@ char2 convert_char2_sat_rte(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rtp(ulong2 x)
 {
   return convert_char2_sat(x);
@@ -28602,7 +28602,7 @@ char2 convert_char2_sat_rtp(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rtn(ulong2 x)
 {
   return convert_char2_sat(x);
@@ -28611,7 +28611,7 @@ char2 convert_char2_sat_rtn(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rtz(ulong3 x)
 {
   return convert_char3_sat(x);
@@ -28620,7 +28620,7 @@ char3 convert_char3_sat_rtz(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rte(ulong3 x)
 {
   return convert_char3_sat(x);
@@ -28629,7 +28629,7 @@ char3 convert_char3_sat_rte(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rtp(ulong3 x)
 {
   return convert_char3_sat(x);
@@ -28638,7 +28638,7 @@ char3 convert_char3_sat_rtp(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rtn(ulong3 x)
 {
   return convert_char3_sat(x);
@@ -28647,7 +28647,7 @@ char3 convert_char3_sat_rtn(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rtz(ulong4 x)
 {
   return convert_char4_sat(x);
@@ -28656,7 +28656,7 @@ char4 convert_char4_sat_rtz(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rte(ulong4 x)
 {
   return convert_char4_sat(x);
@@ -28665,7 +28665,7 @@ char4 convert_char4_sat_rte(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rtp(ulong4 x)
 {
   return convert_char4_sat(x);
@@ -28674,7 +28674,7 @@ char4 convert_char4_sat_rtp(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rtn(ulong4 x)
 {
   return convert_char4_sat(x);
@@ -28683,7 +28683,7 @@ char4 convert_char4_sat_rtn(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rtz(ulong8 x)
 {
   return convert_char8_sat(x);
@@ -28692,7 +28692,7 @@ char8 convert_char8_sat_rtz(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rte(ulong8 x)
 {
   return convert_char8_sat(x);
@@ -28701,7 +28701,7 @@ char8 convert_char8_sat_rte(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rtp(ulong8 x)
 {
   return convert_char8_sat(x);
@@ -28710,7 +28710,7 @@ char8 convert_char8_sat_rtp(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rtn(ulong8 x)
 {
   return convert_char8_sat(x);
@@ -28719,7 +28719,7 @@ char8 convert_char8_sat_rtn(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rtz(ulong16 x)
 {
   return convert_char16_sat(x);
@@ -28728,7 +28728,7 @@ char16 convert_char16_sat_rtz(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rte(ulong16 x)
 {
   return convert_char16_sat(x);
@@ -28737,7 +28737,7 @@ char16 convert_char16_sat_rte(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rtp(ulong16 x)
 {
   return convert_char16_sat(x);
@@ -28746,7 +28746,7 @@ char16 convert_char16_sat_rtp(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rtn(ulong16 x)
 {
   return convert_char16_sat(x);
@@ -28755,7 +28755,7 @@ char16 convert_char16_sat_rtn(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rtz(ulong x)
 {
   return convert_uchar_sat(x);
@@ -28764,7 +28764,7 @@ uchar convert_uchar_sat_rtz(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rte(ulong x)
 {
   return convert_uchar_sat(x);
@@ -28773,7 +28773,7 @@ uchar convert_uchar_sat_rte(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rtp(ulong x)
 {
   return convert_uchar_sat(x);
@@ -28782,7 +28782,7 @@ uchar convert_uchar_sat_rtp(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rtn(ulong x)
 {
   return convert_uchar_sat(x);
@@ -28791,7 +28791,7 @@ uchar convert_uchar_sat_rtn(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rtz(ulong2 x)
 {
   return convert_uchar2_sat(x);
@@ -28800,7 +28800,7 @@ uchar2 convert_uchar2_sat_rtz(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rte(ulong2 x)
 {
   return convert_uchar2_sat(x);
@@ -28809,7 +28809,7 @@ uchar2 convert_uchar2_sat_rte(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rtp(ulong2 x)
 {
   return convert_uchar2_sat(x);
@@ -28818,7 +28818,7 @@ uchar2 convert_uchar2_sat_rtp(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rtn(ulong2 x)
 {
   return convert_uchar2_sat(x);
@@ -28827,7 +28827,7 @@ uchar2 convert_uchar2_sat_rtn(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rtz(ulong3 x)
 {
   return convert_uchar3_sat(x);
@@ -28836,7 +28836,7 @@ uchar3 convert_uchar3_sat_rtz(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rte(ulong3 x)
 {
   return convert_uchar3_sat(x);
@@ -28845,7 +28845,7 @@ uchar3 convert_uchar3_sat_rte(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rtp(ulong3 x)
 {
   return convert_uchar3_sat(x);
@@ -28854,7 +28854,7 @@ uchar3 convert_uchar3_sat_rtp(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rtn(ulong3 x)
 {
   return convert_uchar3_sat(x);
@@ -28863,7 +28863,7 @@ uchar3 convert_uchar3_sat_rtn(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rtz(ulong4 x)
 {
   return convert_uchar4_sat(x);
@@ -28872,7 +28872,7 @@ uchar4 convert_uchar4_sat_rtz(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rte(ulong4 x)
 {
   return convert_uchar4_sat(x);
@@ -28881,7 +28881,7 @@ uchar4 convert_uchar4_sat_rte(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rtp(ulong4 x)
 {
   return convert_uchar4_sat(x);
@@ -28890,7 +28890,7 @@ uchar4 convert_uchar4_sat_rtp(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rtn(ulong4 x)
 {
   return convert_uchar4_sat(x);
@@ -28899,7 +28899,7 @@ uchar4 convert_uchar4_sat_rtn(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rtz(ulong8 x)
 {
   return convert_uchar8_sat(x);
@@ -28908,7 +28908,7 @@ uchar8 convert_uchar8_sat_rtz(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rte(ulong8 x)
 {
   return convert_uchar8_sat(x);
@@ -28917,7 +28917,7 @@ uchar8 convert_uchar8_sat_rte(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rtp(ulong8 x)
 {
   return convert_uchar8_sat(x);
@@ -28926,7 +28926,7 @@ uchar8 convert_uchar8_sat_rtp(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rtn(ulong8 x)
 {
   return convert_uchar8_sat(x);
@@ -28935,7 +28935,7 @@ uchar8 convert_uchar8_sat_rtn(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rtz(ulong16 x)
 {
   return convert_uchar16_sat(x);
@@ -28944,7 +28944,7 @@ uchar16 convert_uchar16_sat_rtz(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rte(ulong16 x)
 {
   return convert_uchar16_sat(x);
@@ -28953,7 +28953,7 @@ uchar16 convert_uchar16_sat_rte(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rtp(ulong16 x)
 {
   return convert_uchar16_sat(x);
@@ -28962,7 +28962,7 @@ uchar16 convert_uchar16_sat_rtp(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rtn(ulong16 x)
 {
   return convert_uchar16_sat(x);
@@ -28971,7 +28971,7 @@ uchar16 convert_uchar16_sat_rtn(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rtz(ulong x)
 {
   return convert_short_sat(x);
@@ -28980,7 +28980,7 @@ short convert_short_sat_rtz(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rte(ulong x)
 {
   return convert_short_sat(x);
@@ -28989,7 +28989,7 @@ short convert_short_sat_rte(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rtp(ulong x)
 {
   return convert_short_sat(x);
@@ -28998,7 +28998,7 @@ short convert_short_sat_rtp(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rtn(ulong x)
 {
   return convert_short_sat(x);
@@ -29007,7 +29007,7 @@ short convert_short_sat_rtn(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rtz(ulong2 x)
 {
   return convert_short2_sat(x);
@@ -29016,7 +29016,7 @@ short2 convert_short2_sat_rtz(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rte(ulong2 x)
 {
   return convert_short2_sat(x);
@@ -29025,7 +29025,7 @@ short2 convert_short2_sat_rte(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rtp(ulong2 x)
 {
   return convert_short2_sat(x);
@@ -29034,7 +29034,7 @@ short2 convert_short2_sat_rtp(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rtn(ulong2 x)
 {
   return convert_short2_sat(x);
@@ -29043,7 +29043,7 @@ short2 convert_short2_sat_rtn(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rtz(ulong3 x)
 {
   return convert_short3_sat(x);
@@ -29052,7 +29052,7 @@ short3 convert_short3_sat_rtz(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rte(ulong3 x)
 {
   return convert_short3_sat(x);
@@ -29061,7 +29061,7 @@ short3 convert_short3_sat_rte(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rtp(ulong3 x)
 {
   return convert_short3_sat(x);
@@ -29070,7 +29070,7 @@ short3 convert_short3_sat_rtp(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rtn(ulong3 x)
 {
   return convert_short3_sat(x);
@@ -29079,7 +29079,7 @@ short3 convert_short3_sat_rtn(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rtz(ulong4 x)
 {
   return convert_short4_sat(x);
@@ -29088,7 +29088,7 @@ short4 convert_short4_sat_rtz(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rte(ulong4 x)
 {
   return convert_short4_sat(x);
@@ -29097,7 +29097,7 @@ short4 convert_short4_sat_rte(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rtp(ulong4 x)
 {
   return convert_short4_sat(x);
@@ -29106,7 +29106,7 @@ short4 convert_short4_sat_rtp(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rtn(ulong4 x)
 {
   return convert_short4_sat(x);
@@ -29115,7 +29115,7 @@ short4 convert_short4_sat_rtn(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rtz(ulong8 x)
 {
   return convert_short8_sat(x);
@@ -29124,7 +29124,7 @@ short8 convert_short8_sat_rtz(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rte(ulong8 x)
 {
   return convert_short8_sat(x);
@@ -29133,7 +29133,7 @@ short8 convert_short8_sat_rte(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rtp(ulong8 x)
 {
   return convert_short8_sat(x);
@@ -29142,7 +29142,7 @@ short8 convert_short8_sat_rtp(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rtn(ulong8 x)
 {
   return convert_short8_sat(x);
@@ -29151,7 +29151,7 @@ short8 convert_short8_sat_rtn(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rtz(ulong16 x)
 {
   return convert_short16_sat(x);
@@ -29160,7 +29160,7 @@ short16 convert_short16_sat_rtz(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rte(ulong16 x)
 {
   return convert_short16_sat(x);
@@ -29169,7 +29169,7 @@ short16 convert_short16_sat_rte(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rtp(ulong16 x)
 {
   return convert_short16_sat(x);
@@ -29178,7 +29178,7 @@ short16 convert_short16_sat_rtp(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rtn(ulong16 x)
 {
   return convert_short16_sat(x);
@@ -29187,7 +29187,7 @@ short16 convert_short16_sat_rtn(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rtz(ulong x)
 {
   return convert_ushort_sat(x);
@@ -29196,7 +29196,7 @@ ushort convert_ushort_sat_rtz(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rte(ulong x)
 {
   return convert_ushort_sat(x);
@@ -29205,7 +29205,7 @@ ushort convert_ushort_sat_rte(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rtp(ulong x)
 {
   return convert_ushort_sat(x);
@@ -29214,7 +29214,7 @@ ushort convert_ushort_sat_rtp(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rtn(ulong x)
 {
   return convert_ushort_sat(x);
@@ -29223,7 +29223,7 @@ ushort convert_ushort_sat_rtn(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rtz(ulong2 x)
 {
   return convert_ushort2_sat(x);
@@ -29232,7 +29232,7 @@ ushort2 convert_ushort2_sat_rtz(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rte(ulong2 x)
 {
   return convert_ushort2_sat(x);
@@ -29241,7 +29241,7 @@ ushort2 convert_ushort2_sat_rte(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rtp(ulong2 x)
 {
   return convert_ushort2_sat(x);
@@ -29250,7 +29250,7 @@ ushort2 convert_ushort2_sat_rtp(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rtn(ulong2 x)
 {
   return convert_ushort2_sat(x);
@@ -29259,7 +29259,7 @@ ushort2 convert_ushort2_sat_rtn(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rtz(ulong3 x)
 {
   return convert_ushort3_sat(x);
@@ -29268,7 +29268,7 @@ ushort3 convert_ushort3_sat_rtz(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rte(ulong3 x)
 {
   return convert_ushort3_sat(x);
@@ -29277,7 +29277,7 @@ ushort3 convert_ushort3_sat_rte(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rtp(ulong3 x)
 {
   return convert_ushort3_sat(x);
@@ -29286,7 +29286,7 @@ ushort3 convert_ushort3_sat_rtp(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rtn(ulong3 x)
 {
   return convert_ushort3_sat(x);
@@ -29295,7 +29295,7 @@ ushort3 convert_ushort3_sat_rtn(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rtz(ulong4 x)
 {
   return convert_ushort4_sat(x);
@@ -29304,7 +29304,7 @@ ushort4 convert_ushort4_sat_rtz(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rte(ulong4 x)
 {
   return convert_ushort4_sat(x);
@@ -29313,7 +29313,7 @@ ushort4 convert_ushort4_sat_rte(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rtp(ulong4 x)
 {
   return convert_ushort4_sat(x);
@@ -29322,7 +29322,7 @@ ushort4 convert_ushort4_sat_rtp(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rtn(ulong4 x)
 {
   return convert_ushort4_sat(x);
@@ -29331,7 +29331,7 @@ ushort4 convert_ushort4_sat_rtn(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rtz(ulong8 x)
 {
   return convert_ushort8_sat(x);
@@ -29340,7 +29340,7 @@ ushort8 convert_ushort8_sat_rtz(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rte(ulong8 x)
 {
   return convert_ushort8_sat(x);
@@ -29349,7 +29349,7 @@ ushort8 convert_ushort8_sat_rte(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rtp(ulong8 x)
 {
   return convert_ushort8_sat(x);
@@ -29358,7 +29358,7 @@ ushort8 convert_ushort8_sat_rtp(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rtn(ulong8 x)
 {
   return convert_ushort8_sat(x);
@@ -29367,7 +29367,7 @@ ushort8 convert_ushort8_sat_rtn(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rtz(ulong16 x)
 {
   return convert_ushort16_sat(x);
@@ -29376,7 +29376,7 @@ ushort16 convert_ushort16_sat_rtz(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rte(ulong16 x)
 {
   return convert_ushort16_sat(x);
@@ -29385,7 +29385,7 @@ ushort16 convert_ushort16_sat_rte(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rtp(ulong16 x)
 {
   return convert_ushort16_sat(x);
@@ -29394,7 +29394,7 @@ ushort16 convert_ushort16_sat_rtp(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rtn(ulong16 x)
 {
   return convert_ushort16_sat(x);
@@ -29403,7 +29403,7 @@ ushort16 convert_ushort16_sat_rtn(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rtz(ulong x)
 {
   return convert_int_sat(x);
@@ -29412,7 +29412,7 @@ int convert_int_sat_rtz(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rte(ulong x)
 {
   return convert_int_sat(x);
@@ -29421,7 +29421,7 @@ int convert_int_sat_rte(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rtp(ulong x)
 {
   return convert_int_sat(x);
@@ -29430,7 +29430,7 @@ int convert_int_sat_rtp(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rtn(ulong x)
 {
   return convert_int_sat(x);
@@ -29439,7 +29439,7 @@ int convert_int_sat_rtn(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rtz(ulong2 x)
 {
   return convert_int2_sat(x);
@@ -29448,7 +29448,7 @@ int2 convert_int2_sat_rtz(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rte(ulong2 x)
 {
   return convert_int2_sat(x);
@@ -29457,7 +29457,7 @@ int2 convert_int2_sat_rte(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rtp(ulong2 x)
 {
   return convert_int2_sat(x);
@@ -29466,7 +29466,7 @@ int2 convert_int2_sat_rtp(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rtn(ulong2 x)
 {
   return convert_int2_sat(x);
@@ -29475,7 +29475,7 @@ int2 convert_int2_sat_rtn(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rtz(ulong3 x)
 {
   return convert_int3_sat(x);
@@ -29484,7 +29484,7 @@ int3 convert_int3_sat_rtz(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rte(ulong3 x)
 {
   return convert_int3_sat(x);
@@ -29493,7 +29493,7 @@ int3 convert_int3_sat_rte(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rtp(ulong3 x)
 {
   return convert_int3_sat(x);
@@ -29502,7 +29502,7 @@ int3 convert_int3_sat_rtp(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rtn(ulong3 x)
 {
   return convert_int3_sat(x);
@@ -29511,7 +29511,7 @@ int3 convert_int3_sat_rtn(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rtz(ulong4 x)
 {
   return convert_int4_sat(x);
@@ -29520,7 +29520,7 @@ int4 convert_int4_sat_rtz(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rte(ulong4 x)
 {
   return convert_int4_sat(x);
@@ -29529,7 +29529,7 @@ int4 convert_int4_sat_rte(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rtp(ulong4 x)
 {
   return convert_int4_sat(x);
@@ -29538,7 +29538,7 @@ int4 convert_int4_sat_rtp(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rtn(ulong4 x)
 {
   return convert_int4_sat(x);
@@ -29547,7 +29547,7 @@ int4 convert_int4_sat_rtn(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rtz(ulong8 x)
 {
   return convert_int8_sat(x);
@@ -29556,7 +29556,7 @@ int8 convert_int8_sat_rtz(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rte(ulong8 x)
 {
   return convert_int8_sat(x);
@@ -29565,7 +29565,7 @@ int8 convert_int8_sat_rte(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rtp(ulong8 x)
 {
   return convert_int8_sat(x);
@@ -29574,7 +29574,7 @@ int8 convert_int8_sat_rtp(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rtn(ulong8 x)
 {
   return convert_int8_sat(x);
@@ -29583,7 +29583,7 @@ int8 convert_int8_sat_rtn(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rtz(ulong16 x)
 {
   return convert_int16_sat(x);
@@ -29592,7 +29592,7 @@ int16 convert_int16_sat_rtz(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rte(ulong16 x)
 {
   return convert_int16_sat(x);
@@ -29601,7 +29601,7 @@ int16 convert_int16_sat_rte(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rtp(ulong16 x)
 {
   return convert_int16_sat(x);
@@ -29610,7 +29610,7 @@ int16 convert_int16_sat_rtp(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rtn(ulong16 x)
 {
   return convert_int16_sat(x);
@@ -29619,7 +29619,7 @@ int16 convert_int16_sat_rtn(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rtz(ulong x)
 {
   return convert_uint_sat(x);
@@ -29628,7 +29628,7 @@ uint convert_uint_sat_rtz(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rte(ulong x)
 {
   return convert_uint_sat(x);
@@ -29637,7 +29637,7 @@ uint convert_uint_sat_rte(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rtp(ulong x)
 {
   return convert_uint_sat(x);
@@ -29646,7 +29646,7 @@ uint convert_uint_sat_rtp(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rtn(ulong x)
 {
   return convert_uint_sat(x);
@@ -29655,7 +29655,7 @@ uint convert_uint_sat_rtn(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rtz(ulong2 x)
 {
   return convert_uint2_sat(x);
@@ -29664,7 +29664,7 @@ uint2 convert_uint2_sat_rtz(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rte(ulong2 x)
 {
   return convert_uint2_sat(x);
@@ -29673,7 +29673,7 @@ uint2 convert_uint2_sat_rte(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rtp(ulong2 x)
 {
   return convert_uint2_sat(x);
@@ -29682,7 +29682,7 @@ uint2 convert_uint2_sat_rtp(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rtn(ulong2 x)
 {
   return convert_uint2_sat(x);
@@ -29691,7 +29691,7 @@ uint2 convert_uint2_sat_rtn(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rtz(ulong3 x)
 {
   return convert_uint3_sat(x);
@@ -29700,7 +29700,7 @@ uint3 convert_uint3_sat_rtz(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rte(ulong3 x)
 {
   return convert_uint3_sat(x);
@@ -29709,7 +29709,7 @@ uint3 convert_uint3_sat_rte(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rtp(ulong3 x)
 {
   return convert_uint3_sat(x);
@@ -29718,7 +29718,7 @@ uint3 convert_uint3_sat_rtp(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rtn(ulong3 x)
 {
   return convert_uint3_sat(x);
@@ -29727,7 +29727,7 @@ uint3 convert_uint3_sat_rtn(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rtz(ulong4 x)
 {
   return convert_uint4_sat(x);
@@ -29736,7 +29736,7 @@ uint4 convert_uint4_sat_rtz(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rte(ulong4 x)
 {
   return convert_uint4_sat(x);
@@ -29745,7 +29745,7 @@ uint4 convert_uint4_sat_rte(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rtp(ulong4 x)
 {
   return convert_uint4_sat(x);
@@ -29754,7 +29754,7 @@ uint4 convert_uint4_sat_rtp(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rtn(ulong4 x)
 {
   return convert_uint4_sat(x);
@@ -29763,7 +29763,7 @@ uint4 convert_uint4_sat_rtn(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rtz(ulong8 x)
 {
   return convert_uint8_sat(x);
@@ -29772,7 +29772,7 @@ uint8 convert_uint8_sat_rtz(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rte(ulong8 x)
 {
   return convert_uint8_sat(x);
@@ -29781,7 +29781,7 @@ uint8 convert_uint8_sat_rte(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rtp(ulong8 x)
 {
   return convert_uint8_sat(x);
@@ -29790,7 +29790,7 @@ uint8 convert_uint8_sat_rtp(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rtn(ulong8 x)
 {
   return convert_uint8_sat(x);
@@ -29799,7 +29799,7 @@ uint8 convert_uint8_sat_rtn(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rtz(ulong16 x)
 {
   return convert_uint16_sat(x);
@@ -29808,7 +29808,7 @@ uint16 convert_uint16_sat_rtz(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rte(ulong16 x)
 {
   return convert_uint16_sat(x);
@@ -29817,7 +29817,7 @@ uint16 convert_uint16_sat_rte(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rtp(ulong16 x)
 {
   return convert_uint16_sat(x);
@@ -29826,7 +29826,7 @@ uint16 convert_uint16_sat_rtp(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rtn(ulong16 x)
 {
   return convert_uint16_sat(x);
@@ -29835,7 +29835,7 @@ uint16 convert_uint16_sat_rtn(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rtz(ulong x)
 {
   return convert_long_sat(x);
@@ -29844,7 +29844,7 @@ long convert_long_sat_rtz(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rte(ulong x)
 {
   return convert_long_sat(x);
@@ -29853,7 +29853,7 @@ long convert_long_sat_rte(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rtp(ulong x)
 {
   return convert_long_sat(x);
@@ -29862,7 +29862,7 @@ long convert_long_sat_rtp(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rtn(ulong x)
 {
   return convert_long_sat(x);
@@ -29871,7 +29871,7 @@ long convert_long_sat_rtn(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rtz(ulong2 x)
 {
   return convert_long2_sat(x);
@@ -29880,7 +29880,7 @@ long2 convert_long2_sat_rtz(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rte(ulong2 x)
 {
   return convert_long2_sat(x);
@@ -29889,7 +29889,7 @@ long2 convert_long2_sat_rte(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rtp(ulong2 x)
 {
   return convert_long2_sat(x);
@@ -29898,7 +29898,7 @@ long2 convert_long2_sat_rtp(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rtn(ulong2 x)
 {
   return convert_long2_sat(x);
@@ -29907,7 +29907,7 @@ long2 convert_long2_sat_rtn(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rtz(ulong3 x)
 {
   return convert_long3_sat(x);
@@ -29916,7 +29916,7 @@ long3 convert_long3_sat_rtz(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rte(ulong3 x)
 {
   return convert_long3_sat(x);
@@ -29925,7 +29925,7 @@ long3 convert_long3_sat_rte(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rtp(ulong3 x)
 {
   return convert_long3_sat(x);
@@ -29934,7 +29934,7 @@ long3 convert_long3_sat_rtp(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rtn(ulong3 x)
 {
   return convert_long3_sat(x);
@@ -29943,7 +29943,7 @@ long3 convert_long3_sat_rtn(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rtz(ulong4 x)
 {
   return convert_long4_sat(x);
@@ -29952,7 +29952,7 @@ long4 convert_long4_sat_rtz(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rte(ulong4 x)
 {
   return convert_long4_sat(x);
@@ -29961,7 +29961,7 @@ long4 convert_long4_sat_rte(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rtp(ulong4 x)
 {
   return convert_long4_sat(x);
@@ -29970,7 +29970,7 @@ long4 convert_long4_sat_rtp(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rtn(ulong4 x)
 {
   return convert_long4_sat(x);
@@ -29979,7 +29979,7 @@ long4 convert_long4_sat_rtn(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rtz(ulong8 x)
 {
   return convert_long8_sat(x);
@@ -29988,7 +29988,7 @@ long8 convert_long8_sat_rtz(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rte(ulong8 x)
 {
   return convert_long8_sat(x);
@@ -29997,7 +29997,7 @@ long8 convert_long8_sat_rte(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rtp(ulong8 x)
 {
   return convert_long8_sat(x);
@@ -30006,7 +30006,7 @@ long8 convert_long8_sat_rtp(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rtn(ulong8 x)
 {
   return convert_long8_sat(x);
@@ -30015,7 +30015,7 @@ long8 convert_long8_sat_rtn(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rtz(ulong16 x)
 {
   return convert_long16_sat(x);
@@ -30024,7 +30024,7 @@ long16 convert_long16_sat_rtz(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rte(ulong16 x)
 {
   return convert_long16_sat(x);
@@ -30033,7 +30033,7 @@ long16 convert_long16_sat_rte(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rtp(ulong16 x)
 {
   return convert_long16_sat(x);
@@ -30042,7 +30042,7 @@ long16 convert_long16_sat_rtp(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rtn(ulong16 x)
 {
   return convert_long16_sat(x);
@@ -30051,7 +30051,7 @@ long16 convert_long16_sat_rtn(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rtz(ulong x)
 {
   return convert_ulong_sat(x);
@@ -30060,7 +30060,7 @@ ulong convert_ulong_sat_rtz(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rte(ulong x)
 {
   return convert_ulong_sat(x);
@@ -30069,7 +30069,7 @@ ulong convert_ulong_sat_rte(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rtp(ulong x)
 {
   return convert_ulong_sat(x);
@@ -30078,7 +30078,7 @@ ulong convert_ulong_sat_rtp(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rtn(ulong x)
 {
   return convert_ulong_sat(x);
@@ -30087,7 +30087,7 @@ ulong convert_ulong_sat_rtn(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rtz(ulong2 x)
 {
   return convert_ulong2_sat(x);
@@ -30096,7 +30096,7 @@ ulong2 convert_ulong2_sat_rtz(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rte(ulong2 x)
 {
   return convert_ulong2_sat(x);
@@ -30105,7 +30105,7 @@ ulong2 convert_ulong2_sat_rte(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rtp(ulong2 x)
 {
   return convert_ulong2_sat(x);
@@ -30114,7 +30114,7 @@ ulong2 convert_ulong2_sat_rtp(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rtn(ulong2 x)
 {
   return convert_ulong2_sat(x);
@@ -30123,7 +30123,7 @@ ulong2 convert_ulong2_sat_rtn(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rtz(ulong3 x)
 {
   return convert_ulong3_sat(x);
@@ -30132,7 +30132,7 @@ ulong3 convert_ulong3_sat_rtz(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rte(ulong3 x)
 {
   return convert_ulong3_sat(x);
@@ -30141,7 +30141,7 @@ ulong3 convert_ulong3_sat_rte(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rtp(ulong3 x)
 {
   return convert_ulong3_sat(x);
@@ -30150,7 +30150,7 @@ ulong3 convert_ulong3_sat_rtp(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rtn(ulong3 x)
 {
   return convert_ulong3_sat(x);
@@ -30159,7 +30159,7 @@ ulong3 convert_ulong3_sat_rtn(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rtz(ulong4 x)
 {
   return convert_ulong4_sat(x);
@@ -30168,7 +30168,7 @@ ulong4 convert_ulong4_sat_rtz(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rte(ulong4 x)
 {
   return convert_ulong4_sat(x);
@@ -30177,7 +30177,7 @@ ulong4 convert_ulong4_sat_rte(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rtp(ulong4 x)
 {
   return convert_ulong4_sat(x);
@@ -30186,7 +30186,7 @@ ulong4 convert_ulong4_sat_rtp(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rtn(ulong4 x)
 {
   return convert_ulong4_sat(x);
@@ -30195,7 +30195,7 @@ ulong4 convert_ulong4_sat_rtn(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rtz(ulong8 x)
 {
   return convert_ulong8_sat(x);
@@ -30204,7 +30204,7 @@ ulong8 convert_ulong8_sat_rtz(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rte(ulong8 x)
 {
   return convert_ulong8_sat(x);
@@ -30213,7 +30213,7 @@ ulong8 convert_ulong8_sat_rte(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rtp(ulong8 x)
 {
   return convert_ulong8_sat(x);
@@ -30222,7 +30222,7 @@ ulong8 convert_ulong8_sat_rtp(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rtn(ulong8 x)
 {
   return convert_ulong8_sat(x);
@@ -30231,7 +30231,7 @@ ulong8 convert_ulong8_sat_rtn(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rtz(ulong16 x)
 {
   return convert_ulong16_sat(x);
@@ -30240,7 +30240,7 @@ ulong16 convert_ulong16_sat_rtz(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rte(ulong16 x)
 {
   return convert_ulong16_sat(x);
@@ -30249,7 +30249,7 @@ ulong16 convert_ulong16_sat_rte(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rtp(ulong16 x)
 {
   return convert_ulong16_sat(x);
@@ -30258,7 +30258,7 @@ ulong16 convert_ulong16_sat_rtp(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rtn(ulong16 x)
 {
   return convert_ulong16_sat(x);
@@ -30266,1944 +30266,1944 @@ ulong16 convert_ulong16_sat_rtn(ulong16 x)
 
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rtz(float x)
 {
   return convert_char(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rtz(float x)
 {
   return convert_char_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rte(float x)
 {
   x = rint(x);
   return convert_char(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rte(float x)
 {
   x = rint(x);
   return convert_char_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rtp(float x)
 {
   x = ceil(x);
   return convert_char(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rtp(float x)
 {
   x = ceil(x);
   return convert_char_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rtn(float x)
 {
   x = floor(x);
   return convert_char(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rtn(float x)
 {
   x = floor(x);
   return convert_char_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rtz(float2 x)
 {
   return convert_char2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rtz(float2 x)
 {
   return convert_char2_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rte(float2 x)
 {
   x = rint(x);
   return convert_char2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rte(float2 x)
 {
   x = rint(x);
   return convert_char2_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rtp(float2 x)
 {
   x = ceil(x);
   return convert_char2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rtp(float2 x)
 {
   x = ceil(x);
   return convert_char2_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rtn(float2 x)
 {
   x = floor(x);
   return convert_char2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rtn(float2 x)
 {
   x = floor(x);
   return convert_char2_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rtz(float3 x)
 {
   return convert_char3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rtz(float3 x)
 {
   return convert_char3_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rte(float3 x)
 {
   x = rint(x);
   return convert_char3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rte(float3 x)
 {
   x = rint(x);
   return convert_char3_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rtp(float3 x)
 {
   x = ceil(x);
   return convert_char3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rtp(float3 x)
 {
   x = ceil(x);
   return convert_char3_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rtn(float3 x)
 {
   x = floor(x);
   return convert_char3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rtn(float3 x)
 {
   x = floor(x);
   return convert_char3_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rtz(float4 x)
 {
   return convert_char4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rtz(float4 x)
 {
   return convert_char4_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rte(float4 x)
 {
   x = rint(x);
   return convert_char4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rte(float4 x)
 {
   x = rint(x);
   return convert_char4_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rtp(float4 x)
 {
   x = ceil(x);
   return convert_char4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rtp(float4 x)
 {
   x = ceil(x);
   return convert_char4_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rtn(float4 x)
 {
   x = floor(x);
   return convert_char4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rtn(float4 x)
 {
   x = floor(x);
   return convert_char4_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rtz(float8 x)
 {
   return convert_char8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rtz(float8 x)
 {
   return convert_char8_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rte(float8 x)
 {
   x = rint(x);
   return convert_char8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rte(float8 x)
 {
   x = rint(x);
   return convert_char8_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rtp(float8 x)
 {
   x = ceil(x);
   return convert_char8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rtp(float8 x)
 {
   x = ceil(x);
   return convert_char8_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rtn(float8 x)
 {
   x = floor(x);
   return convert_char8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rtn(float8 x)
 {
   x = floor(x);
   return convert_char8_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rtz(float16 x)
 {
   return convert_char16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rtz(float16 x)
 {
   return convert_char16_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rte(float16 x)
 {
   x = rint(x);
   return convert_char16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rte(float16 x)
 {
   x = rint(x);
   return convert_char16_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rtp(float16 x)
 {
   x = ceil(x);
   return convert_char16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rtp(float16 x)
 {
   x = ceil(x);
   return convert_char16_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rtn(float16 x)
 {
   x = floor(x);
   return convert_char16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rtn(float16 x)
 {
   x = floor(x);
   return convert_char16_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rtz(float x)
 {
   return convert_uchar(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rtz(float x)
 {
   return convert_uchar_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rte(float x)
 {
   x = rint(x);
   return convert_uchar(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rte(float x)
 {
   x = rint(x);
   return convert_uchar_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rtp(float x)
 {
   x = ceil(x);
   return convert_uchar(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rtp(float x)
 {
   x = ceil(x);
   return convert_uchar_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rtn(float x)
 {
   x = floor(x);
   return convert_uchar(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rtn(float x)
 {
   x = floor(x);
   return convert_uchar_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rtz(float2 x)
 {
   return convert_uchar2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rtz(float2 x)
 {
   return convert_uchar2_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rte(float2 x)
 {
   x = rint(x);
   return convert_uchar2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rte(float2 x)
 {
   x = rint(x);
   return convert_uchar2_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rtp(float2 x)
 {
   x = ceil(x);
   return convert_uchar2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rtp(float2 x)
 {
   x = ceil(x);
   return convert_uchar2_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rtn(float2 x)
 {
   x = floor(x);
   return convert_uchar2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rtn(float2 x)
 {
   x = floor(x);
   return convert_uchar2_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rtz(float3 x)
 {
   return convert_uchar3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rtz(float3 x)
 {
   return convert_uchar3_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rte(float3 x)
 {
   x = rint(x);
   return convert_uchar3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rte(float3 x)
 {
   x = rint(x);
   return convert_uchar3_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rtp(float3 x)
 {
   x = ceil(x);
   return convert_uchar3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rtp(float3 x)
 {
   x = ceil(x);
   return convert_uchar3_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rtn(float3 x)
 {
   x = floor(x);
   return convert_uchar3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rtn(float3 x)
 {
   x = floor(x);
   return convert_uchar3_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rtz(float4 x)
 {
   return convert_uchar4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rtz(float4 x)
 {
   return convert_uchar4_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rte(float4 x)
 {
   x = rint(x);
   return convert_uchar4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rte(float4 x)
 {
   x = rint(x);
   return convert_uchar4_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rtp(float4 x)
 {
   x = ceil(x);
   return convert_uchar4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rtp(float4 x)
 {
   x = ceil(x);
   return convert_uchar4_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rtn(float4 x)
 {
   x = floor(x);
   return convert_uchar4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rtn(float4 x)
 {
   x = floor(x);
   return convert_uchar4_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rtz(float8 x)
 {
   return convert_uchar8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rtz(float8 x)
 {
   return convert_uchar8_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rte(float8 x)
 {
   x = rint(x);
   return convert_uchar8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rte(float8 x)
 {
   x = rint(x);
   return convert_uchar8_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rtp(float8 x)
 {
   x = ceil(x);
   return convert_uchar8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rtp(float8 x)
 {
   x = ceil(x);
   return convert_uchar8_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rtn(float8 x)
 {
   x = floor(x);
   return convert_uchar8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rtn(float8 x)
 {
   x = floor(x);
   return convert_uchar8_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rtz(float16 x)
 {
   return convert_uchar16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rtz(float16 x)
 {
   return convert_uchar16_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rte(float16 x)
 {
   x = rint(x);
   return convert_uchar16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rte(float16 x)
 {
   x = rint(x);
   return convert_uchar16_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rtp(float16 x)
 {
   x = ceil(x);
   return convert_uchar16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rtp(float16 x)
 {
   x = ceil(x);
   return convert_uchar16_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rtn(float16 x)
 {
   x = floor(x);
   return convert_uchar16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rtn(float16 x)
 {
   x = floor(x);
   return convert_uchar16_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rtz(float x)
 {
   return convert_short(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rtz(float x)
 {
   return convert_short_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rte(float x)
 {
   x = rint(x);
   return convert_short(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rte(float x)
 {
   x = rint(x);
   return convert_short_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rtp(float x)
 {
   x = ceil(x);
   return convert_short(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rtp(float x)
 {
   x = ceil(x);
   return convert_short_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rtn(float x)
 {
   x = floor(x);
   return convert_short(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rtn(float x)
 {
   x = floor(x);
   return convert_short_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rtz(float2 x)
 {
   return convert_short2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rtz(float2 x)
 {
   return convert_short2_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rte(float2 x)
 {
   x = rint(x);
   return convert_short2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rte(float2 x)
 {
   x = rint(x);
   return convert_short2_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rtp(float2 x)
 {
   x = ceil(x);
   return convert_short2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rtp(float2 x)
 {
   x = ceil(x);
   return convert_short2_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rtn(float2 x)
 {
   x = floor(x);
   return convert_short2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rtn(float2 x)
 {
   x = floor(x);
   return convert_short2_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rtz(float3 x)
 {
   return convert_short3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rtz(float3 x)
 {
   return convert_short3_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rte(float3 x)
 {
   x = rint(x);
   return convert_short3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rte(float3 x)
 {
   x = rint(x);
   return convert_short3_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rtp(float3 x)
 {
   x = ceil(x);
   return convert_short3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rtp(float3 x)
 {
   x = ceil(x);
   return convert_short3_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rtn(float3 x)
 {
   x = floor(x);
   return convert_short3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rtn(float3 x)
 {
   x = floor(x);
   return convert_short3_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rtz(float4 x)
 {
   return convert_short4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rtz(float4 x)
 {
   return convert_short4_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rte(float4 x)
 {
   x = rint(x);
   return convert_short4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rte(float4 x)
 {
   x = rint(x);
   return convert_short4_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rtp(float4 x)
 {
   x = ceil(x);
   return convert_short4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rtp(float4 x)
 {
   x = ceil(x);
   return convert_short4_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rtn(float4 x)
 {
   x = floor(x);
   return convert_short4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rtn(float4 x)
 {
   x = floor(x);
   return convert_short4_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rtz(float8 x)
 {
   return convert_short8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rtz(float8 x)
 {
   return convert_short8_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rte(float8 x)
 {
   x = rint(x);
   return convert_short8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rte(float8 x)
 {
   x = rint(x);
   return convert_short8_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rtp(float8 x)
 {
   x = ceil(x);
   return convert_short8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rtp(float8 x)
 {
   x = ceil(x);
   return convert_short8_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rtn(float8 x)
 {
   x = floor(x);
   return convert_short8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rtn(float8 x)
 {
   x = floor(x);
   return convert_short8_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rtz(float16 x)
 {
   return convert_short16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rtz(float16 x)
 {
   return convert_short16_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rte(float16 x)
 {
   x = rint(x);
   return convert_short16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rte(float16 x)
 {
   x = rint(x);
   return convert_short16_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rtp(float16 x)
 {
   x = ceil(x);
   return convert_short16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rtp(float16 x)
 {
   x = ceil(x);
   return convert_short16_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rtn(float16 x)
 {
   x = floor(x);
   return convert_short16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rtn(float16 x)
 {
   x = floor(x);
   return convert_short16_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rtz(float x)
 {
   return convert_ushort(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rtz(float x)
 {
   return convert_ushort_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rte(float x)
 {
   x = rint(x);
   return convert_ushort(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rte(float x)
 {
   x = rint(x);
   return convert_ushort_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rtp(float x)
 {
   x = ceil(x);
   return convert_ushort(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rtp(float x)
 {
   x = ceil(x);
   return convert_ushort_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rtn(float x)
 {
   x = floor(x);
   return convert_ushort(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rtn(float x)
 {
   x = floor(x);
   return convert_ushort_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rtz(float2 x)
 {
   return convert_ushort2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rtz(float2 x)
 {
   return convert_ushort2_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rte(float2 x)
 {
   x = rint(x);
   return convert_ushort2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rte(float2 x)
 {
   x = rint(x);
   return convert_ushort2_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rtp(float2 x)
 {
   x = ceil(x);
   return convert_ushort2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rtp(float2 x)
 {
   x = ceil(x);
   return convert_ushort2_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rtn(float2 x)
 {
   x = floor(x);
   return convert_ushort2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rtn(float2 x)
 {
   x = floor(x);
   return convert_ushort2_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rtz(float3 x)
 {
   return convert_ushort3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rtz(float3 x)
 {
   return convert_ushort3_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rte(float3 x)
 {
   x = rint(x);
   return convert_ushort3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rte(float3 x)
 {
   x = rint(x);
   return convert_ushort3_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rtp(float3 x)
 {
   x = ceil(x);
   return convert_ushort3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rtp(float3 x)
 {
   x = ceil(x);
   return convert_ushort3_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rtn(float3 x)
 {
   x = floor(x);
   return convert_ushort3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rtn(float3 x)
 {
   x = floor(x);
   return convert_ushort3_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rtz(float4 x)
 {
   return convert_ushort4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rtz(float4 x)
 {
   return convert_ushort4_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rte(float4 x)
 {
   x = rint(x);
   return convert_ushort4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rte(float4 x)
 {
   x = rint(x);
   return convert_ushort4_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rtp(float4 x)
 {
   x = ceil(x);
   return convert_ushort4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rtp(float4 x)
 {
   x = ceil(x);
   return convert_ushort4_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rtn(float4 x)
 {
   x = floor(x);
   return convert_ushort4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rtn(float4 x)
 {
   x = floor(x);
   return convert_ushort4_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rtz(float8 x)
 {
   return convert_ushort8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rtz(float8 x)
 {
   return convert_ushort8_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rte(float8 x)
 {
   x = rint(x);
   return convert_ushort8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rte(float8 x)
 {
   x = rint(x);
   return convert_ushort8_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rtp(float8 x)
 {
   x = ceil(x);
   return convert_ushort8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rtp(float8 x)
 {
   x = ceil(x);
   return convert_ushort8_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rtn(float8 x)
 {
   x = floor(x);
   return convert_ushort8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rtn(float8 x)
 {
   x = floor(x);
   return convert_ushort8_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rtz(float16 x)
 {
   return convert_ushort16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rtz(float16 x)
 {
   return convert_ushort16_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rte(float16 x)
 {
   x = rint(x);
   return convert_ushort16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rte(float16 x)
 {
   x = rint(x);
   return convert_ushort16_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rtp(float16 x)
 {
   x = ceil(x);
   return convert_ushort16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rtp(float16 x)
 {
   x = ceil(x);
   return convert_ushort16_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rtn(float16 x)
 {
   x = floor(x);
   return convert_ushort16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rtn(float16 x)
 {
   x = floor(x);
   return convert_ushort16_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rtz(float x)
 {
   return convert_int(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rtz(float x)
 {
   return convert_int_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rte(float x)
 {
   x = rint(x);
   return convert_int(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rte(float x)
 {
   x = rint(x);
   return convert_int_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rtp(float x)
 {
   x = ceil(x);
   return convert_int(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rtp(float x)
 {
   x = ceil(x);
   return convert_int_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rtn(float x)
 {
   x = floor(x);
   return convert_int(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rtn(float x)
 {
   x = floor(x);
   return convert_int_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rtz(float2 x)
 {
   return convert_int2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rtz(float2 x)
 {
   return convert_int2_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rte(float2 x)
 {
   x = rint(x);
   return convert_int2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rte(float2 x)
 {
   x = rint(x);
   return convert_int2_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rtp(float2 x)
 {
   x = ceil(x);
   return convert_int2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rtp(float2 x)
 {
   x = ceil(x);
   return convert_int2_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rtn(float2 x)
 {
   x = floor(x);
   return convert_int2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rtn(float2 x)
 {
   x = floor(x);
   return convert_int2_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rtz(float3 x)
 {
   return convert_int3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rtz(float3 x)
 {
   return convert_int3_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rte(float3 x)
 {
   x = rint(x);
   return convert_int3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rte(float3 x)
 {
   x = rint(x);
   return convert_int3_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rtp(float3 x)
 {
   x = ceil(x);
   return convert_int3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rtp(float3 x)
 {
   x = ceil(x);
   return convert_int3_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rtn(float3 x)
 {
   x = floor(x);
   return convert_int3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rtn(float3 x)
 {
   x = floor(x);
   return convert_int3_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rtz(float4 x)
 {
   return convert_int4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rtz(float4 x)
 {
   return convert_int4_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rte(float4 x)
 {
   x = rint(x);
   return convert_int4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rte(float4 x)
 {
   x = rint(x);
   return convert_int4_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rtp(float4 x)
 {
   x = ceil(x);
   return convert_int4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rtp(float4 x)
 {
   x = ceil(x);
   return convert_int4_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rtn(float4 x)
 {
   x = floor(x);
   return convert_int4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rtn(float4 x)
 {
   x = floor(x);
   return convert_int4_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rtz(float8 x)
 {
   return convert_int8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rtz(float8 x)
 {
   return convert_int8_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rte(float8 x)
 {
   x = rint(x);
   return convert_int8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rte(float8 x)
 {
   x = rint(x);
   return convert_int8_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rtp(float8 x)
 {
   x = ceil(x);
   return convert_int8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rtp(float8 x)
 {
   x = ceil(x);
   return convert_int8_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rtn(float8 x)
 {
   x = floor(x);
   return convert_int8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rtn(float8 x)
 {
   x = floor(x);
   return convert_int8_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rtz(float16 x)
 {
   return convert_int16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rtz(float16 x)
 {
   return convert_int16_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rte(float16 x)
 {
   x = rint(x);
   return convert_int16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rte(float16 x)
 {
   x = rint(x);
   return convert_int16_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rtp(float16 x)
 {
   x = ceil(x);
   return convert_int16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rtp(float16 x)
 {
   x = ceil(x);
   return convert_int16_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rtn(float16 x)
 {
   x = floor(x);
   return convert_int16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rtn(float16 x)
 {
   x = floor(x);
   return convert_int16_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rtz(float x)
 {
   return convert_uint(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rtz(float x)
 {
   return convert_uint_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rte(float x)
 {
   x = rint(x);
   return convert_uint(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rte(float x)
 {
   x = rint(x);
   return convert_uint_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rtp(float x)
 {
   x = ceil(x);
   return convert_uint(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rtp(float x)
 {
   x = ceil(x);
   return convert_uint_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rtn(float x)
 {
   x = floor(x);
   return convert_uint(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rtn(float x)
 {
   x = floor(x);
   return convert_uint_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rtz(float2 x)
 {
   return convert_uint2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rtz(float2 x)
 {
   return convert_uint2_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rte(float2 x)
 {
   x = rint(x);
   return convert_uint2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rte(float2 x)
 {
   x = rint(x);
   return convert_uint2_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rtp(float2 x)
 {
   x = ceil(x);
   return convert_uint2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rtp(float2 x)
 {
   x = ceil(x);
   return convert_uint2_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rtn(float2 x)
 {
   x = floor(x);
   return convert_uint2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rtn(float2 x)
 {
   x = floor(x);
   return convert_uint2_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rtz(float3 x)
 {
   return convert_uint3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rtz(float3 x)
 {
   return convert_uint3_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rte(float3 x)
 {
   x = rint(x);
   return convert_uint3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rte(float3 x)
 {
   x = rint(x);
   return convert_uint3_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rtp(float3 x)
 {
   x = ceil(x);
   return convert_uint3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rtp(float3 x)
 {
   x = ceil(x);
   return convert_uint3_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rtn(float3 x)
 {
   x = floor(x);
   return convert_uint3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rtn(float3 x)
 {
   x = floor(x);
   return convert_uint3_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rtz(float4 x)
 {
   return convert_uint4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rtz(float4 x)
 {
   return convert_uint4_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rte(float4 x)
 {
   x = rint(x);
   return convert_uint4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rte(float4 x)
 {
   x = rint(x);
   return convert_uint4_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rtp(float4 x)
 {
   x = ceil(x);
   return convert_uint4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rtp(float4 x)
 {
   x = ceil(x);
   return convert_uint4_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rtn(float4 x)
 {
   x = floor(x);
   return convert_uint4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rtn(float4 x)
 {
   x = floor(x);
   return convert_uint4_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rtz(float8 x)
 {
   return convert_uint8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rtz(float8 x)
 {
   return convert_uint8_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rte(float8 x)
 {
   x = rint(x);
   return convert_uint8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rte(float8 x)
 {
   x = rint(x);
   return convert_uint8_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rtp(float8 x)
 {
   x = ceil(x);
   return convert_uint8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rtp(float8 x)
 {
   x = ceil(x);
   return convert_uint8_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rtn(float8 x)
 {
   x = floor(x);
   return convert_uint8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rtn(float8 x)
 {
   x = floor(x);
   return convert_uint8_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rtz(float16 x)
 {
   return convert_uint16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rtz(float16 x)
 {
   return convert_uint16_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rte(float16 x)
 {
   x = rint(x);
   return convert_uint16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rte(float16 x)
 {
   x = rint(x);
   return convert_uint16_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rtp(float16 x)
 {
   x = ceil(x);
   return convert_uint16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rtp(float16 x)
 {
   x = ceil(x);
   return convert_uint16_sat(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rtn(float16 x)
 {
   x = floor(x);
   return convert_uint16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rtn(float16 x)
 {
   x = floor(x);
@@ -32211,7 +32211,7 @@ uint16 convert_uint16_sat_rtn(float16 x)
 }
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rtz(float x)
 {
   return convert_long(x);
@@ -32219,7 +32219,7 @@ long convert_long_rtz(float x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rtz(float x)
 {
   return convert_long_sat(x);
@@ -32227,7 +32227,7 @@ long convert_long_sat_rtz(float x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rte(float x)
 {
   x = rint(x);
@@ -32236,7 +32236,7 @@ long convert_long_rte(float x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rte(float x)
 {
   x = rint(x);
@@ -32245,7 +32245,7 @@ long convert_long_sat_rte(float x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rtp(float x)
 {
   x = ceil(x);
@@ -32254,7 +32254,7 @@ long convert_long_rtp(float x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rtp(float x)
 {
   x = ceil(x);
@@ -32263,7 +32263,7 @@ long convert_long_sat_rtp(float x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rtn(float x)
 {
   x = floor(x);
@@ -32272,7 +32272,7 @@ long convert_long_rtn(float x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rtn(float x)
 {
   x = floor(x);
@@ -32281,7 +32281,7 @@ long convert_long_sat_rtn(float x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rtz(float2 x)
 {
   return convert_long2(x);
@@ -32289,7 +32289,7 @@ long2 convert_long2_rtz(float2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rtz(float2 x)
 {
   return convert_long2_sat(x);
@@ -32297,7 +32297,7 @@ long2 convert_long2_sat_rtz(float2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rte(float2 x)
 {
   x = rint(x);
@@ -32306,7 +32306,7 @@ long2 convert_long2_rte(float2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rte(float2 x)
 {
   x = rint(x);
@@ -32315,7 +32315,7 @@ long2 convert_long2_sat_rte(float2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rtp(float2 x)
 {
   x = ceil(x);
@@ -32324,7 +32324,7 @@ long2 convert_long2_rtp(float2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rtp(float2 x)
 {
   x = ceil(x);
@@ -32333,7 +32333,7 @@ long2 convert_long2_sat_rtp(float2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rtn(float2 x)
 {
   x = floor(x);
@@ -32342,7 +32342,7 @@ long2 convert_long2_rtn(float2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rtn(float2 x)
 {
   x = floor(x);
@@ -32351,7 +32351,7 @@ long2 convert_long2_sat_rtn(float2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rtz(float3 x)
 {
   return convert_long3(x);
@@ -32359,7 +32359,7 @@ long3 convert_long3_rtz(float3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rtz(float3 x)
 {
   return convert_long3_sat(x);
@@ -32367,7 +32367,7 @@ long3 convert_long3_sat_rtz(float3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rte(float3 x)
 {
   x = rint(x);
@@ -32376,7 +32376,7 @@ long3 convert_long3_rte(float3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rte(float3 x)
 {
   x = rint(x);
@@ -32385,7 +32385,7 @@ long3 convert_long3_sat_rte(float3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rtp(float3 x)
 {
   x = ceil(x);
@@ -32394,7 +32394,7 @@ long3 convert_long3_rtp(float3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rtp(float3 x)
 {
   x = ceil(x);
@@ -32403,7 +32403,7 @@ long3 convert_long3_sat_rtp(float3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rtn(float3 x)
 {
   x = floor(x);
@@ -32412,7 +32412,7 @@ long3 convert_long3_rtn(float3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rtn(float3 x)
 {
   x = floor(x);
@@ -32421,7 +32421,7 @@ long3 convert_long3_sat_rtn(float3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rtz(float4 x)
 {
   return convert_long4(x);
@@ -32429,7 +32429,7 @@ long4 convert_long4_rtz(float4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rtz(float4 x)
 {
   return convert_long4_sat(x);
@@ -32437,7 +32437,7 @@ long4 convert_long4_sat_rtz(float4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rte(float4 x)
 {
   x = rint(x);
@@ -32446,7 +32446,7 @@ long4 convert_long4_rte(float4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rte(float4 x)
 {
   x = rint(x);
@@ -32455,7 +32455,7 @@ long4 convert_long4_sat_rte(float4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rtp(float4 x)
 {
   x = ceil(x);
@@ -32464,7 +32464,7 @@ long4 convert_long4_rtp(float4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rtp(float4 x)
 {
   x = ceil(x);
@@ -32473,7 +32473,7 @@ long4 convert_long4_sat_rtp(float4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rtn(float4 x)
 {
   x = floor(x);
@@ -32482,7 +32482,7 @@ long4 convert_long4_rtn(float4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rtn(float4 x)
 {
   x = floor(x);
@@ -32491,7 +32491,7 @@ long4 convert_long4_sat_rtn(float4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rtz(float8 x)
 {
   return convert_long8(x);
@@ -32499,7 +32499,7 @@ long8 convert_long8_rtz(float8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rtz(float8 x)
 {
   return convert_long8_sat(x);
@@ -32507,7 +32507,7 @@ long8 convert_long8_sat_rtz(float8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rte(float8 x)
 {
   x = rint(x);
@@ -32516,7 +32516,7 @@ long8 convert_long8_rte(float8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rte(float8 x)
 {
   x = rint(x);
@@ -32525,7 +32525,7 @@ long8 convert_long8_sat_rte(float8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rtp(float8 x)
 {
   x = ceil(x);
@@ -32534,7 +32534,7 @@ long8 convert_long8_rtp(float8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rtp(float8 x)
 {
   x = ceil(x);
@@ -32543,7 +32543,7 @@ long8 convert_long8_sat_rtp(float8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rtn(float8 x)
 {
   x = floor(x);
@@ -32552,7 +32552,7 @@ long8 convert_long8_rtn(float8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rtn(float8 x)
 {
   x = floor(x);
@@ -32561,7 +32561,7 @@ long8 convert_long8_sat_rtn(float8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rtz(float16 x)
 {
   return convert_long16(x);
@@ -32569,7 +32569,7 @@ long16 convert_long16_rtz(float16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rtz(float16 x)
 {
   return convert_long16_sat(x);
@@ -32577,7 +32577,7 @@ long16 convert_long16_sat_rtz(float16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rte(float16 x)
 {
   x = rint(x);
@@ -32586,7 +32586,7 @@ long16 convert_long16_rte(float16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rte(float16 x)
 {
   x = rint(x);
@@ -32595,7 +32595,7 @@ long16 convert_long16_sat_rte(float16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rtp(float16 x)
 {
   x = ceil(x);
@@ -32604,7 +32604,7 @@ long16 convert_long16_rtp(float16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rtp(float16 x)
 {
   x = ceil(x);
@@ -32613,7 +32613,7 @@ long16 convert_long16_sat_rtp(float16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rtn(float16 x)
 {
   x = floor(x);
@@ -32622,7 +32622,7 @@ long16 convert_long16_rtn(float16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rtn(float16 x)
 {
   x = floor(x);
@@ -32631,7 +32631,7 @@ long16 convert_long16_sat_rtn(float16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rtz(float x)
 {
   return convert_ulong(x);
@@ -32639,7 +32639,7 @@ ulong convert_ulong_rtz(float x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rtz(float x)
 {
   return convert_ulong_sat(x);
@@ -32647,7 +32647,7 @@ ulong convert_ulong_sat_rtz(float x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rte(float x)
 {
   x = rint(x);
@@ -32656,7 +32656,7 @@ ulong convert_ulong_rte(float x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rte(float x)
 {
   x = rint(x);
@@ -32665,7 +32665,7 @@ ulong convert_ulong_sat_rte(float x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rtp(float x)
 {
   x = ceil(x);
@@ -32674,7 +32674,7 @@ ulong convert_ulong_rtp(float x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rtp(float x)
 {
   x = ceil(x);
@@ -32683,7 +32683,7 @@ ulong convert_ulong_sat_rtp(float x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rtn(float x)
 {
   x = floor(x);
@@ -32692,7 +32692,7 @@ ulong convert_ulong_rtn(float x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rtn(float x)
 {
   x = floor(x);
@@ -32701,7 +32701,7 @@ ulong convert_ulong_sat_rtn(float x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rtz(float2 x)
 {
   return convert_ulong2(x);
@@ -32709,7 +32709,7 @@ ulong2 convert_ulong2_rtz(float2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rtz(float2 x)
 {
   return convert_ulong2_sat(x);
@@ -32717,7 +32717,7 @@ ulong2 convert_ulong2_sat_rtz(float2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rte(float2 x)
 {
   x = rint(x);
@@ -32726,7 +32726,7 @@ ulong2 convert_ulong2_rte(float2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rte(float2 x)
 {
   x = rint(x);
@@ -32735,7 +32735,7 @@ ulong2 convert_ulong2_sat_rte(float2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rtp(float2 x)
 {
   x = ceil(x);
@@ -32744,7 +32744,7 @@ ulong2 convert_ulong2_rtp(float2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rtp(float2 x)
 {
   x = ceil(x);
@@ -32753,7 +32753,7 @@ ulong2 convert_ulong2_sat_rtp(float2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rtn(float2 x)
 {
   x = floor(x);
@@ -32762,7 +32762,7 @@ ulong2 convert_ulong2_rtn(float2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rtn(float2 x)
 {
   x = floor(x);
@@ -32771,7 +32771,7 @@ ulong2 convert_ulong2_sat_rtn(float2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rtz(float3 x)
 {
   return convert_ulong3(x);
@@ -32779,7 +32779,7 @@ ulong3 convert_ulong3_rtz(float3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rtz(float3 x)
 {
   return convert_ulong3_sat(x);
@@ -32787,7 +32787,7 @@ ulong3 convert_ulong3_sat_rtz(float3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rte(float3 x)
 {
   x = rint(x);
@@ -32796,7 +32796,7 @@ ulong3 convert_ulong3_rte(float3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rte(float3 x)
 {
   x = rint(x);
@@ -32805,7 +32805,7 @@ ulong3 convert_ulong3_sat_rte(float3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rtp(float3 x)
 {
   x = ceil(x);
@@ -32814,7 +32814,7 @@ ulong3 convert_ulong3_rtp(float3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rtp(float3 x)
 {
   x = ceil(x);
@@ -32823,7 +32823,7 @@ ulong3 convert_ulong3_sat_rtp(float3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rtn(float3 x)
 {
   x = floor(x);
@@ -32832,7 +32832,7 @@ ulong3 convert_ulong3_rtn(float3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rtn(float3 x)
 {
   x = floor(x);
@@ -32841,7 +32841,7 @@ ulong3 convert_ulong3_sat_rtn(float3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rtz(float4 x)
 {
   return convert_ulong4(x);
@@ -32849,7 +32849,7 @@ ulong4 convert_ulong4_rtz(float4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rtz(float4 x)
 {
   return convert_ulong4_sat(x);
@@ -32857,7 +32857,7 @@ ulong4 convert_ulong4_sat_rtz(float4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rte(float4 x)
 {
   x = rint(x);
@@ -32866,7 +32866,7 @@ ulong4 convert_ulong4_rte(float4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rte(float4 x)
 {
   x = rint(x);
@@ -32875,7 +32875,7 @@ ulong4 convert_ulong4_sat_rte(float4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rtp(float4 x)
 {
   x = ceil(x);
@@ -32884,7 +32884,7 @@ ulong4 convert_ulong4_rtp(float4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rtp(float4 x)
 {
   x = ceil(x);
@@ -32893,7 +32893,7 @@ ulong4 convert_ulong4_sat_rtp(float4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rtn(float4 x)
 {
   x = floor(x);
@@ -32902,7 +32902,7 @@ ulong4 convert_ulong4_rtn(float4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rtn(float4 x)
 {
   x = floor(x);
@@ -32911,7 +32911,7 @@ ulong4 convert_ulong4_sat_rtn(float4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rtz(float8 x)
 {
   return convert_ulong8(x);
@@ -32919,7 +32919,7 @@ ulong8 convert_ulong8_rtz(float8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rtz(float8 x)
 {
   return convert_ulong8_sat(x);
@@ -32927,7 +32927,7 @@ ulong8 convert_ulong8_sat_rtz(float8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rte(float8 x)
 {
   x = rint(x);
@@ -32936,7 +32936,7 @@ ulong8 convert_ulong8_rte(float8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rte(float8 x)
 {
   x = rint(x);
@@ -32945,7 +32945,7 @@ ulong8 convert_ulong8_sat_rte(float8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rtp(float8 x)
 {
   x = ceil(x);
@@ -32954,7 +32954,7 @@ ulong8 convert_ulong8_rtp(float8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rtp(float8 x)
 {
   x = ceil(x);
@@ -32963,7 +32963,7 @@ ulong8 convert_ulong8_sat_rtp(float8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rtn(float8 x)
 {
   x = floor(x);
@@ -32972,7 +32972,7 @@ ulong8 convert_ulong8_rtn(float8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rtn(float8 x)
 {
   x = floor(x);
@@ -32981,7 +32981,7 @@ ulong8 convert_ulong8_sat_rtn(float8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rtz(float16 x)
 {
   return convert_ulong16(x);
@@ -32989,7 +32989,7 @@ ulong16 convert_ulong16_rtz(float16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rtz(float16 x)
 {
   return convert_ulong16_sat(x);
@@ -32997,7 +32997,7 @@ ulong16 convert_ulong16_sat_rtz(float16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rte(float16 x)
 {
   x = rint(x);
@@ -33006,7 +33006,7 @@ ulong16 convert_ulong16_rte(float16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rte(float16 x)
 {
   x = rint(x);
@@ -33015,7 +33015,7 @@ ulong16 convert_ulong16_sat_rte(float16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rtp(float16 x)
 {
   x = ceil(x);
@@ -33024,7 +33024,7 @@ ulong16 convert_ulong16_rtp(float16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rtp(float16 x)
 {
   x = ceil(x);
@@ -33033,7 +33033,7 @@ ulong16 convert_ulong16_sat_rtp(float16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rtn(float16 x)
 {
   x = floor(x);
@@ -33042,7 +33042,7 @@ ulong16 convert_ulong16_rtn(float16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rtn(float16 x)
 {
   x = floor(x);
@@ -33051,7 +33051,7 @@ ulong16 convert_ulong16_sat_rtn(float16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rtz(double x)
 {
   return convert_char(x);
@@ -33059,7 +33059,7 @@ char convert_char_rtz(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rtz(double x)
 {
   return convert_char_sat(x);
@@ -33067,7 +33067,7 @@ char convert_char_sat_rtz(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rte(double x)
 {
   x = rint(x);
@@ -33076,7 +33076,7 @@ char convert_char_rte(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rte(double x)
 {
   x = rint(x);
@@ -33085,7 +33085,7 @@ char convert_char_sat_rte(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rtp(double x)
 {
   x = ceil(x);
@@ -33094,7 +33094,7 @@ char convert_char_rtp(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rtp(double x)
 {
   x = ceil(x);
@@ -33103,7 +33103,7 @@ char convert_char_sat_rtp(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_rtn(double x)
 {
   x = floor(x);
@@ -33112,7 +33112,7 @@ char convert_char_rtn(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char convert_char_sat_rtn(double x)
 {
   x = floor(x);
@@ -33121,7 +33121,7 @@ char convert_char_sat_rtn(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rtz(double2 x)
 {
   return convert_char2(x);
@@ -33129,7 +33129,7 @@ char2 convert_char2_rtz(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rtz(double2 x)
 {
   return convert_char2_sat(x);
@@ -33137,7 +33137,7 @@ char2 convert_char2_sat_rtz(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rte(double2 x)
 {
   x = rint(x);
@@ -33146,7 +33146,7 @@ char2 convert_char2_rte(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rte(double2 x)
 {
   x = rint(x);
@@ -33155,7 +33155,7 @@ char2 convert_char2_sat_rte(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rtp(double2 x)
 {
   x = ceil(x);
@@ -33164,7 +33164,7 @@ char2 convert_char2_rtp(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rtp(double2 x)
 {
   x = ceil(x);
@@ -33173,7 +33173,7 @@ char2 convert_char2_sat_rtp(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_rtn(double2 x)
 {
   x = floor(x);
@@ -33182,7 +33182,7 @@ char2 convert_char2_rtn(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char2 convert_char2_sat_rtn(double2 x)
 {
   x = floor(x);
@@ -33191,7 +33191,7 @@ char2 convert_char2_sat_rtn(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rtz(double3 x)
 {
   return convert_char3(x);
@@ -33199,7 +33199,7 @@ char3 convert_char3_rtz(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rtz(double3 x)
 {
   return convert_char3_sat(x);
@@ -33207,7 +33207,7 @@ char3 convert_char3_sat_rtz(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rte(double3 x)
 {
   x = rint(x);
@@ -33216,7 +33216,7 @@ char3 convert_char3_rte(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rte(double3 x)
 {
   x = rint(x);
@@ -33225,7 +33225,7 @@ char3 convert_char3_sat_rte(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rtp(double3 x)
 {
   x = ceil(x);
@@ -33234,7 +33234,7 @@ char3 convert_char3_rtp(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rtp(double3 x)
 {
   x = ceil(x);
@@ -33243,7 +33243,7 @@ char3 convert_char3_sat_rtp(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_rtn(double3 x)
 {
   x = floor(x);
@@ -33252,7 +33252,7 @@ char3 convert_char3_rtn(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char3 convert_char3_sat_rtn(double3 x)
 {
   x = floor(x);
@@ -33261,7 +33261,7 @@ char3 convert_char3_sat_rtn(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rtz(double4 x)
 {
   return convert_char4(x);
@@ -33269,7 +33269,7 @@ char4 convert_char4_rtz(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rtz(double4 x)
 {
   return convert_char4_sat(x);
@@ -33277,7 +33277,7 @@ char4 convert_char4_sat_rtz(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rte(double4 x)
 {
   x = rint(x);
@@ -33286,7 +33286,7 @@ char4 convert_char4_rte(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rte(double4 x)
 {
   x = rint(x);
@@ -33295,7 +33295,7 @@ char4 convert_char4_sat_rte(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rtp(double4 x)
 {
   x = ceil(x);
@@ -33304,7 +33304,7 @@ char4 convert_char4_rtp(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rtp(double4 x)
 {
   x = ceil(x);
@@ -33313,7 +33313,7 @@ char4 convert_char4_sat_rtp(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_rtn(double4 x)
 {
   x = floor(x);
@@ -33322,7 +33322,7 @@ char4 convert_char4_rtn(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char4 convert_char4_sat_rtn(double4 x)
 {
   x = floor(x);
@@ -33331,7 +33331,7 @@ char4 convert_char4_sat_rtn(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rtz(double8 x)
 {
   return convert_char8(x);
@@ -33339,7 +33339,7 @@ char8 convert_char8_rtz(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rtz(double8 x)
 {
   return convert_char8_sat(x);
@@ -33347,7 +33347,7 @@ char8 convert_char8_sat_rtz(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rte(double8 x)
 {
   x = rint(x);
@@ -33356,7 +33356,7 @@ char8 convert_char8_rte(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rte(double8 x)
 {
   x = rint(x);
@@ -33365,7 +33365,7 @@ char8 convert_char8_sat_rte(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rtp(double8 x)
 {
   x = ceil(x);
@@ -33374,7 +33374,7 @@ char8 convert_char8_rtp(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rtp(double8 x)
 {
   x = ceil(x);
@@ -33383,7 +33383,7 @@ char8 convert_char8_sat_rtp(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_rtn(double8 x)
 {
   x = floor(x);
@@ -33392,7 +33392,7 @@ char8 convert_char8_rtn(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char8 convert_char8_sat_rtn(double8 x)
 {
   x = floor(x);
@@ -33401,7 +33401,7 @@ char8 convert_char8_sat_rtn(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rtz(double16 x)
 {
   return convert_char16(x);
@@ -33409,7 +33409,7 @@ char16 convert_char16_rtz(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rtz(double16 x)
 {
   return convert_char16_sat(x);
@@ -33417,7 +33417,7 @@ char16 convert_char16_sat_rtz(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rte(double16 x)
 {
   x = rint(x);
@@ -33426,7 +33426,7 @@ char16 convert_char16_rte(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rte(double16 x)
 {
   x = rint(x);
@@ -33435,7 +33435,7 @@ char16 convert_char16_sat_rte(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rtp(double16 x)
 {
   x = ceil(x);
@@ -33444,7 +33444,7 @@ char16 convert_char16_rtp(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rtp(double16 x)
 {
   x = ceil(x);
@@ -33453,7 +33453,7 @@ char16 convert_char16_sat_rtp(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_rtn(double16 x)
 {
   x = floor(x);
@@ -33462,7 +33462,7 @@ char16 convert_char16_rtn(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 char16 convert_char16_sat_rtn(double16 x)
 {
   x = floor(x);
@@ -33471,7 +33471,7 @@ char16 convert_char16_sat_rtn(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rtz(double x)
 {
   return convert_uchar(x);
@@ -33479,7 +33479,7 @@ uchar convert_uchar_rtz(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rtz(double x)
 {
   return convert_uchar_sat(x);
@@ -33487,7 +33487,7 @@ uchar convert_uchar_sat_rtz(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rte(double x)
 {
   x = rint(x);
@@ -33496,7 +33496,7 @@ uchar convert_uchar_rte(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rte(double x)
 {
   x = rint(x);
@@ -33505,7 +33505,7 @@ uchar convert_uchar_sat_rte(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rtp(double x)
 {
   x = ceil(x);
@@ -33514,7 +33514,7 @@ uchar convert_uchar_rtp(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rtp(double x)
 {
   x = ceil(x);
@@ -33523,7 +33523,7 @@ uchar convert_uchar_sat_rtp(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_rtn(double x)
 {
   x = floor(x);
@@ -33532,7 +33532,7 @@ uchar convert_uchar_rtn(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar convert_uchar_sat_rtn(double x)
 {
   x = floor(x);
@@ -33541,7 +33541,7 @@ uchar convert_uchar_sat_rtn(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rtz(double2 x)
 {
   return convert_uchar2(x);
@@ -33549,7 +33549,7 @@ uchar2 convert_uchar2_rtz(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rtz(double2 x)
 {
   return convert_uchar2_sat(x);
@@ -33557,7 +33557,7 @@ uchar2 convert_uchar2_sat_rtz(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rte(double2 x)
 {
   x = rint(x);
@@ -33566,7 +33566,7 @@ uchar2 convert_uchar2_rte(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rte(double2 x)
 {
   x = rint(x);
@@ -33575,7 +33575,7 @@ uchar2 convert_uchar2_sat_rte(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rtp(double2 x)
 {
   x = ceil(x);
@@ -33584,7 +33584,7 @@ uchar2 convert_uchar2_rtp(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rtp(double2 x)
 {
   x = ceil(x);
@@ -33593,7 +33593,7 @@ uchar2 convert_uchar2_sat_rtp(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_rtn(double2 x)
 {
   x = floor(x);
@@ -33602,7 +33602,7 @@ uchar2 convert_uchar2_rtn(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar2 convert_uchar2_sat_rtn(double2 x)
 {
   x = floor(x);
@@ -33611,7 +33611,7 @@ uchar2 convert_uchar2_sat_rtn(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rtz(double3 x)
 {
   return convert_uchar3(x);
@@ -33619,7 +33619,7 @@ uchar3 convert_uchar3_rtz(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rtz(double3 x)
 {
   return convert_uchar3_sat(x);
@@ -33627,7 +33627,7 @@ uchar3 convert_uchar3_sat_rtz(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rte(double3 x)
 {
   x = rint(x);
@@ -33636,7 +33636,7 @@ uchar3 convert_uchar3_rte(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rte(double3 x)
 {
   x = rint(x);
@@ -33645,7 +33645,7 @@ uchar3 convert_uchar3_sat_rte(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rtp(double3 x)
 {
   x = ceil(x);
@@ -33654,7 +33654,7 @@ uchar3 convert_uchar3_rtp(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rtp(double3 x)
 {
   x = ceil(x);
@@ -33663,7 +33663,7 @@ uchar3 convert_uchar3_sat_rtp(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_rtn(double3 x)
 {
   x = floor(x);
@@ -33672,7 +33672,7 @@ uchar3 convert_uchar3_rtn(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar3 convert_uchar3_sat_rtn(double3 x)
 {
   x = floor(x);
@@ -33681,7 +33681,7 @@ uchar3 convert_uchar3_sat_rtn(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rtz(double4 x)
 {
   return convert_uchar4(x);
@@ -33689,7 +33689,7 @@ uchar4 convert_uchar4_rtz(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rtz(double4 x)
 {
   return convert_uchar4_sat(x);
@@ -33697,7 +33697,7 @@ uchar4 convert_uchar4_sat_rtz(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rte(double4 x)
 {
   x = rint(x);
@@ -33706,7 +33706,7 @@ uchar4 convert_uchar4_rte(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rte(double4 x)
 {
   x = rint(x);
@@ -33715,7 +33715,7 @@ uchar4 convert_uchar4_sat_rte(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rtp(double4 x)
 {
   x = ceil(x);
@@ -33724,7 +33724,7 @@ uchar4 convert_uchar4_rtp(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rtp(double4 x)
 {
   x = ceil(x);
@@ -33733,7 +33733,7 @@ uchar4 convert_uchar4_sat_rtp(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_rtn(double4 x)
 {
   x = floor(x);
@@ -33742,7 +33742,7 @@ uchar4 convert_uchar4_rtn(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar4 convert_uchar4_sat_rtn(double4 x)
 {
   x = floor(x);
@@ -33751,7 +33751,7 @@ uchar4 convert_uchar4_sat_rtn(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rtz(double8 x)
 {
   return convert_uchar8(x);
@@ -33759,7 +33759,7 @@ uchar8 convert_uchar8_rtz(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rtz(double8 x)
 {
   return convert_uchar8_sat(x);
@@ -33767,7 +33767,7 @@ uchar8 convert_uchar8_sat_rtz(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rte(double8 x)
 {
   x = rint(x);
@@ -33776,7 +33776,7 @@ uchar8 convert_uchar8_rte(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rte(double8 x)
 {
   x = rint(x);
@@ -33785,7 +33785,7 @@ uchar8 convert_uchar8_sat_rte(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rtp(double8 x)
 {
   x = ceil(x);
@@ -33794,7 +33794,7 @@ uchar8 convert_uchar8_rtp(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rtp(double8 x)
 {
   x = ceil(x);
@@ -33803,7 +33803,7 @@ uchar8 convert_uchar8_sat_rtp(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_rtn(double8 x)
 {
   x = floor(x);
@@ -33812,7 +33812,7 @@ uchar8 convert_uchar8_rtn(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar8 convert_uchar8_sat_rtn(double8 x)
 {
   x = floor(x);
@@ -33821,7 +33821,7 @@ uchar8 convert_uchar8_sat_rtn(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rtz(double16 x)
 {
   return convert_uchar16(x);
@@ -33829,7 +33829,7 @@ uchar16 convert_uchar16_rtz(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rtz(double16 x)
 {
   return convert_uchar16_sat(x);
@@ -33837,7 +33837,7 @@ uchar16 convert_uchar16_sat_rtz(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rte(double16 x)
 {
   x = rint(x);
@@ -33846,7 +33846,7 @@ uchar16 convert_uchar16_rte(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rte(double16 x)
 {
   x = rint(x);
@@ -33855,7 +33855,7 @@ uchar16 convert_uchar16_sat_rte(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rtp(double16 x)
 {
   x = ceil(x);
@@ -33864,7 +33864,7 @@ uchar16 convert_uchar16_rtp(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rtp(double16 x)
 {
   x = ceil(x);
@@ -33873,7 +33873,7 @@ uchar16 convert_uchar16_sat_rtp(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_rtn(double16 x)
 {
   x = floor(x);
@@ -33882,7 +33882,7 @@ uchar16 convert_uchar16_rtn(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uchar16 convert_uchar16_sat_rtn(double16 x)
 {
   x = floor(x);
@@ -33891,7 +33891,7 @@ uchar16 convert_uchar16_sat_rtn(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rtz(double x)
 {
   return convert_short(x);
@@ -33899,7 +33899,7 @@ short convert_short_rtz(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rtz(double x)
 {
   return convert_short_sat(x);
@@ -33907,7 +33907,7 @@ short convert_short_sat_rtz(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rte(double x)
 {
   x = rint(x);
@@ -33916,7 +33916,7 @@ short convert_short_rte(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rte(double x)
 {
   x = rint(x);
@@ -33925,7 +33925,7 @@ short convert_short_sat_rte(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rtp(double x)
 {
   x = ceil(x);
@@ -33934,7 +33934,7 @@ short convert_short_rtp(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rtp(double x)
 {
   x = ceil(x);
@@ -33943,7 +33943,7 @@ short convert_short_sat_rtp(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_rtn(double x)
 {
   x = floor(x);
@@ -33952,7 +33952,7 @@ short convert_short_rtn(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short convert_short_sat_rtn(double x)
 {
   x = floor(x);
@@ -33961,7 +33961,7 @@ short convert_short_sat_rtn(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rtz(double2 x)
 {
   return convert_short2(x);
@@ -33969,7 +33969,7 @@ short2 convert_short2_rtz(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rtz(double2 x)
 {
   return convert_short2_sat(x);
@@ -33977,7 +33977,7 @@ short2 convert_short2_sat_rtz(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rte(double2 x)
 {
   x = rint(x);
@@ -33986,7 +33986,7 @@ short2 convert_short2_rte(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rte(double2 x)
 {
   x = rint(x);
@@ -33995,7 +33995,7 @@ short2 convert_short2_sat_rte(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rtp(double2 x)
 {
   x = ceil(x);
@@ -34004,7 +34004,7 @@ short2 convert_short2_rtp(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rtp(double2 x)
 {
   x = ceil(x);
@@ -34013,7 +34013,7 @@ short2 convert_short2_sat_rtp(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_rtn(double2 x)
 {
   x = floor(x);
@@ -34022,7 +34022,7 @@ short2 convert_short2_rtn(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short2 convert_short2_sat_rtn(double2 x)
 {
   x = floor(x);
@@ -34031,7 +34031,7 @@ short2 convert_short2_sat_rtn(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rtz(double3 x)
 {
   return convert_short3(x);
@@ -34039,7 +34039,7 @@ short3 convert_short3_rtz(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rtz(double3 x)
 {
   return convert_short3_sat(x);
@@ -34047,7 +34047,7 @@ short3 convert_short3_sat_rtz(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rte(double3 x)
 {
   x = rint(x);
@@ -34056,7 +34056,7 @@ short3 convert_short3_rte(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rte(double3 x)
 {
   x = rint(x);
@@ -34065,7 +34065,7 @@ short3 convert_short3_sat_rte(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rtp(double3 x)
 {
   x = ceil(x);
@@ -34074,7 +34074,7 @@ short3 convert_short3_rtp(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rtp(double3 x)
 {
   x = ceil(x);
@@ -34083,7 +34083,7 @@ short3 convert_short3_sat_rtp(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_rtn(double3 x)
 {
   x = floor(x);
@@ -34092,7 +34092,7 @@ short3 convert_short3_rtn(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short3 convert_short3_sat_rtn(double3 x)
 {
   x = floor(x);
@@ -34101,7 +34101,7 @@ short3 convert_short3_sat_rtn(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rtz(double4 x)
 {
   return convert_short4(x);
@@ -34109,7 +34109,7 @@ short4 convert_short4_rtz(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rtz(double4 x)
 {
   return convert_short4_sat(x);
@@ -34117,7 +34117,7 @@ short4 convert_short4_sat_rtz(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rte(double4 x)
 {
   x = rint(x);
@@ -34126,7 +34126,7 @@ short4 convert_short4_rte(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rte(double4 x)
 {
   x = rint(x);
@@ -34135,7 +34135,7 @@ short4 convert_short4_sat_rte(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rtp(double4 x)
 {
   x = ceil(x);
@@ -34144,7 +34144,7 @@ short4 convert_short4_rtp(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rtp(double4 x)
 {
   x = ceil(x);
@@ -34153,7 +34153,7 @@ short4 convert_short4_sat_rtp(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_rtn(double4 x)
 {
   x = floor(x);
@@ -34162,7 +34162,7 @@ short4 convert_short4_rtn(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short4 convert_short4_sat_rtn(double4 x)
 {
   x = floor(x);
@@ -34171,7 +34171,7 @@ short4 convert_short4_sat_rtn(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rtz(double8 x)
 {
   return convert_short8(x);
@@ -34179,7 +34179,7 @@ short8 convert_short8_rtz(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rtz(double8 x)
 {
   return convert_short8_sat(x);
@@ -34187,7 +34187,7 @@ short8 convert_short8_sat_rtz(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rte(double8 x)
 {
   x = rint(x);
@@ -34196,7 +34196,7 @@ short8 convert_short8_rte(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rte(double8 x)
 {
   x = rint(x);
@@ -34205,7 +34205,7 @@ short8 convert_short8_sat_rte(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rtp(double8 x)
 {
   x = ceil(x);
@@ -34214,7 +34214,7 @@ short8 convert_short8_rtp(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rtp(double8 x)
 {
   x = ceil(x);
@@ -34223,7 +34223,7 @@ short8 convert_short8_sat_rtp(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_rtn(double8 x)
 {
   x = floor(x);
@@ -34232,7 +34232,7 @@ short8 convert_short8_rtn(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short8 convert_short8_sat_rtn(double8 x)
 {
   x = floor(x);
@@ -34241,7 +34241,7 @@ short8 convert_short8_sat_rtn(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rtz(double16 x)
 {
   return convert_short16(x);
@@ -34249,7 +34249,7 @@ short16 convert_short16_rtz(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rtz(double16 x)
 {
   return convert_short16_sat(x);
@@ -34257,7 +34257,7 @@ short16 convert_short16_sat_rtz(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rte(double16 x)
 {
   x = rint(x);
@@ -34266,7 +34266,7 @@ short16 convert_short16_rte(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rte(double16 x)
 {
   x = rint(x);
@@ -34275,7 +34275,7 @@ short16 convert_short16_sat_rte(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rtp(double16 x)
 {
   x = ceil(x);
@@ -34284,7 +34284,7 @@ short16 convert_short16_rtp(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rtp(double16 x)
 {
   x = ceil(x);
@@ -34293,7 +34293,7 @@ short16 convert_short16_sat_rtp(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_rtn(double16 x)
 {
   x = floor(x);
@@ -34302,7 +34302,7 @@ short16 convert_short16_rtn(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 short16 convert_short16_sat_rtn(double16 x)
 {
   x = floor(x);
@@ -34311,7 +34311,7 @@ short16 convert_short16_sat_rtn(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rtz(double x)
 {
   return convert_ushort(x);
@@ -34319,7 +34319,7 @@ ushort convert_ushort_rtz(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rtz(double x)
 {
   return convert_ushort_sat(x);
@@ -34327,7 +34327,7 @@ ushort convert_ushort_sat_rtz(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rte(double x)
 {
   x = rint(x);
@@ -34336,7 +34336,7 @@ ushort convert_ushort_rte(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rte(double x)
 {
   x = rint(x);
@@ -34345,7 +34345,7 @@ ushort convert_ushort_sat_rte(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rtp(double x)
 {
   x = ceil(x);
@@ -34354,7 +34354,7 @@ ushort convert_ushort_rtp(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rtp(double x)
 {
   x = ceil(x);
@@ -34363,7 +34363,7 @@ ushort convert_ushort_sat_rtp(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_rtn(double x)
 {
   x = floor(x);
@@ -34372,7 +34372,7 @@ ushort convert_ushort_rtn(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort convert_ushort_sat_rtn(double x)
 {
   x = floor(x);
@@ -34381,7 +34381,7 @@ ushort convert_ushort_sat_rtn(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rtz(double2 x)
 {
   return convert_ushort2(x);
@@ -34389,7 +34389,7 @@ ushort2 convert_ushort2_rtz(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rtz(double2 x)
 {
   return convert_ushort2_sat(x);
@@ -34397,7 +34397,7 @@ ushort2 convert_ushort2_sat_rtz(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rte(double2 x)
 {
   x = rint(x);
@@ -34406,7 +34406,7 @@ ushort2 convert_ushort2_rte(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rte(double2 x)
 {
   x = rint(x);
@@ -34415,7 +34415,7 @@ ushort2 convert_ushort2_sat_rte(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rtp(double2 x)
 {
   x = ceil(x);
@@ -34424,7 +34424,7 @@ ushort2 convert_ushort2_rtp(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rtp(double2 x)
 {
   x = ceil(x);
@@ -34433,7 +34433,7 @@ ushort2 convert_ushort2_sat_rtp(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_rtn(double2 x)
 {
   x = floor(x);
@@ -34442,7 +34442,7 @@ ushort2 convert_ushort2_rtn(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort2 convert_ushort2_sat_rtn(double2 x)
 {
   x = floor(x);
@@ -34451,7 +34451,7 @@ ushort2 convert_ushort2_sat_rtn(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rtz(double3 x)
 {
   return convert_ushort3(x);
@@ -34459,7 +34459,7 @@ ushort3 convert_ushort3_rtz(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rtz(double3 x)
 {
   return convert_ushort3_sat(x);
@@ -34467,7 +34467,7 @@ ushort3 convert_ushort3_sat_rtz(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rte(double3 x)
 {
   x = rint(x);
@@ -34476,7 +34476,7 @@ ushort3 convert_ushort3_rte(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rte(double3 x)
 {
   x = rint(x);
@@ -34485,7 +34485,7 @@ ushort3 convert_ushort3_sat_rte(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rtp(double3 x)
 {
   x = ceil(x);
@@ -34494,7 +34494,7 @@ ushort3 convert_ushort3_rtp(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rtp(double3 x)
 {
   x = ceil(x);
@@ -34503,7 +34503,7 @@ ushort3 convert_ushort3_sat_rtp(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_rtn(double3 x)
 {
   x = floor(x);
@@ -34512,7 +34512,7 @@ ushort3 convert_ushort3_rtn(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort3 convert_ushort3_sat_rtn(double3 x)
 {
   x = floor(x);
@@ -34521,7 +34521,7 @@ ushort3 convert_ushort3_sat_rtn(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rtz(double4 x)
 {
   return convert_ushort4(x);
@@ -34529,7 +34529,7 @@ ushort4 convert_ushort4_rtz(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rtz(double4 x)
 {
   return convert_ushort4_sat(x);
@@ -34537,7 +34537,7 @@ ushort4 convert_ushort4_sat_rtz(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rte(double4 x)
 {
   x = rint(x);
@@ -34546,7 +34546,7 @@ ushort4 convert_ushort4_rte(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rte(double4 x)
 {
   x = rint(x);
@@ -34555,7 +34555,7 @@ ushort4 convert_ushort4_sat_rte(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rtp(double4 x)
 {
   x = ceil(x);
@@ -34564,7 +34564,7 @@ ushort4 convert_ushort4_rtp(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rtp(double4 x)
 {
   x = ceil(x);
@@ -34573,7 +34573,7 @@ ushort4 convert_ushort4_sat_rtp(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_rtn(double4 x)
 {
   x = floor(x);
@@ -34582,7 +34582,7 @@ ushort4 convert_ushort4_rtn(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort4 convert_ushort4_sat_rtn(double4 x)
 {
   x = floor(x);
@@ -34591,7 +34591,7 @@ ushort4 convert_ushort4_sat_rtn(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rtz(double8 x)
 {
   return convert_ushort8(x);
@@ -34599,7 +34599,7 @@ ushort8 convert_ushort8_rtz(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rtz(double8 x)
 {
   return convert_ushort8_sat(x);
@@ -34607,7 +34607,7 @@ ushort8 convert_ushort8_sat_rtz(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rte(double8 x)
 {
   x = rint(x);
@@ -34616,7 +34616,7 @@ ushort8 convert_ushort8_rte(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rte(double8 x)
 {
   x = rint(x);
@@ -34625,7 +34625,7 @@ ushort8 convert_ushort8_sat_rte(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rtp(double8 x)
 {
   x = ceil(x);
@@ -34634,7 +34634,7 @@ ushort8 convert_ushort8_rtp(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rtp(double8 x)
 {
   x = ceil(x);
@@ -34643,7 +34643,7 @@ ushort8 convert_ushort8_sat_rtp(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_rtn(double8 x)
 {
   x = floor(x);
@@ -34652,7 +34652,7 @@ ushort8 convert_ushort8_rtn(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort8 convert_ushort8_sat_rtn(double8 x)
 {
   x = floor(x);
@@ -34661,7 +34661,7 @@ ushort8 convert_ushort8_sat_rtn(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rtz(double16 x)
 {
   return convert_ushort16(x);
@@ -34669,7 +34669,7 @@ ushort16 convert_ushort16_rtz(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rtz(double16 x)
 {
   return convert_ushort16_sat(x);
@@ -34677,7 +34677,7 @@ ushort16 convert_ushort16_sat_rtz(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rte(double16 x)
 {
   x = rint(x);
@@ -34686,7 +34686,7 @@ ushort16 convert_ushort16_rte(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rte(double16 x)
 {
   x = rint(x);
@@ -34695,7 +34695,7 @@ ushort16 convert_ushort16_sat_rte(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rtp(double16 x)
 {
   x = ceil(x);
@@ -34704,7 +34704,7 @@ ushort16 convert_ushort16_rtp(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rtp(double16 x)
 {
   x = ceil(x);
@@ -34713,7 +34713,7 @@ ushort16 convert_ushort16_sat_rtp(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_rtn(double16 x)
 {
   x = floor(x);
@@ -34722,7 +34722,7 @@ ushort16 convert_ushort16_rtn(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ushort16 convert_ushort16_sat_rtn(double16 x)
 {
   x = floor(x);
@@ -34731,7 +34731,7 @@ ushort16 convert_ushort16_sat_rtn(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rtz(double x)
 {
   return convert_int(x);
@@ -34739,7 +34739,7 @@ int convert_int_rtz(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rtz(double x)
 {
   return convert_int_sat(x);
@@ -34747,7 +34747,7 @@ int convert_int_sat_rtz(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rte(double x)
 {
   x = rint(x);
@@ -34756,7 +34756,7 @@ int convert_int_rte(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rte(double x)
 {
   x = rint(x);
@@ -34765,7 +34765,7 @@ int convert_int_sat_rte(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rtp(double x)
 {
   x = ceil(x);
@@ -34774,7 +34774,7 @@ int convert_int_rtp(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rtp(double x)
 {
   x = ceil(x);
@@ -34783,7 +34783,7 @@ int convert_int_sat_rtp(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_rtn(double x)
 {
   x = floor(x);
@@ -34792,7 +34792,7 @@ int convert_int_rtn(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int convert_int_sat_rtn(double x)
 {
   x = floor(x);
@@ -34801,7 +34801,7 @@ int convert_int_sat_rtn(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rtz(double2 x)
 {
   return convert_int2(x);
@@ -34809,7 +34809,7 @@ int2 convert_int2_rtz(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rtz(double2 x)
 {
   return convert_int2_sat(x);
@@ -34817,7 +34817,7 @@ int2 convert_int2_sat_rtz(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rte(double2 x)
 {
   x = rint(x);
@@ -34826,7 +34826,7 @@ int2 convert_int2_rte(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rte(double2 x)
 {
   x = rint(x);
@@ -34835,7 +34835,7 @@ int2 convert_int2_sat_rte(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rtp(double2 x)
 {
   x = ceil(x);
@@ -34844,7 +34844,7 @@ int2 convert_int2_rtp(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rtp(double2 x)
 {
   x = ceil(x);
@@ -34853,7 +34853,7 @@ int2 convert_int2_sat_rtp(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_rtn(double2 x)
 {
   x = floor(x);
@@ -34862,7 +34862,7 @@ int2 convert_int2_rtn(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int2 convert_int2_sat_rtn(double2 x)
 {
   x = floor(x);
@@ -34871,7 +34871,7 @@ int2 convert_int2_sat_rtn(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rtz(double3 x)
 {
   return convert_int3(x);
@@ -34879,7 +34879,7 @@ int3 convert_int3_rtz(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rtz(double3 x)
 {
   return convert_int3_sat(x);
@@ -34887,7 +34887,7 @@ int3 convert_int3_sat_rtz(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rte(double3 x)
 {
   x = rint(x);
@@ -34896,7 +34896,7 @@ int3 convert_int3_rte(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rte(double3 x)
 {
   x = rint(x);
@@ -34905,7 +34905,7 @@ int3 convert_int3_sat_rte(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rtp(double3 x)
 {
   x = ceil(x);
@@ -34914,7 +34914,7 @@ int3 convert_int3_rtp(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rtp(double3 x)
 {
   x = ceil(x);
@@ -34923,7 +34923,7 @@ int3 convert_int3_sat_rtp(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_rtn(double3 x)
 {
   x = floor(x);
@@ -34932,7 +34932,7 @@ int3 convert_int3_rtn(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int3 convert_int3_sat_rtn(double3 x)
 {
   x = floor(x);
@@ -34941,7 +34941,7 @@ int3 convert_int3_sat_rtn(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rtz(double4 x)
 {
   return convert_int4(x);
@@ -34949,7 +34949,7 @@ int4 convert_int4_rtz(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rtz(double4 x)
 {
   return convert_int4_sat(x);
@@ -34957,7 +34957,7 @@ int4 convert_int4_sat_rtz(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rte(double4 x)
 {
   x = rint(x);
@@ -34966,7 +34966,7 @@ int4 convert_int4_rte(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rte(double4 x)
 {
   x = rint(x);
@@ -34975,7 +34975,7 @@ int4 convert_int4_sat_rte(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rtp(double4 x)
 {
   x = ceil(x);
@@ -34984,7 +34984,7 @@ int4 convert_int4_rtp(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rtp(double4 x)
 {
   x = ceil(x);
@@ -34993,7 +34993,7 @@ int4 convert_int4_sat_rtp(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_rtn(double4 x)
 {
   x = floor(x);
@@ -35002,7 +35002,7 @@ int4 convert_int4_rtn(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int4 convert_int4_sat_rtn(double4 x)
 {
   x = floor(x);
@@ -35011,7 +35011,7 @@ int4 convert_int4_sat_rtn(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rtz(double8 x)
 {
   return convert_int8(x);
@@ -35019,7 +35019,7 @@ int8 convert_int8_rtz(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rtz(double8 x)
 {
   return convert_int8_sat(x);
@@ -35027,7 +35027,7 @@ int8 convert_int8_sat_rtz(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rte(double8 x)
 {
   x = rint(x);
@@ -35036,7 +35036,7 @@ int8 convert_int8_rte(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rte(double8 x)
 {
   x = rint(x);
@@ -35045,7 +35045,7 @@ int8 convert_int8_sat_rte(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rtp(double8 x)
 {
   x = ceil(x);
@@ -35054,7 +35054,7 @@ int8 convert_int8_rtp(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rtp(double8 x)
 {
   x = ceil(x);
@@ -35063,7 +35063,7 @@ int8 convert_int8_sat_rtp(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_rtn(double8 x)
 {
   x = floor(x);
@@ -35072,7 +35072,7 @@ int8 convert_int8_rtn(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int8 convert_int8_sat_rtn(double8 x)
 {
   x = floor(x);
@@ -35081,7 +35081,7 @@ int8 convert_int8_sat_rtn(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rtz(double16 x)
 {
   return convert_int16(x);
@@ -35089,7 +35089,7 @@ int16 convert_int16_rtz(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rtz(double16 x)
 {
   return convert_int16_sat(x);
@@ -35097,7 +35097,7 @@ int16 convert_int16_sat_rtz(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rte(double16 x)
 {
   x = rint(x);
@@ -35106,7 +35106,7 @@ int16 convert_int16_rte(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rte(double16 x)
 {
   x = rint(x);
@@ -35115,7 +35115,7 @@ int16 convert_int16_sat_rte(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rtp(double16 x)
 {
   x = ceil(x);
@@ -35124,7 +35124,7 @@ int16 convert_int16_rtp(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rtp(double16 x)
 {
   x = ceil(x);
@@ -35133,7 +35133,7 @@ int16 convert_int16_sat_rtp(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_rtn(double16 x)
 {
   x = floor(x);
@@ -35142,7 +35142,7 @@ int16 convert_int16_rtn(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 int16 convert_int16_sat_rtn(double16 x)
 {
   x = floor(x);
@@ -35151,7 +35151,7 @@ int16 convert_int16_sat_rtn(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rtz(double x)
 {
   return convert_uint(x);
@@ -35159,7 +35159,7 @@ uint convert_uint_rtz(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rtz(double x)
 {
   return convert_uint_sat(x);
@@ -35167,7 +35167,7 @@ uint convert_uint_sat_rtz(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rte(double x)
 {
   x = rint(x);
@@ -35176,7 +35176,7 @@ uint convert_uint_rte(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rte(double x)
 {
   x = rint(x);
@@ -35185,7 +35185,7 @@ uint convert_uint_sat_rte(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rtp(double x)
 {
   x = ceil(x);
@@ -35194,7 +35194,7 @@ uint convert_uint_rtp(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rtp(double x)
 {
   x = ceil(x);
@@ -35203,7 +35203,7 @@ uint convert_uint_sat_rtp(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_rtn(double x)
 {
   x = floor(x);
@@ -35212,7 +35212,7 @@ uint convert_uint_rtn(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint convert_uint_sat_rtn(double x)
 {
   x = floor(x);
@@ -35221,7 +35221,7 @@ uint convert_uint_sat_rtn(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rtz(double2 x)
 {
   return convert_uint2(x);
@@ -35229,7 +35229,7 @@ uint2 convert_uint2_rtz(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rtz(double2 x)
 {
   return convert_uint2_sat(x);
@@ -35237,7 +35237,7 @@ uint2 convert_uint2_sat_rtz(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rte(double2 x)
 {
   x = rint(x);
@@ -35246,7 +35246,7 @@ uint2 convert_uint2_rte(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rte(double2 x)
 {
   x = rint(x);
@@ -35255,7 +35255,7 @@ uint2 convert_uint2_sat_rte(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rtp(double2 x)
 {
   x = ceil(x);
@@ -35264,7 +35264,7 @@ uint2 convert_uint2_rtp(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rtp(double2 x)
 {
   x = ceil(x);
@@ -35273,7 +35273,7 @@ uint2 convert_uint2_sat_rtp(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_rtn(double2 x)
 {
   x = floor(x);
@@ -35282,7 +35282,7 @@ uint2 convert_uint2_rtn(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint2 convert_uint2_sat_rtn(double2 x)
 {
   x = floor(x);
@@ -35291,7 +35291,7 @@ uint2 convert_uint2_sat_rtn(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rtz(double3 x)
 {
   return convert_uint3(x);
@@ -35299,7 +35299,7 @@ uint3 convert_uint3_rtz(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rtz(double3 x)
 {
   return convert_uint3_sat(x);
@@ -35307,7 +35307,7 @@ uint3 convert_uint3_sat_rtz(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rte(double3 x)
 {
   x = rint(x);
@@ -35316,7 +35316,7 @@ uint3 convert_uint3_rte(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rte(double3 x)
 {
   x = rint(x);
@@ -35325,7 +35325,7 @@ uint3 convert_uint3_sat_rte(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rtp(double3 x)
 {
   x = ceil(x);
@@ -35334,7 +35334,7 @@ uint3 convert_uint3_rtp(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rtp(double3 x)
 {
   x = ceil(x);
@@ -35343,7 +35343,7 @@ uint3 convert_uint3_sat_rtp(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_rtn(double3 x)
 {
   x = floor(x);
@@ -35352,7 +35352,7 @@ uint3 convert_uint3_rtn(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint3 convert_uint3_sat_rtn(double3 x)
 {
   x = floor(x);
@@ -35361,7 +35361,7 @@ uint3 convert_uint3_sat_rtn(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rtz(double4 x)
 {
   return convert_uint4(x);
@@ -35369,7 +35369,7 @@ uint4 convert_uint4_rtz(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rtz(double4 x)
 {
   return convert_uint4_sat(x);
@@ -35377,7 +35377,7 @@ uint4 convert_uint4_sat_rtz(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rte(double4 x)
 {
   x = rint(x);
@@ -35386,7 +35386,7 @@ uint4 convert_uint4_rte(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rte(double4 x)
 {
   x = rint(x);
@@ -35395,7 +35395,7 @@ uint4 convert_uint4_sat_rte(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rtp(double4 x)
 {
   x = ceil(x);
@@ -35404,7 +35404,7 @@ uint4 convert_uint4_rtp(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rtp(double4 x)
 {
   x = ceil(x);
@@ -35413,7 +35413,7 @@ uint4 convert_uint4_sat_rtp(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_rtn(double4 x)
 {
   x = floor(x);
@@ -35422,7 +35422,7 @@ uint4 convert_uint4_rtn(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint4 convert_uint4_sat_rtn(double4 x)
 {
   x = floor(x);
@@ -35431,7 +35431,7 @@ uint4 convert_uint4_sat_rtn(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rtz(double8 x)
 {
   return convert_uint8(x);
@@ -35439,7 +35439,7 @@ uint8 convert_uint8_rtz(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rtz(double8 x)
 {
   return convert_uint8_sat(x);
@@ -35447,7 +35447,7 @@ uint8 convert_uint8_sat_rtz(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rte(double8 x)
 {
   x = rint(x);
@@ -35456,7 +35456,7 @@ uint8 convert_uint8_rte(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rte(double8 x)
 {
   x = rint(x);
@@ -35465,7 +35465,7 @@ uint8 convert_uint8_sat_rte(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rtp(double8 x)
 {
   x = ceil(x);
@@ -35474,7 +35474,7 @@ uint8 convert_uint8_rtp(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rtp(double8 x)
 {
   x = ceil(x);
@@ -35483,7 +35483,7 @@ uint8 convert_uint8_sat_rtp(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_rtn(double8 x)
 {
   x = floor(x);
@@ -35492,7 +35492,7 @@ uint8 convert_uint8_rtn(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint8 convert_uint8_sat_rtn(double8 x)
 {
   x = floor(x);
@@ -35501,7 +35501,7 @@ uint8 convert_uint8_sat_rtn(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rtz(double16 x)
 {
   return convert_uint16(x);
@@ -35509,7 +35509,7 @@ uint16 convert_uint16_rtz(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rtz(double16 x)
 {
   return convert_uint16_sat(x);
@@ -35517,7 +35517,7 @@ uint16 convert_uint16_sat_rtz(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rte(double16 x)
 {
   x = rint(x);
@@ -35526,7 +35526,7 @@ uint16 convert_uint16_rte(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rte(double16 x)
 {
   x = rint(x);
@@ -35535,7 +35535,7 @@ uint16 convert_uint16_sat_rte(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rtp(double16 x)
 {
   x = ceil(x);
@@ -35544,7 +35544,7 @@ uint16 convert_uint16_rtp(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rtp(double16 x)
 {
   x = ceil(x);
@@ -35553,7 +35553,7 @@ uint16 convert_uint16_sat_rtp(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_rtn(double16 x)
 {
   x = floor(x);
@@ -35562,7 +35562,7 @@ uint16 convert_uint16_rtn(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 uint16 convert_uint16_sat_rtn(double16 x)
 {
   x = floor(x);
@@ -35571,7 +35571,7 @@ uint16 convert_uint16_sat_rtn(double16 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rtz(double x)
 {
   return convert_long(x);
@@ -35579,7 +35579,7 @@ long convert_long_rtz(double x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rtz(double x)
 {
   return convert_long_sat(x);
@@ -35587,7 +35587,7 @@ long convert_long_sat_rtz(double x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rte(double x)
 {
   x = rint(x);
@@ -35596,7 +35596,7 @@ long convert_long_rte(double x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rte(double x)
 {
   x = rint(x);
@@ -35605,7 +35605,7 @@ long convert_long_sat_rte(double x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rtp(double x)
 {
   x = ceil(x);
@@ -35614,7 +35614,7 @@ long convert_long_rtp(double x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rtp(double x)
 {
   x = ceil(x);
@@ -35623,7 +35623,7 @@ long convert_long_sat_rtp(double x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_rtn(double x)
 {
   x = floor(x);
@@ -35632,7 +35632,7 @@ long convert_long_rtn(double x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long convert_long_sat_rtn(double x)
 {
   x = floor(x);
@@ -35641,7 +35641,7 @@ long convert_long_sat_rtn(double x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rtz(double2 x)
 {
   return convert_long2(x);
@@ -35649,7 +35649,7 @@ long2 convert_long2_rtz(double2 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rtz(double2 x)
 {
   return convert_long2_sat(x);
@@ -35657,7 +35657,7 @@ long2 convert_long2_sat_rtz(double2 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rte(double2 x)
 {
   x = rint(x);
@@ -35666,7 +35666,7 @@ long2 convert_long2_rte(double2 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rte(double2 x)
 {
   x = rint(x);
@@ -35675,7 +35675,7 @@ long2 convert_long2_sat_rte(double2 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rtp(double2 x)
 {
   x = ceil(x);
@@ -35684,7 +35684,7 @@ long2 convert_long2_rtp(double2 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rtp(double2 x)
 {
   x = ceil(x);
@@ -35693,7 +35693,7 @@ long2 convert_long2_sat_rtp(double2 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_rtn(double2 x)
 {
   x = floor(x);
@@ -35702,7 +35702,7 @@ long2 convert_long2_rtn(double2 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long2 convert_long2_sat_rtn(double2 x)
 {
   x = floor(x);
@@ -35711,7 +35711,7 @@ long2 convert_long2_sat_rtn(double2 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rtz(double3 x)
 {
   return convert_long3(x);
@@ -35719,7 +35719,7 @@ long3 convert_long3_rtz(double3 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rtz(double3 x)
 {
   return convert_long3_sat(x);
@@ -35727,7 +35727,7 @@ long3 convert_long3_sat_rtz(double3 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rte(double3 x)
 {
   x = rint(x);
@@ -35736,7 +35736,7 @@ long3 convert_long3_rte(double3 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rte(double3 x)
 {
   x = rint(x);
@@ -35745,7 +35745,7 @@ long3 convert_long3_sat_rte(double3 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rtp(double3 x)
 {
   x = ceil(x);
@@ -35754,7 +35754,7 @@ long3 convert_long3_rtp(double3 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rtp(double3 x)
 {
   x = ceil(x);
@@ -35763,7 +35763,7 @@ long3 convert_long3_sat_rtp(double3 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_rtn(double3 x)
 {
   x = floor(x);
@@ -35772,7 +35772,7 @@ long3 convert_long3_rtn(double3 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long3 convert_long3_sat_rtn(double3 x)
 {
   x = floor(x);
@@ -35781,7 +35781,7 @@ long3 convert_long3_sat_rtn(double3 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rtz(double4 x)
 {
   return convert_long4(x);
@@ -35789,7 +35789,7 @@ long4 convert_long4_rtz(double4 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rtz(double4 x)
 {
   return convert_long4_sat(x);
@@ -35797,7 +35797,7 @@ long4 convert_long4_sat_rtz(double4 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rte(double4 x)
 {
   x = rint(x);
@@ -35806,7 +35806,7 @@ long4 convert_long4_rte(double4 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rte(double4 x)
 {
   x = rint(x);
@@ -35815,7 +35815,7 @@ long4 convert_long4_sat_rte(double4 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rtp(double4 x)
 {
   x = ceil(x);
@@ -35824,7 +35824,7 @@ long4 convert_long4_rtp(double4 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rtp(double4 x)
 {
   x = ceil(x);
@@ -35833,7 +35833,7 @@ long4 convert_long4_sat_rtp(double4 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_rtn(double4 x)
 {
   x = floor(x);
@@ -35842,7 +35842,7 @@ long4 convert_long4_rtn(double4 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long4 convert_long4_sat_rtn(double4 x)
 {
   x = floor(x);
@@ -35851,7 +35851,7 @@ long4 convert_long4_sat_rtn(double4 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rtz(double8 x)
 {
   return convert_long8(x);
@@ -35859,7 +35859,7 @@ long8 convert_long8_rtz(double8 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rtz(double8 x)
 {
   return convert_long8_sat(x);
@@ -35867,7 +35867,7 @@ long8 convert_long8_sat_rtz(double8 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rte(double8 x)
 {
   x = rint(x);
@@ -35876,7 +35876,7 @@ long8 convert_long8_rte(double8 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rte(double8 x)
 {
   x = rint(x);
@@ -35885,7 +35885,7 @@ long8 convert_long8_sat_rte(double8 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rtp(double8 x)
 {
   x = ceil(x);
@@ -35894,7 +35894,7 @@ long8 convert_long8_rtp(double8 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rtp(double8 x)
 {
   x = ceil(x);
@@ -35903,7 +35903,7 @@ long8 convert_long8_sat_rtp(double8 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_rtn(double8 x)
 {
   x = floor(x);
@@ -35912,7 +35912,7 @@ long8 convert_long8_rtn(double8 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long8 convert_long8_sat_rtn(double8 x)
 {
   x = floor(x);
@@ -35921,7 +35921,7 @@ long8 convert_long8_sat_rtn(double8 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rtz(double16 x)
 {
   return convert_long16(x);
@@ -35929,7 +35929,7 @@ long16 convert_long16_rtz(double16 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rtz(double16 x)
 {
   return convert_long16_sat(x);
@@ -35937,7 +35937,7 @@ long16 convert_long16_sat_rtz(double16 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rte(double16 x)
 {
   x = rint(x);
@@ -35946,7 +35946,7 @@ long16 convert_long16_rte(double16 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rte(double16 x)
 {
   x = rint(x);
@@ -35955,7 +35955,7 @@ long16 convert_long16_sat_rte(double16 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rtp(double16 x)
 {
   x = ceil(x);
@@ -35964,7 +35964,7 @@ long16 convert_long16_rtp(double16 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rtp(double16 x)
 {
   x = ceil(x);
@@ -35973,7 +35973,7 @@ long16 convert_long16_sat_rtp(double16 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_rtn(double16 x)
 {
   x = floor(x);
@@ -35982,7 +35982,7 @@ long16 convert_long16_rtn(double16 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 long16 convert_long16_sat_rtn(double16 x)
 {
   x = floor(x);
@@ -35991,7 +35991,7 @@ long16 convert_long16_sat_rtn(double16 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rtz(double x)
 {
   return convert_ulong(x);
@@ -35999,7 +35999,7 @@ ulong convert_ulong_rtz(double x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rtz(double x)
 {
   return convert_ulong_sat(x);
@@ -36007,7 +36007,7 @@ ulong convert_ulong_sat_rtz(double x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rte(double x)
 {
   x = rint(x);
@@ -36016,7 +36016,7 @@ ulong convert_ulong_rte(double x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rte(double x)
 {
   x = rint(x);
@@ -36025,7 +36025,7 @@ ulong convert_ulong_sat_rte(double x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rtp(double x)
 {
   x = ceil(x);
@@ -36034,7 +36034,7 @@ ulong convert_ulong_rtp(double x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rtp(double x)
 {
   x = ceil(x);
@@ -36043,7 +36043,7 @@ ulong convert_ulong_sat_rtp(double x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_rtn(double x)
 {
   x = floor(x);
@@ -36052,7 +36052,7 @@ ulong convert_ulong_rtn(double x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong convert_ulong_sat_rtn(double x)
 {
   x = floor(x);
@@ -36061,7 +36061,7 @@ ulong convert_ulong_sat_rtn(double x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rtz(double2 x)
 {
   return convert_ulong2(x);
@@ -36069,7 +36069,7 @@ ulong2 convert_ulong2_rtz(double2 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rtz(double2 x)
 {
   return convert_ulong2_sat(x);
@@ -36077,7 +36077,7 @@ ulong2 convert_ulong2_sat_rtz(double2 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rte(double2 x)
 {
   x = rint(x);
@@ -36086,7 +36086,7 @@ ulong2 convert_ulong2_rte(double2 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rte(double2 x)
 {
   x = rint(x);
@@ -36095,7 +36095,7 @@ ulong2 convert_ulong2_sat_rte(double2 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rtp(double2 x)
 {
   x = ceil(x);
@@ -36104,7 +36104,7 @@ ulong2 convert_ulong2_rtp(double2 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rtp(double2 x)
 {
   x = ceil(x);
@@ -36113,7 +36113,7 @@ ulong2 convert_ulong2_sat_rtp(double2 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_rtn(double2 x)
 {
   x = floor(x);
@@ -36122,7 +36122,7 @@ ulong2 convert_ulong2_rtn(double2 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong2 convert_ulong2_sat_rtn(double2 x)
 {
   x = floor(x);
@@ -36131,7 +36131,7 @@ ulong2 convert_ulong2_sat_rtn(double2 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rtz(double3 x)
 {
   return convert_ulong3(x);
@@ -36139,7 +36139,7 @@ ulong3 convert_ulong3_rtz(double3 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rtz(double3 x)
 {
   return convert_ulong3_sat(x);
@@ -36147,7 +36147,7 @@ ulong3 convert_ulong3_sat_rtz(double3 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rte(double3 x)
 {
   x = rint(x);
@@ -36156,7 +36156,7 @@ ulong3 convert_ulong3_rte(double3 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rte(double3 x)
 {
   x = rint(x);
@@ -36165,7 +36165,7 @@ ulong3 convert_ulong3_sat_rte(double3 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rtp(double3 x)
 {
   x = ceil(x);
@@ -36174,7 +36174,7 @@ ulong3 convert_ulong3_rtp(double3 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rtp(double3 x)
 {
   x = ceil(x);
@@ -36183,7 +36183,7 @@ ulong3 convert_ulong3_sat_rtp(double3 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_rtn(double3 x)
 {
   x = floor(x);
@@ -36192,7 +36192,7 @@ ulong3 convert_ulong3_rtn(double3 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong3 convert_ulong3_sat_rtn(double3 x)
 {
   x = floor(x);
@@ -36201,7 +36201,7 @@ ulong3 convert_ulong3_sat_rtn(double3 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rtz(double4 x)
 {
   return convert_ulong4(x);
@@ -36209,7 +36209,7 @@ ulong4 convert_ulong4_rtz(double4 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rtz(double4 x)
 {
   return convert_ulong4_sat(x);
@@ -36217,7 +36217,7 @@ ulong4 convert_ulong4_sat_rtz(double4 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rte(double4 x)
 {
   x = rint(x);
@@ -36226,7 +36226,7 @@ ulong4 convert_ulong4_rte(double4 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rte(double4 x)
 {
   x = rint(x);
@@ -36235,7 +36235,7 @@ ulong4 convert_ulong4_sat_rte(double4 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rtp(double4 x)
 {
   x = ceil(x);
@@ -36244,7 +36244,7 @@ ulong4 convert_ulong4_rtp(double4 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rtp(double4 x)
 {
   x = ceil(x);
@@ -36253,7 +36253,7 @@ ulong4 convert_ulong4_sat_rtp(double4 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_rtn(double4 x)
 {
   x = floor(x);
@@ -36262,7 +36262,7 @@ ulong4 convert_ulong4_rtn(double4 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong4 convert_ulong4_sat_rtn(double4 x)
 {
   x = floor(x);
@@ -36271,7 +36271,7 @@ ulong4 convert_ulong4_sat_rtn(double4 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rtz(double8 x)
 {
   return convert_ulong8(x);
@@ -36279,7 +36279,7 @@ ulong8 convert_ulong8_rtz(double8 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rtz(double8 x)
 {
   return convert_ulong8_sat(x);
@@ -36287,7 +36287,7 @@ ulong8 convert_ulong8_sat_rtz(double8 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rte(double8 x)
 {
   x = rint(x);
@@ -36296,7 +36296,7 @@ ulong8 convert_ulong8_rte(double8 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rte(double8 x)
 {
   x = rint(x);
@@ -36305,7 +36305,7 @@ ulong8 convert_ulong8_sat_rte(double8 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rtp(double8 x)
 {
   x = ceil(x);
@@ -36314,7 +36314,7 @@ ulong8 convert_ulong8_rtp(double8 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rtp(double8 x)
 {
   x = ceil(x);
@@ -36323,7 +36323,7 @@ ulong8 convert_ulong8_sat_rtp(double8 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_rtn(double8 x)
 {
   x = floor(x);
@@ -36332,7 +36332,7 @@ ulong8 convert_ulong8_rtn(double8 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong8 convert_ulong8_sat_rtn(double8 x)
 {
   x = floor(x);
@@ -36341,7 +36341,7 @@ ulong8 convert_ulong8_sat_rtn(double8 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rtz(double16 x)
 {
   return convert_ulong16(x);
@@ -36349,7 +36349,7 @@ ulong16 convert_ulong16_rtz(double16 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rtz(double16 x)
 {
   return convert_ulong16_sat(x);
@@ -36357,7 +36357,7 @@ ulong16 convert_ulong16_sat_rtz(double16 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rte(double16 x)
 {
   x = rint(x);
@@ -36366,7 +36366,7 @@ ulong16 convert_ulong16_rte(double16 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rte(double16 x)
 {
   x = rint(x);
@@ -36375,7 +36375,7 @@ ulong16 convert_ulong16_sat_rte(double16 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rtp(double16 x)
 {
   x = ceil(x);
@@ -36384,7 +36384,7 @@ ulong16 convert_ulong16_rtp(double16 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rtp(double16 x)
 {
   x = ceil(x);
@@ -36393,7 +36393,7 @@ ulong16 convert_ulong16_sat_rtp(double16 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_rtn(double16 x)
 {
   x = floor(x);
@@ -36402,7 +36402,7 @@ ulong16 convert_ulong16_rtn(double16 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 ulong16 convert_ulong16_sat_rtn(double16 x)
 {
   x = floor(x);
@@ -36410,212 +36410,160 @@ ulong16 convert_ulong16_sat_rtn(double16 x)
 }
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rtz(char x)
 {
-  float r = convert_float(x);
-  char y = convert_char(r);
-  uchar abs_x = abs(x);
-  uchar abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+  return convert_float(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rte(char x)
 {
   return convert_float(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rtp(char x)
 {
-  float r = convert_float(x);
-  char y = convert_char(r);
-  return select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+  return convert_float(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rtn(char x)
 {
-  float r = convert_float(x);
-  char y = convert_char(r);
-  return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+  return convert_float(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rtz(char2 x)
 {
-  float2 r = convert_float2(x);
-  char2 y = convert_char2(r);
-  uchar2 abs_x = abs(x);
-  uchar2 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+  return convert_float2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rte(char2 x)
 {
   return convert_float2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rtp(char2 x)
 {
-  float2 r = convert_float2(x);
-  char2 y = convert_char2(r);
-  return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+  return convert_float2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rtn(char2 x)
 {
-  float2 r = convert_float2(x);
-  char2 y = convert_char2(r);
-  return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+  return convert_float2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rtz(char3 x)
 {
-  float3 r = convert_float3(x);
-  char3 y = convert_char3(r);
-  uchar3 abs_x = abs(x);
-  uchar3 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+  return convert_float3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rte(char3 x)
 {
   return convert_float3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rtp(char3 x)
 {
-  float3 r = convert_float3(x);
-  char3 y = convert_char3(r);
-  return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+  return convert_float3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rtn(char3 x)
 {
-  float3 r = convert_float3(x);
-  char3 y = convert_char3(r);
-  return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+  return convert_float3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rtz(char4 x)
 {
-  float4 r = convert_float4(x);
-  char4 y = convert_char4(r);
-  uchar4 abs_x = abs(x);
-  uchar4 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+  return convert_float4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rte(char4 x)
 {
   return convert_float4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rtp(char4 x)
 {
-  float4 r = convert_float4(x);
-  char4 y = convert_char4(r);
-  return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+  return convert_float4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rtn(char4 x)
 {
-  float4 r = convert_float4(x);
-  char4 y = convert_char4(r);
-  return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+  return convert_float4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rtz(char8 x)
 {
-  float8 r = convert_float8(x);
-  char8 y = convert_char8(r);
-  uchar8 abs_x = abs(x);
-  uchar8 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+  return convert_float8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rte(char8 x)
 {
   return convert_float8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rtp(char8 x)
 {
-  float8 r = convert_float8(x);
-  char8 y = convert_char8(r);
-  return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+  return convert_float8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rtn(char8 x)
 {
-  float8 r = convert_float8(x);
-  char8 y = convert_char8(r);
-  return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+  return convert_float8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rtz(char16 x)
 {
-  float16 r = convert_float16(x);
-  char16 y = convert_char16(r);
-  uchar16 abs_x = abs(x);
-  uchar16 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+  return convert_float16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rte(char16 x)
 {
   return convert_float16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rtp(char16 x)
 {
-  float16 r = convert_float16(x);
-  char16 y = convert_char16(r);
-  return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+  return convert_float16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rtn(char16 x)
 {
-  float16 r = convert_float16(x);
-  char16 y = convert_char16(r);
-  return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+  return convert_float16(x);
 }
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rtz(char x)
 {
-  double r = convert_double(x);
-  char y = convert_char(r);
-  uchar abs_x = abs(x);
-  uchar abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x));
+  return convert_double(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rte(char x)
 {
   return convert_double(x);
@@ -36623,39 +36571,31 @@ double convert_double_rte(char x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rtp(char x)
 {
-  double r = convert_double(x);
-  char y = convert_char(r);
-  return select(r, nextafter(r, (double)INFINITY), convert_long(y < x));
+  return convert_double(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rtn(char x)
 {
-  double r = convert_double(x);
-  char y = convert_char(r);
-  return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x));
+  return convert_double(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rtz(char2 x)
 {
-  double2 r = convert_double2(x);
-  char2 y = convert_char2(r);
-  uchar2 abs_x = abs(x);
-  uchar2 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x));
+  return convert_double2(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rte(char2 x)
 {
   return convert_double2(x);
@@ -36663,39 +36603,31 @@ double2 convert_double2_rte(char2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rtp(char2 x)
 {
-  double2 r = convert_double2(x);
-  char2 y = convert_char2(r);
-  return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x));
+  return convert_double2(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rtn(char2 x)
 {
-  double2 r = convert_double2(x);
-  char2 y = convert_char2(r);
-  return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x));
+  return convert_double2(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rtz(char3 x)
 {
-  double3 r = convert_double3(x);
-  char3 y = convert_char3(r);
-  uchar3 abs_x = abs(x);
-  uchar3 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x));
+  return convert_double3(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rte(char3 x)
 {
   return convert_double3(x);
@@ -36703,39 +36635,31 @@ double3 convert_double3_rte(char3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rtp(char3 x)
 {
-  double3 r = convert_double3(x);
-  char3 y = convert_char3(r);
-  return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x));
+  return convert_double3(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rtn(char3 x)
 {
-  double3 r = convert_double3(x);
-  char3 y = convert_char3(r);
-  return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x));
+  return convert_double3(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rtz(char4 x)
 {
-  double4 r = convert_double4(x);
-  char4 y = convert_char4(r);
-  uchar4 abs_x = abs(x);
-  uchar4 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x));
+  return convert_double4(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rte(char4 x)
 {
   return convert_double4(x);
@@ -36743,39 +36667,31 @@ double4 convert_double4_rte(char4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rtp(char4 x)
 {
-  double4 r = convert_double4(x);
-  char4 y = convert_char4(r);
-  return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x));
+  return convert_double4(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rtn(char4 x)
 {
-  double4 r = convert_double4(x);
-  char4 y = convert_char4(r);
-  return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x));
+  return convert_double4(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rtz(char8 x)
 {
-  double8 r = convert_double8(x);
-  char8 y = convert_char8(r);
-  uchar8 abs_x = abs(x);
-  uchar8 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x));
+  return convert_double8(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rte(char8 x)
 {
   return convert_double8(x);
@@ -36783,39 +36699,31 @@ double8 convert_double8_rte(char8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rtp(char8 x)
 {
-  double8 r = convert_double8(x);
-  char8 y = convert_char8(r);
-  return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x));
+  return convert_double8(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rtn(char8 x)
 {
-  double8 r = convert_double8(x);
-  char8 y = convert_char8(r);
-  return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x));
+  return convert_double8(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rtz(char16 x)
 {
-  double16 r = convert_double16(x);
-  char16 y = convert_char16(r);
-  uchar16 abs_x = abs(x);
-  uchar16 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x));
+  return convert_double16(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rte(char16 x)
 {
   return convert_double16(x);
@@ -36823,231 +36731,175 @@ double16 convert_double16_rte(char16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rtp(char16 x)
 {
-  double16 r = convert_double16(x);
-  char16 y = convert_char16(r);
-  return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x));
+  return convert_double16(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rtn(char16 x)
 {
-  double16 r = convert_double16(x);
-  char16 y = convert_char16(r);
-  return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x));
+  return convert_double16(x);
 }
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rtz(uchar x)
 {
-  float r = convert_float(x);
-  uchar y = convert_uchar(r);
-  uchar abs_x = abs(x);
-  uchar abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+  return convert_float(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rte(uchar x)
 {
   return convert_float(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rtp(uchar x)
 {
-  float r = convert_float(x);
-  uchar y = convert_uchar(r);
-  return select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+  return convert_float(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rtn(uchar x)
 {
-  float r = convert_float(x);
-  uchar y = convert_uchar(r);
-  return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+  return convert_float(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rtz(uchar2 x)
 {
-  float2 r = convert_float2(x);
-  uchar2 y = convert_uchar2(r);
-  uchar2 abs_x = abs(x);
-  uchar2 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+  return convert_float2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rte(uchar2 x)
 {
   return convert_float2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rtp(uchar2 x)
 {
-  float2 r = convert_float2(x);
-  uchar2 y = convert_uchar2(r);
-  return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+  return convert_float2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rtn(uchar2 x)
 {
-  float2 r = convert_float2(x);
-  uchar2 y = convert_uchar2(r);
-  return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+  return convert_float2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rtz(uchar3 x)
 {
-  float3 r = convert_float3(x);
-  uchar3 y = convert_uchar3(r);
-  uchar3 abs_x = abs(x);
-  uchar3 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+  return convert_float3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rte(uchar3 x)
 {
   return convert_float3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rtp(uchar3 x)
 {
-  float3 r = convert_float3(x);
-  uchar3 y = convert_uchar3(r);
-  return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+  return convert_float3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rtn(uchar3 x)
 {
-  float3 r = convert_float3(x);
-  uchar3 y = convert_uchar3(r);
-  return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+  return convert_float3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rtz(uchar4 x)
 {
-  float4 r = convert_float4(x);
-  uchar4 y = convert_uchar4(r);
-  uchar4 abs_x = abs(x);
-  uchar4 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+  return convert_float4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rte(uchar4 x)
 {
   return convert_float4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rtp(uchar4 x)
 {
-  float4 r = convert_float4(x);
-  uchar4 y = convert_uchar4(r);
-  return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+  return convert_float4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rtn(uchar4 x)
 {
-  float4 r = convert_float4(x);
-  uchar4 y = convert_uchar4(r);
-  return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+  return convert_float4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rtz(uchar8 x)
 {
-  float8 r = convert_float8(x);
-  uchar8 y = convert_uchar8(r);
-  uchar8 abs_x = abs(x);
-  uchar8 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+  return convert_float8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rte(uchar8 x)
 {
   return convert_float8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rtp(uchar8 x)
 {
-  float8 r = convert_float8(x);
-  uchar8 y = convert_uchar8(r);
-  return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+  return convert_float8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rtn(uchar8 x)
 {
-  float8 r = convert_float8(x);
-  uchar8 y = convert_uchar8(r);
-  return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+  return convert_float8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rtz(uchar16 x)
 {
-  float16 r = convert_float16(x);
-  uchar16 y = convert_uchar16(r);
-  uchar16 abs_x = abs(x);
-  uchar16 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+  return convert_float16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rte(uchar16 x)
 {
   return convert_float16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rtp(uchar16 x)
 {
-  float16 r = convert_float16(x);
-  uchar16 y = convert_uchar16(r);
-  return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+  return convert_float16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rtn(uchar16 x)
 {
-  float16 r = convert_float16(x);
-  uchar16 y = convert_uchar16(r);
-  return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+  return convert_float16(x);
 }
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rtz(uchar x)
 {
-  double r = convert_double(x);
-  uchar y = convert_uchar(r);
-  uchar abs_x = abs(x);
-  uchar abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x));
+  return convert_double(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rte(uchar x)
 {
   return convert_double(x);
@@ -37055,39 +36907,31 @@ double convert_double_rte(uchar x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rtp(uchar x)
 {
-  double r = convert_double(x);
-  uchar y = convert_uchar(r);
-  return select(r, nextafter(r, (double)INFINITY), convert_long(y < x));
+  return convert_double(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rtn(uchar x)
 {
-  double r = convert_double(x);
-  uchar y = convert_uchar(r);
-  return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x));
+  return convert_double(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rtz(uchar2 x)
 {
-  double2 r = convert_double2(x);
-  uchar2 y = convert_uchar2(r);
-  uchar2 abs_x = abs(x);
-  uchar2 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x));
+  return convert_double2(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rte(uchar2 x)
 {
   return convert_double2(x);
@@ -37095,39 +36939,31 @@ double2 convert_double2_rte(uchar2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rtp(uchar2 x)
 {
-  double2 r = convert_double2(x);
-  uchar2 y = convert_uchar2(r);
-  return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x));
+  return convert_double2(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rtn(uchar2 x)
 {
-  double2 r = convert_double2(x);
-  uchar2 y = convert_uchar2(r);
-  return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x));
+  return convert_double2(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rtz(uchar3 x)
 {
-  double3 r = convert_double3(x);
-  uchar3 y = convert_uchar3(r);
-  uchar3 abs_x = abs(x);
-  uchar3 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x));
+  return convert_double3(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rte(uchar3 x)
 {
   return convert_double3(x);
@@ -37135,39 +36971,31 @@ double3 convert_double3_rte(uchar3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rtp(uchar3 x)
 {
-  double3 r = convert_double3(x);
-  uchar3 y = convert_uchar3(r);
-  return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x));
+  return convert_double3(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rtn(uchar3 x)
 {
-  double3 r = convert_double3(x);
-  uchar3 y = convert_uchar3(r);
-  return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x));
+  return convert_double3(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rtz(uchar4 x)
 {
-  double4 r = convert_double4(x);
-  uchar4 y = convert_uchar4(r);
-  uchar4 abs_x = abs(x);
-  uchar4 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x));
+  return convert_double4(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rte(uchar4 x)
 {
   return convert_double4(x);
@@ -37175,39 +37003,31 @@ double4 convert_double4_rte(uchar4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rtp(uchar4 x)
 {
-  double4 r = convert_double4(x);
-  uchar4 y = convert_uchar4(r);
-  return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x));
+  return convert_double4(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rtn(uchar4 x)
 {
-  double4 r = convert_double4(x);
-  uchar4 y = convert_uchar4(r);
-  return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x));
+  return convert_double4(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rtz(uchar8 x)
 {
-  double8 r = convert_double8(x);
-  uchar8 y = convert_uchar8(r);
-  uchar8 abs_x = abs(x);
-  uchar8 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x));
+  return convert_double8(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rte(uchar8 x)
 {
   return convert_double8(x);
@@ -37215,39 +37035,31 @@ double8 convert_double8_rte(uchar8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rtp(uchar8 x)
 {
-  double8 r = convert_double8(x);
-  uchar8 y = convert_uchar8(r);
-  return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x));
+  return convert_double8(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rtn(uchar8 x)
 {
-  double8 r = convert_double8(x);
-  uchar8 y = convert_uchar8(r);
-  return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x));
+  return convert_double8(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rtz(uchar16 x)
 {
-  double16 r = convert_double16(x);
-  uchar16 y = convert_uchar16(r);
-  uchar16 abs_x = abs(x);
-  uchar16 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x));
+  return convert_double16(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rte(uchar16 x)
 {
   return convert_double16(x);
@@ -37255,231 +37067,175 @@ double16 convert_double16_rte(uchar16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rtp(uchar16 x)
 {
-  double16 r = convert_double16(x);
-  uchar16 y = convert_uchar16(r);
-  return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x));
+  return convert_double16(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rtn(uchar16 x)
 {
-  double16 r = convert_double16(x);
-  uchar16 y = convert_uchar16(r);
-  return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x));
+  return convert_double16(x);
 }
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rtz(short x)
 {
-  float r = convert_float(x);
-  short y = convert_short(r);
-  ushort abs_x = abs(x);
-  ushort abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+  return convert_float(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rte(short x)
 {
   return convert_float(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rtp(short x)
 {
-  float r = convert_float(x);
-  short y = convert_short(r);
-  return select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+  return convert_float(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rtn(short x)
 {
-  float r = convert_float(x);
-  short y = convert_short(r);
-  return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+  return convert_float(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rtz(short2 x)
 {
-  float2 r = convert_float2(x);
-  short2 y = convert_short2(r);
-  ushort2 abs_x = abs(x);
-  ushort2 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+  return convert_float2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rte(short2 x)
 {
   return convert_float2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rtp(short2 x)
 {
-  float2 r = convert_float2(x);
-  short2 y = convert_short2(r);
-  return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+  return convert_float2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rtn(short2 x)
 {
-  float2 r = convert_float2(x);
-  short2 y = convert_short2(r);
-  return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+  return convert_float2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rtz(short3 x)
 {
-  float3 r = convert_float3(x);
-  short3 y = convert_short3(r);
-  ushort3 abs_x = abs(x);
-  ushort3 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+  return convert_float3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rte(short3 x)
 {
   return convert_float3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rtp(short3 x)
 {
-  float3 r = convert_float3(x);
-  short3 y = convert_short3(r);
-  return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+  return convert_float3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rtn(short3 x)
 {
-  float3 r = convert_float3(x);
-  short3 y = convert_short3(r);
-  return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+  return convert_float3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rtz(short4 x)
 {
-  float4 r = convert_float4(x);
-  short4 y = convert_short4(r);
-  ushort4 abs_x = abs(x);
-  ushort4 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+  return convert_float4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rte(short4 x)
 {
   return convert_float4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rtp(short4 x)
 {
-  float4 r = convert_float4(x);
-  short4 y = convert_short4(r);
-  return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+  return convert_float4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rtn(short4 x)
 {
-  float4 r = convert_float4(x);
-  short4 y = convert_short4(r);
-  return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+  return convert_float4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rtz(short8 x)
 {
-  float8 r = convert_float8(x);
-  short8 y = convert_short8(r);
-  ushort8 abs_x = abs(x);
-  ushort8 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+  return convert_float8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rte(short8 x)
 {
   return convert_float8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rtp(short8 x)
 {
-  float8 r = convert_float8(x);
-  short8 y = convert_short8(r);
-  return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+  return convert_float8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rtn(short8 x)
 {
-  float8 r = convert_float8(x);
-  short8 y = convert_short8(r);
-  return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+  return convert_float8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rtz(short16 x)
 {
-  float16 r = convert_float16(x);
-  short16 y = convert_short16(r);
-  ushort16 abs_x = abs(x);
-  ushort16 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+  return convert_float16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rte(short16 x)
 {
   return convert_float16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rtp(short16 x)
 {
-  float16 r = convert_float16(x);
-  short16 y = convert_short16(r);
-  return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+  return convert_float16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rtn(short16 x)
 {
-  float16 r = convert_float16(x);
-  short16 y = convert_short16(r);
-  return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+  return convert_float16(x);
 }
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rtz(short x)
 {
-  double r = convert_double(x);
-  short y = convert_short(r);
-  ushort abs_x = abs(x);
-  ushort abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x));
+  return convert_double(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rte(short x)
 {
   return convert_double(x);
@@ -37487,39 +37243,31 @@ double convert_double_rte(short x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rtp(short x)
 {
-  double r = convert_double(x);
-  short y = convert_short(r);
-  return select(r, nextafter(r, (double)INFINITY), convert_long(y < x));
+  return convert_double(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rtn(short x)
 {
-  double r = convert_double(x);
-  short y = convert_short(r);
-  return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x));
+  return convert_double(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rtz(short2 x)
 {
-  double2 r = convert_double2(x);
-  short2 y = convert_short2(r);
-  ushort2 abs_x = abs(x);
-  ushort2 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x));
+  return convert_double2(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rte(short2 x)
 {
   return convert_double2(x);
@@ -37527,39 +37275,31 @@ double2 convert_double2_rte(short2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rtp(short2 x)
 {
-  double2 r = convert_double2(x);
-  short2 y = convert_short2(r);
-  return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x));
+  return convert_double2(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rtn(short2 x)
 {
-  double2 r = convert_double2(x);
-  short2 y = convert_short2(r);
-  return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x));
+  return convert_double2(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rtz(short3 x)
 {
-  double3 r = convert_double3(x);
-  short3 y = convert_short3(r);
-  ushort3 abs_x = abs(x);
-  ushort3 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x));
+  return convert_double3(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rte(short3 x)
 {
   return convert_double3(x);
@@ -37567,39 +37307,31 @@ double3 convert_double3_rte(short3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rtp(short3 x)
 {
-  double3 r = convert_double3(x);
-  short3 y = convert_short3(r);
-  return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x));
+  return convert_double3(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rtn(short3 x)
 {
-  double3 r = convert_double3(x);
-  short3 y = convert_short3(r);
-  return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x));
+  return convert_double3(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rtz(short4 x)
 {
-  double4 r = convert_double4(x);
-  short4 y = convert_short4(r);
-  ushort4 abs_x = abs(x);
-  ushort4 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x));
+  return convert_double4(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rte(short4 x)
 {
   return convert_double4(x);
@@ -37607,39 +37339,31 @@ double4 convert_double4_rte(short4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rtp(short4 x)
 {
-  double4 r = convert_double4(x);
-  short4 y = convert_short4(r);
-  return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x));
+  return convert_double4(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rtn(short4 x)
 {
-  double4 r = convert_double4(x);
-  short4 y = convert_short4(r);
-  return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x));
+  return convert_double4(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rtz(short8 x)
 {
-  double8 r = convert_double8(x);
-  short8 y = convert_short8(r);
-  ushort8 abs_x = abs(x);
-  ushort8 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x));
+  return convert_double8(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rte(short8 x)
 {
   return convert_double8(x);
@@ -37647,39 +37371,31 @@ double8 convert_double8_rte(short8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rtp(short8 x)
 {
-  double8 r = convert_double8(x);
-  short8 y = convert_short8(r);
-  return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x));
+  return convert_double8(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rtn(short8 x)
 {
-  double8 r = convert_double8(x);
-  short8 y = convert_short8(r);
-  return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x));
+  return convert_double8(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rtz(short16 x)
 {
-  double16 r = convert_double16(x);
-  short16 y = convert_short16(r);
-  ushort16 abs_x = abs(x);
-  ushort16 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x));
+  return convert_double16(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rte(short16 x)
 {
   return convert_double16(x);
@@ -37687,231 +37403,175 @@ double16 convert_double16_rte(short16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rtp(short16 x)
 {
-  double16 r = convert_double16(x);
-  short16 y = convert_short16(r);
-  return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x));
+  return convert_double16(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rtn(short16 x)
 {
-  double16 r = convert_double16(x);
-  short16 y = convert_short16(r);
-  return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x));
+  return convert_double16(x);
 }
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rtz(ushort x)
 {
-  float r = convert_float(x);
-  ushort y = convert_ushort(r);
-  ushort abs_x = abs(x);
-  ushort abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+  return convert_float(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rte(ushort x)
 {
   return convert_float(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rtp(ushort x)
 {
-  float r = convert_float(x);
-  ushort y = convert_ushort(r);
-  return select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+  return convert_float(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rtn(ushort x)
 {
-  float r = convert_float(x);
-  ushort y = convert_ushort(r);
-  return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+  return convert_float(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rtz(ushort2 x)
 {
-  float2 r = convert_float2(x);
-  ushort2 y = convert_ushort2(r);
-  ushort2 abs_x = abs(x);
-  ushort2 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+  return convert_float2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rte(ushort2 x)
 {
   return convert_float2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rtp(ushort2 x)
 {
-  float2 r = convert_float2(x);
-  ushort2 y = convert_ushort2(r);
-  return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+  return convert_float2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rtn(ushort2 x)
 {
-  float2 r = convert_float2(x);
-  ushort2 y = convert_ushort2(r);
-  return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+  return convert_float2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rtz(ushort3 x)
 {
-  float3 r = convert_float3(x);
-  ushort3 y = convert_ushort3(r);
-  ushort3 abs_x = abs(x);
-  ushort3 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+  return convert_float3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rte(ushort3 x)
 {
   return convert_float3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rtp(ushort3 x)
 {
-  float3 r = convert_float3(x);
-  ushort3 y = convert_ushort3(r);
-  return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+  return convert_float3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rtn(ushort3 x)
 {
-  float3 r = convert_float3(x);
-  ushort3 y = convert_ushort3(r);
-  return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+  return convert_float3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rtz(ushort4 x)
 {
-  float4 r = convert_float4(x);
-  ushort4 y = convert_ushort4(r);
-  ushort4 abs_x = abs(x);
-  ushort4 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+  return convert_float4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rte(ushort4 x)
 {
   return convert_float4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rtp(ushort4 x)
 {
-  float4 r = convert_float4(x);
-  ushort4 y = convert_ushort4(r);
-  return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+  return convert_float4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rtn(ushort4 x)
 {
-  float4 r = convert_float4(x);
-  ushort4 y = convert_ushort4(r);
-  return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+  return convert_float4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rtz(ushort8 x)
 {
-  float8 r = convert_float8(x);
-  ushort8 y = convert_ushort8(r);
-  ushort8 abs_x = abs(x);
-  ushort8 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+  return convert_float8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rte(ushort8 x)
 {
   return convert_float8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rtp(ushort8 x)
 {
-  float8 r = convert_float8(x);
-  ushort8 y = convert_ushort8(r);
-  return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+  return convert_float8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rtn(ushort8 x)
 {
-  float8 r = convert_float8(x);
-  ushort8 y = convert_ushort8(r);
-  return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+  return convert_float8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rtz(ushort16 x)
 {
-  float16 r = convert_float16(x);
-  ushort16 y = convert_ushort16(r);
-  ushort16 abs_x = abs(x);
-  ushort16 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+  return convert_float16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rte(ushort16 x)
 {
   return convert_float16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rtp(ushort16 x)
 {
-  float16 r = convert_float16(x);
-  ushort16 y = convert_ushort16(r);
-  return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+  return convert_float16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rtn(ushort16 x)
 {
-  float16 r = convert_float16(x);
-  ushort16 y = convert_ushort16(r);
-  return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+  return convert_float16(x);
 }
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rtz(ushort x)
 {
-  double r = convert_double(x);
-  ushort y = convert_ushort(r);
-  ushort abs_x = abs(x);
-  ushort abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x));
+  return convert_double(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rte(ushort x)
 {
   return convert_double(x);
@@ -37919,39 +37579,31 @@ double convert_double_rte(ushort x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rtp(ushort x)
 {
-  double r = convert_double(x);
-  ushort y = convert_ushort(r);
-  return select(r, nextafter(r, (double)INFINITY), convert_long(y < x));
+  return convert_double(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rtn(ushort x)
 {
-  double r = convert_double(x);
-  ushort y = convert_ushort(r);
-  return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x));
+  return convert_double(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rtz(ushort2 x)
 {
-  double2 r = convert_double2(x);
-  ushort2 y = convert_ushort2(r);
-  ushort2 abs_x = abs(x);
-  ushort2 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x));
+  return convert_double2(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rte(ushort2 x)
 {
   return convert_double2(x);
@@ -37959,39 +37611,31 @@ double2 convert_double2_rte(ushort2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rtp(ushort2 x)
 {
-  double2 r = convert_double2(x);
-  ushort2 y = convert_ushort2(r);
-  return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x));
+  return convert_double2(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rtn(ushort2 x)
 {
-  double2 r = convert_double2(x);
-  ushort2 y = convert_ushort2(r);
-  return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x));
+  return convert_double2(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rtz(ushort3 x)
 {
-  double3 r = convert_double3(x);
-  ushort3 y = convert_ushort3(r);
-  ushort3 abs_x = abs(x);
-  ushort3 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x));
+  return convert_double3(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rte(ushort3 x)
 {
   return convert_double3(x);
@@ -37999,39 +37643,31 @@ double3 convert_double3_rte(ushort3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rtp(ushort3 x)
 {
-  double3 r = convert_double3(x);
-  ushort3 y = convert_ushort3(r);
-  return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x));
+  return convert_double3(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rtn(ushort3 x)
 {
-  double3 r = convert_double3(x);
-  ushort3 y = convert_ushort3(r);
-  return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x));
+  return convert_double3(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rtz(ushort4 x)
 {
-  double4 r = convert_double4(x);
-  ushort4 y = convert_ushort4(r);
-  ushort4 abs_x = abs(x);
-  ushort4 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x));
+  return convert_double4(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rte(ushort4 x)
 {
   return convert_double4(x);
@@ -38039,39 +37675,31 @@ double4 convert_double4_rte(ushort4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rtp(ushort4 x)
 {
-  double4 r = convert_double4(x);
-  ushort4 y = convert_ushort4(r);
-  return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x));
+  return convert_double4(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rtn(ushort4 x)
 {
-  double4 r = convert_double4(x);
-  ushort4 y = convert_ushort4(r);
-  return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x));
+  return convert_double4(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rtz(ushort8 x)
 {
-  double8 r = convert_double8(x);
-  ushort8 y = convert_ushort8(r);
-  ushort8 abs_x = abs(x);
-  ushort8 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x));
+  return convert_double8(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rte(ushort8 x)
 {
   return convert_double8(x);
@@ -38079,39 +37707,31 @@ double8 convert_double8_rte(ushort8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rtp(ushort8 x)
 {
-  double8 r = convert_double8(x);
-  ushort8 y = convert_ushort8(r);
-  return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x));
+  return convert_double8(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rtn(ushort8 x)
 {
-  double8 r = convert_double8(x);
-  ushort8 y = convert_ushort8(r);
-  return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x));
+  return convert_double8(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rtz(ushort16 x)
 {
-  double16 r = convert_double16(x);
-  ushort16 y = convert_ushort16(r);
-  ushort16 abs_x = abs(x);
-  ushort16 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x));
+  return convert_double16(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rte(ushort16 x)
 {
   return convert_double16(x);
@@ -38119,231 +37739,241 @@ double16 convert_double16_rte(ushort16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rtp(ushort16 x)
 {
-  double16 r = convert_double16(x);
-  ushort16 y = convert_ushort16(r);
-  return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x));
+  return convert_double16(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rtn(ushort16 x)
 {
-  double16 r = convert_double16(x);
-  ushort16 y = convert_ushort16(r);
-  return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x));
+  return convert_double16(x);
 }
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rtz(int x)
 {
   float r = convert_float(x);
-  int y = convert_int(r);
+  int y = convert_int_sat(r);
   uint abs_x = abs(x);
   uint abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+  float res = select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+  return select(res, (float)(0x1.fffffep+30f), convert_int(x >= 0x7ffffffc));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rte(int x)
 {
   return convert_float(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rtp(int x)
 {
   float r = convert_float(x);
-  int y = convert_int(r);
-  return select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+  int y = convert_int_sat(r);
+  float res = select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+  return res;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rtn(int x)
 {
   float r = convert_float(x);
-  int y = convert_int(r);
-  return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+  int y = convert_int_sat(r);
+  float res = select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+  return select(res, (float)(0x1.fffffep+30f), convert_int(x >= 0x7ffffffc));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rtz(int2 x)
 {
   float2 r = convert_float2(x);
-  int2 y = convert_int2(r);
+  int2 y = convert_int2_sat(r);
   uint2 abs_x = abs(x);
   uint2 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+  float2 res = select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+  return select(res, (float2)(0x1.fffffep+30f), convert_int2(x >= 0x7ffffffc));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rte(int2 x)
 {
   return convert_float2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rtp(int2 x)
 {
   float2 r = convert_float2(x);
-  int2 y = convert_int2(r);
-  return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+  int2 y = convert_int2_sat(r);
+  float2 res = select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+  return res;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rtn(int2 x)
 {
   float2 r = convert_float2(x);
-  int2 y = convert_int2(r);
-  return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+  int2 y = convert_int2_sat(r);
+  float2 res = select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+  return select(res, (float2)(0x1.fffffep+30f), convert_int2(x >= 0x7ffffffc));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rtz(int3 x)
 {
   float3 r = convert_float3(x);
-  int3 y = convert_int3(r);
+  int3 y = convert_int3_sat(r);
   uint3 abs_x = abs(x);
   uint3 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+  float3 res = select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+  return select(res, (float3)(0x1.fffffep+30f), convert_int3(x >= 0x7ffffffc));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rte(int3 x)
 {
   return convert_float3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rtp(int3 x)
 {
   float3 r = convert_float3(x);
-  int3 y = convert_int3(r);
-  return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+  int3 y = convert_int3_sat(r);
+  float3 res = select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+  return res;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rtn(int3 x)
 {
   float3 r = convert_float3(x);
-  int3 y = convert_int3(r);
-  return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+  int3 y = convert_int3_sat(r);
+  float3 res = select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+  return select(res, (float3)(0x1.fffffep+30f), convert_int3(x >= 0x7ffffffc));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rtz(int4 x)
 {
   float4 r = convert_float4(x);
-  int4 y = convert_int4(r);
+  int4 y = convert_int4_sat(r);
   uint4 abs_x = abs(x);
   uint4 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+  float4 res = select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+  return select(res, (float4)(0x1.fffffep+30f), convert_int4(x >= 0x7ffffffc));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rte(int4 x)
 {
   return convert_float4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rtp(int4 x)
 {
   float4 r = convert_float4(x);
-  int4 y = convert_int4(r);
-  return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+  int4 y = convert_int4_sat(r);
+  float4 res = select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+  return res;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rtn(int4 x)
 {
   float4 r = convert_float4(x);
-  int4 y = convert_int4(r);
-  return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+  int4 y = convert_int4_sat(r);
+  float4 res = select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+  return select(res, (float4)(0x1.fffffep+30f), convert_int4(x >= 0x7ffffffc));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rtz(int8 x)
 {
   float8 r = convert_float8(x);
-  int8 y = convert_int8(r);
+  int8 y = convert_int8_sat(r);
   uint8 abs_x = abs(x);
   uint8 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+  float8 res = select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+  return select(res, (float8)(0x1.fffffep+30f), convert_int8(x >= 0x7ffffffc));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rte(int8 x)
 {
   return convert_float8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rtp(int8 x)
 {
   float8 r = convert_float8(x);
-  int8 y = convert_int8(r);
-  return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+  int8 y = convert_int8_sat(r);
+  float8 res = select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+  return res;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rtn(int8 x)
 {
   float8 r = convert_float8(x);
-  int8 y = convert_int8(r);
-  return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+  int8 y = convert_int8_sat(r);
+  float8 res = select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+  return select(res, (float8)(0x1.fffffep+30f), convert_int8(x >= 0x7ffffffc));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rtz(int16 x)
 {
   float16 r = convert_float16(x);
-  int16 y = convert_int16(r);
+  int16 y = convert_int16_sat(r);
   uint16 abs_x = abs(x);
   uint16 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+  float16 res = select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+  return select(res, (float16)(0x1.fffffep+30f), convert_int16(x >= 0x7ffffffc));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rte(int16 x)
 {
   return convert_float16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rtp(int16 x)
 {
   float16 r = convert_float16(x);
-  int16 y = convert_int16(r);
-  return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+  int16 y = convert_int16_sat(r);
+  float16 res = select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+  return res;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rtn(int16 x)
 {
   float16 r = convert_float16(x);
-  int16 y = convert_int16(r);
-  return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+  int16 y = convert_int16_sat(r);
+  float16 res = select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+  return select(res, (float16)(0x1.fffffep+30f), convert_int16(x >= 0x7ffffffc));
 }
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rtz(int x)
 {
-  double r = convert_double(x);
-  int y = convert_int(r);
-  uint abs_x = abs(x);
-  uint abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x));
+  return convert_double(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rte(int x)
 {
   return convert_double(x);
@@ -38351,39 +37981,31 @@ double convert_double_rte(int x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rtp(int x)
 {
-  double r = convert_double(x);
-  int y = convert_int(r);
-  return select(r, nextafter(r, (double)INFINITY), convert_long(y < x));
+  return convert_double(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rtn(int x)
 {
-  double r = convert_double(x);
-  int y = convert_int(r);
-  return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x));
+  return convert_double(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rtz(int2 x)
 {
-  double2 r = convert_double2(x);
-  int2 y = convert_int2(r);
-  uint2 abs_x = abs(x);
-  uint2 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x));
+  return convert_double2(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rte(int2 x)
 {
   return convert_double2(x);
@@ -38391,39 +38013,31 @@ double2 convert_double2_rte(int2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rtp(int2 x)
 {
-  double2 r = convert_double2(x);
-  int2 y = convert_int2(r);
-  return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x));
+  return convert_double2(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rtn(int2 x)
 {
-  double2 r = convert_double2(x);
-  int2 y = convert_int2(r);
-  return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x));
+  return convert_double2(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rtz(int3 x)
 {
-  double3 r = convert_double3(x);
-  int3 y = convert_int3(r);
-  uint3 abs_x = abs(x);
-  uint3 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x));
+  return convert_double3(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rte(int3 x)
 {
   return convert_double3(x);
@@ -38431,39 +38045,31 @@ double3 convert_double3_rte(int3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rtp(int3 x)
 {
-  double3 r = convert_double3(x);
-  int3 y = convert_int3(r);
-  return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x));
+  return convert_double3(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rtn(int3 x)
 {
-  double3 r = convert_double3(x);
-  int3 y = convert_int3(r);
-  return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x));
+  return convert_double3(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rtz(int4 x)
 {
-  double4 r = convert_double4(x);
-  int4 y = convert_int4(r);
-  uint4 abs_x = abs(x);
-  uint4 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x));
+  return convert_double4(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rte(int4 x)
 {
   return convert_double4(x);
@@ -38471,39 +38077,31 @@ double4 convert_double4_rte(int4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rtp(int4 x)
 {
-  double4 r = convert_double4(x);
-  int4 y = convert_int4(r);
-  return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x));
+  return convert_double4(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rtn(int4 x)
 {
-  double4 r = convert_double4(x);
-  int4 y = convert_int4(r);
-  return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x));
+  return convert_double4(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rtz(int8 x)
 {
-  double8 r = convert_double8(x);
-  int8 y = convert_int8(r);
-  uint8 abs_x = abs(x);
-  uint8 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x));
+  return convert_double8(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rte(int8 x)
 {
   return convert_double8(x);
@@ -38511,39 +38109,31 @@ double8 convert_double8_rte(int8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rtp(int8 x)
 {
-  double8 r = convert_double8(x);
-  int8 y = convert_int8(r);
-  return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x));
+  return convert_double8(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rtn(int8 x)
 {
-  double8 r = convert_double8(x);
-  int8 y = convert_int8(r);
-  return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x));
+  return convert_double8(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rtz(int16 x)
 {
-  double16 r = convert_double16(x);
-  int16 y = convert_int16(r);
-  uint16 abs_x = abs(x);
-  uint16 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x));
+  return convert_double16(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rte(int16 x)
 {
   return convert_double16(x);
@@ -38551,231 +38141,241 @@ double16 convert_double16_rte(int16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rtp(int16 x)
 {
-  double16 r = convert_double16(x);
-  int16 y = convert_int16(r);
-  return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x));
+  return convert_double16(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rtn(int16 x)
 {
-  double16 r = convert_double16(x);
-  int16 y = convert_int16(r);
-  return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x));
+  return convert_double16(x);
 }
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rtz(uint x)
 {
   float r = convert_float(x);
-  uint y = convert_uint(r);
+  uint y = convert_uint_sat(r);
   uint abs_x = abs(x);
   uint abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+  float res = select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+  return select(res, (float)(0x1.fffffep+31f), convert_int(x >= 0xffffff80));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rte(uint x)
 {
   return convert_float(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rtp(uint x)
 {
   float r = convert_float(x);
-  uint y = convert_uint(r);
-  return select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+  uint y = convert_uint_sat(r);
+  float res = select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+  return res;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rtn(uint x)
 {
   float r = convert_float(x);
-  uint y = convert_uint(r);
-  return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+  uint y = convert_uint_sat(r);
+  float res = select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+  return select(res, (float)(0x1.fffffep+31f), convert_int(x >= 0xffffff80));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rtz(uint2 x)
 {
   float2 r = convert_float2(x);
-  uint2 y = convert_uint2(r);
+  uint2 y = convert_uint2_sat(r);
   uint2 abs_x = abs(x);
   uint2 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+  float2 res = select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+  return select(res, (float2)(0x1.fffffep+31f), convert_int2(x >= 0xffffff80));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rte(uint2 x)
 {
   return convert_float2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rtp(uint2 x)
 {
   float2 r = convert_float2(x);
-  uint2 y = convert_uint2(r);
-  return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+  uint2 y = convert_uint2_sat(r);
+  float2 res = select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+  return res;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rtn(uint2 x)
 {
   float2 r = convert_float2(x);
-  uint2 y = convert_uint2(r);
-  return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+  uint2 y = convert_uint2_sat(r);
+  float2 res = select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+  return select(res, (float2)(0x1.fffffep+31f), convert_int2(x >= 0xffffff80));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rtz(uint3 x)
 {
   float3 r = convert_float3(x);
-  uint3 y = convert_uint3(r);
+  uint3 y = convert_uint3_sat(r);
   uint3 abs_x = abs(x);
   uint3 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+  float3 res = select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+  return select(res, (float3)(0x1.fffffep+31f), convert_int3(x >= 0xffffff80));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rte(uint3 x)
 {
   return convert_float3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rtp(uint3 x)
 {
   float3 r = convert_float3(x);
-  uint3 y = convert_uint3(r);
-  return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+  uint3 y = convert_uint3_sat(r);
+  float3 res = select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+  return res;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rtn(uint3 x)
 {
   float3 r = convert_float3(x);
-  uint3 y = convert_uint3(r);
-  return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+  uint3 y = convert_uint3_sat(r);
+  float3 res = select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+  return select(res, (float3)(0x1.fffffep+31f), convert_int3(x >= 0xffffff80));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rtz(uint4 x)
 {
   float4 r = convert_float4(x);
-  uint4 y = convert_uint4(r);
+  uint4 y = convert_uint4_sat(r);
   uint4 abs_x = abs(x);
   uint4 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+  float4 res = select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+  return select(res, (float4)(0x1.fffffep+31f), convert_int4(x >= 0xffffff80));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rte(uint4 x)
 {
   return convert_float4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rtp(uint4 x)
 {
   float4 r = convert_float4(x);
-  uint4 y = convert_uint4(r);
-  return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+  uint4 y = convert_uint4_sat(r);
+  float4 res = select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+  return res;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rtn(uint4 x)
 {
   float4 r = convert_float4(x);
-  uint4 y = convert_uint4(r);
-  return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+  uint4 y = convert_uint4_sat(r);
+  float4 res = select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+  return select(res, (float4)(0x1.fffffep+31f), convert_int4(x >= 0xffffff80));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rtz(uint8 x)
 {
   float8 r = convert_float8(x);
-  uint8 y = convert_uint8(r);
+  uint8 y = convert_uint8_sat(r);
   uint8 abs_x = abs(x);
   uint8 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+  float8 res = select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+  return select(res, (float8)(0x1.fffffep+31f), convert_int8(x >= 0xffffff80));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rte(uint8 x)
 {
   return convert_float8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rtp(uint8 x)
 {
   float8 r = convert_float8(x);
-  uint8 y = convert_uint8(r);
-  return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+  uint8 y = convert_uint8_sat(r);
+  float8 res = select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+  return res;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rtn(uint8 x)
 {
   float8 r = convert_float8(x);
-  uint8 y = convert_uint8(r);
-  return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+  uint8 y = convert_uint8_sat(r);
+  float8 res = select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+  return select(res, (float8)(0x1.fffffep+31f), convert_int8(x >= 0xffffff80));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rtz(uint16 x)
 {
   float16 r = convert_float16(x);
-  uint16 y = convert_uint16(r);
+  uint16 y = convert_uint16_sat(r);
   uint16 abs_x = abs(x);
   uint16 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+  float16 res = select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+  return select(res, (float16)(0x1.fffffep+31f), convert_int16(x >= 0xffffff80));
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rte(uint16 x)
 {
   return convert_float16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rtp(uint16 x)
 {
   float16 r = convert_float16(x);
-  uint16 y = convert_uint16(r);
-  return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+  uint16 y = convert_uint16_sat(r);
+  float16 res = select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+  return res;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rtn(uint16 x)
 {
   float16 r = convert_float16(x);
-  uint16 y = convert_uint16(r);
-  return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+  uint16 y = convert_uint16_sat(r);
+  float16 res = select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+  return select(res, (float16)(0x1.fffffep+31f), convert_int16(x >= 0xffffff80));
 }
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rtz(uint x)
 {
-  double r = convert_double(x);
-  uint y = convert_uint(r);
-  uint abs_x = abs(x);
-  uint abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x));
+  return convert_double(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rte(uint x)
 {
   return convert_double(x);
@@ -38783,39 +38383,31 @@ double convert_double_rte(uint x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rtp(uint x)
 {
-  double r = convert_double(x);
-  uint y = convert_uint(r);
-  return select(r, nextafter(r, (double)INFINITY), convert_long(y < x));
+  return convert_double(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rtn(uint x)
 {
-  double r = convert_double(x);
-  uint y = convert_uint(r);
-  return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x));
+  return convert_double(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rtz(uint2 x)
 {
-  double2 r = convert_double2(x);
-  uint2 y = convert_uint2(r);
-  uint2 abs_x = abs(x);
-  uint2 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x));
+  return convert_double2(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rte(uint2 x)
 {
   return convert_double2(x);
@@ -38823,39 +38415,31 @@ double2 convert_double2_rte(uint2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rtp(uint2 x)
 {
-  double2 r = convert_double2(x);
-  uint2 y = convert_uint2(r);
-  return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x));
+  return convert_double2(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rtn(uint2 x)
 {
-  double2 r = convert_double2(x);
-  uint2 y = convert_uint2(r);
-  return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x));
+  return convert_double2(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rtz(uint3 x)
 {
-  double3 r = convert_double3(x);
-  uint3 y = convert_uint3(r);
-  uint3 abs_x = abs(x);
-  uint3 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x));
+  return convert_double3(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rte(uint3 x)
 {
   return convert_double3(x);
@@ -38863,39 +38447,31 @@ double3 convert_double3_rte(uint3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rtp(uint3 x)
 {
-  double3 r = convert_double3(x);
-  uint3 y = convert_uint3(r);
-  return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x));
+  return convert_double3(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rtn(uint3 x)
 {
-  double3 r = convert_double3(x);
-  uint3 y = convert_uint3(r);
-  return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x));
+  return convert_double3(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rtz(uint4 x)
 {
-  double4 r = convert_double4(x);
-  uint4 y = convert_uint4(r);
-  uint4 abs_x = abs(x);
-  uint4 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x));
+  return convert_double4(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rte(uint4 x)
 {
   return convert_double4(x);
@@ -38903,39 +38479,31 @@ double4 convert_double4_rte(uint4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rtp(uint4 x)
 {
-  double4 r = convert_double4(x);
-  uint4 y = convert_uint4(r);
-  return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x));
+  return convert_double4(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rtn(uint4 x)
 {
-  double4 r = convert_double4(x);
-  uint4 y = convert_uint4(r);
-  return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x));
+  return convert_double4(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rtz(uint8 x)
 {
-  double8 r = convert_double8(x);
-  uint8 y = convert_uint8(r);
-  uint8 abs_x = abs(x);
-  uint8 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x));
+  return convert_double8(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rte(uint8 x)
 {
   return convert_double8(x);
@@ -38943,39 +38511,31 @@ double8 convert_double8_rte(uint8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rtp(uint8 x)
 {
-  double8 r = convert_double8(x);
-  uint8 y = convert_uint8(r);
-  return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x));
+  return convert_double8(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rtn(uint8 x)
 {
-  double8 r = convert_double8(x);
-  uint8 y = convert_uint8(r);
-  return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x));
+  return convert_double8(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rtz(uint16 x)
 {
-  double16 r = convert_double16(x);
-  uint16 y = convert_uint16(r);
-  uint16 abs_x = abs(x);
-  uint16 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x));
+  return convert_double16(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rte(uint16 x)
 {
   return convert_double16(x);
@@ -38983,39 +38543,36 @@ double16 convert_double16_rte(uint16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rtp(uint16 x)
 {
-  double16 r = convert_double16(x);
-  uint16 y = convert_uint16(r);
-  return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x));
+  return convert_double16(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rtn(uint16 x)
 {
-  double16 r = convert_double16(x);
-  uint16 y = convert_uint16(r);
-  return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x));
+  return convert_double16(x);
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rtz(long x)
 {
   float r = convert_float(x);
-  long y = convert_long(r);
+  long y = convert_long_sat(r);
   ulong abs_x = abs(x);
   ulong abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+  float res = select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+  return select(res, (float)(0x1.fffffep+62), convert_int(x >= 0x7fffffffffffffffL));
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rte(long x)
 {
   return convert_float(x);
@@ -39023,39 +38580,42 @@ float convert_float_rte(long x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rtp(long x)
 {
   float r = convert_float(x);
-  long y = convert_long(r);
-  return select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+  long y = convert_long_sat(r);
+  float res = select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rtn(long x)
 {
   float r = convert_float(x);
-  long y = convert_long(r);
-  return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+  long y = convert_long_sat(r);
+  float res = select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+  return select(res, (float)(0x1.fffffep+62), convert_int(x >= 0x7fffffffffffffffL));
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rtz(long2 x)
 {
   float2 r = convert_float2(x);
-  long2 y = convert_long2(r);
+  long2 y = convert_long2_sat(r);
   ulong2 abs_x = abs(x);
   ulong2 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+  float2 res = select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+  return select(res, (float2)(0x1.fffffep+62), convert_int2(x >= 0x7fffffffffffffffL));
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rte(long2 x)
 {
   return convert_float2(x);
@@ -39063,39 +38623,42 @@ float2 convert_float2_rte(long2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rtp(long2 x)
 {
   float2 r = convert_float2(x);
-  long2 y = convert_long2(r);
-  return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+  long2 y = convert_long2_sat(r);
+  float2 res = select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rtn(long2 x)
 {
   float2 r = convert_float2(x);
-  long2 y = convert_long2(r);
-  return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+  long2 y = convert_long2_sat(r);
+  float2 res = select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+  return select(res, (float2)(0x1.fffffep+62), convert_int2(x >= 0x7fffffffffffffffL));
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rtz(long3 x)
 {
   float3 r = convert_float3(x);
-  long3 y = convert_long3(r);
+  long3 y = convert_long3_sat(r);
   ulong3 abs_x = abs(x);
   ulong3 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+  float3 res = select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+  return select(res, (float3)(0x1.fffffep+62), convert_int3(x >= 0x7fffffffffffffffL));
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rte(long3 x)
 {
   return convert_float3(x);
@@ -39103,39 +38666,42 @@ float3 convert_float3_rte(long3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rtp(long3 x)
 {
   float3 r = convert_float3(x);
-  long3 y = convert_long3(r);
-  return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+  long3 y = convert_long3_sat(r);
+  float3 res = select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rtn(long3 x)
 {
   float3 r = convert_float3(x);
-  long3 y = convert_long3(r);
-  return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+  long3 y = convert_long3_sat(r);
+  float3 res = select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+  return select(res, (float3)(0x1.fffffep+62), convert_int3(x >= 0x7fffffffffffffffL));
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rtz(long4 x)
 {
   float4 r = convert_float4(x);
-  long4 y = convert_long4(r);
+  long4 y = convert_long4_sat(r);
   ulong4 abs_x = abs(x);
   ulong4 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+  float4 res = select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+  return select(res, (float4)(0x1.fffffep+62), convert_int4(x >= 0x7fffffffffffffffL));
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rte(long4 x)
 {
   return convert_float4(x);
@@ -39143,39 +38709,42 @@ float4 convert_float4_rte(long4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rtp(long4 x)
 {
   float4 r = convert_float4(x);
-  long4 y = convert_long4(r);
-  return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+  long4 y = convert_long4_sat(r);
+  float4 res = select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rtn(long4 x)
 {
   float4 r = convert_float4(x);
-  long4 y = convert_long4(r);
-  return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+  long4 y = convert_long4_sat(r);
+  float4 res = select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+  return select(res, (float4)(0x1.fffffep+62), convert_int4(x >= 0x7fffffffffffffffL));
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rtz(long8 x)
 {
   float8 r = convert_float8(x);
-  long8 y = convert_long8(r);
+  long8 y = convert_long8_sat(r);
   ulong8 abs_x = abs(x);
   ulong8 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+  float8 res = select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+  return select(res, (float8)(0x1.fffffep+62), convert_int8(x >= 0x7fffffffffffffffL));
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rte(long8 x)
 {
   return convert_float8(x);
@@ -39183,39 +38752,42 @@ float8 convert_float8_rte(long8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rtp(long8 x)
 {
   float8 r = convert_float8(x);
-  long8 y = convert_long8(r);
-  return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+  long8 y = convert_long8_sat(r);
+  float8 res = select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rtn(long8 x)
 {
   float8 r = convert_float8(x);
-  long8 y = convert_long8(r);
-  return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+  long8 y = convert_long8_sat(r);
+  float8 res = select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+  return select(res, (float8)(0x1.fffffep+62), convert_int8(x >= 0x7fffffffffffffffL));
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rtz(long16 x)
 {
   float16 r = convert_float16(x);
-  long16 y = convert_long16(r);
+  long16 y = convert_long16_sat(r);
   ulong16 abs_x = abs(x);
   ulong16 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+  float16 res = select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+  return select(res, (float16)(0x1.fffffep+62), convert_int16(x >= 0x7fffffffffffffffL));
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rte(long16 x)
 {
   return convert_float16(x);
@@ -39223,39 +38795,42 @@ float16 convert_float16_rte(long16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rtp(long16 x)
 {
   float16 r = convert_float16(x);
-  long16 y = convert_long16(r);
-  return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+  long16 y = convert_long16_sat(r);
+  float16 res = select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rtn(long16 x)
 {
   float16 r = convert_float16(x);
-  long16 y = convert_long16(r);
-  return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+  long16 y = convert_long16_sat(r);
+  float16 res = select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+  return select(res, (float16)(0x1.fffffep+62), convert_int16(x >= 0x7fffffffffffffffL));
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rtz(long x)
 {
   double r = convert_double(x);
-  long y = convert_long(r);
+  long y = convert_long_sat(r);
   ulong abs_x = abs(x);
   ulong abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x));
+  double res = select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x));
+  return select(res, (double)(0x1.fffffffffffffp+62), convert_long(x >= 0x7fffffffffffffffL));
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rte(long x)
 {
   return convert_double(x);
@@ -39263,39 +38838,42 @@ double convert_double_rte(long x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rtp(long x)
 {
   double r = convert_double(x);
-  long y = convert_long(r);
-  return select(r, nextafter(r, (double)INFINITY), convert_long(y < x));
+  long y = convert_long_sat(r);
+  double res = select(r, nextafter(r, (double)INFINITY), convert_long(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rtn(long x)
 {
   double r = convert_double(x);
-  long y = convert_long(r);
-  return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x));
+  long y = convert_long_sat(r);
+  double res = select(r, nextafter(r, (double)-INFINITY), convert_long(y > x));
+  return select(res, (double)(0x1.fffffffffffffp+62), convert_long(x >= 0x7fffffffffffffffL));
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rtz(long2 x)
 {
   double2 r = convert_double2(x);
-  long2 y = convert_long2(r);
+  long2 y = convert_long2_sat(r);
   ulong2 abs_x = abs(x);
   ulong2 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x));
+  double2 res = select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x));
+  return select(res, (double2)(0x1.fffffffffffffp+62), convert_long2(x >= 0x7fffffffffffffffL));
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rte(long2 x)
 {
   return convert_double2(x);
@@ -39303,39 +38881,42 @@ double2 convert_double2_rte(long2 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rtp(long2 x)
 {
   double2 r = convert_double2(x);
-  long2 y = convert_long2(r);
-  return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x));
+  long2 y = convert_long2_sat(r);
+  double2 res = select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rtn(long2 x)
 {
   double2 r = convert_double2(x);
-  long2 y = convert_long2(r);
-  return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x));
+  long2 y = convert_long2_sat(r);
+  double2 res = select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x));
+  return select(res, (double2)(0x1.fffffffffffffp+62), convert_long2(x >= 0x7fffffffffffffffL));
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rtz(long3 x)
 {
   double3 r = convert_double3(x);
-  long3 y = convert_long3(r);
+  long3 y = convert_long3_sat(r);
   ulong3 abs_x = abs(x);
   ulong3 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x));
+  double3 res = select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x));
+  return select(res, (double3)(0x1.fffffffffffffp+62), convert_long3(x >= 0x7fffffffffffffffL));
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rte(long3 x)
 {
   return convert_double3(x);
@@ -39343,39 +38924,42 @@ double3 convert_double3_rte(long3 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rtp(long3 x)
 {
   double3 r = convert_double3(x);
-  long3 y = convert_long3(r);
-  return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x));
+  long3 y = convert_long3_sat(r);
+  double3 res = select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rtn(long3 x)
 {
   double3 r = convert_double3(x);
-  long3 y = convert_long3(r);
-  return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x));
+  long3 y = convert_long3_sat(r);
+  double3 res = select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x));
+  return select(res, (double3)(0x1.fffffffffffffp+62), convert_long3(x >= 0x7fffffffffffffffL));
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rtz(long4 x)
 {
   double4 r = convert_double4(x);
-  long4 y = convert_long4(r);
+  long4 y = convert_long4_sat(r);
   ulong4 abs_x = abs(x);
   ulong4 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x));
+  double4 res = select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x));
+  return select(res, (double4)(0x1.fffffffffffffp+62), convert_long4(x >= 0x7fffffffffffffffL));
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rte(long4 x)
 {
   return convert_double4(x);
@@ -39383,39 +38967,42 @@ double4 convert_double4_rte(long4 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rtp(long4 x)
 {
   double4 r = convert_double4(x);
-  long4 y = convert_long4(r);
-  return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x));
+  long4 y = convert_long4_sat(r);
+  double4 res = select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rtn(long4 x)
 {
   double4 r = convert_double4(x);
-  long4 y = convert_long4(r);
-  return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x));
+  long4 y = convert_long4_sat(r);
+  double4 res = select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x));
+  return select(res, (double4)(0x1.fffffffffffffp+62), convert_long4(x >= 0x7fffffffffffffffL));
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rtz(long8 x)
 {
   double8 r = convert_double8(x);
-  long8 y = convert_long8(r);
+  long8 y = convert_long8_sat(r);
   ulong8 abs_x = abs(x);
   ulong8 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x));
+  double8 res = select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x));
+  return select(res, (double8)(0x1.fffffffffffffp+62), convert_long8(x >= 0x7fffffffffffffffL));
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rte(long8 x)
 {
   return convert_double8(x);
@@ -39423,39 +39010,42 @@ double8 convert_double8_rte(long8 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rtp(long8 x)
 {
   double8 r = convert_double8(x);
-  long8 y = convert_long8(r);
-  return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x));
+  long8 y = convert_long8_sat(r);
+  double8 res = select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rtn(long8 x)
 {
   double8 r = convert_double8(x);
-  long8 y = convert_long8(r);
-  return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x));
+  long8 y = convert_long8_sat(r);
+  double8 res = select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x));
+  return select(res, (double8)(0x1.fffffffffffffp+62), convert_long8(x >= 0x7fffffffffffffffL));
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rtz(long16 x)
 {
   double16 r = convert_double16(x);
-  long16 y = convert_long16(r);
+  long16 y = convert_long16_sat(r);
   ulong16 abs_x = abs(x);
   ulong16 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x));
+  double16 res = select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x));
+  return select(res, (double16)(0x1.fffffffffffffp+62), convert_long16(x >= 0x7fffffffffffffffL));
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rte(long16 x)
 {
   return convert_double16(x);
@@ -39463,39 +39053,42 @@ double16 convert_double16_rte(long16 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rtp(long16 x)
 {
   double16 r = convert_double16(x);
-  long16 y = convert_long16(r);
-  return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x));
+  long16 y = convert_long16_sat(r);
+  double16 res = select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rtn(long16 x)
 {
   double16 r = convert_double16(x);
-  long16 y = convert_long16(r);
-  return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x));
+  long16 y = convert_long16_sat(r);
+  double16 res = select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x));
+  return select(res, (double16)(0x1.fffffffffffffp+62), convert_long16(x >= 0x7fffffffffffffffL));
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rtz(ulong x)
 {
   float r = convert_float(x);
-  ulong y = convert_ulong(r);
+  ulong y = convert_ulong_sat(r);
   ulong abs_x = abs(x);
   ulong abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+  float res = select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+  return select(res, (float)(0x1.fffffep+63), convert_int(x >= 0xfffffffffffffffeUL));
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rte(ulong x)
 {
   return convert_float(x);
@@ -39503,39 +39096,42 @@ float convert_float_rte(ulong x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rtp(ulong x)
 {
   float r = convert_float(x);
-  ulong y = convert_ulong(r);
-  return select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+  ulong y = convert_ulong_sat(r);
+  float res = select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rtn(ulong x)
 {
   float r = convert_float(x);
-  ulong y = convert_ulong(r);
-  return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+  ulong y = convert_ulong_sat(r);
+  float res = select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+  return select(res, (float)(0x1.fffffep+63), convert_int(x >= 0xfffffffffffffffeUL));
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rtz(ulong2 x)
 {
   float2 r = convert_float2(x);
-  ulong2 y = convert_ulong2(r);
+  ulong2 y = convert_ulong2_sat(r);
   ulong2 abs_x = abs(x);
   ulong2 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+  float2 res = select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+  return select(res, (float2)(0x1.fffffep+63), convert_int2(x >= 0xfffffffffffffffeUL));
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rte(ulong2 x)
 {
   return convert_float2(x);
@@ -39543,39 +39139,42 @@ float2 convert_float2_rte(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rtp(ulong2 x)
 {
   float2 r = convert_float2(x);
-  ulong2 y = convert_ulong2(r);
-  return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+  ulong2 y = convert_ulong2_sat(r);
+  float2 res = select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rtn(ulong2 x)
 {
   float2 r = convert_float2(x);
-  ulong2 y = convert_ulong2(r);
-  return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+  ulong2 y = convert_ulong2_sat(r);
+  float2 res = select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+  return select(res, (float2)(0x1.fffffep+63), convert_int2(x >= 0xfffffffffffffffeUL));
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rtz(ulong3 x)
 {
   float3 r = convert_float3(x);
-  ulong3 y = convert_ulong3(r);
+  ulong3 y = convert_ulong3_sat(r);
   ulong3 abs_x = abs(x);
   ulong3 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+  float3 res = select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+  return select(res, (float3)(0x1.fffffep+63), convert_int3(x >= 0xfffffffffffffffeUL));
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rte(ulong3 x)
 {
   return convert_float3(x);
@@ -39583,39 +39182,42 @@ float3 convert_float3_rte(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rtp(ulong3 x)
 {
   float3 r = convert_float3(x);
-  ulong3 y = convert_ulong3(r);
-  return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+  ulong3 y = convert_ulong3_sat(r);
+  float3 res = select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rtn(ulong3 x)
 {
   float3 r = convert_float3(x);
-  ulong3 y = convert_ulong3(r);
-  return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+  ulong3 y = convert_ulong3_sat(r);
+  float3 res = select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+  return select(res, (float3)(0x1.fffffep+63), convert_int3(x >= 0xfffffffffffffffeUL));
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rtz(ulong4 x)
 {
   float4 r = convert_float4(x);
-  ulong4 y = convert_ulong4(r);
+  ulong4 y = convert_ulong4_sat(r);
   ulong4 abs_x = abs(x);
   ulong4 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+  float4 res = select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+  return select(res, (float4)(0x1.fffffep+63), convert_int4(x >= 0xfffffffffffffffeUL));
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rte(ulong4 x)
 {
   return convert_float4(x);
@@ -39623,39 +39225,42 @@ float4 convert_float4_rte(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rtp(ulong4 x)
 {
   float4 r = convert_float4(x);
-  ulong4 y = convert_ulong4(r);
-  return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+  ulong4 y = convert_ulong4_sat(r);
+  float4 res = select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rtn(ulong4 x)
 {
   float4 r = convert_float4(x);
-  ulong4 y = convert_ulong4(r);
-  return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+  ulong4 y = convert_ulong4_sat(r);
+  float4 res = select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+  return select(res, (float4)(0x1.fffffep+63), convert_int4(x >= 0xfffffffffffffffeUL));
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rtz(ulong8 x)
 {
   float8 r = convert_float8(x);
-  ulong8 y = convert_ulong8(r);
+  ulong8 y = convert_ulong8_sat(r);
   ulong8 abs_x = abs(x);
   ulong8 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+  float8 res = select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+  return select(res, (float8)(0x1.fffffep+63), convert_int8(x >= 0xfffffffffffffffeUL));
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rte(ulong8 x)
 {
   return convert_float8(x);
@@ -39663,39 +39268,42 @@ float8 convert_float8_rte(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rtp(ulong8 x)
 {
   float8 r = convert_float8(x);
-  ulong8 y = convert_ulong8(r);
-  return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+  ulong8 y = convert_ulong8_sat(r);
+  float8 res = select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rtn(ulong8 x)
 {
   float8 r = convert_float8(x);
-  ulong8 y = convert_ulong8(r);
-  return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+  ulong8 y = convert_ulong8_sat(r);
+  float8 res = select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+  return select(res, (float8)(0x1.fffffep+63), convert_int8(x >= 0xfffffffffffffffeUL));
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rtz(ulong16 x)
 {
   float16 r = convert_float16(x);
-  ulong16 y = convert_ulong16(r);
+  ulong16 y = convert_ulong16_sat(r);
   ulong16 abs_x = abs(x);
   ulong16 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+  float16 res = select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+  return select(res, (float16)(0x1.fffffep+63), convert_int16(x >= 0xfffffffffffffffeUL));
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rte(ulong16 x)
 {
   return convert_float16(x);
@@ -39703,39 +39311,42 @@ float16 convert_float16_rte(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rtp(ulong16 x)
 {
   float16 r = convert_float16(x);
-  ulong16 y = convert_ulong16(r);
-  return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+  ulong16 y = convert_ulong16_sat(r);
+  float16 res = select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_int64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rtn(ulong16 x)
 {
   float16 r = convert_float16(x);
-  ulong16 y = convert_ulong16(r);
-  return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+  ulong16 y = convert_ulong16_sat(r);
+  float16 res = select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+  return select(res, (float16)(0x1.fffffep+63), convert_int16(x >= 0xfffffffffffffffeUL));
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rtz(ulong x)
 {
   double r = convert_double(x);
-  ulong y = convert_ulong(r);
+  ulong y = convert_ulong_sat(r);
   ulong abs_x = abs(x);
   ulong abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x));
+  double res = select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x));
+  return select(res, (double)(0x1.fffffffffffffp+63), convert_long(x >= 0xfffffffffffffffeUL));
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rte(ulong x)
 {
   return convert_double(x);
@@ -39743,39 +39354,42 @@ double convert_double_rte(ulong x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rtp(ulong x)
 {
   double r = convert_double(x);
-  ulong y = convert_ulong(r);
-  return select(r, nextafter(r, (double)INFINITY), convert_long(y < x));
+  ulong y = convert_ulong_sat(r);
+  double res = select(r, nextafter(r, (double)INFINITY), convert_long(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rtn(ulong x)
 {
   double r = convert_double(x);
-  ulong y = convert_ulong(r);
-  return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x));
+  ulong y = convert_ulong_sat(r);
+  double res = select(r, nextafter(r, (double)-INFINITY), convert_long(y > x));
+  return select(res, (double)(0x1.fffffffffffffp+63), convert_long(x >= 0xfffffffffffffffeUL));
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rtz(ulong2 x)
 {
   double2 r = convert_double2(x);
-  ulong2 y = convert_ulong2(r);
+  ulong2 y = convert_ulong2_sat(r);
   ulong2 abs_x = abs(x);
   ulong2 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x));
+  double2 res = select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x));
+  return select(res, (double2)(0x1.fffffffffffffp+63), convert_long2(x >= 0xfffffffffffffffeUL));
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rte(ulong2 x)
 {
   return convert_double2(x);
@@ -39783,39 +39397,42 @@ double2 convert_double2_rte(ulong2 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rtp(ulong2 x)
 {
   double2 r = convert_double2(x);
-  ulong2 y = convert_ulong2(r);
-  return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x));
+  ulong2 y = convert_ulong2_sat(r);
+  double2 res = select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rtn(ulong2 x)
 {
   double2 r = convert_double2(x);
-  ulong2 y = convert_ulong2(r);
-  return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x));
+  ulong2 y = convert_ulong2_sat(r);
+  double2 res = select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x));
+  return select(res, (double2)(0x1.fffffffffffffp+63), convert_long2(x >= 0xfffffffffffffffeUL));
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rtz(ulong3 x)
 {
   double3 r = convert_double3(x);
-  ulong3 y = convert_ulong3(r);
+  ulong3 y = convert_ulong3_sat(r);
   ulong3 abs_x = abs(x);
   ulong3 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x));
+  double3 res = select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x));
+  return select(res, (double3)(0x1.fffffffffffffp+63), convert_long3(x >= 0xfffffffffffffffeUL));
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rte(ulong3 x)
 {
   return convert_double3(x);
@@ -39823,39 +39440,42 @@ double3 convert_double3_rte(ulong3 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rtp(ulong3 x)
 {
   double3 r = convert_double3(x);
-  ulong3 y = convert_ulong3(r);
-  return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x));
+  ulong3 y = convert_ulong3_sat(r);
+  double3 res = select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rtn(ulong3 x)
 {
   double3 r = convert_double3(x);
-  ulong3 y = convert_ulong3(r);
-  return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x));
+  ulong3 y = convert_ulong3_sat(r);
+  double3 res = select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x));
+  return select(res, (double3)(0x1.fffffffffffffp+63), convert_long3(x >= 0xfffffffffffffffeUL));
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rtz(ulong4 x)
 {
   double4 r = convert_double4(x);
-  ulong4 y = convert_ulong4(r);
+  ulong4 y = convert_ulong4_sat(r);
   ulong4 abs_x = abs(x);
   ulong4 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x));
+  double4 res = select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x));
+  return select(res, (double4)(0x1.fffffffffffffp+63), convert_long4(x >= 0xfffffffffffffffeUL));
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rte(ulong4 x)
 {
   return convert_double4(x);
@@ -39863,39 +39483,42 @@ double4 convert_double4_rte(ulong4 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rtp(ulong4 x)
 {
   double4 r = convert_double4(x);
-  ulong4 y = convert_ulong4(r);
-  return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x));
+  ulong4 y = convert_ulong4_sat(r);
+  double4 res = select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rtn(ulong4 x)
 {
   double4 r = convert_double4(x);
-  ulong4 y = convert_ulong4(r);
-  return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x));
+  ulong4 y = convert_ulong4_sat(r);
+  double4 res = select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x));
+  return select(res, (double4)(0x1.fffffffffffffp+63), convert_long4(x >= 0xfffffffffffffffeUL));
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rtz(ulong8 x)
 {
   double8 r = convert_double8(x);
-  ulong8 y = convert_ulong8(r);
+  ulong8 y = convert_ulong8_sat(r);
   ulong8 abs_x = abs(x);
   ulong8 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x));
+  double8 res = select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x));
+  return select(res, (double8)(0x1.fffffffffffffp+63), convert_long8(x >= 0xfffffffffffffffeUL));
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rte(ulong8 x)
 {
   return convert_double8(x);
@@ -39903,39 +39526,42 @@ double8 convert_double8_rte(ulong8 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rtp(ulong8 x)
 {
   double8 r = convert_double8(x);
-  ulong8 y = convert_ulong8(r);
-  return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x));
+  ulong8 y = convert_ulong8_sat(r);
+  double8 res = select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rtn(ulong8 x)
 {
   double8 r = convert_double8(x);
-  ulong8 y = convert_ulong8(r);
-  return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x));
+  ulong8 y = convert_ulong8_sat(r);
+  double8 res = select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x));
+  return select(res, (double8)(0x1.fffffffffffffp+63), convert_long8(x >= 0xfffffffffffffffeUL));
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rtz(ulong16 x)
 {
   double16 r = convert_double16(x);
-  ulong16 y = convert_ulong16(r);
+  ulong16 y = convert_ulong16_sat(r);
   ulong16 abs_x = abs(x);
   ulong16 abs_y = abs(y);
-  return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x));
+  double16 res = select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x));
+  return select(res, (double16)(0x1.fffffffffffffp+63), convert_long16(x >= 0xfffffffffffffffeUL));
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rte(ulong16 x)
 {
   return convert_double16(x);
@@ -39943,39 +39569,42 @@ double16 convert_double16_rte(ulong16 x)
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rtp(ulong16 x)
 {
   double16 r = convert_double16(x);
-  ulong16 y = convert_ulong16(r);
-  return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x));
+  ulong16 y = convert_ulong16_sat(r);
+  double16 res = select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_int64) && defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rtn(ulong16 x)
 {
   double16 r = convert_double16(x);
-  ulong16 y = convert_ulong16(r);
-  return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x));
+  ulong16 y = convert_ulong16_sat(r);
+  double16 res = select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x));
+  return select(res, (double16)(0x1.fffffffffffffp+63), convert_long16(x >= 0xfffffffffffffffeUL));
 }
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rtz(half x)
 {
   float r = convert_float(x);
   half y = convert_half(r);
   half abs_x = fabs(x);
   half abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+  float res = select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rte(half x)
 {
   return convert_float(x);
@@ -39983,39 +39612,42 @@ float convert_float_rte(half x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rtp(half x)
 {
   float r = convert_float(x);
   half y = convert_half(r);
-  return select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+  float res = select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rtn(half x)
 {
   float r = convert_float(x);
   half y = convert_half(r);
-  return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+  float res = select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rtz(half2 x)
 {
   float2 r = convert_float2(x);
   half2 y = convert_half2(r);
   half2 abs_x = fabs(x);
   half2 abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+  float2 res = select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rte(half2 x)
 {
   return convert_float2(x);
@@ -40023,39 +39655,42 @@ float2 convert_float2_rte(half2 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rtp(half2 x)
 {
   float2 r = convert_float2(x);
   half2 y = convert_half2(r);
-  return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+  float2 res = select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rtn(half2 x)
 {
   float2 r = convert_float2(x);
   half2 y = convert_half2(r);
-  return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+  float2 res = select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rtz(half3 x)
 {
   float3 r = convert_float3(x);
   half3 y = convert_half3(r);
   half3 abs_x = fabs(x);
   half3 abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+  float3 res = select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rte(half3 x)
 {
   return convert_float3(x);
@@ -40063,39 +39698,42 @@ float3 convert_float3_rte(half3 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rtp(half3 x)
 {
   float3 r = convert_float3(x);
   half3 y = convert_half3(r);
-  return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+  float3 res = select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rtn(half3 x)
 {
   float3 r = convert_float3(x);
   half3 y = convert_half3(r);
-  return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+  float3 res = select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rtz(half4 x)
 {
   float4 r = convert_float4(x);
   half4 y = convert_half4(r);
   half4 abs_x = fabs(x);
   half4 abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+  float4 res = select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rte(half4 x)
 {
   return convert_float4(x);
@@ -40103,39 +39741,42 @@ float4 convert_float4_rte(half4 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rtp(half4 x)
 {
   float4 r = convert_float4(x);
   half4 y = convert_half4(r);
-  return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+  float4 res = select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rtn(half4 x)
 {
   float4 r = convert_float4(x);
   half4 y = convert_half4(r);
-  return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+  float4 res = select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rtz(half8 x)
 {
   float8 r = convert_float8(x);
   half8 y = convert_half8(r);
   half8 abs_x = fabs(x);
   half8 abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+  float8 res = select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rte(half8 x)
 {
   return convert_float8(x);
@@ -40143,39 +39784,42 @@ float8 convert_float8_rte(half8 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rtp(half8 x)
 {
   float8 r = convert_float8(x);
   half8 y = convert_half8(r);
-  return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+  float8 res = select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rtn(half8 x)
 {
   float8 r = convert_float8(x);
   half8 y = convert_half8(r);
-  return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+  float8 res = select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rtz(half16 x)
 {
   float16 r = convert_float16(x);
   half16 y = convert_half16(r);
   half16 abs_x = fabs(x);
   half16 abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+  float16 res = select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rte(half16 x)
 {
   return convert_float16(x);
@@ -40183,39 +39827,42 @@ float16 convert_float16_rte(half16 x)
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rtp(half16 x)
 {
   float16 r = convert_float16(x);
   half16 y = convert_half16(r);
-  return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+  float16 res = select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rtn(half16 x)
 {
   float16 r = convert_float16(x);
   half16 y = convert_half16(r);
-  return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+  float16 res = select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rtz(half x)
 {
   double r = convert_double(x);
   half y = convert_half(r);
   half abs_x = fabs(x);
   half abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x));
+  double res = select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rte(half x)
 {
   return convert_double(x);
@@ -40223,39 +39870,42 @@ double convert_double_rte(half x)
 #endif
 
 #if defined(cl_khr_fp64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rtp(half x)
 {
   double r = convert_double(x);
   half y = convert_half(r);
-  return select(r, nextafter(r, (double)INFINITY), convert_long(y < x));
+  double res = select(r, nextafter(r, (double)INFINITY), convert_long(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rtn(half x)
 {
   double r = convert_double(x);
   half y = convert_half(r);
-  return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x));
+  double res = select(r, nextafter(r, (double)-INFINITY), convert_long(y > x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rtz(half2 x)
 {
   double2 r = convert_double2(x);
   half2 y = convert_half2(r);
   half2 abs_x = fabs(x);
   half2 abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x));
+  double2 res = select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rte(half2 x)
 {
   return convert_double2(x);
@@ -40263,39 +39913,42 @@ double2 convert_double2_rte(half2 x)
 #endif
 
 #if defined(cl_khr_fp64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rtp(half2 x)
 {
   double2 r = convert_double2(x);
   half2 y = convert_half2(r);
-  return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x));
+  double2 res = select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rtn(half2 x)
 {
   double2 r = convert_double2(x);
   half2 y = convert_half2(r);
-  return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x));
+  double2 res = select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rtz(half3 x)
 {
   double3 r = convert_double3(x);
   half3 y = convert_half3(r);
   half3 abs_x = fabs(x);
   half3 abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x));
+  double3 res = select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rte(half3 x)
 {
   return convert_double3(x);
@@ -40303,39 +39956,42 @@ double3 convert_double3_rte(half3 x)
 #endif
 
 #if defined(cl_khr_fp64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rtp(half3 x)
 {
   double3 r = convert_double3(x);
   half3 y = convert_half3(r);
-  return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x));
+  double3 res = select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rtn(half3 x)
 {
   double3 r = convert_double3(x);
   half3 y = convert_half3(r);
-  return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x));
+  double3 res = select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rtz(half4 x)
 {
   double4 r = convert_double4(x);
   half4 y = convert_half4(r);
   half4 abs_x = fabs(x);
   half4 abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x));
+  double4 res = select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rte(half4 x)
 {
   return convert_double4(x);
@@ -40343,39 +39999,42 @@ double4 convert_double4_rte(half4 x)
 #endif
 
 #if defined(cl_khr_fp64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rtp(half4 x)
 {
   double4 r = convert_double4(x);
   half4 y = convert_half4(r);
-  return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x));
+  double4 res = select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rtn(half4 x)
 {
   double4 r = convert_double4(x);
   half4 y = convert_half4(r);
-  return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x));
+  double4 res = select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rtz(half8 x)
 {
   double8 r = convert_double8(x);
   half8 y = convert_half8(r);
   half8 abs_x = fabs(x);
   half8 abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x));
+  double8 res = select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rte(half8 x)
 {
   return convert_double8(x);
@@ -40383,39 +40042,42 @@ double8 convert_double8_rte(half8 x)
 #endif
 
 #if defined(cl_khr_fp64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rtp(half8 x)
 {
   double8 r = convert_double8(x);
   half8 y = convert_half8(r);
-  return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x));
+  double8 res = select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rtn(half8 x)
 {
   double8 r = convert_double8(x);
   half8 y = convert_half8(r);
-  return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x));
+  double8 res = select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rtz(half16 x)
 {
   double16 r = convert_double16(x);
   half16 y = convert_half16(r);
   half16 abs_x = fabs(x);
   half16 abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x));
+  double16 res = select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rte(half16 x)
 {
   return convert_double16(x);
@@ -40423,231 +40085,247 @@ double16 convert_double16_rte(half16 x)
 #endif
 
 #if defined(cl_khr_fp64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rtp(half16 x)
 {
   double16 r = convert_double16(x);
   half16 y = convert_half16(r);
-  return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x));
+  double16 res = select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64) && defined(cl_khr_fp16)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rtn(half16 x)
 {
   double16 r = convert_double16(x);
   half16 y = convert_half16(r);
-  return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x));
+  double16 res = select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x));
+  return res;
 }
 #endif
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rtz(float x)
 {
   float r = convert_float(x);
   float y = convert_float(r);
   float abs_x = fabs(x);
   float abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+  float res = select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+  return res;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rte(float x)
 {
   return convert_float(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rtp(float x)
 {
   float r = convert_float(x);
   float y = convert_float(r);
-  return select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+  float res = select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+  return res;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rtn(float x)
 {
   float r = convert_float(x);
   float y = convert_float(r);
-  return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+  float res = select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+  return res;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rtz(float2 x)
 {
   float2 r = convert_float2(x);
   float2 y = convert_float2(r);
   float2 abs_x = fabs(x);
   float2 abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+  float2 res = select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+  return res;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rte(float2 x)
 {
   return convert_float2(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rtp(float2 x)
 {
   float2 r = convert_float2(x);
   float2 y = convert_float2(r);
-  return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+  float2 res = select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+  return res;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rtn(float2 x)
 {
   float2 r = convert_float2(x);
   float2 y = convert_float2(r);
-  return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+  float2 res = select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+  return res;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rtz(float3 x)
 {
   float3 r = convert_float3(x);
   float3 y = convert_float3(r);
   float3 abs_x = fabs(x);
   float3 abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+  float3 res = select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+  return res;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rte(float3 x)
 {
   return convert_float3(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rtp(float3 x)
 {
   float3 r = convert_float3(x);
   float3 y = convert_float3(r);
-  return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+  float3 res = select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+  return res;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rtn(float3 x)
 {
   float3 r = convert_float3(x);
   float3 y = convert_float3(r);
-  return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+  float3 res = select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+  return res;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rtz(float4 x)
 {
   float4 r = convert_float4(x);
   float4 y = convert_float4(r);
   float4 abs_x = fabs(x);
   float4 abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+  float4 res = select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+  return res;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rte(float4 x)
 {
   return convert_float4(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rtp(float4 x)
 {
   float4 r = convert_float4(x);
   float4 y = convert_float4(r);
-  return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+  float4 res = select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+  return res;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rtn(float4 x)
 {
   float4 r = convert_float4(x);
   float4 y = convert_float4(r);
-  return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+  float4 res = select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+  return res;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rtz(float8 x)
 {
   float8 r = convert_float8(x);
   float8 y = convert_float8(r);
   float8 abs_x = fabs(x);
   float8 abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+  float8 res = select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+  return res;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rte(float8 x)
 {
   return convert_float8(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rtp(float8 x)
 {
   float8 r = convert_float8(x);
   float8 y = convert_float8(r);
-  return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+  float8 res = select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+  return res;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rtn(float8 x)
 {
   float8 r = convert_float8(x);
   float8 y = convert_float8(r);
-  return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+  float8 res = select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+  return res;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rtz(float16 x)
 {
   float16 r = convert_float16(x);
   float16 y = convert_float16(r);
   float16 abs_x = fabs(x);
   float16 abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+  float16 res = select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+  return res;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rte(float16 x)
 {
   return convert_float16(x);
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rtp(float16 x)
 {
   float16 r = convert_float16(x);
   float16 y = convert_float16(r);
-  return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+  float16 res = select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+  return res;
 }
 
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rtn(float16 x)
 {
   float16 r = convert_float16(x);
   float16 y = convert_float16(r);
-  return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+  float16 res = select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+  return res;
 }
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rtz(float x)
 {
-  double r = convert_double(x);
-  float y = convert_float(r);
-  float abs_x = fabs(x);
-  float abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x));
+  return convert_double(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rte(float x)
 {
   return convert_double(x);
@@ -40655,39 +40333,31 @@ double convert_double_rte(float x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rtp(float x)
 {
-  double r = convert_double(x);
-  float y = convert_float(r);
-  return select(r, nextafter(r, (double)INFINITY), convert_long(y < x));
+  return convert_double(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rtn(float x)
 {
-  double r = convert_double(x);
-  float y = convert_float(r);
-  return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x));
+  return convert_double(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rtz(float2 x)
 {
-  double2 r = convert_double2(x);
-  float2 y = convert_float2(r);
-  float2 abs_x = fabs(x);
-  float2 abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x));
+  return convert_double2(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rte(float2 x)
 {
   return convert_double2(x);
@@ -40695,39 +40365,31 @@ double2 convert_double2_rte(float2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rtp(float2 x)
 {
-  double2 r = convert_double2(x);
-  float2 y = convert_float2(r);
-  return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x));
+  return convert_double2(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rtn(float2 x)
 {
-  double2 r = convert_double2(x);
-  float2 y = convert_float2(r);
-  return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x));
+  return convert_double2(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rtz(float3 x)
 {
-  double3 r = convert_double3(x);
-  float3 y = convert_float3(r);
-  float3 abs_x = fabs(x);
-  float3 abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x));
+  return convert_double3(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rte(float3 x)
 {
   return convert_double3(x);
@@ -40735,39 +40397,31 @@ double3 convert_double3_rte(float3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rtp(float3 x)
 {
-  double3 r = convert_double3(x);
-  float3 y = convert_float3(r);
-  return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x));
+  return convert_double3(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rtn(float3 x)
 {
-  double3 r = convert_double3(x);
-  float3 y = convert_float3(r);
-  return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x));
+  return convert_double3(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rtz(float4 x)
 {
-  double4 r = convert_double4(x);
-  float4 y = convert_float4(r);
-  float4 abs_x = fabs(x);
-  float4 abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x));
+  return convert_double4(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rte(float4 x)
 {
   return convert_double4(x);
@@ -40775,39 +40429,31 @@ double4 convert_double4_rte(float4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rtp(float4 x)
 {
-  double4 r = convert_double4(x);
-  float4 y = convert_float4(r);
-  return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x));
+  return convert_double4(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rtn(float4 x)
 {
-  double4 r = convert_double4(x);
-  float4 y = convert_float4(r);
-  return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x));
+  return convert_double4(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rtz(float8 x)
 {
-  double8 r = convert_double8(x);
-  float8 y = convert_float8(r);
-  float8 abs_x = fabs(x);
-  float8 abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x));
+  return convert_double8(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rte(float8 x)
 {
   return convert_double8(x);
@@ -40815,39 +40461,31 @@ double8 convert_double8_rte(float8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rtp(float8 x)
 {
-  double8 r = convert_double8(x);
-  float8 y = convert_float8(r);
-  return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x));
+  return convert_double8(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rtn(float8 x)
 {
-  double8 r = convert_double8(x);
-  float8 y = convert_float8(r);
-  return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x));
+  return convert_double8(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rtz(float16 x)
 {
-  double16 r = convert_double16(x);
-  float16 y = convert_float16(r);
-  float16 abs_x = fabs(x);
-  float16 abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x));
+  return convert_double16(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rte(float16 x)
 {
   return convert_double16(x);
@@ -40855,39 +40493,36 @@ double16 convert_double16_rte(float16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rtp(float16 x)
 {
-  double16 r = convert_double16(x);
-  float16 y = convert_float16(r);
-  return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x));
+  return convert_double16(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rtn(float16 x)
 {
-  double16 r = convert_double16(x);
-  float16 y = convert_float16(r);
-  return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x));
+  return convert_double16(x);
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rtz(double x)
 {
   float r = convert_float(x);
   double y = convert_double(r);
   double abs_x = fabs(x);
   double abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+  float res = select(r, nextafter(r, sign(r) * (float)-INFINITY), convert_int(abs_y > abs_x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rte(double x)
 {
   return convert_float(x);
@@ -40895,39 +40530,42 @@ float convert_float_rte(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rtp(double x)
 {
   float r = convert_float(x);
   double y = convert_double(r);
-  return select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+  float res = select(r, nextafter(r, (float)INFINITY), convert_int(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float convert_float_rtn(double x)
 {
   float r = convert_float(x);
   double y = convert_double(r);
-  return select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+  float res = select(r, nextafter(r, (float)-INFINITY), convert_int(y > x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rtz(double2 x)
 {
   float2 r = convert_float2(x);
   double2 y = convert_double2(r);
   double2 abs_x = fabs(x);
   double2 abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+  float2 res = select(r, nextafter(r, sign(r) * (float2)-INFINITY), convert_int2(abs_y > abs_x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rte(double2 x)
 {
   return convert_float2(x);
@@ -40935,39 +40573,42 @@ float2 convert_float2_rte(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rtp(double2 x)
 {
   float2 r = convert_float2(x);
   double2 y = convert_double2(r);
-  return select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+  float2 res = select(r, nextafter(r, (float2)INFINITY), convert_int2(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float2 convert_float2_rtn(double2 x)
 {
   float2 r = convert_float2(x);
   double2 y = convert_double2(r);
-  return select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+  float2 res = select(r, nextafter(r, (float2)-INFINITY), convert_int2(y > x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rtz(double3 x)
 {
   float3 r = convert_float3(x);
   double3 y = convert_double3(r);
   double3 abs_x = fabs(x);
   double3 abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+  float3 res = select(r, nextafter(r, sign(r) * (float3)-INFINITY), convert_int3(abs_y > abs_x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rte(double3 x)
 {
   return convert_float3(x);
@@ -40975,39 +40616,42 @@ float3 convert_float3_rte(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rtp(double3 x)
 {
   float3 r = convert_float3(x);
   double3 y = convert_double3(r);
-  return select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+  float3 res = select(r, nextafter(r, (float3)INFINITY), convert_int3(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float3 convert_float3_rtn(double3 x)
 {
   float3 r = convert_float3(x);
   double3 y = convert_double3(r);
-  return select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+  float3 res = select(r, nextafter(r, (float3)-INFINITY), convert_int3(y > x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rtz(double4 x)
 {
   float4 r = convert_float4(x);
   double4 y = convert_double4(r);
   double4 abs_x = fabs(x);
   double4 abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+  float4 res = select(r, nextafter(r, sign(r) * (float4)-INFINITY), convert_int4(abs_y > abs_x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rte(double4 x)
 {
   return convert_float4(x);
@@ -41015,39 +40659,42 @@ float4 convert_float4_rte(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rtp(double4 x)
 {
   float4 r = convert_float4(x);
   double4 y = convert_double4(r);
-  return select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+  float4 res = select(r, nextafter(r, (float4)INFINITY), convert_int4(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float4 convert_float4_rtn(double4 x)
 {
   float4 r = convert_float4(x);
   double4 y = convert_double4(r);
-  return select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+  float4 res = select(r, nextafter(r, (float4)-INFINITY), convert_int4(y > x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rtz(double8 x)
 {
   float8 r = convert_float8(x);
   double8 y = convert_double8(r);
   double8 abs_x = fabs(x);
   double8 abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+  float8 res = select(r, nextafter(r, sign(r) * (float8)-INFINITY), convert_int8(abs_y > abs_x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rte(double8 x)
 {
   return convert_float8(x);
@@ -41055,39 +40702,42 @@ float8 convert_float8_rte(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rtp(double8 x)
 {
   float8 r = convert_float8(x);
   double8 y = convert_double8(r);
-  return select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+  float8 res = select(r, nextafter(r, (float8)INFINITY), convert_int8(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float8 convert_float8_rtn(double8 x)
 {
   float8 r = convert_float8(x);
   double8 y = convert_double8(r);
-  return select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+  float8 res = select(r, nextafter(r, (float8)-INFINITY), convert_int8(y > x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rtz(double16 x)
 {
   float16 r = convert_float16(x);
   double16 y = convert_double16(r);
   double16 abs_x = fabs(x);
   double16 abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+  float16 res = select(r, nextafter(r, sign(r) * (float16)-INFINITY), convert_int16(abs_y > abs_x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rte(double16 x)
 {
   return convert_float16(x);
@@ -41095,39 +40745,42 @@ float16 convert_float16_rte(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rtp(double16 x)
 {
   float16 r = convert_float16(x);
   double16 y = convert_double16(r);
-  return select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+  float16 res = select(r, nextafter(r, (float16)INFINITY), convert_int16(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 float16 convert_float16_rtn(double16 x)
 {
   float16 r = convert_float16(x);
   double16 y = convert_double16(r);
-  return select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+  float16 res = select(r, nextafter(r, (float16)-INFINITY), convert_int16(y > x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rtz(double x)
 {
   double r = convert_double(x);
   double y = convert_double(r);
   double abs_x = fabs(x);
   double abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x));
+  double res = select(r, nextafter(r, sign(r) * (double)-INFINITY), convert_long(abs_y > abs_x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rte(double x)
 {
   return convert_double(x);
@@ -41135,39 +40788,42 @@ double convert_double_rte(double x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rtp(double x)
 {
   double r = convert_double(x);
   double y = convert_double(r);
-  return select(r, nextafter(r, (double)INFINITY), convert_long(y < x));
+  double res = select(r, nextafter(r, (double)INFINITY), convert_long(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double convert_double_rtn(double x)
 {
   double r = convert_double(x);
   double y = convert_double(r);
-  return select(r, nextafter(r, (double)-INFINITY), convert_long(y > x));
+  double res = select(r, nextafter(r, (double)-INFINITY), convert_long(y > x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rtz(double2 x)
 {
   double2 r = convert_double2(x);
   double2 y = convert_double2(r);
   double2 abs_x = fabs(x);
   double2 abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x));
+  double2 res = select(r, nextafter(r, sign(r) * (double2)-INFINITY), convert_long2(abs_y > abs_x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rte(double2 x)
 {
   return convert_double2(x);
@@ -41175,39 +40831,42 @@ double2 convert_double2_rte(double2 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rtp(double2 x)
 {
   double2 r = convert_double2(x);
   double2 y = convert_double2(r);
-  return select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x));
+  double2 res = select(r, nextafter(r, (double2)INFINITY), convert_long2(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double2 convert_double2_rtn(double2 x)
 {
   double2 r = convert_double2(x);
   double2 y = convert_double2(r);
-  return select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x));
+  double2 res = select(r, nextafter(r, (double2)-INFINITY), convert_long2(y > x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rtz(double3 x)
 {
   double3 r = convert_double3(x);
   double3 y = convert_double3(r);
   double3 abs_x = fabs(x);
   double3 abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x));
+  double3 res = select(r, nextafter(r, sign(r) * (double3)-INFINITY), convert_long3(abs_y > abs_x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rte(double3 x)
 {
   return convert_double3(x);
@@ -41215,39 +40874,42 @@ double3 convert_double3_rte(double3 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rtp(double3 x)
 {
   double3 r = convert_double3(x);
   double3 y = convert_double3(r);
-  return select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x));
+  double3 res = select(r, nextafter(r, (double3)INFINITY), convert_long3(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double3 convert_double3_rtn(double3 x)
 {
   double3 r = convert_double3(x);
   double3 y = convert_double3(r);
-  return select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x));
+  double3 res = select(r, nextafter(r, (double3)-INFINITY), convert_long3(y > x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rtz(double4 x)
 {
   double4 r = convert_double4(x);
   double4 y = convert_double4(r);
   double4 abs_x = fabs(x);
   double4 abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x));
+  double4 res = select(r, nextafter(r, sign(r) * (double4)-INFINITY), convert_long4(abs_y > abs_x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rte(double4 x)
 {
   return convert_double4(x);
@@ -41255,39 +40917,42 @@ double4 convert_double4_rte(double4 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rtp(double4 x)
 {
   double4 r = convert_double4(x);
   double4 y = convert_double4(r);
-  return select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x));
+  double4 res = select(r, nextafter(r, (double4)INFINITY), convert_long4(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double4 convert_double4_rtn(double4 x)
 {
   double4 r = convert_double4(x);
   double4 y = convert_double4(r);
-  return select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x));
+  double4 res = select(r, nextafter(r, (double4)-INFINITY), convert_long4(y > x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rtz(double8 x)
 {
   double8 r = convert_double8(x);
   double8 y = convert_double8(r);
   double8 abs_x = fabs(x);
   double8 abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x));
+  double8 res = select(r, nextafter(r, sign(r) * (double8)-INFINITY), convert_long8(abs_y > abs_x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rte(double8 x)
 {
   return convert_double8(x);
@@ -41295,39 +40960,42 @@ double8 convert_double8_rte(double8 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rtp(double8 x)
 {
   double8 r = convert_double8(x);
   double8 y = convert_double8(r);
-  return select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x));
+  double8 res = select(r, nextafter(r, (double8)INFINITY), convert_long8(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double8 convert_double8_rtn(double8 x)
 {
   double8 r = convert_double8(x);
   double8 y = convert_double8(r);
-  return select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x));
+  double8 res = select(r, nextafter(r, (double8)-INFINITY), convert_long8(y > x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rtz(double16 x)
 {
   double16 r = convert_double16(x);
   double16 y = convert_double16(r);
   double16 abs_x = fabs(x);
   double16 abs_y = fabs(y);
-  return select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x));
+  double16 res = select(r, nextafter(r, sign(r) * (double16)-INFINITY), convert_long16(abs_y > abs_x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rte(double16 x)
 {
   return convert_double16(x);
@@ -41335,21 +41003,23 @@ double16 convert_double16_rte(double16 x)
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rtp(double16 x)
 {
   double16 r = convert_double16(x);
   double16 y = convert_double16(r);
-  return select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x));
+  double16 res = select(r, nextafter(r, (double16)INFINITY), convert_long16(y < x));
+  return res;
 }
 #endif
 
 #if defined(cl_khr_fp64)
-_CL_ALWAYSINLINE _CL_OVERLOADABLE
+_CL_ALWAYSINLINE _CL_OVERLOADABLE _CL_READNONE
 double16 convert_double16_rtn(double16 x)
 {
   double16 r = convert_double16(x);
   double16 y = convert_double16(r);
-  return select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x));
+  double16 res = select(r, nextafter(r, (double16)-INFINITY), convert_long16(y > x));
+  return res;
 }
 #endif
diff --git a/lib/kernel/hsail64/CMakeLists.txt b/lib/kernel/cuda/CMakeLists.txt
similarity index 52%
copy from lib/kernel/hsail64/CMakeLists.txt
copy to lib/kernel/cuda/CMakeLists.txt
index 15c51e6..a97979a 100644
--- a/lib/kernel/hsail64/CMakeLists.txt
+++ b/lib/kernel/cuda/CMakeLists.txt
@@ -1,7 +1,7 @@
 #=============================================================================
 #   CMake build system files
 #
-#   Copyright (c) 2015 pocl developers
+#   Copyright (c) 2016 pocl developers
 #
 #   Permission is hereby granted, free of charge, to any person obtaining a copy
 #   of this software and associated documentation files (the "Software"), to deal
@@ -27,61 +27,62 @@ include("bitcode_rules")
 
 set(KERNEL_SOURCES ${SOURCES_WITHOUT_VML})
 
-foreach(FILE printf.c barrier.ll get_image_depth.cl get_image_dim.cl
-  get_image_height.cl get_image_width.cl read_image.cl write_image.cl)
+foreach(FILE atomics.cl)
   list(REMOVE_ITEM KERNEL_SOURCES "${FILE}")
 endforeach()
 
-list(APPEND KERNEL_SOURCES svm_atomics.cl)
-
-foreach(FILE atomics.cl atomic_impl.ll barrier.c
-        get_global_id.c get_local_id.c get_group_id.c
-        get_global_size.c get_local_size.c get_global_offset.c
-        get_num_groups.c get_work_dim.c
-        native_cos.cl native_exp.cl native_exp10.cl native_exp2.cl
-        native_log.cl native_log10.cl native_log2.cl native_recip.cl
-        native_rsqrt.cl  native_sin.cl  native_sqrt.cl
-        fabs.cl floor.cl rint.cl trunc.cl remainder.cl
-        fma.cl mad.cl mad_hi.cl mul_hi.cl mul24.cl mad24.cl
-        sqrt.cl sqrt_default.ll cbrt.cl hypot.cl length.cl copysign.cl
-        exp.cl exp2.cl exp10.cl expm1.cl
-        log.cl log2.cl log10.cl log1p.cl
-        sin.cl tan.cl cos.cl sinh.cl tanh.cl cosh.cl
-        asin.cl acos.cl atan.cl asinh.cl acosh.cl atanh.cl
-        ilogb.cl ldexp.cl fract.cl frexp.cl atan2.cl pow.cl
-        lgamma.cl tgamma.cl erf.cl erfc.cl fast_normalize.cl fast_length.cl
-        svm_atomics_hsail.cl.ll)
-
+foreach(FILE
+  atomic_add.ll atomic_and.ll atomic_cmpxchg.ll atomic_dec.ll atomic_inc.ll
+  atomic_min.ll atomic_max.ll atomic_or.ll atomic_sub.ll atomic_xchg.ll
+  atomic_xor.ll barrier.ll
+  get_global_id.c get_global_size.c get_group_id.c
+  get_local_id.c get_local_size.c get_num_groups.c
+  get_global_offset.c
+  printf.c
+  )
   list(REMOVE_ITEM KERNEL_SOURCES "${FILE}")
-  list(APPEND KERNEL_SOURCES "hsail64/${FILE}")
+  list(APPEND KERNEL_SOURCES "cuda/${FILE}")
 endforeach()
 
-set(CLANG_FLAGS "-emit-llvm" "-target" "hsail64" "-D_CL_DISABLE_HALF")
+list(APPEND KERNEL_SOURCES "cuda/nvvm_functions.ll")
+
+# Select either NVPTX or NVPTX64
+if( CMAKE_SIZEOF_VOID_P EQUAL 8 )
+  set(LLVM_TARGET nvptx64)
+else( CMAKE_SIZEOF_VOID_P EQUAL 8 )
+  set(LLVM_TARGET nvptx)
+endif( CMAKE_SIZEOF_VOID_P EQUAL 8 )
+
+
+set(CLANG_FLAGS "-emit-llvm" "-target" "${LLVM_TARGET}" "-D_CL_DISABLE_HALF")
 
 if(POCL_USE_FAKE_ADDR_SPACE_IDS)
-list(APPEND CLANG_FLAGS "-Xclang" "-ffake-address-space-map")
+  list(APPEND CLANG_FLAGS "-Xclang" "-ffake-address-space-map" "-DPOCL_USE_FAKE_ADDR_SPACE_IDS")
 endif()
 
+# Enable all extensions
+set(KERNEL_CL_FLAGS "-Xclang" "-cl-std=CL${CUDA_DEVICE_CL_STD}" "-D__OPENCL_C_VERSION__=${CUDA_DEVICE_CL_VERSION}" "-Xclang" "-cl-ext=all" ${KERNEL_CL_FLAGS})
+
 set(LLC_FLAGS "")
-set(DEVICE_CL_FLAGS "-D__OPENCL_VERSION__=${HSA_DEVICE_CL_VERSION}")
-separate_arguments(HSA_DEVICE_EXTENSIONS)
-foreach(EXT ${HSA_DEVICE_EXTENSIONS})
+set(DEVICE_CL_FLAGS "-D__OPENCL_VERSION__=${CUDA_DEVICE_CL_VERSION} -Dcl_khr_int64")
+separate_arguments(CUDA_DEVICE_EXTENSIONS)
+foreach(EXT ${CUDA_DEVICE_EXTENSIONS})
   set(DEVICE_CL_FLAGS "${DEVICE_CL_FLAGS} -D${EXT}")
 endforeach()
 separate_arguments(DEVICE_CL_FLAGS)
 
-make_kernel_bc(KERNEL_BC "hsail64" "BCs" ${KERNEL_SOURCES})
+make_kernel_bc(KERNEL_BC "${LLVM_TARGET}" "BCs" 0 0 0 ${KERNEL_SOURCES})
 
 # just debug
-message(STATUS "HSAIL64 Kernel BC: ${KERNEL_BC}")
+message(STATUS "${LLVM_TARGET} Kernel BC: ${KERNEL_BC}")
 
 list(APPEND KERNEL_BC_LIST "${KERNEL_BC}")
 set(KERNEL_BC_LIST "${KERNEL_BC_LIST}" PARENT_SCOPE)
 
 # a target is needed...
-add_custom_target("kernel_hsail64" DEPENDS ${KERNEL_BC})
+add_custom_target("kernel_${LLVM_TARGET}" DEPENDS ${KERNEL_BC})
 
-list(APPEND KERNEL_TARGET_LIST "kernel_hsail64")
+list(APPEND KERNEL_TARGET_LIST "kernel_${LLVM_TARGET}")
 set(KERNEL_TARGET_LIST "${KERNEL_TARGET_LIST}" PARENT_SCOPE)
 
 install(FILES "${KERNEL_BC}"
diff --git a/lib/kernel/cuda/atomic_add.ll b/lib/kernel/cuda/atomic_add.ll
new file mode 100644
index 0000000..c343198
--- /dev/null
+++ b/lib/kernel/cuda/atomic_add.ll
@@ -0,0 +1,47 @@
+define i32 @_Z10atomic_addPU3AS1Vii(i32 addrspace(1)* %ptr, i32 %val) {
+entry:
+  %0 = atomicrmw add i32 addrspace(1)* %ptr, i32 %val monotonic
+  ret i32 %0
+}
+
+define i32 @_Z10atomic_addPU3AS1Vjj(i32 addrspace(1)* %ptr, i32 %val) {
+entry:
+  %0 = atomicrmw add i32 addrspace(1)* %ptr, i32 %val monotonic
+  ret i32 %0
+}
+
+define i32 @_Z10atomic_addPU3AS3Vii(i32 addrspace(3)* %ptr, i32 %val) {
+entry:
+  %0 = atomicrmw add i32 addrspace(3)* %ptr, i32 %val monotonic
+  ret i32 %0
+}
+
+define i32 @_Z10atomic_addPU3AS3Vjj(i32 addrspace(3)* %ptr, i32 %val) {
+entry:
+  %0 = atomicrmw add i32 addrspace(3)* %ptr, i32 %val monotonic
+  ret i32 %0
+}
+
+define i64 @_Z10atomic_addPU3AS1Vll(i64 addrspace(1)* %ptr, i64 %val) {
+entry:
+  %0 = atomicrmw add i64 addrspace(1)* %ptr, i64 %val monotonic
+  ret i64 %0
+}
+
+define i64 @_Z10atomic_addPU3AS1Vmm(i64 addrspace(1)* %ptr, i64 %val) {
+entry:
+  %0 = atomicrmw add i64 addrspace(1)* %ptr, i64 %val monotonic
+  ret i64 %0
+}
+
+define i64 @_Z10atomic_addPU3AS3Vll(i64 addrspace(3)* %ptr, i64 %val) {
+entry:
+  %0 = atomicrmw add i64 addrspace(3)* %ptr, i64 %val monotonic
+  ret i64 %0
+}
+
+define i64 @_Z10atomic_addPU3AS3Vmm(i64 addrspace(3)* %ptr, i64 %val) {
+entry:
+  %0 = atomicrmw add i64 addrspace(3)* %ptr, i64 %val monotonic
+  ret i64 %0
+}
diff --git a/lib/kernel/cuda/atomic_and.ll b/lib/kernel/cuda/atomic_and.ll
new file mode 100644
index 0000000..20ecdef
--- /dev/null
+++ b/lib/kernel/cuda/atomic_and.ll
@@ -0,0 +1,47 @@
+define i32 @_Z10atomic_andPU3AS1Vii(i32 addrspace(1)* %ptr, i32 %val) {
+entry:
+  %0 = atomicrmw and i32 addrspace(1)* %ptr, i32 %val monotonic
+  ret i32 %0
+}
+
+define i32 @_Z10atomic_andPU3AS1Vjj(i32 addrspace(1)* %ptr, i32 %val) {
+entry:
+  %0 = atomicrmw and i32 addrspace(1)* %ptr, i32 %val monotonic
+  ret i32 %0
+}
+
+define i32 @_Z10atomic_andPU3AS3Vii(i32 addrspace(3)* %ptr, i32 %val) {
+entry:
+  %0 = atomicrmw and i32 addrspace(3)* %ptr, i32 %val monotonic
+  ret i32 %0
+}
+
+define i32 @_Z10atomic_andPU3AS3Vjj(i32 addrspace(3)* %ptr, i32 %val) {
+entry:
+  %0 = atomicrmw and i32 addrspace(3)* %ptr, i32 %val monotonic
+  ret i32 %0
+}
+
+define i64 @_Z10atomic_andPU3AS1Vll(i64 addrspace(1)* %ptr, i64 %val) {
+entry:
+  %0 = atomicrmw and i64 addrspace(1)* %ptr, i64 %val monotonic
+  ret i64 %0
+}
+
+define i64 @_Z10atomic_andPU3AS1Vmm(i64 addrspace(1)* %ptr, i64 %val) {
+entry:
+  %0 = atomicrmw and i64 addrspace(1)* %ptr, i64 %val monotonic
+  ret i64 %0
+}
+
+define i64 @_Z10atomic_andPU3AS3Vll(i64 addrspace(3)* %ptr, i64 %val) {
+entry:
+  %0 = atomicrmw and i64 addrspace(3)* %ptr, i64 %val monotonic
+  ret i64 %0
+}
+
+define i64 @_Z10atomic_andPU3AS3Vmm(i64 addrspace(3)* %ptr, i64 %val) {
+entry:
+  %0 = atomicrmw and i64 addrspace(3)* %ptr, i64 %val monotonic
+  ret i64 %0
+}
diff --git a/lib/kernel/cuda/atomic_cmpxchg.ll b/lib/kernel/cuda/atomic_cmpxchg.ll
new file mode 100644
index 0000000..963399d
--- /dev/null
+++ b/lib/kernel/cuda/atomic_cmpxchg.ll
@@ -0,0 +1,55 @@
+define i32 @_Z14atomic_cmpxchgPU3AS1Viii(i32 addrspace(1)* %ptr, i32 %cmp, i32 %val) {
+entry:
+  %0 = cmpxchg i32 addrspace(1)* %ptr, i32 %cmp, i32 %val monotonic monotonic
+  %1 = extractvalue {i32, i1} %0, 0
+  ret i32 %1
+}
+
+define i32 @_Z14atomic_cmpxchgPU3AS1Vjjj(i32 addrspace(1)* %ptr, i32 %cmp, i32 %val) {
+entry:
+  %0 = cmpxchg i32 addrspace(1)* %ptr, i32 %cmp, i32 %val monotonic monotonic
+  %1 = extractvalue {i32, i1} %0, 0
+  ret i32 %1
+}
+
+define i32 @_Z14atomic_cmpxchgPU3AS3Viii(i32 addrspace(3)* %ptr, i32 %cmp, i32 %val) {
+entry:
+  %0 = cmpxchg i32 addrspace(3)* %ptr, i32 %cmp, i32 %val monotonic monotonic
+  %1 = extractvalue {i32, i1} %0, 0
+  ret i32 %1
+}
+
+define i32 @_Z14atomic_cmpxchgPU3AS3Vjjj(i32 addrspace(3)* %ptr, i32 %cmp, i32 %val) {
+entry:
+  %0 = cmpxchg i32 addrspace(3)* %ptr, i32 %cmp, i32 %val monotonic monotonic
+  %1 = extractvalue {i32, i1} %0, 0
+  ret i32 %1
+}
+
+define i64 @_Z14atomic_cmpxchgPU3AS1Vlll(i64 addrspace(1)* %ptr, i64 %cmp, i64 %val) {
+entry:
+  %0 = cmpxchg i64 addrspace(1)* %ptr, i64 %cmp, i64 %val monotonic monotonic
+  %1 = extractvalue {i64, i1} %0, 0
+  ret i64 %1
+}
+
+define i64 @_Z14atomic_cmpxchgPU3AS1Vmmm(i64 addrspace(1)* %ptr, i64 %cmp, i64 %val) {
+entry:
+  %0 = cmpxchg i64 addrspace(1)* %ptr, i64 %cmp, i64 %val monotonic monotonic
+  %1 = extractvalue {i64, i1} %0, 0
+  ret i64 %1
+}
+
+define i64 @_Z14atomic_cmpxchgPU3AS3Vlll(i64 addrspace(3)* %ptr, i64 %cmp, i64 %val) {
+entry:
+  %0 = cmpxchg i64 addrspace(3)* %ptr, i64 %cmp, i64 %val monotonic monotonic
+  %1 = extractvalue {i64, i1} %0, 0
+  ret i64 %1
+}
+
+define i64 @_Z14atomic_cmpxchgPU3AS3Vmmm(i64 addrspace(3)* %ptr, i64 %cmp, i64 %val) {
+entry:
+  %0 = cmpxchg i64 addrspace(3)* %ptr, i64 %cmp, i64 %val monotonic monotonic
+  %1 = extractvalue {i64, i1} %0, 0
+  ret i64 %1
+}
diff --git a/lib/kernel/cuda/atomic_dec.ll b/lib/kernel/cuda/atomic_dec.ll
new file mode 100644
index 0000000..fd09842
--- /dev/null
+++ b/lib/kernel/cuda/atomic_dec.ll
@@ -0,0 +1,47 @@
+define i32 @_Z10atomic_decPU3AS1Vi(i32 addrspace(1)* %ptr) {
+entry:
+  %0 = atomicrmw sub i32 addrspace(1)* %ptr, i32 1 monotonic
+  ret i32 %0
+}
+
+define i32 @_Z10atomic_decPU3AS1Vj(i32 addrspace(1)* %ptr) {
+entry:
+  %0 = atomicrmw sub i32 addrspace(1)* %ptr, i32 1 monotonic
+  ret i32 %0
+}
+
+define i32 @_Z10atomic_decPU3AS3Vi(i32 addrspace(3)* %ptr) {
+entry:
+  %0 = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 monotonic
+  ret i32 %0
+}
+
+define i32 @_Z10atomic_decPU3AS3Vj(i32 addrspace(3)* %ptr) {
+entry:
+  %0 = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 monotonic
+  ret i32 %0
+}
+
+define i64 @_Z10atomic_decPU3AS1Vl(i64 addrspace(1)* %ptr) {
+entry:
+  %0 = atomicrmw sub i64 addrspace(1)* %ptr, i64 1 monotonic
+  ret i64 %0
+}
+
+define i64 @_Z10atomic_decPU3AS1Vm(i64 addrspace(1)* %ptr) {
+entry:
+  %0 = atomicrmw sub i64 addrspace(1)* %ptr, i64 1 monotonic
+  ret i64 %0
+}
+
+define i64 @_Z10atomic_decPU3AS3Vl(i64 addrspace(3)* %ptr) {
+entry:
+  %0 = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 monotonic
+  ret i64 %0
+}
+
+define i64 @_Z10atomic_decPU3AS3Vm(i64 addrspace(3)* %ptr) {
+entry:
+  %0 = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 monotonic
+  ret i64 %0
+}
diff --git a/lib/kernel/cuda/atomic_inc.ll b/lib/kernel/cuda/atomic_inc.ll
new file mode 100644
index 0000000..cbf9af1
--- /dev/null
+++ b/lib/kernel/cuda/atomic_inc.ll
@@ -0,0 +1,47 @@
+define i32 @_Z10atomic_incPU3AS1Vi(i32 addrspace(1)* %ptr) {
+entry:
+  %0 = atomicrmw add i32 addrspace(1)* %ptr, i32 1 monotonic
+  ret i32 %0
+}
+
+define i32 @_Z10atomic_incPU3AS1Vj(i32 addrspace(1)* %ptr) {
+entry:
+  %0 = atomicrmw add i32 addrspace(1)* %ptr, i32 1 monotonic
+  ret i32 %0
+}
+
+define i32 @_Z10atomic_incPU3AS3Vi(i32 addrspace(3)* %ptr) {
+entry:
+  %0 = atomicrmw add i32 addrspace(3)* %ptr, i32 1 monotonic
+  ret i32 %0
+}
+
+define i32 @_Z10atomic_incPU3AS3Vj(i32 addrspace(3)* %ptr) {
+entry:
+  %0 = atomicrmw add i32 addrspace(3)* %ptr, i32 1 monotonic
+  ret i32 %0
+}
+
+define i64 @_Z10atomic_incPU3AS1Vl(i64 addrspace(1)* %ptr) {
+entry:
+  %0 = atomicrmw add i64 addrspace(1)* %ptr, i64 1 monotonic
+  ret i64 %0
+}
+
+define i64 @_Z10atomic_incPU3AS1Vm(i64 addrspace(1)* %ptr) {
+entry:
+  %0 = atomicrmw add i64 addrspace(1)* %ptr, i64 1 monotonic
+  ret i64 %0
+}
+
+define i64 @_Z10atomic_incPU3AS3Vl(i64 addrspace(3)* %ptr) {
+entry:
+  %0 = atomicrmw add i64 addrspace(3)* %ptr, i64 1 monotonic
+  ret i64 %0
+}
+
+define i64 @_Z10atomic_incPU3AS3Vm(i64 addrspace(3)* %ptr) {
+entry:
+  %0 = atomicrmw add i64 addrspace(3)* %ptr, i64 1 monotonic
+  ret i64 %0
+}
diff --git a/lib/kernel/cuda/atomic_max.ll b/lib/kernel/cuda/atomic_max.ll
new file mode 100644
index 0000000..256a9b9
--- /dev/null
+++ b/lib/kernel/cuda/atomic_max.ll
@@ -0,0 +1,47 @@
+define i32 @_Z10atomic_maxPU3AS1Vii(i32 addrspace(1)* %ptr, i32 %val) {
+entry:
+  %0 = atomicrmw max i32 addrspace(1)* %ptr, i32 %val monotonic
+  ret i32 %0
+}
+
+define i32 @_Z10atomic_maxPU3AS1Vjj(i32 addrspace(1)* %ptr, i32 %val) {
+entry:
+  %0 = atomicrmw max i32 addrspace(1)* %ptr, i32 %val monotonic
+  ret i32 %0
+}
+
+define i32 @_Z10atomic_maxPU3AS3Vii(i32 addrspace(3)* %ptr, i32 %val) {
+entry:
+  %0 = atomicrmw max i32 addrspace(3)* %ptr, i32 %val monotonic
+  ret i32 %0
+}
+
+define i32 @_Z10atomic_maxPU3AS3Vjj(i32 addrspace(3)* %ptr, i32 %val) {
+entry:
+  %0 = atomicrmw max i32 addrspace(3)* %ptr, i32 %val monotonic
+  ret i32 %0
+}
+
+define i64 @_Z10atomic_maxPU3AS1Vll(i64 addrspace(1)* %ptr, i64 %val) {
+entry:
+  %0 = atomicrmw max i64 addrspace(1)* %ptr, i64 %val monotonic
+  ret i64 %0
+}
+
+define i64 @_Z10atomic_maxPU3AS1Vmm(i64 addrspace(1)* %ptr, i64 %val) {
+entry:
+  %0 = atomicrmw max i64 addrspace(1)* %ptr, i64 %val monotonic
+  ret i64 %0
+}
+
+define i64 @_Z10atomic_maxPU3AS3Vll(i64 addrspace(3)* %ptr, i64 %val) {
+entry:
+  %0 = atomicrmw max i64 addrspace(3)* %ptr, i64 %val monotonic
+  ret i64 %0
+}
+
+define i64 @_Z10atomic_maxPU3AS3Vmm(i64 addrspace(3)* %ptr, i64 %val) {
+entry:
+  %0 = atomicrmw max i64 addrspace(3)* %ptr, i64 %val monotonic
+  ret i64 %0
+}
diff --git a/lib/kernel/cuda/atomic_min.ll b/lib/kernel/cuda/atomic_min.ll
new file mode 100644
index 0000000..a387273
--- /dev/null
+++ b/lib/kernel/cuda/atomic_min.ll
@@ -0,0 +1,47 @@
+define i32 @_Z10atomic_minPU3AS1Vii(i32 addrspace(1)* %ptr, i32 %val) {
+entry:
+  %0 = atomicrmw min i32 addrspace(1)* %ptr, i32 %val monotonic
+  ret i32 %0
+}
+
+define i32 @_Z10atomic_minPU3AS1Vjj(i32 addrspace(1)* %ptr, i32 %val) {
+entry:
+  %0 = atomicrmw umin i32 addrspace(1)* %ptr, i32 %val monotonic
+  ret i32 %0
+}
+
+define i32 @_Z10atomic_minPU3AS3Vii(i32 addrspace(3)* %ptr, i32 %val) {
+entry:
+  %0 = atomicrmw min i32 addrspace(3)* %ptr, i32 %val monotonic
+  ret i32 %0
+}
+
+define i32 @_Z10atomic_minPU3AS3Vjj(i32 addrspace(3)* %ptr, i32 %val) {
+entry:
+  %0 = atomicrmw umin i32 addrspace(3)* %ptr, i32 %val monotonic
+  ret i32 %0
+}
+
+define i64 @_Z10atomic_minPU3AS1Vll(i64 addrspace(1)* %ptr, i64 %val) {
+entry:
+  %0 = atomicrmw min i64 addrspace(1)* %ptr, i64 %val monotonic
+  ret i64 %0
+}
+
+define i64 @_Z10atomic_minPU3AS1Vmm(i64 addrspace(1)* %ptr, i64 %val) {
+entry:
+  %0 = atomicrmw umin i64 addrspace(1)* %ptr, i64 %val monotonic
+  ret i64 %0
+}
+
+define i64 @_Z10atomic_minPU3AS3Vll(i64 addrspace(3)* %ptr, i64 %val) {
+entry:
+  %0 = atomicrmw min i64 addrspace(3)* %ptr, i64 %val monotonic
+  ret i64 %0
+}
+
+define i64 @_Z10atomic_minPU3AS3Vmm(i64 addrspace(3)* %ptr, i64 %val) {
+entry:
+  %0 = atomicrmw umin i64 addrspace(3)* %ptr, i64 %val monotonic
+  ret i64 %0
+}
diff --git a/lib/kernel/cuda/atomic_or.ll b/lib/kernel/cuda/atomic_or.ll
new file mode 100644
index 0000000..2a8d6f2
--- /dev/null
+++ b/lib/kernel/cuda/atomic_or.ll
@@ -0,0 +1,47 @@
+define i32 @_Z9atomic_orPU3AS1Vii(i32 addrspace(1)* %ptr, i32 %val) {
+entry:
+  %0 = atomicrmw or i32 addrspace(1)* %ptr, i32 %val monotonic
+  ret i32 %0
+}
+
+define i32 @_Z9atomic_orPU3AS1Vjj(i32 addrspace(1)* %ptr, i32 %val) {
+entry:
+  %0 = atomicrmw or i32 addrspace(1)* %ptr, i32 %val monotonic
+  ret i32 %0
+}
+
+define i32 @_Z9atomic_orPU3AS3Vii(i32 addrspace(3)* %ptr, i32 %val) {
+entry:
+  %0 = atomicrmw or i32 addrspace(3)* %ptr, i32 %val monotonic
+  ret i32 %0
+}
+
+define i32 @_Z9atomic_orPU3AS3Vjj(i32 addrspace(3)* %ptr, i32 %val) {
+entry:
+  %0 = atomicrmw or i32 addrspace(3)* %ptr, i32 %val monotonic
+  ret i32 %0
+}
+
+define i64 @_Z9atomic_orPU3AS1Vll(i64 addrspace(1)* %ptr, i64 %val) {
+entry:
+  %0 = atomicrmw or i64 addrspace(1)* %ptr, i64 %val monotonic
+  ret i64 %0
+}
+
+define i64 @_Z9atomic_orPU3AS1Vmm(i64 addrspace(1)* %ptr, i64 %val) {
+entry:
+  %0 = atomicrmw or i64 addrspace(1)* %ptr, i64 %val monotonic
+  ret i64 %0
+}
+
+define i64 @_Z9atomic_orPU3AS3Vll(i64 addrspace(3)* %ptr, i64 %val) {
+entry:
+  %0 = atomicrmw or i64 addrspace(3)* %ptr, i64 %val monotonic
+  ret i64 %0
+}
+
+define i64 @_Z9atomic_orPU3AS3Vmm(i64 addrspace(3)* %ptr, i64 %val) {
+entry:
+  %0 = atomicrmw or i64 addrspace(3)* %ptr, i64 %val monotonic
+  ret i64 %0
+}
diff --git a/lib/kernel/cuda/atomic_sub.ll b/lib/kernel/cuda/atomic_sub.ll
new file mode 100644
index 0000000..31d1f10
--- /dev/null
+++ b/lib/kernel/cuda/atomic_sub.ll
@@ -0,0 +1,47 @@
+define i32 @_Z10atomic_subPU3AS1Vii(i32 addrspace(1)* %ptr, i32 %val) {
+entry:
+  %0 = atomicrmw sub i32 addrspace(1)* %ptr, i32 %val monotonic
+  ret i32 %0
+}
+
+define i32 @_Z10atomic_subPU3AS1Vjj(i32 addrspace(1)* %ptr, i32 %val) {
+entry:
+  %0 = atomicrmw sub i32 addrspace(1)* %ptr, i32 %val monotonic
+  ret i32 %0
+}
+
+define i32 @_Z10atomic_subPU3AS3Vii(i32 addrspace(3)* %ptr, i32 %val) {
+entry:
+  %0 = atomicrmw sub i32 addrspace(3)* %ptr, i32 %val monotonic
+  ret i32 %0
+}
+
+define i32 @_Z10atomic_subPU3AS3Vjj(i32 addrspace(3)* %ptr, i32 %val) {
+entry:
+  %0 = atomicrmw sub i32 addrspace(3)* %ptr, i32 %val monotonic
+  ret i32 %0
+}
+
+define i64 @_Z10atomic_subPU3AS1Vll(i64 addrspace(1)* %ptr, i64 %val) {
+entry:
+  %0 = atomicrmw sub i64 addrspace(1)* %ptr, i64 %val monotonic
+  ret i64 %0
+}
+
+define i64 @_Z10atomic_subPU3AS1Vmm(i64 addrspace(1)* %ptr, i64 %val) {
+entry:
+  %0 = atomicrmw sub i64 addrspace(1)* %ptr, i64 %val monotonic
+  ret i64 %0
+}
+
+define i64 @_Z10atomic_subPU3AS3Vll(i64 addrspace(3)* %ptr, i64 %val) {
+entry:
+  %0 = atomicrmw sub i64 addrspace(3)* %ptr, i64 %val monotonic
+  ret i64 %0
+}
+
+define i64 @_Z10atomic_subPU3AS3Vmm(i64 addrspace(3)* %ptr, i64 %val) {
+entry:
+  %0 = atomicrmw sub i64 addrspace(3)* %ptr, i64 %val monotonic
+  ret i64 %0
+}
diff --git a/lib/kernel/cuda/atomic_xchg.ll b/lib/kernel/cuda/atomic_xchg.ll
new file mode 100644
index 0000000..b71a6a4
--- /dev/null
+++ b/lib/kernel/cuda/atomic_xchg.ll
@@ -0,0 +1,65 @@
+define i32 @_Z11atomic_xchgPU3AS1Vii(i32 addrspace(1)* %ptr, i32 %val) {
+entry:
+  %0 = atomicrmw xchg i32 addrspace(1)* %ptr, i32 %val monotonic
+  ret i32 %0
+}
+
+define i32 @_Z11atomic_xchgPU3AS1Vjj(i32 addrspace(1)* %ptr, i32 %val) {
+entry:
+  %0 = atomicrmw xchg i32 addrspace(1)* %ptr, i32 %val monotonic
+  ret i32 %0
+}
+
+define i32 @_Z11atomic_xchgPU3AS3Vii(i32 addrspace(3)* %ptr, i32 %val) {
+entry:
+  %0 = atomicrmw xchg i32 addrspace(3)* %ptr, i32 %val monotonic
+  ret i32 %0
+}
+
+define i32 @_Z11atomic_xchgPU3AS3Vjj(i32 addrspace(3)* %ptr, i32 %val) {
+entry:
+  %0 = atomicrmw xchg i32 addrspace(3)* %ptr, i32 %val monotonic
+  ret i32 %0
+}
+
+define i64 @_Z11atomic_xchgPU3AS1Vll(i64 addrspace(1)* %ptr, i64 %val) {
+entry:
+  %0 = atomicrmw xchg i64 addrspace(1)* %ptr, i64 %val monotonic
+  ret i64 %0
+}
+
+define i64 @_Z11atomic_xchgPU3AS1Vmm(i64 addrspace(1)* %ptr, i64 %val) {
+entry:
+  %0 = atomicrmw xchg i64 addrspace(1)* %ptr, i64 %val monotonic
+  ret i64 %0
+}
+
+define i64 @_Z11atomic_xchgPU3AS3Vll(i64 addrspace(3)* %ptr, i64 %val) {
+entry:
+  %0 = atomicrmw xchg i64 addrspace(3)* %ptr, i64 %val monotonic
+  ret i64 %0
+}
+
+define i64 @_Z11atomic_xchgPU3AS3Vmm(i64 addrspace(3)* %ptr, i64 %val) {
+entry:
+  %0 = atomicrmw xchg i64 addrspace(3)* %ptr, i64 %val monotonic
+  ret i64 %0
+}
+
+define float @_Z11atomic_xchgPU3AS1Vff(float addrspace(1)* %ptr, float %val) {
+entry:
+  %iptr = bitcast float addrspace(1)* %ptr to i32 addrspace(1)*
+  %ival = bitcast float %val to i32
+  %ires = atomicrmw xchg i32 addrspace(1)* %iptr, i32 %ival monotonic
+  %fres = bitcast i32 %ires to float
+  ret float %fres
+}
+
+define float @_Z11atomic_xchgPU3AS3Vff(float addrspace(3)* %ptr, float %val) {
+entry:
+  %iptr = bitcast float addrspace(3)* %ptr to i32 addrspace(3)*
+  %ival = bitcast float %val to i32
+  %ires = atomicrmw xchg i32 addrspace(3)* %iptr, i32 %ival monotonic
+  %fres = bitcast i32 %ires to float
+  ret float %fres
+}
diff --git a/lib/kernel/cuda/atomic_xor.ll b/lib/kernel/cuda/atomic_xor.ll
new file mode 100644
index 0000000..795f5bc
--- /dev/null
+++ b/lib/kernel/cuda/atomic_xor.ll
@@ -0,0 +1,47 @@
+define i32 @_Z10atomic_xorPU3AS1Vii(i32 addrspace(1)* %ptr, i32 %val) {
+entry:
+  %0 = atomicrmw xor i32 addrspace(1)* %ptr, i32 %val monotonic
+  ret i32 %0
+}
+
+define i32 @_Z10atomic_xorPU3AS1Vjj(i32 addrspace(1)* %ptr, i32 %val) {
+entry:
+  %0 = atomicrmw xor i32 addrspace(1)* %ptr, i32 %val monotonic
+  ret i32 %0
+}
+
+define i32 @_Z10atomic_xorPU3AS3Vii(i32 addrspace(3)* %ptr, i32 %val) {
+entry:
+  %0 = atomicrmw xor i32 addrspace(3)* %ptr, i32 %val monotonic
+  ret i32 %0
+}
+
+define i32 @_Z10atomic_xorPU3AS3Vjj(i32 addrspace(3)* %ptr, i32 %val) {
+entry:
+  %0 = atomicrmw xor i32 addrspace(3)* %ptr, i32 %val monotonic
+  ret i32 %0
+}
+
+define i64 @_Z10atomic_xorPU3AS1Vll(i64 addrspace(1)* %ptr, i64 %val) {
+entry:
+  %0 = atomicrmw xor i64 addrspace(1)* %ptr, i64 %val monotonic
+  ret i64 %0
+}
+
+define i64 @_Z10atomic_xorPU3AS1Vmm(i64 addrspace(1)* %ptr, i64 %val) {
+entry:
+  %0 = atomicrmw xor i64 addrspace(1)* %ptr, i64 %val monotonic
+  ret i64 %0
+}
+
+define i64 @_Z10atomic_xorPU3AS3Vll(i64 addrspace(3)* %ptr, i64 %val) {
+entry:
+  %0 = atomicrmw xor i64 addrspace(3)* %ptr, i64 %val monotonic
+  ret i64 %0
+}
+
+define i64 @_Z10atomic_xorPU3AS3Vmm(i64 addrspace(3)* %ptr, i64 %val) {
+entry:
+  %0 = atomicrmw xor i64 addrspace(3)* %ptr, i64 %val monotonic
+  ret i64 %0
+}
diff --git a/lib/kernel/cuda/barrier.ll b/lib/kernel/cuda/barrier.ll
new file mode 100644
index 0000000..beb96c0
--- /dev/null
+++ b/lib/kernel/cuda/barrier.ll
@@ -0,0 +1,7 @@
+declare void @llvm.nvvm.barrier0()
+
+define void @_Z7barrierj(i32 %flags) {
+entry:
+  call void @llvm.nvvm.barrier0()
+  ret void
+}
diff --git a/lib/CL/pocl_queue_util.h b/lib/kernel/cuda/get_global_id.c
similarity index 57%
rename from lib/CL/pocl_queue_util.h
rename to lib/kernel/cuda/get_global_id.c
index f8e905a..3684698 100644
--- a/lib/CL/pocl_queue_util.h
+++ b/lib/kernel/cuda/get_global_id.c
@@ -1,17 +1,17 @@
-/* Command queue management functions
+/* OpenCL built-in library: get_global_id() for CUDA
+
+   Copyright (c) 2016 James Price
 
-   Copyright (c) 2015 Giuseppe Bilotta
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -21,36 +21,30 @@
    THE SOFTWARE.
 */
 
-/* We keep a global list of all 'live' command queues in order to be able
- * to force a clFinish on all of them before this is triggered by the destructors
- * at program end, which happen in unspecified order and might cause all sorts
- * of issues. This header defines the signatures of the available functions
- */
-
-#ifndef POCL_QUEUE_H
-#define POCL_QUEUE_H
-
-#include "pocl_cl.h"
-
-#ifdef __GNUC__
-#pragma GCC visibility push(hidden)
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void pocl_init_queue_list();
-void pocl_queue_list_insert(cl_command_queue );
-void pocl_queue_list_delete(cl_command_queue );
-
-#ifdef __cplusplus
+uint get_nvvm_ntid_x();
+uint get_nvvm_ntid_y();
+uint get_nvvm_ntid_z();
+
+uint get_nvvm_ctaid_x();
+uint get_nvvm_ctaid_y();
+uint get_nvvm_ctaid_z();
+
+uint get_nvvm_tid_x();
+uint get_nvvm_tid_y();
+uint get_nvvm_tid_z();
+
+extern uint _global_offset_x;
+extern uint _global_offset_y;
+extern uint _global_offset_z;
+
+size_t _CL_OVERLOADABLE
+get_global_id(unsigned int dimindx)
+{
+  switch(dimindx)
+    {
+    case 0: return get_nvvm_ntid_x() * get_nvvm_ctaid_x() + get_nvvm_tid_x() + _global_offset_x;
+    case 1: return get_nvvm_ntid_y() * get_nvvm_ctaid_y() + get_nvvm_tid_y() + _global_offset_y;
+    case 2: return get_nvvm_ntid_z() * get_nvvm_ctaid_z() + get_nvvm_tid_z() + _global_offset_z;
+    default: return 0;
+    }
 }
-#endif
-
-#ifdef __GNUC__
-#pragma GCC visibility pop
-#endif
-
-
-#endif
diff --git a/lib/CL/clRetainContext.c b/lib/kernel/cuda/get_global_offset.c
similarity index 72%
copy from lib/CL/clRetainContext.c
copy to lib/kernel/cuda/get_global_offset.c
index e0b6a4b..31a7bbe 100644
--- a/lib/CL/clRetainContext.c
+++ b/lib/kernel/cuda/get_global_offset.c
@@ -1,17 +1,17 @@
-/* OpenCL runtime library: clRetainContext()
+/* OpenCL built-in library: get_global_offset() for CUDA
+
+   Copyright (c) 2017 James Price
 
-   Copyright (c) 2012 Tampere University of Technology
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -21,13 +21,18 @@
    THE SOFTWARE.
 */
 
-#include "pocl_cl.h"
+extern uint _global_offset_x;
+extern uint _global_offset_y;
+extern uint _global_offset_z;
 
-CL_API_ENTRY cl_int CL_API_CALL
-POname(clRetainContext)(cl_context context) CL_API_SUFFIX__VERSION_1_0
+size_t _CL_OVERLOADABLE
+get_global_offset(unsigned int dimindx)
 {
-  POCL_RETURN_ERROR_COND((context == NULL), CL_INVALID_CONTEXT);
-  POCL_RETAIN_OBJECT(context);
-  return CL_SUCCESS;
+  switch(dimindx)
+    {
+    case 0: return _global_offset_x;
+    case 1: return _global_offset_y;
+    case 2: return _global_offset_z;
+    default: return 0;
+    }
 }
-POsym(clRetainContext)
diff --git a/lib/CL/clReleaseDevice.c b/lib/kernel/cuda/get_global_size.c
similarity index 66%
copy from lib/CL/clReleaseDevice.c
copy to lib/kernel/cuda/get_global_size.c
index b48b29e..2d82cfb 100644
--- a/lib/CL/clReleaseDevice.c
+++ b/lib/kernel/cuda/get_global_size.c
@@ -1,17 +1,17 @@
-/* OpenCL runtime library: clReleaseDevice()
+/* OpenCL built-in library: get_global_size() for CUDA
+
+   Copyright (c) 2016 James Price
 
-   Copyright (c) 2011 Pekka Jääskeläinen / TUT
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -21,20 +21,22 @@
    THE SOFTWARE.
 */
 
-#include "pocl_cl.h"
+uint get_nvvm_ntid_x();
+uint get_nvvm_ntid_y();
+uint get_nvvm_ntid_z();
 
-CL_API_ENTRY cl_int CL_API_CALL
-POname(clReleaseDevice)(cl_device_id device) CL_API_SUFFIX__VERSION_1_2 
-{
-  if (device->parent_device == NULL)
-    return CL_SUCCESS;
+uint get_nvvm_nctaid_x();
+uint get_nvvm_nctaid_y();
+uint get_nvvm_nctaid_z();
 
-  int new_refcount;
-  POCL_RELEASE_OBJECT (device, new_refcount);
-
-  if (new_refcount == 0)
-    POCL_MEM_FREE(device);
-
-  return CL_SUCCESS;
+size_t _CL_OVERLOADABLE
+get_global_size(unsigned int dimindx)
+{
+  switch(dimindx)
+    {
+    case 0: return get_nvvm_ntid_x() * get_nvvm_nctaid_x();
+    case 1: return get_nvvm_ntid_y() * get_nvvm_nctaid_y();
+    case 2: return get_nvvm_ntid_z() * get_nvvm_nctaid_z();
+    default: return 0;
+    }
 }
-POsym(clReleaseDevice)
diff --git a/lib/CL/clRetainContext.c b/lib/kernel/cuda/get_group_id.c
similarity index 73%
copy from lib/CL/clRetainContext.c
copy to lib/kernel/cuda/get_group_id.c
index e0b6a4b..2911b19 100644
--- a/lib/CL/clRetainContext.c
+++ b/lib/kernel/cuda/get_group_id.c
@@ -1,17 +1,17 @@
-/* OpenCL runtime library: clRetainContext()
+/* OpenCL built-in library: get_group_id() for CUDA
+
+   Copyright (c) 2016 James Price
 
-   Copyright (c) 2012 Tampere University of Technology
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -21,13 +21,18 @@
    THE SOFTWARE.
 */
 
-#include "pocl_cl.h"
+uint get_nvvm_ctaid_x();
+uint get_nvvm_ctaid_y();
+uint get_nvvm_ctaid_z();
 
-CL_API_ENTRY cl_int CL_API_CALL
-POname(clRetainContext)(cl_context context) CL_API_SUFFIX__VERSION_1_0
+size_t _CL_OVERLOADABLE
+get_group_id(unsigned int dimindx)
 {
-  POCL_RETURN_ERROR_COND((context == NULL), CL_INVALID_CONTEXT);
-  POCL_RETAIN_OBJECT(context);
-  return CL_SUCCESS;
-}
-POsym(clRetainContext)
+  switch(dimindx)
+    {
+    case 0: return get_nvvm_ctaid_x();
+    case 1: return get_nvvm_ctaid_y();
+    case 2: return get_nvvm_ctaid_z();
+    default: return 0;
+    }
+ }
diff --git a/lib/CL/clRetainContext.c b/lib/kernel/cuda/get_local_id.c
similarity index 73%
copy from lib/CL/clRetainContext.c
copy to lib/kernel/cuda/get_local_id.c
index e0b6a4b..8ce6015 100644
--- a/lib/CL/clRetainContext.c
+++ b/lib/kernel/cuda/get_local_id.c
@@ -1,17 +1,17 @@
-/* OpenCL runtime library: clRetainContext()
+/* OpenCL built-in library: get_local_id() for CUDA
+
+   Copyright (c) 2016 James Price
 
-   Copyright (c) 2012 Tampere University of Technology
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -21,13 +21,18 @@
    THE SOFTWARE.
 */
 
-#include "pocl_cl.h"
+uint get_nvvm_tid_x();
+uint get_nvvm_tid_y();
+uint get_nvvm_tid_z();
 
-CL_API_ENTRY cl_int CL_API_CALL
-POname(clRetainContext)(cl_context context) CL_API_SUFFIX__VERSION_1_0
+size_t _CL_OVERLOADABLE
+get_local_id(unsigned int dimindx)
 {
-  POCL_RETURN_ERROR_COND((context == NULL), CL_INVALID_CONTEXT);
-  POCL_RETAIN_OBJECT(context);
-  return CL_SUCCESS;
-}
-POsym(clRetainContext)
+  switch(dimindx)
+    {
+    case 0: return get_nvvm_tid_x();
+    case 1: return get_nvvm_tid_y();
+    case 2: return get_nvvm_tid_z();
+    default: return 0;
+    }
+ }
diff --git a/lib/CL/clRetainContext.c b/lib/kernel/cuda/get_local_size.c
similarity index 73%
copy from lib/CL/clRetainContext.c
copy to lib/kernel/cuda/get_local_size.c
index e0b6a4b..00a6b9b 100644
--- a/lib/CL/clRetainContext.c
+++ b/lib/kernel/cuda/get_local_size.c
@@ -1,17 +1,17 @@
-/* OpenCL runtime library: clRetainContext()
+/* OpenCL built-in library: get_local_size() for CUDA
+
+   Copyright (c) 2016 James Price
 
-   Copyright (c) 2012 Tampere University of Technology
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -21,13 +21,18 @@
    THE SOFTWARE.
 */
 
-#include "pocl_cl.h"
+uint get_nvvm_ntid_x();
+uint get_nvvm_ntid_y();
+uint get_nvvm_ntid_z();
 
-CL_API_ENTRY cl_int CL_API_CALL
-POname(clRetainContext)(cl_context context) CL_API_SUFFIX__VERSION_1_0
+size_t _CL_OVERLOADABLE
+get_local_size(unsigned int dimindx)
 {
-  POCL_RETURN_ERROR_COND((context == NULL), CL_INVALID_CONTEXT);
-  POCL_RETAIN_OBJECT(context);
-  return CL_SUCCESS;
+  switch(dimindx)
+    {
+    case 0: return get_nvvm_ntid_x();
+    case 1: return get_nvvm_ntid_y();
+    case 2: return get_nvvm_ntid_z();
+    default: return 0;
+    }
 }
-POsym(clRetainContext)
diff --git a/lib/CL/clRetainContext.c b/lib/kernel/cuda/get_num_groups.c
similarity index 72%
copy from lib/CL/clRetainContext.c
copy to lib/kernel/cuda/get_num_groups.c
index e0b6a4b..115da1e 100644
--- a/lib/CL/clRetainContext.c
+++ b/lib/kernel/cuda/get_num_groups.c
@@ -1,17 +1,17 @@
-/* OpenCL runtime library: clRetainContext()
+/* OpenCL built-in library: get_num_groups() for CUDA
+
+   Copyright (c) 2016 James Price
 
-   Copyright (c) 2012 Tampere University of Technology
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -21,13 +21,18 @@
    THE SOFTWARE.
 */
 
-#include "pocl_cl.h"
+uint get_nvvm_nctaid_x();
+uint get_nvvm_nctaid_y();
+uint get_nvvm_nctaid_z();
 
-CL_API_ENTRY cl_int CL_API_CALL
-POname(clRetainContext)(cl_context context) CL_API_SUFFIX__VERSION_1_0
+size_t _CL_OVERLOADABLE
+get_num_groups(unsigned int dimindx)
 {
-  POCL_RETURN_ERROR_COND((context == NULL), CL_INVALID_CONTEXT);
-  POCL_RETAIN_OBJECT(context);
-  return CL_SUCCESS;
+  switch(dimindx)
+    {
+    case 0: return get_nvvm_nctaid_x();
+    case 1: return get_nvvm_nctaid_y();
+    case 2: return get_nvvm_nctaid_z();
+    default: return 0;
+    }
 }
-POsym(clRetainContext)
diff --git a/lib/kernel/cuda/nvvm_functions.ll b/lib/kernel/cuda/nvvm_functions.ll
new file mode 100644
index 0000000..0d1f818
--- /dev/null
+++ b/lib/kernel/cuda/nvvm_functions.ll
@@ -0,0 +1,87 @@
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.y()
+declare i32 @llvm.nvvm.read.ptx.sreg.tid.z()
+
+declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+declare i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
+declare i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
+
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
+declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
+
+declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
+declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
+declare i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
+
+define i32 @get_nvvm_tid_x() {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  ret i32 %0
+}
+
+define i32 @get_nvvm_tid_y() {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.y()
+  ret i32 %0
+}
+
+define i32 @get_nvvm_tid_z() {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.tid.z()
+  ret i32 %0
+}
+
+define i32 @get_nvvm_ntid_x() {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  ret i32 %0
+}
+
+define i32 @get_nvvm_ntid_y() {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.y()
+  ret i32 %0
+}
+
+define i32 @get_nvvm_ntid_z() {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ntid.z()
+  ret i32 %0
+}
+
+define i32 @get_nvvm_ctaid_x() {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
+  ret i32 %0
+}
+
+define i32 @get_nvvm_ctaid_y() {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y()
+  ret i32 %0
+}
+
+define i32 @get_nvvm_ctaid_z() {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.z()
+  ret i32 %0
+}
+
+define i32 @get_nvvm_nctaid_x() {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.x()
+  ret i32 %0
+}
+
+define i32 @get_nvvm_nctaid_y() {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.y()
+  ret i32 %0
+}
+
+define i32 @get_nvvm_nctaid_z() {
+entry:
+  %0 = call i32 @llvm.nvvm.read.ptx.sreg.nctaid.z()
+  ret i32 %0
+}
diff --git a/lib/kernel/cuda/printf.c b/lib/kernel/cuda/printf.c
new file mode 100644
index 0000000..df19501
--- /dev/null
+++ b/lib/kernel/cuda/printf.c
@@ -0,0 +1,91 @@
+/* OpenCL built-in library: printf() for CUDA
+
+   Copyright (c) 2016 James Price
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include <stdarg.h>
+
+void __cl_va_arg(va_list ap, char data[], int num_words);
+
+int vprintf(const char*, char*);
+
+int
+__cl_printf(__attribute__((address_space(4))) char* restrict format, ...)
+{
+  // TODO: Might need more than 2 words for (e.g.) vectors
+  char arg_data[8];
+
+  va_list ap;
+  va_start(ap, format);
+  char ch = *format;
+  while (ch) {
+    if (ch == '%') {
+      ch = *++format;
+
+      if (ch == '%') {
+        vprintf("%%", arg_data); // literal %
+        ch = *++format;
+      } else {
+        // TODO: other format specifiers
+        switch (ch) {
+          case 'c':
+          {
+            __cl_va_arg(ap, arg_data, 1);
+            vprintf("%c", arg_data);
+            break;
+          }
+          case 'd':
+          {
+            __cl_va_arg(ap, arg_data, 1);
+            vprintf("%d", arg_data);
+            break;
+          }
+          case 'f':
+          {
+            __cl_va_arg(ap, arg_data, 2);
+            vprintf("%lf", arg_data);
+            break;
+          }
+          case 's':
+          {
+            __cl_va_arg(ap, arg_data, 2);
+            vprintf("%s", arg_data);
+            break;
+          }
+          default: goto error;
+        }
+        ch = *++format;
+      }
+    }
+    else {
+      vprintf("%c", &ch);
+      ch = *++format;
+    }
+  }
+
+  va_end(ap);
+  return 0;
+
+  error:
+  va_end(ap);
+  vprintf("(printf format string error)", &ch);
+  return -1;
+}
diff --git a/lib/kernel/fract.cl b/lib/kernel/fract.cl
index f2c430a..e6874ac 100644
--- a/lib/kernel/fract.cl
+++ b/lib/kernel/fract.cl
@@ -26,7 +26,25 @@
 
 
 #ifdef cl_khr_fp64
-DEFINE_EXPR_V_VPV(fract, fmin(a - floor(a), (vtype)(stype)(sizeof(stype)==4 ? 0x1.fffffep-1f : 0x1.fffffffffffffp-1)))
+DEFINE_EXPR_V_VPV (fract, ({
+                     vtype fl = select ((vtype)floor (a), (vtype)NAN,
+                                        (itype)isnan (a));
+                     fl = select ((vtype)fl, (vtype)a, (itype)isinf (a));
+                     *b = fl;
+                     vtype ret = fmin (a - floor (a),
+                                       (vtype) (sizeof (stype) == 4
+                                                    ? 0x1.fffffep-1f
+                                                    : 0x1.fffffffffffffp-1));
+                     ret = select ((vtype)ret, (vtype)0.0, (itype)isinf (a));
+                     select ((vtype)ret, (vtype) (NAN), (itype)isnan (a));
+                   }))
 #else
-DEFINE_EXPR_V_VPV(fract, fmin(a - floor(a), (vtype)(stype)0x1.fffffep-1f))
+DEFINE_EXPR_V_VPV (fract, ({
+                     vtype fl = select ((vtype)floor (a), (vtype)NAN,
+                                        (itype)isnan (a));
+                     fl = select ((vtype)fl, (vtype)0.0f, (itype)isinf (a));
+                     *b = fl;
+                     vtype ret = fmin (a - floor (a), (vtype)0x1.fffffep-1f);
+                     select ((vtype)ret, (vtype)NAN, (itype)isnan (a));
+                   }))
 #endif
diff --git a/lib/kernel/get_global_id.c b/lib/kernel/get_global_id.c
index f531b34..33bed35 100644
--- a/lib/kernel/get_global_id.c
+++ b/lib/kernel/get_global_id.c
@@ -40,8 +40,19 @@ extern size_t _global_offset_z;
 size_t _CL_OVERLOADABLE
 get_local_id(unsigned int dimindx);
 
+
+/* attribute optnone disables all optimizations.
+ * This was necessary, because running opt on kernel library
+ * introduced global "switch tables" (@switch.table.XX)
+ * which referenced the global variables like @_global_offset*,
+ * and this was preventing these global vars from being optimized
+ * out after privatizeContext() in Workgroup pass. Leading to
+ * undefined references in final .so
+ */
+
+
 size_t _CL_OVERLOADABLE
-get_global_id(unsigned int dimindx)
+get_global_id(unsigned int dimindx) __attribute__ ((optnone))
 {
   switch(dimindx)
     {
diff --git a/lib/kernel/get_image_width.cl b/lib/kernel/get_image_array_size.cl
similarity index 53%
copy from lib/kernel/get_image_width.cl
copy to lib/kernel/get_image_array_size.cl
index 0a8b47c..d8510ce 100644
--- a/lib/kernel/get_image_width.cl
+++ b/lib/kernel/get_image_array_size.cl
@@ -1,18 +1,17 @@
-/* OpenCL built-in library: get_image_width()
+/* OpenCL built-in library: get_image_array_size()
+
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
 
-   Copyright (c) 2013-2014 Ville Korhonen, Pekka Jääskeläinen
-                           Tampere University of Technology
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -24,23 +23,28 @@
 
 #include "templates.h"
 
-#if (__clang_major__ == 3) && (__clang_minor__ >= 5)
-// Clang 3.5 crashes in case trying to cast to the private pointer,
-// adding the global qualifier fixes it. Clang 3.4 crashes if it's
-// there. The issue is in SROA.
-#define ADDRESS_SPACE global
-#else
-#define ADDRESS_SPACE
+#ifndef LLVM_OLDER_THAN_3_8
+
+#define IMPLEMENT_GET_IMAGE_ARRAY_SIZE(__IMGTYPE__)             \
+size_t _CL_OVERLOADABLE get_image_array_size(__IMGTYPE__ image) \
+{                                                               \
+  global dev_image_t* img =                                     \
+    __builtin_astype (image, global dev_image_t*);              \
+  return (size_t)(img->_image_array_size);                      \
+}
+
+IMPLEMENT_GET_IMAGE_ARRAY_SIZE (IMG_RO_AQ image1d_array_t)
+IMPLEMENT_GET_IMAGE_ARRAY_SIZE (IMG_RO_AQ image2d_array_t)
+
+#ifdef CLANG_HAS_IMAGE_AS
+IMPLEMENT_GET_IMAGE_ARRAY_SIZE (IMG_WO_AQ image1d_array_t)
+IMPLEMENT_GET_IMAGE_ARRAY_SIZE (IMG_WO_AQ image2d_array_t)
 #endif
 
-#define IMPLEMENT_GET_IMAGE_WIDTH(__IMGTYPE__)                  \
-  int _CL_OVERLOADABLE get_image_width(__IMGTYPE__ image){      \
-    ADDRESS_SPACE dev_image_t* ptr =                            \
-      __builtin_astype(image, ADDRESS_SPACE dev_image_t*);      \
-    return ptr->_width;                                         \
-  }                                                             \
+#ifdef CLANG_HAS_RW_IMAGES
+IMPLEMENT_GET_IMAGE_ARRAY_SIZE (IMG_RW_AQ image1d_array_t)
+IMPLEMENT_GET_IMAGE_ARRAY_SIZE (IMG_RW_AQ image2d_array_t)
+#endif
 
-IMPLEMENT_GET_IMAGE_WIDTH(image1d_t)
-IMPLEMENT_GET_IMAGE_WIDTH(image2d_t)
-IMPLEMENT_GET_IMAGE_WIDTH(image3d_t)
 
+#endif
diff --git a/lib/kernel/get_image_channel_data_type.cl b/lib/kernel/get_image_channel_data_type.cl
new file mode 100644
index 0000000..31b4ec4
--- /dev/null
+++ b/lib/kernel/get_image_channel_data_type.cl
@@ -0,0 +1,53 @@
+/* OpenCL built-in library: get_image_channel_data_type()
+
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include "templates.h"
+
+#define IMPLEMENT_GET_IMAGE_CHANNEL_DATA_TYPE(__IMGTYPE__)                    \
+  int _CL_OVERLOADABLE get_image_channel_data_type (__IMGTYPE__ image)        \
+  {                                                                           \
+    global dev_image_t *ptr = __builtin_astype (image, global dev_image_t *); \
+    return ptr->_data_type;                                                   \
+  }
+
+IMPLEMENT_GET_IMAGE_CHANNEL_DATA_TYPE (IMG_RO_AQ image1d_t)
+IMPLEMENT_GET_IMAGE_CHANNEL_DATA_TYPE (IMG_RO_AQ image1d_array_t)
+IMPLEMENT_GET_IMAGE_CHANNEL_DATA_TYPE (IMG_RO_AQ image2d_array_t)
+IMPLEMENT_GET_IMAGE_CHANNEL_DATA_TYPE (IMG_RO_AQ image2d_t)
+IMPLEMENT_GET_IMAGE_CHANNEL_DATA_TYPE (IMG_RO_AQ image3d_t)
+
+#ifdef CLANG_HAS_IMAGE_AS
+IMPLEMENT_GET_IMAGE_CHANNEL_DATA_TYPE (IMG_WO_AQ image1d_t)
+IMPLEMENT_GET_IMAGE_CHANNEL_DATA_TYPE (IMG_WO_AQ image1d_array_t)
+IMPLEMENT_GET_IMAGE_CHANNEL_DATA_TYPE (IMG_WO_AQ image2d_array_t)
+IMPLEMENT_GET_IMAGE_CHANNEL_DATA_TYPE (IMG_WO_AQ image2d_t)
+IMPLEMENT_GET_IMAGE_CHANNEL_DATA_TYPE (IMG_WO_AQ image3d_t)
+#endif
+
+#ifdef CLANG_HAS_RW_IMAGES
+IMPLEMENT_GET_IMAGE_CHANNEL_DATA_TYPE (IMG_RW_AQ image1d_t)
+IMPLEMENT_GET_IMAGE_CHANNEL_DATA_TYPE (IMG_RW_AQ image1d_array_t)
+IMPLEMENT_GET_IMAGE_CHANNEL_DATA_TYPE (IMG_RW_AQ image2d_array_t)
+IMPLEMENT_GET_IMAGE_CHANNEL_DATA_TYPE (IMG_RW_AQ image2d_t)
+IMPLEMENT_GET_IMAGE_CHANNEL_DATA_TYPE (IMG_RW_AQ image3d_t)
+#endif
diff --git a/lib/kernel/get_image_channel_order.cl b/lib/kernel/get_image_channel_order.cl
new file mode 100644
index 0000000..a00362f
--- /dev/null
+++ b/lib/kernel/get_image_channel_order.cl
@@ -0,0 +1,53 @@
+/* OpenCL built-in library: get_image_channel_order()
+
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include "templates.h"
+
+#define IMPLEMENT_GET_IMAGE_CHANNEL_ORDER(__IMGTYPE__)                        \
+  int _CL_OVERLOADABLE get_image_channel_order (__IMGTYPE__ image)            \
+  {                                                                           \
+    global dev_image_t *ptr = __builtin_astype (image, global dev_image_t *); \
+    return ptr->_order;                                                       \
+  }
+
+IMPLEMENT_GET_IMAGE_CHANNEL_ORDER (IMG_RO_AQ image1d_t)
+IMPLEMENT_GET_IMAGE_CHANNEL_ORDER (IMG_RO_AQ image1d_array_t)
+IMPLEMENT_GET_IMAGE_CHANNEL_ORDER (IMG_RO_AQ image2d_array_t)
+IMPLEMENT_GET_IMAGE_CHANNEL_ORDER (IMG_RO_AQ image2d_t)
+IMPLEMENT_GET_IMAGE_CHANNEL_ORDER (IMG_RO_AQ image3d_t)
+
+#ifdef CLANG_HAS_IMAGE_AS
+IMPLEMENT_GET_IMAGE_CHANNEL_ORDER (IMG_WO_AQ image1d_t)
+IMPLEMENT_GET_IMAGE_CHANNEL_ORDER (IMG_WO_AQ image1d_array_t)
+IMPLEMENT_GET_IMAGE_CHANNEL_ORDER (IMG_WO_AQ image2d_array_t)
+IMPLEMENT_GET_IMAGE_CHANNEL_ORDER (IMG_WO_AQ image2d_t)
+IMPLEMENT_GET_IMAGE_CHANNEL_ORDER (IMG_WO_AQ image3d_t)
+#endif
+
+#ifdef CLANG_HAS_RW_IMAGES
+IMPLEMENT_GET_IMAGE_CHANNEL_ORDER (IMG_RW_AQ image1d_t)
+IMPLEMENT_GET_IMAGE_CHANNEL_ORDER (IMG_RW_AQ image1d_array_t)
+IMPLEMENT_GET_IMAGE_CHANNEL_ORDER (IMG_RW_AQ image2d_array_t)
+IMPLEMENT_GET_IMAGE_CHANNEL_ORDER (IMG_RW_AQ image2d_t)
+IMPLEMENT_GET_IMAGE_CHANNEL_ORDER (IMG_RW_AQ image3d_t)
+#endif
diff --git a/lib/kernel/get_image_depth.cl b/lib/kernel/get_image_depth.cl
index 62f1977..92db4ee 100644
--- a/lib/kernel/get_image_depth.cl
+++ b/lib/kernel/get_image_depth.cl
@@ -24,22 +24,23 @@
 
 #include "templates.h"
 
-#if (__clang_major__ == 3) && (__clang_minor__ >= 5)
-// Clang 3.5 crashes in case trying to cast to the private pointer,
-// adding the global qualifier fixes it. Clang 3.4 crashes if it's
-// there. The issue is in SROA.
-#define ADDRESS_SPACE global
-#else
-#define ADDRESS_SPACE
-#endif
 
 #define IMPLEMENT_GET_IMAGE_DEPTH(__IMGTYPE__)                \
-  int _CL_OVERLOADABLE get_image_depth(__IMGTYPE__ image){    \
-    ADDRESS_SPACE dev_image_t* ptr =                          \
-      __builtin_astype(image, ADDRESS_SPACE dev_image_t*);    \
+  int _CL_OVERLOADABLE get_image_depth(__IMGTYPE__ image) {   \
+    global dev_image_t* ptr =                                 \
+      __builtin_astype(image, global dev_image_t*);           \
     return ptr->_depth;                                       \
-  }                                                           \
+  }
+
+
+IMPLEMENT_GET_IMAGE_DEPTH(IMG_RO_AQ image3d_t)
+
+
+#ifdef CLANG_HAS_IMAGE_AS
+IMPLEMENT_GET_IMAGE_DEPTH(IMG_WO_AQ image3d_t)
+#endif
+
+#ifdef CLANG_HAS_RW_IMAGES
+IMPLEMENT_GET_IMAGE_DEPTH(IMG_RW_AQ image3d_t)
+#endif
 
-IMPLEMENT_GET_IMAGE_DEPTH(image1d_t)
-IMPLEMENT_GET_IMAGE_DEPTH(image2d_t)
-IMPLEMENT_GET_IMAGE_DEPTH(image3d_t)
diff --git a/lib/kernel/get_image_dim.cl b/lib/kernel/get_image_dim.cl
index 9e5ae1e..085988e 100644
--- a/lib/kernel/get_image_dim.cl
+++ b/lib/kernel/get_image_dim.cl
@@ -25,32 +25,42 @@
 
 #include "templates.h"
 
-#if (__clang_major__ == 3) && (__clang_minor__ >= 5)
-// Clang 3.5 crashes in case trying to cast to the private pointer,
-// adding the global qualifier fixes it. Clang 3.4 crashes if it's
-// there. The issue is in SROA.
-#define ADDRESS_SPACE global
-#else
-#define ADDRESS_SPACE
-#endif
-
-int2 _CL_OVERLOADABLE get_image_dim(image2d_t image)
-{
-  ADDRESS_SPACE dev_image_t* img =
-    __builtin_astype(image, ADDRESS_SPACE dev_image_t*);
-  return (int2)(img->_width, img->_height);
+#define IMPLEMENT_GET_IMAGE_DIM_2D(__IMG_AQ__)                  \
+int2 _CL_OVERLOADABLE get_image_dim(__IMG_AQ__ image2d_t image) \
+{                                                               \
+  global dev_image_t* img =                                     \
+    __builtin_astype(image, global dev_image_t*);               \
+  return (int2)(img->_width, img->_height);                     \
 }
 
-int2 _CL_OVERLOADABLE get_image_dim(image2d_array_t image)
-{
-  ADDRESS_SPACE dev_image_t* img =
-    __builtin_astype (image, ADDRESS_SPACE dev_image_t*);
-  return (int2)(img->_width, img->_height);
+#define IMPLEMENT_GET_IMAGE_DIM_2DA(__IMG_AQ__)                       \
+int2 _CL_OVERLOADABLE get_image_dim(__IMG_AQ__ image2d_array_t image) \
+{                                                                     \
+  global dev_image_t* img =                                           \
+    __builtin_astype (image, global dev_image_t*);                    \
+  return (int2)(img->_width, img->_height);                           \
 }
 
-int4 _CL_OVERLOADABLE get_image_dim(image3d_t image)
-{
-  ADDRESS_SPACE dev_image_t* img =
-    __builtin_astype (image, ADDRESS_SPACE dev_image_t*);
-  return (int4)(img->_width, img->_height, img->_depth, 0);
+#define IMPLEMENT_GET_IMAGE_DIM_3D(__IMG_AQ__)                  \
+int4 _CL_OVERLOADABLE get_image_dim(__IMG_AQ__ image3d_t image) \
+{                                                               \
+  global dev_image_t* img =                                     \
+    __builtin_astype (image, global dev_image_t*);              \
+  return (int4)(img->_width, img->_height, img->_depth, 0);     \
 }
+
+IMPLEMENT_GET_IMAGE_DIM_2D(IMG_RO_AQ)
+IMPLEMENT_GET_IMAGE_DIM_2DA(IMG_RO_AQ)
+IMPLEMENT_GET_IMAGE_DIM_3D(IMG_RO_AQ)
+
+#ifdef CLANG_HAS_IMAGE_AS
+IMPLEMENT_GET_IMAGE_DIM_2D(IMG_WO_AQ)
+IMPLEMENT_GET_IMAGE_DIM_2DA(IMG_WO_AQ)
+IMPLEMENT_GET_IMAGE_DIM_3D(IMG_WO_AQ)
+#endif
+
+#ifdef CLANG_HAS_RW_IMAGES
+IMPLEMENT_GET_IMAGE_DIM_2D(IMG_RW_AQ)
+IMPLEMENT_GET_IMAGE_DIM_2DA(IMG_RW_AQ)
+IMPLEMENT_GET_IMAGE_DIM_3D(IMG_RW_AQ)
+#endif
diff --git a/lib/kernel/get_image_height.cl b/lib/kernel/get_image_height.cl
index 4d8e5a3..fd9802c 100644
--- a/lib/kernel/get_image_height.cl
+++ b/lib/kernel/get_image_height.cl
@@ -24,25 +24,33 @@
 
 #include "templates.h"
 
-#if (__clang_major__ == 3) && (__clang_minor__ >= 5)
-// Clang 3.5 crashes in case trying to cast to the private pointer,
-// adding the global qualifier fixes it. Clang 3.4 crashes if it's
-// there. The issue is in SROA.
-#define ADDRESS_SPACE global
-#else
-#define ADDRESS_SPACE
-#endif
-
 #define IMPLEMENT_GET_IMAGE_HEIGHT(__IMGTYPE__)                              \
   int _CL_OVERLOADABLE get_image_height(__IMGTYPE__ image){                  \
-    ADDRESS_SPACE dev_image_t* ptr =                                         \
-      __builtin_astype(image, ADDRESS_SPACE dev_image_t*);                   \
+    global dev_image_t* ptr =                                         \
+      __builtin_astype(image, global dev_image_t*);                   \
     return ptr->_height;                                                     \
   }                                                                          \
 
 
-IMPLEMENT_GET_IMAGE_HEIGHT(image1d_t)
-IMPLEMENT_GET_IMAGE_HEIGHT(image2d_t)
-IMPLEMENT_GET_IMAGE_HEIGHT(image3d_t)
+IMPLEMENT_GET_IMAGE_HEIGHT(IMG_RO_AQ image1d_t)
+IMPLEMENT_GET_IMAGE_HEIGHT(IMG_RO_AQ image1d_array_t)
+IMPLEMENT_GET_IMAGE_HEIGHT(IMG_RO_AQ image2d_array_t)
+IMPLEMENT_GET_IMAGE_HEIGHT(IMG_RO_AQ image2d_t)
+IMPLEMENT_GET_IMAGE_HEIGHT(IMG_RO_AQ image3d_t)
 
 
+#ifdef CLANG_HAS_IMAGE_AS
+IMPLEMENT_GET_IMAGE_HEIGHT(IMG_WO_AQ image1d_t)
+IMPLEMENT_GET_IMAGE_HEIGHT(IMG_WO_AQ image1d_array_t)
+IMPLEMENT_GET_IMAGE_HEIGHT(IMG_WO_AQ image2d_array_t)
+IMPLEMENT_GET_IMAGE_HEIGHT(IMG_WO_AQ image2d_t)
+IMPLEMENT_GET_IMAGE_HEIGHT(IMG_WO_AQ image3d_t)
+#endif
+
+#ifdef CLANG_HAS_RW_IMAGES
+IMPLEMENT_GET_IMAGE_HEIGHT(IMG_RW_AQ image1d_t)
+IMPLEMENT_GET_IMAGE_HEIGHT(IMG_RW_AQ image1d_array_t)
+IMPLEMENT_GET_IMAGE_HEIGHT(IMG_RW_AQ image2d_array_t)
+IMPLEMENT_GET_IMAGE_HEIGHT(IMG_RW_AQ image2d_t)
+IMPLEMENT_GET_IMAGE_HEIGHT(IMG_RW_AQ image3d_t)
+#endif
diff --git a/lib/kernel/get_image_width.cl b/lib/kernel/get_image_width.cl
index 0a8b47c..bd559df 100644
--- a/lib/kernel/get_image_width.cl
+++ b/lib/kernel/get_image_width.cl
@@ -24,23 +24,31 @@
 
 #include "templates.h"
 
-#if (__clang_major__ == 3) && (__clang_minor__ >= 5)
-// Clang 3.5 crashes in case trying to cast to the private pointer,
-// adding the global qualifier fixes it. Clang 3.4 crashes if it's
-// there. The issue is in SROA.
-#define ADDRESS_SPACE global
-#else
-#define ADDRESS_SPACE
-#endif
-
 #define IMPLEMENT_GET_IMAGE_WIDTH(__IMGTYPE__)                  \
   int _CL_OVERLOADABLE get_image_width(__IMGTYPE__ image){      \
-    ADDRESS_SPACE dev_image_t* ptr =                            \
-      __builtin_astype(image, ADDRESS_SPACE dev_image_t*);      \
+    global dev_image_t* ptr =                            \
+      __builtin_astype(image, global dev_image_t*);      \
     return ptr->_width;                                         \
   }                                                             \
 
-IMPLEMENT_GET_IMAGE_WIDTH(image1d_t)
-IMPLEMENT_GET_IMAGE_WIDTH(image2d_t)
-IMPLEMENT_GET_IMAGE_WIDTH(image3d_t)
+IMPLEMENT_GET_IMAGE_WIDTH(IMG_RO_AQ image1d_t)
+IMPLEMENT_GET_IMAGE_WIDTH(IMG_RO_AQ image1d_array_t)
+IMPLEMENT_GET_IMAGE_WIDTH(IMG_RO_AQ image2d_array_t)
+IMPLEMENT_GET_IMAGE_WIDTH(IMG_RO_AQ image2d_t)
+IMPLEMENT_GET_IMAGE_WIDTH(IMG_RO_AQ image3d_t)
 
+#ifdef CLANG_HAS_IMAGE_AS
+IMPLEMENT_GET_IMAGE_WIDTH(IMG_WO_AQ image1d_t)
+IMPLEMENT_GET_IMAGE_WIDTH(IMG_WO_AQ image1d_array_t)
+IMPLEMENT_GET_IMAGE_WIDTH(IMG_WO_AQ image2d_array_t)
+IMPLEMENT_GET_IMAGE_WIDTH(IMG_WO_AQ image2d_t)
+IMPLEMENT_GET_IMAGE_WIDTH(IMG_WO_AQ image3d_t)
+#endif
+
+#ifdef CLANG_HAS_RW_IMAGES
+IMPLEMENT_GET_IMAGE_WIDTH(IMG_RW_AQ image1d_t)
+IMPLEMENT_GET_IMAGE_WIDTH(IMG_RW_AQ image1d_array_t)
+IMPLEMENT_GET_IMAGE_WIDTH(IMG_RW_AQ image2d_array_t)
+IMPLEMENT_GET_IMAGE_WIDTH(IMG_RW_AQ image2d_t)
+IMPLEMENT_GET_IMAGE_WIDTH(IMG_RW_AQ image3d_t)
+#endif
diff --git a/lib/kernel/rsqrt.cl b/lib/kernel/half_cos.cl
similarity index 80%
copy from lib/kernel/rsqrt.cl
copy to lib/kernel/half_cos.cl
index 3c75ca1..ca6771c 100644
--- a/lib/kernel/rsqrt.cl
+++ b/lib/kernel/half_cos.cl
@@ -1,18 +1,17 @@
-/* OpenCL built-in library: rsqrt()
+/* OpenCL built-in library: half_cos()
+
+   Copyright (c) 2011-2013 Erik Schnetter
 
-   Copyright (c) 2011 Erik Schnetter <eschnetter at perimeterinstitute.ca>
-                      Perimeter Institute for Theoretical Physics
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -24,6 +23,4 @@
 
 #include "templates.h"
 
-DEFINE_EXPR_V_V(rsqrt, (stype)1/sqrt(a))
-
-DEFINE_EXPR_F_F(half_rsqrt, rsqrt(a))
+DEFINE_EXPR_F_F(half_cos, cos(a))
diff --git a/lib/kernel/rsqrt.cl b/lib/kernel/half_divide.cl
similarity index 80%
copy from lib/kernel/rsqrt.cl
copy to lib/kernel/half_divide.cl
index 3c75ca1..09828d4 100644
--- a/lib/kernel/rsqrt.cl
+++ b/lib/kernel/half_divide.cl
@@ -1,18 +1,17 @@
-/* OpenCL built-in library: rsqrt()
+/* OpenCL built-in library: half_divide()
+
+   Copyright (c) 2011-2013 Erik Schnetter
 
-   Copyright (c) 2011 Erik Schnetter <eschnetter at perimeterinstitute.ca>
-                      Perimeter Institute for Theoretical Physics
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -24,6 +23,4 @@
 
 #include "templates.h"
 
-DEFINE_EXPR_V_V(rsqrt, (stype)1/sqrt(a))
-
-DEFINE_EXPR_F_F(half_rsqrt, rsqrt(a))
+DEFINE_EXPR_F_FF(half_divide, a/b)
diff --git a/lib/kernel/rsqrt.cl b/lib/kernel/half_exp.cl
similarity index 80%
copy from lib/kernel/rsqrt.cl
copy to lib/kernel/half_exp.cl
index 3c75ca1..35aa767 100644
--- a/lib/kernel/rsqrt.cl
+++ b/lib/kernel/half_exp.cl
@@ -1,18 +1,17 @@
-/* OpenCL built-in library: rsqrt()
+/* OpenCL built-in library: half_exp()
+
+   Copyright (c) 2011-2013 Erik Schnetter
 
-   Copyright (c) 2011 Erik Schnetter <eschnetter at perimeterinstitute.ca>
-                      Perimeter Institute for Theoretical Physics
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -24,6 +23,4 @@
 
 #include "templates.h"
 
-DEFINE_EXPR_V_V(rsqrt, (stype)1/sqrt(a))
-
-DEFINE_EXPR_F_F(half_rsqrt, rsqrt(a))
+DEFINE_EXPR_F_F(half_exp, exp(a))
diff --git a/lib/kernel/rsqrt.cl b/lib/kernel/half_exp10.cl
similarity index 80%
copy from lib/kernel/rsqrt.cl
copy to lib/kernel/half_exp10.cl
index 3c75ca1..6d4694d 100644
--- a/lib/kernel/rsqrt.cl
+++ b/lib/kernel/half_exp10.cl
@@ -1,18 +1,17 @@
-/* OpenCL built-in library: rsqrt()
+/* OpenCL built-in library: half_exp10()
+
+   Copyright (c) 2011-2013 Erik Schnetter
 
-   Copyright (c) 2011 Erik Schnetter <eschnetter at perimeterinstitute.ca>
-                      Perimeter Institute for Theoretical Physics
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -24,6 +23,4 @@
 
 #include "templates.h"
 
-DEFINE_EXPR_V_V(rsqrt, (stype)1/sqrt(a))
-
-DEFINE_EXPR_F_F(half_rsqrt, rsqrt(a))
+DEFINE_EXPR_F_F(half_exp10, exp10(a))
diff --git a/lib/kernel/rsqrt.cl b/lib/kernel/half_exp2.cl
similarity index 80%
copy from lib/kernel/rsqrt.cl
copy to lib/kernel/half_exp2.cl
index 3c75ca1..dd98f38 100644
--- a/lib/kernel/rsqrt.cl
+++ b/lib/kernel/half_exp2.cl
@@ -1,18 +1,17 @@
-/* OpenCL built-in library: rsqrt()
+/* OpenCL built-in library: half_exp2()
+
+   Copyright (c) 2011-2013 Erik Schnetter
 
-   Copyright (c) 2011 Erik Schnetter <eschnetter at perimeterinstitute.ca>
-                      Perimeter Institute for Theoretical Physics
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -24,6 +23,4 @@
 
 #include "templates.h"
 
-DEFINE_EXPR_V_V(rsqrt, (stype)1/sqrt(a))
-
-DEFINE_EXPR_F_F(half_rsqrt, rsqrt(a))
+DEFINE_EXPR_F_F(half_exp2, exp2(a))
diff --git a/lib/kernel/rsqrt.cl b/lib/kernel/half_log.cl
similarity index 80%
copy from lib/kernel/rsqrt.cl
copy to lib/kernel/half_log.cl
index 3c75ca1..cff381f 100644
--- a/lib/kernel/rsqrt.cl
+++ b/lib/kernel/half_log.cl
@@ -1,18 +1,17 @@
-/* OpenCL built-in library: rsqrt()
+/* OpenCL built-in library: half_log()
+
+   Copyright (c) 2011-2013 Erik Schnetter
 
-   Copyright (c) 2011 Erik Schnetter <eschnetter at perimeterinstitute.ca>
-                      Perimeter Institute for Theoretical Physics
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -24,6 +23,4 @@
 
 #include "templates.h"
 
-DEFINE_EXPR_V_V(rsqrt, (stype)1/sqrt(a))
-
-DEFINE_EXPR_F_F(half_rsqrt, rsqrt(a))
+DEFINE_EXPR_F_F(half_log, log(a))
diff --git a/lib/kernel/rsqrt.cl b/lib/kernel/half_log10.cl
similarity index 80%
copy from lib/kernel/rsqrt.cl
copy to lib/kernel/half_log10.cl
index 3c75ca1..5aa3ffa 100644
--- a/lib/kernel/rsqrt.cl
+++ b/lib/kernel/half_log10.cl
@@ -1,18 +1,17 @@
-/* OpenCL built-in library: rsqrt()
+/* OpenCL built-in library: half_log10()
+
+   Copyright (c) 2011-2013 Erik Schnetter
 
-   Copyright (c) 2011 Erik Schnetter <eschnetter at perimeterinstitute.ca>
-                      Perimeter Institute for Theoretical Physics
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -24,6 +23,4 @@
 
 #include "templates.h"
 
-DEFINE_EXPR_V_V(rsqrt, (stype)1/sqrt(a))
-
-DEFINE_EXPR_F_F(half_rsqrt, rsqrt(a))
+DEFINE_EXPR_F_F(half_log10, log10(a))
diff --git a/lib/kernel/rsqrt.cl b/lib/kernel/half_log2.cl
similarity index 80%
copy from lib/kernel/rsqrt.cl
copy to lib/kernel/half_log2.cl
index 3c75ca1..4ad86ec 100644
--- a/lib/kernel/rsqrt.cl
+++ b/lib/kernel/half_log2.cl
@@ -1,18 +1,17 @@
-/* OpenCL built-in library: rsqrt()
+/* OpenCL built-in library: half_log2()
+
+   Copyright (c) 2011-2013 Erik Schnetter
 
-   Copyright (c) 2011 Erik Schnetter <eschnetter at perimeterinstitute.ca>
-                      Perimeter Institute for Theoretical Physics
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -24,6 +23,4 @@
 
 #include "templates.h"
 
-DEFINE_EXPR_V_V(rsqrt, (stype)1/sqrt(a))
-
-DEFINE_EXPR_F_F(half_rsqrt, rsqrt(a))
+DEFINE_EXPR_F_F(half_log2, log2(a))
diff --git a/lib/kernel/rsqrt.cl b/lib/kernel/half_powr.cl
similarity index 80%
copy from lib/kernel/rsqrt.cl
copy to lib/kernel/half_powr.cl
index 3c75ca1..4d57375 100644
--- a/lib/kernel/rsqrt.cl
+++ b/lib/kernel/half_powr.cl
@@ -1,18 +1,17 @@
-/* OpenCL built-in library: rsqrt()
+/* OpenCL built-in library: half_powr()
+
+   Copyright (c) 2011-2013 Erik Schnetter
 
-   Copyright (c) 2011 Erik Schnetter <eschnetter at perimeterinstitute.ca>
-                      Perimeter Institute for Theoretical Physics
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -24,6 +23,4 @@
 
 #include "templates.h"
 
-DEFINE_EXPR_V_V(rsqrt, (stype)1/sqrt(a))
-
-DEFINE_EXPR_F_F(half_rsqrt, rsqrt(a))
+DEFINE_EXPR_F_FF(half_powr, powr(a, b))
diff --git a/lib/kernel/rsqrt.cl b/lib/kernel/half_recip.cl
similarity index 80%
copy from lib/kernel/rsqrt.cl
copy to lib/kernel/half_recip.cl
index 3c75ca1..adb646a 100644
--- a/lib/kernel/rsqrt.cl
+++ b/lib/kernel/half_recip.cl
@@ -1,18 +1,17 @@
-/* OpenCL built-in library: rsqrt()
+/* OpenCL built-in library: half_recip()
+
+   Copyright (c) 2011-2013 Erik Schnetter
 
-   Copyright (c) 2011 Erik Schnetter <eschnetter at perimeterinstitute.ca>
-                      Perimeter Institute for Theoretical Physics
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -24,6 +23,4 @@
 
 #include "templates.h"
 
-DEFINE_EXPR_V_V(rsqrt, (stype)1/sqrt(a))
-
-DEFINE_EXPR_F_F(half_rsqrt, rsqrt(a))
+DEFINE_EXPR_F_F(half_recip, (stype)1/a)
diff --git a/lib/kernel/rsqrt.cl b/lib/kernel/half_rsqrt.cl
similarity index 83%
copy from lib/kernel/rsqrt.cl
copy to lib/kernel/half_rsqrt.cl
index 3c75ca1..6295bf1 100644
--- a/lib/kernel/rsqrt.cl
+++ b/lib/kernel/half_rsqrt.cl
@@ -1,18 +1,17 @@
-/* OpenCL built-in library: rsqrt()
+/* OpenCL built-in library: half_rsqrt()
+
+   Copyright (c) 2011-2013 Erik Schnetter
 
-   Copyright (c) 2011 Erik Schnetter <eschnetter at perimeterinstitute.ca>
-                      Perimeter Institute for Theoretical Physics
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -24,6 +23,4 @@
 
 #include "templates.h"
 
-DEFINE_EXPR_V_V(rsqrt, (stype)1/sqrt(a))
-
 DEFINE_EXPR_F_F(half_rsqrt, rsqrt(a))
diff --git a/lib/kernel/rsqrt.cl b/lib/kernel/half_sin.cl
similarity index 80%
copy from lib/kernel/rsqrt.cl
copy to lib/kernel/half_sin.cl
index 3c75ca1..cf5ce89 100644
--- a/lib/kernel/rsqrt.cl
+++ b/lib/kernel/half_sin.cl
@@ -1,18 +1,17 @@
-/* OpenCL built-in library: rsqrt()
+/* OpenCL built-in library: half_sin()
+
+   Copyright (c) 2011-2013 Erik Schnetter
 
-   Copyright (c) 2011 Erik Schnetter <eschnetter at perimeterinstitute.ca>
-                      Perimeter Institute for Theoretical Physics
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -24,6 +23,4 @@
 
 #include "templates.h"
 
-DEFINE_EXPR_V_V(rsqrt, (stype)1/sqrt(a))
-
-DEFINE_EXPR_F_F(half_rsqrt, rsqrt(a))
+DEFINE_EXPR_F_F(half_sin, sin(a))
diff --git a/lib/kernel/rsqrt.cl b/lib/kernel/half_sqrt.cl
similarity index 80%
copy from lib/kernel/rsqrt.cl
copy to lib/kernel/half_sqrt.cl
index 3c75ca1..b1dfb39 100644
--- a/lib/kernel/rsqrt.cl
+++ b/lib/kernel/half_sqrt.cl
@@ -1,18 +1,17 @@
-/* OpenCL built-in library: rsqrt()
+/* OpenCL built-in library: half_sqrt()
+
+   Copyright (c) 2011-2013 Erik Schnetter
 
-   Copyright (c) 2011 Erik Schnetter <eschnetter at perimeterinstitute.ca>
-                      Perimeter Institute for Theoretical Physics
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -24,6 +23,4 @@
 
 #include "templates.h"
 
-DEFINE_EXPR_V_V(rsqrt, (stype)1/sqrt(a))
-
-DEFINE_EXPR_F_F(half_rsqrt, rsqrt(a))
+DEFINE_EXPR_F_F(half_sqrt, sqrt(a))
diff --git a/lib/kernel/rsqrt.cl b/lib/kernel/half_tan.cl
similarity index 80%
copy from lib/kernel/rsqrt.cl
copy to lib/kernel/half_tan.cl
index 3c75ca1..7c9302c 100644
--- a/lib/kernel/rsqrt.cl
+++ b/lib/kernel/half_tan.cl
@@ -1,18 +1,17 @@
-/* OpenCL built-in library: rsqrt()
+/* OpenCL built-in library: half_tan()
+
+   Copyright (c) 2011-2013 Erik Schnetter
 
-   Copyright (c) 2011 Erik Schnetter <eschnetter at perimeterinstitute.ca>
-                      Perimeter Institute for Theoretical Physics
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -24,6 +23,4 @@
 
 #include "templates.h"
 
-DEFINE_EXPR_V_V(rsqrt, (stype)1/sqrt(a))
-
-DEFINE_EXPR_F_F(half_rsqrt, rsqrt(a))
+DEFINE_EXPR_F_F(half_tan, tan(a))
diff --git a/lib/kernel/host/CMakeLists.txt b/lib/kernel/host/CMakeLists.txt
index 32a3311..c3657a0 100644
--- a/lib/kernel/host/CMakeLists.txt
+++ b/lib/kernel/host/CMakeLists.txt
@@ -25,12 +25,17 @@
 
 include("bitcode_rules")
 
-if(USE_VECMATHLIB)
+if(ENABLE_VECMATHLIB)
   set(KERNEL_SOURCES ${SOURCES_WITH_VML})
+elseif(ENABLE_SLEEF)
+  set(KERNEL_SOURCES ${SOURCES_WITH_SLEEF})
 else()
   set(KERNEL_SOURCES ${SOURCES_WITHOUT_VML})
 endif()
 
+list(APPEND KERNEL_SOURCES "mem_fence.c")
+
+if(HOST_DEVICE_CL_VERSION GREATER 199)
 if(X86_64 OR I386)
   if(LLVM_3_6)
     message(STATUS "OpenCL 2.0 atomics only works with LLVM >= 3.7")
@@ -47,13 +52,20 @@ else()
   message(STATUS "Using generic OpenCL 2.0 atomics. Might or might not break your build.")
   list(APPEND KERNEL_SOURCES svm_atomics_host.cl svm_atomics.cl)
 endif()
+endif()
+
+set(KERNEL_CL_FLAGS
+      "-Wall" "-Wno-unused-local-typedef" "-Xclang"
+      "-cl-std=CL${HOST_DEVICE_CL_STD}"
+      "-D__OPENCL_C_VERSION__=${HOST_DEVICE_CL_VERSION}"
+      ${KERNEL_CL_FLAGS})
 
 separate_arguments(HOST_CLANG_FLAGS)
 separate_arguments(HOST_LLC_FLAGS)
-set(DEVICE_CL_FLAGS "-D__OPENCL_VERSION__=${HOST_DEVICE_CL_VERSION} ${HOST_DEVICE_EXTENSION_DEFINES}")
+set(DEVICE_CL_FLAGS "-D__OPENCL_VERSION__=${HOST_DEVICE_CL_VERSION}")
+set(DEVICE_CL_FLAGS "${DEVICE_CL_FLAGS} ${HOST_DEVICE_EXTENSION_DEFINES}")
 separate_arguments(DEVICE_CL_FLAGS)
 
-
 function(x86_distro_variant_to_flags VARIANT OUT_LLC_FLAGS OUT_CLANG_FLAGS)
 
   if("${VARIANT}" STREQUAL "sse2")
@@ -72,6 +84,10 @@ function(x86_distro_variant_to_flags VARIANT OUT_LLC_FLAGS OUT_CLANG_FLAGS)
     set(CLANG_F "${CLANG_MARCH_FLAG}sandybridge")
     set(LLC_F "-mcpu=sandybridge")
 
+  elseif("${VARIANT}" STREQUAL "avx_f16c")
+    set(CLANG_F "${CLANG_MARCH_FLAG}ivybridge")
+    set(LLC_F "-mcpu=ivybridge")
+
   elseif("${VARIANT}" STREQUAL "avx_fma4")
     set(CLANG_F "${CLANG_MARCH_FLAG}bdver1")
     set(LLC_F "-mcpu=bdver1")
@@ -81,8 +97,8 @@ function(x86_distro_variant_to_flags VARIANT OUT_LLC_FLAGS OUT_CLANG_FLAGS)
     set(LLC_F "-mcpu=haswell")
 
   elseif("${VARIANT}" STREQUAL "avx512")
-    set(CLANG_F "${CLANG_MARCH_FLAG}skylake")
-    set(LLC_F "-mcpu=skylake")
+    set(CLANG_F "${CLANG_MARCH_FLAG}skylake-avx512")
+    set(LLC_F "-mcpu=skylake-avx512")
 
   else()
     set(CLANG_F "${CLANG_MARCH_FLAG}${VARIANT}")
@@ -96,6 +112,89 @@ endfunction()
 
 ###############################################################################
 
+function(compile_sleef VARIANT SLEEF_CONFIG SLEEF_CONFIG_NEW SLEEF_BC)
+
+  unset(BC_FILE_LIST)
+
+  set(EXTRA_FLAGS "-DDORENAME;-DPURE_C;-I${CMAKE_SOURCE_DIR}/lib/kernel/sleef/include")
+
+  if(ENABLE_CONFORMANCE)
+#    compile_sleef_c_to_bc("c" "sleef/libm/sleef_builtin.c"
+#                          "${VARIANT}" BC_FILE_LIST ${EXTRA_FLAGS})
+#  else()
+    # these may be faster than using libm, but not always precise
+    compile_sleef_c_to_bc("c" "sleef/libm/sleefsp.c"
+                          "${VARIANT}" BC_FILE_LIST ${EXTRA_FLAGS})
+    compile_sleef_c_to_bc("c" "sleef/libm/sleefdp.c"
+                          "${VARIANT}" BC_FILE_LIST ${EXTRA_FLAGS})
+  endif()
+
+  compile_sleef_c_to_bc("c" "sleef/libm/sleef_glue_auto.c"
+                        "${VARIANT}" BC_FILE_LIST "-include" "${SLEEF_CONFIG}")
+
+  file(READ "${CMAKE_SOURCE_DIR}/lib/kernel/sleef/test.c" TEST_SRC)
+  file(READ "${CMAKE_SOURCE_DIR}/lib/kernel/sleef/fma_test.c" FMA_TEST_SRC)
+
+  # current SLEEF code does not have code for
+  # ARM32 NEON double vectors (if they even exist)
+  if(NOT ARM32)
+    set(STR "#define SLEEF_DOUBLE_VEC_AVAILABLE\n")
+    file(APPEND "${SLEEF_CONFIG_NEW}" "${STR}")
+  endif()
+
+  foreach(VECSIZE "128" "256" "512")
+
+    set(EXTRA_FLAGS "-DDORENAME;-DVEC${VECSIZE}")
+    custom_try_compile_any(1 "${CLANG}" "c" "${TEST_SRC}" RES
+      ${CLANG_FLAGS} ${EXTRA_FLAGS} "-c")
+
+    if(${RES} EQUAL "0")
+      compile_sleef_c_to_bc("v${VECSIZE}" "sleef/libm/sleefsimdsp.c"
+                            "${VARIANT}" BC_FILE_LIST ${EXTRA_FLAGS})
+      if(NOT ARM32)
+      compile_sleef_c_to_bc("v${VECSIZE}" "sleef/libm/sleefsimddp.c"
+                            "${VARIANT}" BC_FILE_LIST ${EXTRA_FLAGS})
+      endif()
+      message(STATUS "${VARIANT} SLEEF: ${VECSIZE}bit vectors available.")
+
+      set(STR "#define SLEEF_VEC_${VECSIZE}_AVAILABLE\n")
+      file(APPEND "${SLEEF_CONFIG_NEW}" "${STR}")
+
+    else()
+      message(STATUS "${VARIANT} SLEEF: ${VECSIZE}bit vectors NOT available.")
+    endif()
+
+    custom_try_compile_any(1 "${CLANG}" "c" "${FMA_TEST_SRC}" RES
+      ${CLANG_FLAGS} ${EXTRA_FLAGS} "-c")
+    if(${RES} EQUAL "0")
+      unset(STR)
+      set(STR "#define HAVE_FMA32_${VECSIZE}\n")
+      set(STR "${STR}#define HAVE_FMA64_${VECSIZE}\n")
+      file(APPEND "${SLEEF_CONFIG_NEW}" "${STR}")
+      message(STATUS "${VARIANT} SLEEF: ${VECSIZE}bit hardware FMA available.")
+    endif()
+
+  endforeach()
+
+  file(MAKE_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${VARIANT}")
+  set(SLEEF_BC "${CMAKE_CURRENT_BINARY_DIR}/${VARIANT}/sleef.bc")
+  set(SLEEF_BC "${CMAKE_CURRENT_BINARY_DIR}/${VARIANT}/sleef.bc" PARENT_SCOPE)
+
+  message(STATUS "${VARIANT} SLEEF bc list: ${BC_FILE_LIST}")
+  message(STATUS "${VARIANT} SLEEF bc: ${SLEEF_BC}")
+
+  set(LINK_OPT_COMMAND COMMAND "${LLVM_LINK}" "-o" "${SLEEF_BC}" ${BC_FILE_LIST})
+
+  add_custom_command( OUTPUT "${SLEEF_BC}"
+    DEPENDS ${BC_FILE_LIST}
+    COMMAND ${LINK_OPT_COMMAND}
+    COMMENT "Linking & optimizing SLEEF for ${VARIANT}: ${SLEEF_BC}"
+    VERBATIM)
+
+endfunction()
+
+###############################################################################
+
 foreach(CPU_VARIANT IN LISTS KERNELLIB_HOST_CPU_VARIANTS)
 
 if(CPU_VARIANT MATCHES "native")
@@ -113,16 +212,38 @@ endif()
 
 separate_arguments(CLANG_CPUFLAGS)
 separate_arguments(LLC_CPUFLAGS)
-set(CLANG_FLAGS ${HOST_CLANG_FLAGS} ${CLANG_CPUFLAGS} "-emit-llvm" "-ffp-contract=off")
+set(CLANG_FLAGS ${HOST_CLANG_FLAGS} ${CLANG_CPUFLAGS}
+                "-emit-llvm" "-ffp-contract=off")
 
 if(POCL_USE_FAKE_ADDR_SPACE_IDS)
-list(APPEND CLANG_FLAGS "-Xclang" "-ffake-address-space-map" "-DPOCL_USE_FAKE_ADDR_SPACE_IDS")
+list(APPEND CLANG_FLAGS "-Xclang" "-ffake-address-space-map"
+                        "-DPOCL_USE_FAKE_ADDR_SPACE_IDS")
 endif()
 
 set(LLC_FLAGS ${HOST_LLC_FLAGS} ${LLC_CPUFLAGS})
 
-# KERNEL_TARGET = @OCL_KERNEL_TARGET@
-make_kernel_bc(KERNEL_BC "${OCL_KERNEL_TARGET}-${VARIANT}" "${VARIANT}" ${KERNEL_SOURCES})
+if(ENABLE_SLEEF)
+
+  # write SLEEF config for this CPU
+  set(SLEEF_CONFIG "${CMAKE_BINARY_DIR}/sleef_config_temp_${VARIANT}.h")
+  set(SLEEF_CONFIG_NEW "${SLEEF_CONFIG}.new")
+  set(STR "/* SLEEF library configuration for ${VARIANT} CPU */ \n")
+  file(WRITE "${SLEEF_CONFIG_NEW}" "${STR}")
+
+  # compile SLEEF library for the cpu variant
+  unset(SLEEF_BC)
+  compile_sleef("${VARIANT}" "${SLEEF_CONFIG}" "${SLEEF_CONFIG_NEW}" SLEEF_BC)
+
+  rename_if_different("${SLEEF_CONFIG_NEW}" "${SLEEF_CONFIG}")
+
+  # compile kernel
+  make_kernel_bc(KERNEL_BC "${OCL_KERNEL_TARGET}-${VARIANT}" "${VARIANT}"
+                 1 "${SLEEF_BC}" "${SLEEF_CONFIG}" ${KERNEL_SOURCES})
+
+else()
+  make_kernel_bc(KERNEL_BC "${OCL_KERNEL_TARGET}-${VARIANT}" "${VARIANT}"
+                 0 0 0 ${KERNEL_SOURCES})
+endif()
 
 # just debug
 message(STATUS "Host Kernel BC for \"${VARIANT}\": ${KERNEL_BC}")
diff --git a/lib/kernel/hsail64/CMakeLists.txt b/lib/kernel/hsail64/CMakeLists.txt
index 15c51e6..172ab07 100644
--- a/lib/kernel/hsail64/CMakeLists.txt
+++ b/lib/kernel/hsail64/CMakeLists.txt
@@ -28,7 +28,8 @@ include("bitcode_rules")
 set(KERNEL_SOURCES ${SOURCES_WITHOUT_VML})
 
 foreach(FILE printf.c barrier.ll get_image_depth.cl get_image_dim.cl
-  get_image_height.cl get_image_width.cl read_image.cl write_image.cl)
+  get_image_height.cl get_image_width.cl read_image.cl write_image.cl
+  get_image_channel_data_type.cl get_image_channel_order.cl)
   list(REMOVE_ITEM KERNEL_SOURCES "${FILE}")
 endforeach()
 
@@ -62,15 +63,17 @@ if(POCL_USE_FAKE_ADDR_SPACE_IDS)
 list(APPEND CLANG_FLAGS "-Xclang" "-ffake-address-space-map")
 endif()
 
+set(KERNEL_CL_FLAGS "-Xclang" "-cl-std=CL${HSA_DEVICE_CL_STD}" "-D__OPENCL_C_VERSION__=${HSA_DEVICE_CL_VERSION}" ${KERNEL_CL_FLAGS})
+
 set(LLC_FLAGS "")
-set(DEVICE_CL_FLAGS "-D__OPENCL_VERSION__=${HSA_DEVICE_CL_VERSION}")
+set(DEVICE_CL_FLAGS "-D__OPENCL_VERSION__=${HSA_DEVICE_CL_VERSION} -Dcl_khr_int64")
 separate_arguments(HSA_DEVICE_EXTENSIONS)
 foreach(EXT ${HSA_DEVICE_EXTENSIONS})
   set(DEVICE_CL_FLAGS "${DEVICE_CL_FLAGS} -D${EXT}")
 endforeach()
 separate_arguments(DEVICE_CL_FLAGS)
 
-make_kernel_bc(KERNEL_BC "hsail64" "BCs" ${KERNEL_SOURCES})
+make_kernel_bc(KERNEL_BC "hsail64" "BCs" 0 0 0 ${KERNEL_SOURCES})
 
 # just debug
 message(STATUS "HSAIL64 Kernel BC: ${KERNEL_BC}")
diff --git a/lib/kernel/libclc-pocl/acosh.cl b/lib/kernel/libclc-pocl/acosh.cl
new file mode 100644
index 0000000..86f221c
--- /dev/null
+++ b/lib/kernel/libclc-pocl/acosh.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "acosh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "acosh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "acosh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "acosh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "acosh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "acosh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "acosh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "acosh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "acosh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "acosh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "acosh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "acosh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/acospi.cl b/lib/kernel/libclc-pocl/acospi.cl
new file mode 100644
index 0000000..a094b9a
--- /dev/null
+++ b/lib/kernel/libclc-pocl/acospi.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "acospi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "acospi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "acospi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "acospi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "acospi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "acospi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "acospi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "acospi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "acospi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "acospi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "acospi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "acospi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/asinh.cl b/lib/kernel/libclc-pocl/asinh.cl
new file mode 100644
index 0000000..bdf9bc4
--- /dev/null
+++ b/lib/kernel/libclc-pocl/asinh.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "asinh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "asinh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "asinh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "asinh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "asinh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "asinh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "asinh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "asinh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "asinh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "asinh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "asinh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "asinh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/asinpi.cl b/lib/kernel/libclc-pocl/asinpi.cl
new file mode 100644
index 0000000..8c19e56
--- /dev/null
+++ b/lib/kernel/libclc-pocl/asinpi.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "asinpi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "asinpi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "asinpi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "asinpi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "asinpi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "asinpi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "asinpi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "asinpi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "asinpi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "asinpi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "asinpi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "asinpi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/atan2pi.cl b/lib/kernel/libclc-pocl/atan2pi.cl
new file mode 100644
index 0000000..f3e47b2
--- /dev/null
+++ b/lib/kernel/libclc-pocl/atan2pi.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "atan2pi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "atan2pi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "atan2pi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "atan2pi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "atan2pi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "atan2pi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "atan2pi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "atan2pi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "atan2pi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "atan2pi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "atan2pi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "atan2pi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/atanh.cl b/lib/kernel/libclc-pocl/atanh.cl
new file mode 100644
index 0000000..0c3a971
--- /dev/null
+++ b/lib/kernel/libclc-pocl/atanh.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "atanh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "atanh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "atanh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "atanh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "atanh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "atanh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "atanh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "atanh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "atanh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "atanh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "atanh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "atanh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/atanpi.cl b/lib/kernel/libclc-pocl/atanpi.cl
new file mode 100644
index 0000000..d526899
--- /dev/null
+++ b/lib/kernel/libclc-pocl/atanpi.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "atanpi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "atanpi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "atanpi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "atanpi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "atanpi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "atanpi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "atanpi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "atanpi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "atanpi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "atanpi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "atanpi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "atanpi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/cos.cl b/lib/kernel/libclc-pocl/cos.cl
new file mode 100644
index 0000000..5fd224e
--- /dev/null
+++ b/lib/kernel/libclc-pocl/cos.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "cos_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "cos_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "cos_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "cos_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "cos_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "cos_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "cos_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "cos_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "cos_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "cos_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "cos_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "cos_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/cosh.cl b/lib/kernel/libclc-pocl/cosh.cl
new file mode 100644
index 0000000..e0cc3e1
--- /dev/null
+++ b/lib/kernel/libclc-pocl/cosh.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "cosh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "cosh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "cosh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "cosh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "cosh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "cosh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "cosh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "cosh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "cosh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "cosh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "cosh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "cosh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/cospi.cl b/lib/kernel/libclc-pocl/cospi.cl
new file mode 100644
index 0000000..a21bc72
--- /dev/null
+++ b/lib/kernel/libclc-pocl/cospi.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "cospi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "cospi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "cospi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "cospi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "cospi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "cospi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "cospi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "cospi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "cospi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "cospi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "cospi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "cospi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/degrees.cl b/lib/kernel/libclc-pocl/degrees.cl
new file mode 100644
index 0000000..1754c29
--- /dev/null
+++ b/lib/kernel/libclc-pocl/degrees.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "degrees_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "degrees_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "degrees_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "degrees_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "degrees_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "degrees_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "degrees_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "degrees_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "degrees_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "degrees_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "degrees_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "degrees_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/ep_log.cl b/lib/kernel/libclc-pocl/ep_log.cl
new file mode 100644
index 0000000..515e08f
--- /dev/null
+++ b/lib/kernel/libclc-pocl/ep_log.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "ep_log_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "ep_log_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "ep_log_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "ep_log_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "ep_log_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "ep_log_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "ep_log_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "ep_log_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "ep_log_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "ep_log_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "ep_log_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "ep_log_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/expfrexp.cl b/lib/kernel/libclc-pocl/expfrexp.cl
new file mode 100644
index 0000000..9bebe0b
--- /dev/null
+++ b/lib/kernel/libclc-pocl/expfrexp.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "expfrexp_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "expfrexp_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "expfrexp_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "expfrexp_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "expfrexp_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "expfrexp_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "expfrexp_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "expfrexp_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "expfrexp_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "expfrexp_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "expfrexp_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "expfrexp_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/fmod.cl b/lib/kernel/libclc-pocl/fmod.cl
new file mode 100644
index 0000000..14d18aa
--- /dev/null
+++ b/lib/kernel/libclc-pocl/fmod.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "fmod_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "fmod_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "fmod_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "fmod_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "fmod_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "fmod_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "fmod_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "fmod_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "fmod_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "fmod_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "fmod_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "fmod_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/frexp.cl b/lib/kernel/libclc-pocl/frexp.cl
new file mode 100644
index 0000000..5d4d4f0
--- /dev/null
+++ b/lib/kernel/libclc-pocl/frexp.cl
@@ -0,0 +1,708 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#define ADDRSPACE local
+#include "frexp_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "frexp_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "frexp_fp32.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#define ADDRSPACE local
+#include "frexp_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "frexp_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "frexp_fp32.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#define ADDRSPACE local
+#include "frexp_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "frexp_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "frexp_fp32.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#define ADDRSPACE local
+#include "frexp_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "frexp_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "frexp_fp32.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#define ADDRSPACE local
+#include "frexp_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "frexp_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "frexp_fp32.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#define ADDRSPACE local
+#include "frexp_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "frexp_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "frexp_fp32.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#define ADDRSPACE local
+#include "frexp_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "frexp_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "frexp_fp64.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#define ADDRSPACE local
+#include "frexp_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "frexp_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "frexp_fp64.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#define ADDRSPACE local
+#include "frexp_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "frexp_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "frexp_fp64.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#define ADDRSPACE local
+#include "frexp_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "frexp_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "frexp_fp64.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#define ADDRSPACE local
+#include "frexp_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "frexp_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "frexp_fp64.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#define ADDRSPACE local
+#include "frexp_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "frexp_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "frexp_fp64.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/frfrexp.cl b/lib/kernel/libclc-pocl/frfrexp.cl
new file mode 100644
index 0000000..6ab5e86
--- /dev/null
+++ b/lib/kernel/libclc-pocl/frfrexp.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "frfrexp_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "frfrexp_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "frfrexp_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "frfrexp_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "frfrexp_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "frfrexp_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "frfrexp_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "frfrexp_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "frfrexp_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "frfrexp_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "frfrexp_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "frfrexp_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/isfinite.cl b/lib/kernel/libclc-pocl/isfinite.cl
new file mode 100644
index 0000000..eabe907
--- /dev/null
+++ b/lib/kernel/libclc-pocl/isfinite.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "isfinite_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "isfinite_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "isfinite_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "isfinite_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "isfinite_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "isfinite_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "isfinite_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "isfinite_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "isfinite_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "isfinite_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "isfinite_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "isfinite_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/isinf.cl b/lib/kernel/libclc-pocl/isinf.cl
new file mode 100644
index 0000000..254b4a5
--- /dev/null
+++ b/lib/kernel/libclc-pocl/isinf.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "isinf_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "isinf_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "isinf_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "isinf_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "isinf_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "isinf_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "isinf_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "isinf_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "isinf_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "isinf_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "isinf_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "isinf_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/isnan.cl b/lib/kernel/libclc-pocl/isnan.cl
new file mode 100644
index 0000000..7a41c67
--- /dev/null
+++ b/lib/kernel/libclc-pocl/isnan.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "isnan_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "isnan_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "isnan_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "isnan_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "isnan_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "isnan_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "isnan_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "isnan_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "isnan_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "isnan_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "isnan_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "isnan_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/isnormal.cl b/lib/kernel/libclc-pocl/isnormal.cl
new file mode 100644
index 0000000..6a36c8c
--- /dev/null
+++ b/lib/kernel/libclc-pocl/isnormal.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "isnormal_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "isnormal_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "isnormal_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "isnormal_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "isnormal_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "isnormal_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "isnormal_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "isnormal_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "isnormal_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "isnormal_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "isnormal_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "isnormal_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/log1p.cl b/lib/kernel/libclc-pocl/log1p.cl
new file mode 100644
index 0000000..1aefcbb
--- /dev/null
+++ b/lib/kernel/libclc-pocl/log1p.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "log1p_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "log1p_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "log1p_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "log1p_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "log1p_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "log1p_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "log1p_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "log1p_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "log1p_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "log1p_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "log1p_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "log1p_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/log2.cl b/lib/kernel/libclc-pocl/log2.cl
new file mode 100644
index 0000000..defd600
--- /dev/null
+++ b/lib/kernel/libclc-pocl/log2.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "log2_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "log2_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "log2_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "log2_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "log2_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "log2_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "log2_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "log2_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "log2_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "log2_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "log2_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "log2_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/logb.cl b/lib/kernel/libclc-pocl/logb.cl
new file mode 100644
index 0000000..c6db324
--- /dev/null
+++ b/lib/kernel/libclc-pocl/logb.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "logb_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "logb_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "logb_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "logb_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "logb_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "logb_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "logb_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "logb_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "logb_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "logb_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "logb_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "logb_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/ocml_helpers.cl b/lib/kernel/libclc-pocl/ocml_helpers.cl
new file mode 100644
index 0000000..bd64134
--- /dev/null
+++ b/lib/kernel/libclc-pocl/ocml_helpers.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "ocml_helpers_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "ocml_helpers_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "ocml_helpers_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "ocml_helpers_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "ocml_helpers_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "ocml_helpers_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "ocml_helpers_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "ocml_helpers_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "ocml_helpers_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "ocml_helpers_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "ocml_helpers_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "ocml_helpers_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/pocl_fma.cl b/lib/kernel/libclc-pocl/pocl_fma.cl
new file mode 100644
index 0000000..58cf5c8
--- /dev/null
+++ b/lib/kernel/libclc-pocl/pocl_fma.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "pocl_fma_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "pocl_fma_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "pocl_fma_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "pocl_fma_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "pocl_fma_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "pocl_fma_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "pocl_fma_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "pocl_fma_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "pocl_fma_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "pocl_fma_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "pocl_fma_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "pocl_fma_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/pow.cl b/lib/kernel/libclc-pocl/pow.cl
new file mode 100644
index 0000000..9983792
--- /dev/null
+++ b/lib/kernel/libclc-pocl/pow.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "pow_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "pow_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "pow_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "pow_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "pow_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "pow_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "pow_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "pow_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "pow_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "pow_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "pow_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "pow_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/pow_helpers.cl b/lib/kernel/libclc-pocl/pow_helpers.cl
new file mode 100644
index 0000000..1a240f0
--- /dev/null
+++ b/lib/kernel/libclc-pocl/pow_helpers.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "pow_helpers_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "pow_helpers_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "pow_helpers_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "pow_helpers_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "pow_helpers_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "pow_helpers_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "pow_helpers_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "pow_helpers_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "pow_helpers_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "pow_helpers_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "pow_helpers_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "pow_helpers_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/pown.cl b/lib/kernel/libclc-pocl/pown.cl
new file mode 100644
index 0000000..cb42a7e
--- /dev/null
+++ b/lib/kernel/libclc-pocl/pown.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "pown_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "pown_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "pown_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "pown_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "pown_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "pown_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "pown_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "pown_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "pown_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "pown_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "pown_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "pown_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/powr.cl b/lib/kernel/libclc-pocl/powr.cl
new file mode 100644
index 0000000..3b79ca5
--- /dev/null
+++ b/lib/kernel/libclc-pocl/powr.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "powr_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "powr_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "powr_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "powr_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "powr_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "powr_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "powr_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "powr_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "powr_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "powr_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "powr_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "powr_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/radians.cl b/lib/kernel/libclc-pocl/radians.cl
new file mode 100644
index 0000000..68c72bf
--- /dev/null
+++ b/lib/kernel/libclc-pocl/radians.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "radians_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "radians_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "radians_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "radians_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "radians_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "radians_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "radians_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "radians_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "radians_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "radians_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "radians_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "radians_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/remainder.cl b/lib/kernel/libclc-pocl/remainder.cl
new file mode 100644
index 0000000..2ad92c0
--- /dev/null
+++ b/lib/kernel/libclc-pocl/remainder.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "remainder_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "remainder_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "remainder_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "remainder_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "remainder_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "remainder_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "remainder_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "remainder_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "remainder_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "remainder_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "remainder_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "remainder_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/remquo.cl b/lib/kernel/libclc-pocl/remquo.cl
new file mode 100644
index 0000000..641cd7a
--- /dev/null
+++ b/lib/kernel/libclc-pocl/remquo.cl
@@ -0,0 +1,708 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#define ADDRSPACE local
+#include "remquo_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "remquo_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "remquo_fp32.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#define ADDRSPACE local
+#include "remquo_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "remquo_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "remquo_fp32.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#define ADDRSPACE local
+#include "remquo_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "remquo_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "remquo_fp32.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#define ADDRSPACE local
+#include "remquo_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "remquo_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "remquo_fp32.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#define ADDRSPACE local
+#include "remquo_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "remquo_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "remquo_fp32.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#define ADDRSPACE local
+#include "remquo_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "remquo_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "remquo_fp32.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#define ADDRSPACE local
+#include "remquo_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "remquo_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "remquo_fp64.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#define ADDRSPACE local
+#include "remquo_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "remquo_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "remquo_fp64.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#define ADDRSPACE local
+#include "remquo_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "remquo_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "remquo_fp64.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#define ADDRSPACE local
+#include "remquo_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "remquo_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "remquo_fp64.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#define ADDRSPACE local
+#include "remquo_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "remquo_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "remquo_fp64.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#define ADDRSPACE local
+#include "remquo_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "remquo_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "remquo_fp64.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/rootn.cl b/lib/kernel/libclc-pocl/rootn.cl
new file mode 100644
index 0000000..55cbb33
--- /dev/null
+++ b/lib/kernel/libclc-pocl/rootn.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "rootn_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "rootn_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "rootn_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "rootn_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "rootn_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "rootn_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "rootn_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "rootn_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "rootn_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "rootn_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "rootn_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "rootn_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/sin.cl b/lib/kernel/libclc-pocl/sin.cl
new file mode 100644
index 0000000..f9355fd
--- /dev/null
+++ b/lib/kernel/libclc-pocl/sin.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "sin_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "sin_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "sin_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "sin_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "sin_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "sin_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "sin_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "sin_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "sin_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "sin_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "sin_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "sin_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/sincos.cl b/lib/kernel/libclc-pocl/sincos.cl
new file mode 100644
index 0000000..dcfcd7a
--- /dev/null
+++ b/lib/kernel/libclc-pocl/sincos.cl
@@ -0,0 +1,708 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#define ADDRSPACE local
+#include "sincos_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "sincos_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "sincos_fp32.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#define ADDRSPACE local
+#include "sincos_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "sincos_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "sincos_fp32.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#define ADDRSPACE local
+#include "sincos_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "sincos_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "sincos_fp32.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#define ADDRSPACE local
+#include "sincos_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "sincos_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "sincos_fp32.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#define ADDRSPACE local
+#include "sincos_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "sincos_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "sincos_fp32.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#define ADDRSPACE local
+#include "sincos_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "sincos_fp32.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "sincos_fp32.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#define ADDRSPACE local
+#include "sincos_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "sincos_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "sincos_fp64.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#define ADDRSPACE local
+#include "sincos_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "sincos_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "sincos_fp64.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#define ADDRSPACE local
+#include "sincos_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "sincos_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "sincos_fp64.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#define ADDRSPACE local
+#include "sincos_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "sincos_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "sincos_fp64.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#define ADDRSPACE local
+#include "sincos_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "sincos_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "sincos_fp64.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#define ADDRSPACE local
+#include "sincos_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE global
+#include "sincos_fp64.cl"
+#undef ADDRSPACE
+
+#define ADDRSPACE private
+#include "sincos_fp64.cl"
+#undef ADDRSPACE
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/sincos_helpers.cl b/lib/kernel/libclc-pocl/sincos_helpers.cl
new file mode 100644
index 0000000..4ff2fa8
--- /dev/null
+++ b/lib/kernel/libclc-pocl/sincos_helpers.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "sincos_helpers_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "sincos_helpers_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "sincos_helpers_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "sincos_helpers_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "sincos_helpers_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "sincos_helpers_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "sincos_helpers_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "sincos_helpers_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "sincos_helpers_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "sincos_helpers_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "sincos_helpers_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "sincos_helpers_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/sinh.cl b/lib/kernel/libclc-pocl/sinh.cl
new file mode 100644
index 0000000..1a9ce94
--- /dev/null
+++ b/lib/kernel/libclc-pocl/sinh.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "sinh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "sinh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "sinh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "sinh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "sinh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "sinh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "sinh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "sinh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "sinh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "sinh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "sinh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "sinh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/sinpi.cl b/lib/kernel/libclc-pocl/sinpi.cl
new file mode 100644
index 0000000..ea9eb27
--- /dev/null
+++ b/lib/kernel/libclc-pocl/sinpi.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "sinpi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "sinpi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "sinpi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "sinpi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "sinpi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "sinpi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "sinpi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "sinpi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "sinpi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "sinpi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "sinpi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "sinpi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/tan.cl b/lib/kernel/libclc-pocl/tan.cl
new file mode 100644
index 0000000..47e8004
--- /dev/null
+++ b/lib/kernel/libclc-pocl/tan.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "tan_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "tan_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "tan_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "tan_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "tan_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "tan_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "tan_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "tan_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "tan_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "tan_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "tan_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "tan_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/tanh.cl b/lib/kernel/libclc-pocl/tanh.cl
new file mode 100644
index 0000000..75a6217
--- /dev/null
+++ b/lib/kernel/libclc-pocl/tanh.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "tanh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "tanh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "tanh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "tanh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "tanh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "tanh_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "tanh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "tanh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "tanh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "tanh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "tanh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "tanh_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc-pocl/tanpi.cl b/lib/kernel/libclc-pocl/tanpi.cl
new file mode 100644
index 0000000..0fe75c7
--- /dev/null
+++ b/lib/kernel/libclc-pocl/tanpi.cl
@@ -0,0 +1,588 @@
+#include "misc.h"
+
+
+
+
+#ifdef HAVE_FMA32_32
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define SINGLEVEC
+#define vtype float
+#define v2type v2float
+#define itype int
+#define utype uint
+#define inttype int
+#define as_vtype as_float
+#define as_itype as_int
+#define as_utype as_uint
+#define convert_vtype convert_float
+#define convert_itype convert_int
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_uint
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "tanpi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA32_64
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float2
+#define v2type v2float2
+#define itype int2
+#define utype uint2
+#define inttype int2
+#define as_vtype as_float2
+#define as_itype as_int2
+#define as_utype as_uint2
+#define convert_vtype convert_float2
+#define convert_itype convert_int2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_uint2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "tanpi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_96
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float3
+#define v2type v2float3
+#define itype int3
+#define utype uint3
+#define inttype int3
+#define as_vtype as_float3
+#define as_itype as_int3
+#define as_utype as_uint3
+#define convert_vtype convert_float3
+#define convert_itype convert_int3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_uint3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "tanpi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_128
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float4
+#define v2type v2float4
+#define itype int4
+#define utype uint4
+#define inttype int4
+#define as_vtype as_float4
+#define as_itype as_int4
+#define as_utype as_uint4
+#define convert_vtype convert_float4
+#define convert_itype convert_int4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_uint4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "tanpi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_256
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float8
+#define v2type v2float8
+#define itype int8
+#define utype uint8
+#define inttype int8
+#define as_vtype as_float8
+#define as_itype as_int8
+#define as_utype as_uint8
+#define convert_vtype convert_float8
+#define convert_itype convert_int8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_uint8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "tanpi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+
+#ifdef HAVE_FMA32_512
+#define HAVE_FMA32 1
+#else
+#define HAVE_FMA32 0
+#endif
+#define vtype float16
+#define v2type v2float16
+#define itype int16
+#define utype uint16
+#define inttype int16
+#define as_vtype as_float16
+#define as_itype as_int16
+#define as_utype as_uint16
+#define convert_vtype convert_float16
+#define convert_itype convert_int16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_uint16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp32.h"
+#include "tanpi_fp32.cl"
+
+#undef v2type
+#undef itype4
+#undef vtype
+#undef itype
+#undef inttype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA32
+
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+
+
+#ifdef HAVE_FMA64_64
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define SINGLEVEC
+#define vtype double
+#define v2type v2double
+#define itype long
+#define utype ulong
+#define uinttype uint
+#define inttype int
+#define utype4 v4uint
+#define itype4 v4int
+#define as_vtype as_double
+#define as_itype as_long
+#define as_utype as_ulong
+#define convert_vtype convert_double
+#define convert_itype convert_long
+#define convert_inttype convert_int
+#define convert_uinttype convert_uint
+#define convert_utype convert_ulong
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "tanpi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#undef SINGLEVEC
+
+
+
+#ifdef HAVE_FMA64_128
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double2
+#define v2type v2double2
+#define itype long2
+#define utype ulong2
+#define uinttype uint2
+#define inttype int2
+#define utype4 v4uint2
+#define itype4 v4int2
+#define as_vtype as_double2
+#define as_itype as_long2
+#define as_utype as_ulong2
+#define convert_vtype convert_double2
+#define convert_itype convert_long2
+#define convert_inttype convert_int2
+#define convert_uinttype convert_uint2
+#define convert_utype convert_ulong2
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "tanpi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_192
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double3
+#define v2type v2double3
+#define itype long3
+#define utype ulong3
+#define uinttype uint3
+#define inttype int3
+#define utype4 v4uint3
+#define itype4 v4int3
+#define as_vtype as_double3
+#define as_itype as_long3
+#define as_utype as_ulong3
+#define convert_vtype convert_double3
+#define convert_itype convert_long3
+#define convert_inttype convert_int3
+#define convert_uinttype convert_uint3
+#define convert_utype convert_ulong3
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "tanpi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_256
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double4
+#define v2type v2double4
+#define itype long4
+#define utype ulong4
+#define uinttype uint4
+#define inttype int4
+#define utype4 v4uint4
+#define itype4 v4int4
+#define as_vtype as_double4
+#define as_itype as_long4
+#define as_utype as_ulong4
+#define convert_vtype convert_double4
+#define convert_itype convert_long4
+#define convert_inttype convert_int4
+#define convert_uinttype convert_uint4
+#define convert_utype convert_ulong4
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "tanpi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_512
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double8
+#define v2type v2double8
+#define itype long8
+#define utype ulong8
+#define uinttype uint8
+#define inttype int8
+#define utype4 v4uint8
+#define itype4 v4int8
+#define as_vtype as_double8
+#define as_itype as_long8
+#define as_utype as_ulong8
+#define convert_vtype convert_double8
+#define convert_itype convert_long8
+#define convert_inttype convert_int8
+#define convert_uinttype convert_uint8
+#define convert_utype convert_ulong8
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "tanpi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+
+
+
+#ifdef HAVE_FMA64_1024
+#define HAVE_FMA64 1
+#else
+#define HAVE_FMA64 0
+#endif
+#define vtype double16
+#define v2type v2double16
+#define itype long16
+#define utype ulong16
+#define uinttype uint16
+#define inttype int16
+#define utype4 v4uint16
+#define itype4 v4int16
+#define as_vtype as_double16
+#define as_itype as_long16
+#define as_utype as_ulong16
+#define convert_vtype convert_double16
+#define convert_itype convert_long16
+#define convert_inttype convert_int16
+#define convert_uinttype convert_uint16
+#define convert_utype convert_ulong16
+
+#include "vtables.h"
+
+#include "singlevec.h"
+
+
+#include "sincos_helpers_fp64.h"
+#include "ep_log.h"
+#include "tanpi_fp64.cl"
+
+#undef v2type
+#undef itype4
+#undef utype4
+#undef uinttype
+#undef inttype
+#undef vtype
+#undef itype
+#undef utype
+#undef as_vtype
+#undef as_itype
+#undef as_utype
+#undef convert_vtype
+#undef convert_itype
+#undef convert_inttype
+#undef convert_uinttype
+#undef convert_utype
+#undef HAVE_FMA64
+#endif
diff --git a/lib/kernel/libclc/ROCM_LICENSE.txt b/lib/kernel/libclc/ROCM_LICENSE.txt
new file mode 100644
index 0000000..396a079
--- /dev/null
+++ b/lib/kernel/libclc/ROCM_LICENSE.txt
@@ -0,0 +1,44 @@
+/*
+ * University of Illinois/NCSA
+ * Open Source License
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Copyright (c) 2014-2016, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Developed by:
+ *
+ *     AMD Research and AMD HSA Software Development
+ *
+ *     Advanced Micro Devices, Inc.
+ *
+ *     www.amd.com
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal with
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is furnished to do
+ * so, subject to the following conditions:
+ *
+ *     * Redistributions of source code must retain the above copyright notice,
+ *       this list of conditions and the following disclaimers.
+ *
+ *     * Redistributions in binary form must reproduce the above copyright notice,
+ *       this list of conditions and the following disclaimers in the
+ *       documentation and/or other materials provided with the distribution.
+ *
+ *     * Neither the names of the LLVM Team, University of Illinois at
+ *       Urbana-Champaign, nor the names of its contributors may be used to
+ *       endorse or promote products derived from this Software without specific
+ *       prior written permission.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+ * SOFTWARE.
+ */
\ No newline at end of file
diff --git a/lib/kernel/libclc/acosh_fp32.cl b/lib/kernel/libclc/acosh_fp32.cl
new file mode 100644
index 0000000..9c56047
--- /dev/null
+++ b/lib/kernel/libclc/acosh_fp32.cl
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+_CL_OVERLOADABLE  vtype acosh(vtype x) {
+    utype ux = as_utype(x);
+
+    // Arguments greater than 1/sqrt(epsilon) in magnitude are
+    // approximated by acosh(x) = ln(2) + ln(x)
+    // For 2.0 <= x <= 1/sqrt(epsilon) the approximation is
+    // acosh(x) = ln(x + sqrt(x*x-1)) */
+    itype high = (ux > (utype)0x46000000);
+    itype med = (ux > (utype)0x40000000);
+
+    vtype w = x - (vtype)1.0f;
+    vtype s = w*w + 2.0f*w;
+    vtype t = x*x - (vtype)1.0f;
+    vtype r = sqrt(select(s, t, med)) + select(w, x, med);
+    vtype v = select(r, x, high) - select((vtype)0.0f, (vtype)1.0f, med);
+    vtype z = log1p(v) + select((vtype)0.0f, (vtype)0x1.62e430p-1f, high);
+
+    z = select(z, x, (ux >= (utype)PINFBITPATT_SP32));
+    z = select(z, as_vtype((utype)QNANBITPATT_SP32), (x < (vtype)1.0f));
+
+    return z;
+}
diff --git a/lib/kernel/libclc/acosh_fp64.cl b/lib/kernel/libclc/acosh_fp64.cl
new file mode 100644
index 0000000..422dff2
--- /dev/null
+++ b/lib/kernel/libclc/acosh_fp64.cl
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+_CL_OVERLOADABLE vtype acosh(vtype x) {
+    const vtype recrteps = (vtype)0x1.6a09e667f3bcdp+26;  // 1/sqrt(eps) = (vtype)9.49062656242515593767e+07
+    //log2_lead and log2_tail sum to an extra-precise version of log(2)
+    const vtype log2_lead = (vtype)0x1.62e42ep-1;
+    const vtype log2_tail = (vtype)0x1.efa39ef35793cp-25;
+
+    // Handle x >= 128 here
+    itype xlarge = (x > recrteps);
+    vtype r = x + sqrt(pocl_fma(x, x, (vtype)-1.0));
+    r = xlarge ? x : r;
+
+    itype xexp;
+    vtype r1, r2;
+    __pocl_ep_log(r, &xexp, &r1, &r2);
+
+    itype xlarge2 = (x > recrteps) ? (itype)1 : (itype)0;
+    vtype dxexp = convert_vtype(xexp + xlarge2);
+    r1 = pocl_fma(dxexp, log2_lead, r1);
+    r2 = pocl_fma(dxexp, log2_tail, r2);
+
+    vtype ret1 = r1 + r2;
+
+    // Handle 1 < x < 128 here
+    // We compute the value
+    // t = x - 1.0 + sqrt(2.0*(x - 1.0) + (x - 1.0)*(x - 1.0))
+    // using simulated quad precision.
+    vtype t = x - (vtype)1.0;
+    vtype u1 = t * 2.0;
+
+    // (t,0) * (t,0) -> (v1, v2)
+    vtype v1 = t * t;
+    vtype v2 = pocl_fma(t, t, -v1);
+
+    // (u1,0) + (v1,v2) -> (w1,w2)
+    r = u1 + v1;
+    vtype s = (((u1 - r) + v1) + v2);
+    vtype w1 = r + s;
+    vtype w2 = (r - w1) + s;
+
+    // sqrt(w1,w2) -> (u1,u2)
+    vtype p1 = sqrt(w1);
+    vtype a1 = p1*p1;
+    vtype a2 = pocl_fma(p1, p1, -a1);
+    vtype temp = (((w1 - a1) - a2) + w2);
+    vtype p2 = MATH_DIVIDE(temp * 0.5, p1);
+    u1 = p1 + p2;
+    vtype u2 = (p1 - u1) + p2;
+
+    // (u1,u2) + (t,0) -> (r1,r2)
+    r = u1 + t;
+    s = ((u1 - r) + t) + u2;
+    // r1 = r + s;
+    // r2 = (r - r1) + s;
+    // t = r1 + r2;
+    t = r + s;
+
+    // For arguments 1.13 <= x <= 1.5 the log1p function is good enough
+    vtype ret2 = log1p(t);
+
+    utype ux = as_utype(x);
+    vtype ret = (x >= (vtype)128.0) ? ret1 : ret2;
+
+    ret = (x == (vtype)1.0) ? (vtype)0.0 : ret;
+
+    ret = (ux >= (utype)EXPBITS_DP64) ? x : ret;
+
+    vtype nans = as_vtype((utype)QNANBITPATT_DP64);
+    itype retnans = ((ux & (utype)(SIGNBIT_DP64)) != 0);
+    retnans |= ((itype)(x < (vtype)1.0));
+    ret = retnans ? nans : ret;
+
+    return ret;
+}
diff --git a/lib/kernel/libclc/acospi_fp32.cl b/lib/kernel/libclc/acospi_fp32.cl
new file mode 100644
index 0000000..d1590e5
--- /dev/null
+++ b/lib/kernel/libclc/acospi_fp32.cl
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+_CL_OVERLOADABLE vtype acospi(vtype x) {
+    // Computes arccos(x).
+    // The argument is first reduced by noting that arccos(x)
+    // is invalid for abs(x) > 1. For denormal and small
+    // arguments arccos(x) = pi/2 to machine accuracy.
+    // Remaining argument ranges are handled as follows.
+    // For abs(x) <= 0.5 use
+    // arccos(x) = pi/2 - arcsin(x)
+    // = pi/2 - (x + x^3*R(x^2))
+    // where R(x^2) is a rational minimax approximation to
+    // (arcsin(x) - x)/x^3.
+    // For abs(x) > 0.5 exploit the identity:
+    // arccos(x) = pi - 2*arcsin(sqrt(1-x)/2)
+    // together with the above rational approximation, and
+    // reconstruct the terms carefully.
+
+
+    // Some constants and split constants.
+    const vtype pi = (vtype)3.1415926535897933e+00f;
+    const vtype piby2_head = (vtype)1.5707963267948965580e+00f;  /* 0x3ff921fb54442d18 */
+    const vtype piby2_tail = (vtype)6.12323399573676603587e-17f; /* 0x3c91a62633145c07 */
+
+    utype ux = as_utype(x);
+    utype aux = ux & (utype)~SIGNBIT_SP32;
+    itype xneg = (ux != aux);
+    itype xexp = as_itype(aux >> EXPSHIFTBITS_SP32) - (itype)EXPBIAS_SP32;
+
+    vtype y = as_vtype(aux);
+
+    // transform if |x| >= 0.5
+    itype transform = (xexp >= (itype)-1);
+
+    vtype y2 = y * y;
+    vtype yt = 0.5f * ((vtype)1.0f - y);
+    vtype r = transform ? yt : y2;
+
+    // Use a rational approximation for [0.0, 0.5]
+    vtype a = pocl_fma(r,
+                pocl_fma(r,
+                  pocl_fma(r,
+                    (vtype)-0.00396137437848476485201154797087F,
+                    (vtype)-0.0133819288943925804214011424456F),
+                  (vtype)-0.0565298683201845211985026327361F),
+                (vtype)0.184161606965100694821398249421F);
+    vtype b = pocl_fma(r,
+                (vtype)-0.836411276854206731913362287293F,
+                (vtype)1.10496961524520294485512696706F);
+    vtype u = r * MATH_DIVIDE(a, b);
+
+    vtype s = MATH_SQRT(r);
+    y = s;
+    vtype s1 = as_vtype(as_utype(s) & (utype)0xffff0000);
+    vtype c = MATH_DIVIDE(r - s1 * s1, s + s1);
+    vtype rettn = (vtype)1.0f - MATH_DIVIDE(2.0f * (s + pocl_fma(y, u, -piby2_tail)), pi);
+    vtype rettp = MATH_DIVIDE(2.0f*(s1 + pocl_fma(y, u, c)), pi);
+    vtype rett = xneg ? rettn : rettp;
+    vtype ret = MATH_DIVIDE(piby2_head - (x - pocl_fma(x, -u, piby2_tail)), pi);
+
+    ret = transform ? rett : ret;
+    ret = (aux > (utype)0x3f800000U) ? as_vtype((utype)QNANBITPATT_SP32) : ret;
+    ret = (ux == (utype)0x3f800000U) ? (vtype)0.0f : ret;
+    ret = (ux == (utype)0xbf800000U) ? (vtype)1.0f : ret;
+    ret = (xexp < (itype)-26) ? (vtype)0.5f : ret;
+    return ret;
+}
diff --git a/lib/kernel/libclc/acospi_fp64.cl b/lib/kernel/libclc/acospi_fp64.cl
new file mode 100644
index 0000000..b5bc70a
--- /dev/null
+++ b/lib/kernel/libclc/acospi_fp64.cl
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+
+_CL_OVERLOADABLE vtype acospi(vtype x) {
+    // Computes arccos(x).
+    // The argument is first reduced by noting that arccos(x)
+    // is invalid for abs(x) > 1. For denormal and small
+    // arguments arccos(x) = pi/2 to machine accuracy.
+    // Remaining argument ranges are handled as follows.
+    // For abs(x) <= 0.5 use
+    // arccos(x) = pi/2 - arcsin(x)
+    // = pi/2 - (x + x^3*R(x^2))
+    // where R(x^2) is a rational minimax approximation to
+    // (arcsin(x) - x)/x^3.
+    // For abs(x) > 0.5 exploit the identity:
+    // arccos(x) = pi - 2*arcsin(sqrt(1-x)/2)
+    // together with the above rational approximation, and
+    // reconstruct the terms carefully.
+
+    const vtype pi = (vtype)0x1.921fb54442d18p+1;
+    const vtype piby2_tail = (vtype)6.12323399573676603587e-17;        /* 0x3c91a62633145c07 */
+
+    vtype y = fabs(x);
+
+    itype xneg = (as_itype(x) < (itype)0);
+    itype xexp = (as_itype(y) >> 52) - (itype)EXPBIAS_DP64;
+
+    // abs(x) >= 0.5
+    itype transform = (xexp >= -1);
+
+    // Transform y into the range [0,0.5)
+    vtype r1 = (vtype)0.5 * ((vtype)1.0 - y);
+    vtype s = sqrt(r1);
+    vtype r = y * y;
+    r = transform ? r1 : r;
+    y = transform ? s : y;
+
+    // Use a rational approximation for [0.0, 0.5]
+    vtype un = pocl_fma(r,
+                    pocl_fma(r,
+                        pocl_fma(r,
+                            pocl_fma(r,
+                                pocl_fma(r, 0.0000482901920344786991880522822991,
+                                       0.00109242697235074662306043804220),
+                                -0.0549989809235685841612020091328),
+                            0.275558175256937652532686256258),
+                        -0.445017216867635649900123110649),
+                    0.227485835556935010735943483075);
+
+    vtype ud = pocl_fma(r,
+                    pocl_fma(r,
+                        pocl_fma(r,
+                            pocl_fma(r, 0.105869422087204370341222318533,
+                                   -0.943639137032492685763471240072),
+                            2.76568859157270989520376345954),
+                        -3.28431505720958658909889444194),
+                    1.36491501334161032038194214209);
+
+    vtype u = r * MATH_DIVIDE(un, ud);
+
+    // Reconstruct acos carefully in transformed region
+    vtype res1 = pocl_fma((vtype)-2.0,
+                   MATH_DIVIDE(s + pocl_fma(y, u, -piby2_tail), pi),
+                   (vtype)1.0);
+    vtype s1 = as_vtype(as_utype(s) & (utype)0xffffffff00000000UL);
+    vtype c = MATH_DIVIDE(pocl_fma(-s1, s1, r), s + s1);
+    vtype res2 = MATH_DIVIDE(pocl_fma((vtype)2.0, s1, pocl_fma((vtype)2.0, c, (vtype)2.0 * y * u)), pi);
+    res1 = xneg ? res1 : res2;
+    res2 = (vtype)0.5 - pocl_fma(x, u, x) / pi;
+    res1 = transform ? res1 : res2;
+
+    const vtype qnan = as_vtype((utype)QNANBITPATT_DP64);
+    res2 = (x == (vtype)1.0) ? (vtype)0.0 : qnan;
+    res2 = (x == (vtype)-1.0) ? (vtype)1.0 : res2;
+    res1 = (xexp >= (itype)0) ? res2 : res1;
+    res1 = (xexp < (itype)-56) ? (vtype)0.5 : res1;
+
+    return res1;
+}
diff --git a/lib/kernel/libclc/asinh_fp32.cl b/lib/kernel/libclc/asinh_fp32.cl
new file mode 100644
index 0000000..dcca673
--- /dev/null
+++ b/lib/kernel/libclc/asinh_fp32.cl
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+_CL_OVERLOADABLE vtype asinh(vtype x) {
+    utype ux = as_utype(x);
+    utype ax = ux & (utype)EXSIGNBIT_SP32;
+    utype xsgn = ax ^ ux;
+
+    // |x| <= 2
+    vtype t = x * x;
+    vtype a = pocl_fma(t,
+                pocl_fma(t,
+                  pocl_fma(t,
+                    pocl_fma(t,
+                      (vtype)-1.177198915954942694e-4f,
+                      (vtype)-4.162727710583425360e-2f),
+                    (vtype)-5.063201055468483248e-1f),
+                  (vtype)-1.480204186473758321f),
+                (vtype)-1.152965835871758072f);
+    vtype b = pocl_fma(t,
+                pocl_fma(t,
+                  pocl_fma(t,
+                    pocl_fma(t,
+                      (vtype)6.284381367285534560e-2f,
+                      (vtype)1.260024978680227945f),
+                    (vtype)6.582362487198468066f),
+                  (vtype)11.99423176003939087f),
+                (vtype)6.917795026025976739f);
+
+    vtype q = MATH_DIVIDE(a, b);
+    vtype z1 = pocl_fma(x*t, q, x);
+
+    // |x| > 2
+
+    // Arguments greater than 1/sqrt(epsilon) in magnitude are
+    // approximated by asinh(x) = ln(2) + ln(abs(x)), with sign of x
+    // Arguments such that 4.0 <= abs(x) <= 1/sqrt(epsilon) are
+    // approximated by asinhf(x) = ln(abs(x) + sqrt(x*x+1))
+    // with the sign of x (see Abramowitz and Stegun 4.6.20)
+
+    vtype absx = as_vtype(ax);
+    itype hi = (ax > 0x46000000U);
+    vtype y = MATH_SQRT(absx * absx + (vtype)1.0f) + absx;
+    y = hi ? absx : y;
+    vtype r = log(y) + (hi ? (vtype)0x1.62e430p-1f : (vtype)0.0f);
+    vtype z2 = as_vtype(xsgn | as_utype(r));
+
+    vtype z = (ax <= (utype)0x40000000) ? z1 : z2;
+    z = ((ax < (utype)0x39800000U) | (ax >= (utype)PINFBITPATT_SP32)) ? x : z;
+
+    return z;
+}
diff --git a/lib/kernel/libclc/asinh_fp64.cl b/lib/kernel/libclc/asinh_fp64.cl
new file mode 100644
index 0000000..2288901
--- /dev/null
+++ b/lib/kernel/libclc/asinh_fp64.cl
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+
+#define NA0  (vtype)-0.12845379283524906084997e0
+#define NA1  (vtype)-0.21060688498409799700819e0
+#define NA2  (vtype)-0.10188951822578188309186e0
+#define NA3  (vtype)-0.13891765817243625541799e-1
+#define NA4  (vtype)-0.10324604871728082428024e-3
+
+#define DA0  (vtype)0.77072275701149440164511e0
+#define DA1  (vtype)0.16104665505597338100747e1
+#define DA2  (vtype)0.11296034614816689554875e1
+#define DA3  (vtype)0.30079351943799465092429e0
+#define DA4  (vtype)0.235224464765951442265117e-1
+
+#define NB0  (vtype)-0.12186605129448852495563e0
+#define NB1  (vtype)-0.19777978436593069928318e0
+#define NB2  (vtype)-0.94379072395062374824320e-1
+#define NB3  (vtype)-0.12620141363821680162036e-1
+#define NB4  (vtype)-0.903396794842691998748349e-4
+
+#define DB0  (vtype)0.73119630776696495279434e0
+#define DB1  (vtype)0.15157170446881616648338e1
+#define DB2  (vtype)0.10524909506981282725413e1
+#define DB3  (vtype)0.27663713103600182193817e0
+#define DB4  (vtype)0.21263492900663656707646e-1
+
+#define NC0  (vtype)-0.81210026327726247622500e-1
+#define NC1  (vtype)-0.12327355080668808750232e0
+#define NC2  (vtype)-0.53704925162784720405664e-1
+#define NC3  (vtype)-0.63106739048128554465450e-2
+#define NC4  (vtype)-0.35326896180771371053534e-4
+
+#define DC0  (vtype)0.48726015805581794231182e0
+#define DC1  (vtype)0.95890837357081041150936e0
+#define DC2  (vtype)0.62322223426940387752480e0
+#define DC3  (vtype)0.15028684818508081155141e0
+#define DC4  (vtype)0.10302171620320141529445e-1
+
+#define ND0  (vtype)-0.4638179204422665073e-1
+#define ND1  (vtype)-0.7162729496035415183e-1
+#define ND2  (vtype)-0.3247795155696775148e-1
+#define ND3  (vtype)-0.4225785421291932164e-2
+#define ND4  (vtype)-0.3808984717603160127e-4
+#define ND5  (vtype)0.8023464184964125826e-6
+
+#define DD0  (vtype)0.2782907534642231184e0
+#define DD1  (vtype)0.5549945896829343308e0
+#define DD2  (vtype)0.3700732511330698879e0
+#define DD3  (vtype)0.9395783438240780722e-1
+#define DD4  (vtype)0.7200057974217143034e-2
+
+#define NE0  (vtype)-0.121224194072430701e-4
+#define NE1  (vtype)-0.273145455834305218e-3
+#define NE2  (vtype)-0.152866982560895737e-2
+#define NE3  (vtype)-0.292231744584913045e-2
+#define NE4  (vtype)-0.174670900236060220e-2
+#define NE5  (vtype)-0.891754209521081538e-12
+
+#define DE0  (vtype)0.499426632161317606e-4
+#define DE1  (vtype)0.139591210395547054e-2
+#define DE2  (vtype)0.107665231109108629e-1
+#define DE3  (vtype)0.325809818749873406e-1
+#define DE4  (vtype)0.415222526655158363e-1
+#define DE5  (vtype)0.186315628774716763e-1
+
+#define NF0   (vtype)-0.195436610112717345e-4
+#define NF1   (vtype)-0.233315515113382977e-3
+#define NF2   (vtype)-0.645380957611087587e-3
+#define NF3   (vtype)-0.478948863920281252e-3
+#define NF4   (vtype)-0.805234112224091742e-12
+#define NF5   (vtype)0.246428598194879283e-13
+
+#define DF0   (vtype)0.822166621698664729e-4
+#define DF1   (vtype)0.135346265620413852e-2
+#define DF2   (vtype)0.602739242861830658e-2
+#define DF3   (vtype)0.972227795510722956e-2
+#define DF4   (vtype)0.510878800983771167e-2
+
+#define NG0   (vtype)-0.209689451648100728e-6
+#define NG1   (vtype)-0.219252358028695992e-5
+#define NG2   (vtype)-0.551641756327550939e-5
+#define NG3   (vtype)-0.382300259826830258e-5
+#define NG4   (vtype)-0.421182121910667329e-17
+#define NG5   (vtype)0.492236019998237684e-19
+
+#define DG0   (vtype)0.889178444424237735e-6
+#define DG1   (vtype)0.131152171690011152e-4
+#define DG2   (vtype)0.537955850185616847e-4
+#define DG3   (vtype)0.814966175170941864e-4
+#define DG4   (vtype)0.407786943832260752e-4
+
+#define NH0   (vtype)-0.178284193496441400e-6
+#define NH1   (vtype)-0.928734186616614974e-6
+#define NH2   (vtype)-0.923318925566302615e-6
+#define NH3   (vtype)-0.776417026702577552e-19
+#define NH4   (vtype)0.290845644810826014e-21
+
+#define DH0   (vtype)0.786694697277890964e-6
+#define DH1   (vtype)0.685435665630965488e-5
+#define DH2   (vtype)0.153780175436788329e-4
+#define DH3   (vtype)0.984873520613417917e-5
+
+#define NI0   (vtype)-0.538003743384069117e-10
+#define NI1   (vtype)-0.273698654196756169e-9
+#define NI2   (vtype)-0.268129826956403568e-9
+#define NI3   (vtype)-0.804163374628432850e-29
+
+#define DI0   (vtype)0.238083376363471960e-9
+#define DI1   (vtype)0.203579344621125934e-8
+#define DI2   (vtype)0.450836980450693209e-8
+#define DI3   (vtype)0.286005148753497156e-8
+
+_CL_OVERLOADABLE vtype asinh(vtype x) {
+    const vtype rteps = (vtype)0x1.6a09e667f3bcdp-27;
+    const vtype recrteps = (vtype)0x1.6a09e667f3bcdp+26;
+
+    // log2_lead and log2_tail sum to an extra-precise version of log(2)
+    const vtype log2_lead = (vtype)0x1.62e42ep-1;
+    const vtype log2_tail = (vtype)0x1.efa39ef35793cp-25;
+
+    utype ux = as_utype(x);
+    utype ax = ux & (utype)~SIGNBIT_DP64;
+    vtype absx = as_vtype(ax);
+
+    vtype t = x * x;
+    vtype pn, tn, pd, td;
+
+    // XXX we are betting here that we can evaluate 8 pairs of
+    // polys faster than we can grab 12 coefficients from a table
+    // This also uses fewer registers
+
+    // |x| >= 8
+    pn = pocl_fma(t, pocl_fma(t, pocl_fma(t, NI3, NI2), NI1), NI0);
+    pd = pocl_fma(t, pocl_fma(t, pocl_fma(t, DI3, DI2), DI1), DI0);
+
+    tn = pocl_fma(t, pocl_fma(t, pocl_fma(t, pocl_fma(t, NH4, NH3), NH2), NH1), NH0);
+    td = pocl_fma(t, pocl_fma(t, pocl_fma(t, DH3, DH2), DH1), DH0);
+    pn = (absx < (vtype)8.0) ? tn : pn;
+    pd = (absx < (vtype)8.0) ? td : pd;
+
+    tn = pocl_fma(t, pocl_fma(t, pocl_fma(t, pocl_fma(t, pocl_fma(t, NG5, NG4), NG3), NG2), NG1), NG0);
+    td = pocl_fma(t, pocl_fma(t, pocl_fma(t, pocl_fma(t, DG4, DG3), DG2), DG1), DG0);
+    pn = (absx < (vtype)4.0) ? tn : pn;
+    pd = (absx < (vtype)4.0) ? td : pd;
+
+    tn = pocl_fma(t, pocl_fma(t, pocl_fma(t, pocl_fma(t, pocl_fma(t, NF5, NF4), NF3), NF2), NF1), NF0);
+    td = pocl_fma(t, pocl_fma(t, pocl_fma(t, pocl_fma(t, DF4, DF3), DF2), DF1), DF0);
+    pn = (absx < (vtype)2.0) ? tn : pn;
+    pd = (absx < (vtype)2.0) ? td : pd;
+
+    tn = pocl_fma(t, pocl_fma(t, pocl_fma(t, pocl_fma(t, pocl_fma(t, NE5, NE4), NE3), NE2), NE1), NE0);
+    td = pocl_fma(t, pocl_fma(t, pocl_fma(t, pocl_fma(t, pocl_fma(t, DE5, DE4), DE3), DE2), DE1), DE0);
+    pn = (absx < (vtype)1.5) ? tn : pn;
+    pd = (absx < (vtype)1.5) ? td : pd;
+
+    tn = pocl_fma(t, pocl_fma(t, pocl_fma(t, pocl_fma(t, pocl_fma(t, ND5, ND4), ND3), ND2), ND1), ND0);
+    td = pocl_fma(t, pocl_fma(t, pocl_fma(t, pocl_fma(t, DD4, DD3), DD2), DD1), DD0);
+    pn = (absx <= (vtype)1.0) ? tn : pn;
+    pd = (absx <= (vtype)1.0) ? td : pd;
+
+    tn = pocl_fma(t, pocl_fma(t, pocl_fma(t, pocl_fma(t, NC4, NC3), NC2), NC1), NC0);
+    td = pocl_fma(t, pocl_fma(t, pocl_fma(t, pocl_fma(t, DC4, DC3), DC2), DC1), DC0);
+    pn = (absx < (vtype)0.75) ? tn : pn;
+    pd = (absx < (vtype)0.75) ? td : pd;
+
+    tn = pocl_fma(t, pocl_fma(t, pocl_fma(t, pocl_fma(t, NB4, NB3), NB2), NB1), NB0);
+    td = pocl_fma(t, pocl_fma(t, pocl_fma(t, pocl_fma(t, DB4, DB3), DB2), DB1), DB0);
+    pn = (absx < (vtype)0.5) ? tn : pn;
+    pd = (absx < (vtype)0.5) ? td : pd;
+
+    tn = pocl_fma(t, pocl_fma(t, pocl_fma(t, pocl_fma(t, NA4, NA3), NA2), NA1), NA0);
+    td = pocl_fma(t, pocl_fma(t, pocl_fma(t, pocl_fma(t, DA4, DA3), DA2), DA1), DA0);
+    pn = (absx < (vtype)0.25) ? tn : pn;
+    pd = (absx < (vtype)0.25) ? td : pd;
+
+    vtype pq = MATH_DIVIDE(pn, pd);
+
+    // |x| <= 1
+    vtype result1 = pocl_fma(absx*t, pq, absx);
+
+    // Other ranges
+    itype xout = (absx <= (vtype)32.0) | (absx > recrteps);
+    vtype y = absx + sqrt(pocl_fma(absx, absx, (vtype)1.0));
+    y = xout ? absx : y;
+
+    vtype r1, r2;
+    itype xexp;
+    __pocl_ep_log(y, &xexp, &r1, &r2);
+
+    itype xout2 = (xout ? (itype)1 : (itype)0);
+    vtype dxexp = convert_vtype(xexp + xout2);
+    r1 = pocl_fma(dxexp, log2_lead, r1);
+    r2 = pocl_fma(dxexp, log2_tail, r2);
+
+    // 1 < x <= 32
+    vtype v2 = (pq + (vtype)0.25) / t;
+    vtype r = v2 + r1;
+    vtype s = ((r1 - r) + v2) + r2;
+    vtype v1 = r + s;
+    v2 = (r - v1) + s;
+    vtype result2 = v1 + v2;
+
+    // x > 32
+    vtype result3 = r1 + r2;
+
+    vtype ret = (absx > (vtype)1.0) ? result2 : result1;
+    ret = (absx > (vtype)32.0) ? result3 : ret;
+    ret = (x < (vtype)0.0) ? -ret : ret;
+
+    // NaN, +-Inf, or x small enough that asinh(x) = x
+    ret = ((ax >= (utype)PINFBITPATT_DP64) | (absx < rteps)) ? x : ret;
+    return ret;
+}
diff --git a/lib/kernel/libclc/asinpi_fp32.cl b/lib/kernel/libclc/asinpi_fp32.cl
new file mode 100644
index 0000000..11790bb
--- /dev/null
+++ b/lib/kernel/libclc/asinpi_fp32.cl
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+
+_CL_OVERLOADABLE vtype asinpi(vtype x) {
+    // Computes arcsin(x).
+    // The argument is first reduced by noting that arcsin(x)
+    // is invalid for abs(x) > 1 and arcsin(-x) = -arcsin(x).
+    // For denormal and small arguments arcsin(x) = x to machine
+    // accuracy. Remaining argument ranges are handled as follows.
+    // For abs(x) <= 0.5 use
+    // arcsin(x) = x + x^3*R(x^2)
+    // where R(x^2) is a rational minimax approximation to
+    // (arcsin(x) - x)/x^3.
+    // For abs(x) > 0.5 exploit the identity:
+    // arcsin(x) = pi/2 - 2*arcsin(sqrt(1-x)/2)
+    // together with the above rational approximation, and
+    // reconstruct the terms carefully.
+
+
+    const vtype pi = (vtype)3.1415926535897933e+00f;
+    const vtype piby2_tail = (vtype)7.5497894159e-08F;   /* 0x33a22168 */
+    const vtype hpiby2_head = (vtype)7.8539812565e-01F;  /* 0x3f490fda */
+
+    utype ux = as_utype(x);
+    utype aux = ux & (utype)EXSIGNBIT_SP32;
+    utype xs = ux ^ aux;
+    vtype shalf = as_vtype(xs | as_utype((vtype)0.5f));
+    itype xexp = as_itype(aux >> EXPSHIFTBITS_SP32) - (itype)EXPBIAS_SP32;
+    vtype y = as_vtype(aux);
+
+    // abs(x) >= 0.5
+    itype transform = (xexp >= (itype)-1);
+
+    vtype y2 = y * y;
+    vtype rt = (vtype)0.5f * ((vtype)1.0f - y);
+    vtype r = transform ? rt : y2;
+
+    // Use a rational approximation for [0.0, 0.5]
+    vtype a = pocl_fma(r,
+                pocl_fma(r,
+                  pocl_fma(r,
+                    (vtype)-0.00396137437848476485201154797087F,
+                    (vtype)-0.0133819288943925804214011424456F),
+                  (vtype)-0.0565298683201845211985026327361F),
+                (vtype)0.184161606965100694821398249421F);
+    vtype b = pocl_fma(r,
+                (vtype)-0.836411276854206731913362287293F,
+                (vtype)1.10496961524520294485512696706F);
+    vtype u = r * MATH_DIVIDE(a, b);
+
+    vtype s = MATH_SQRT(r);
+    vtype s1 = as_vtype(as_utype(s) & (utype)0xffff0000);
+    vtype c = MATH_DIVIDE(pocl_fma(-s1, s1, r), s + s1);
+    vtype p = pocl_fma((vtype)2.0f * s, u, -pocl_fma(c, (vtype)-2.0f, piby2_tail));
+    vtype q = pocl_fma(s1, (vtype)-2.0f, hpiby2_head);
+    vtype vt = hpiby2_head - (p - q);
+    vtype v = pocl_fma(y, u, y);
+    v = transform ? vt : v;
+    v = MATH_DIVIDE(v, pi);
+    vtype xbypi = MATH_DIVIDE(x, pi);
+
+    vtype ret = as_vtype(xs | as_utype(v));
+    ret = (aux > (utype)0x3f800000U) ? as_vtype((utype)QNANBITPATT_SP32) : ret;
+    ret = (aux == (utype)0x3f800000U) ? shalf : ret;
+    ret = (xexp < (itype)-14) ? xbypi : ret;
+
+    return ret;
+}
diff --git a/lib/kernel/libclc/asinpi_fp64.cl b/lib/kernel/libclc/asinpi_fp64.cl
new file mode 100644
index 0000000..51b19ca
--- /dev/null
+++ b/lib/kernel/libclc/asinpi_fp64.cl
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+
+
+_CL_OVERLOADABLE vtype asinpi(vtype x) {
+    // Computes arcsin(x).
+    // The argument is first reduced by noting that arcsin(x)
+    // is invalid for abs(x) > 1 and arcsin(-x) = -arcsin(x).
+    // For denormal and small arguments arcsin(x) = x to machine
+    // accuracy. Remaining argument ranges are handled as follows.
+    // For abs(x) <= 0.5 use
+    // arcsin(x) = x + x^3*R(x^2)
+    // where R(x^2) is a rational minimax approximation to
+    // (arcsin(x) - x)/x^3.
+    // For abs(x) > 0.5 exploit the identity:
+    // arcsin(x) = pi/2 - 2*arcsin(sqrt(1-x)/2)
+    // together with the above rational approximation, and
+    // reconstruct the terms carefully.
+
+    const vtype pi = (vtype)0x1.921fb54442d18p+1;
+    const vtype piby2_tail = (vtype)6.1232339957367660e-17; /* 0x3c91a62633145c07 */
+    const vtype hpiby2_head = (vtype)7.8539816339744831e-01;  /* 0x3fe921fb54442d18 */
+
+    vtype y = fabs(x);
+    itype xneg = (as_itype(x) < (itype)0);
+    itype xexp = (as_itype(y) >> 52) - (itype)EXPBIAS_DP64;
+
+    // abs(x) >= 0.5
+    itype transform = (xexp >= (itype)-1);
+
+    vtype rt = (vtype)0.5 * ((vtype)1.0 - y);
+    vtype y2 = y * y;
+    vtype r = transform ? rt : y2;
+
+    // Use a rational approximation for [0.0, 0.5]
+    vtype un = pocl_fma(r,
+                    pocl_fma(r,
+                        pocl_fma(r,
+                            pocl_fma(r,
+                                pocl_fma(r, (vtype)0.0000482901920344786991880522822991,
+                                       (vtype)0.00109242697235074662306043804220),
+                                (vtype)-0.0549989809235685841612020091328),
+                            (vtype)0.275558175256937652532686256258),
+                        (vtype)-0.445017216867635649900123110649),
+                    (vtype)0.227485835556935010735943483075);
+
+    vtype ud = pocl_fma(r,
+                    pocl_fma(r,
+                        pocl_fma(r,
+                            pocl_fma(r, (vtype)0.105869422087204370341222318533,
+                                   (vtype)-0.943639137032492685763471240072),
+                            (vtype)2.76568859157270989520376345954),
+                        (vtype)-3.28431505720958658909889444194),
+                    (vtype)1.36491501334161032038194214209);
+
+    vtype u = r * MATH_DIVIDE(un, ud);
+
+
+    // Reconstruct asin carefully in transformed region
+    vtype s = sqrt(r);
+    vtype sh = as_vtype(as_utype(s) & (utype)0xffffffff00000000UL);
+    vtype c = MATH_DIVIDE(pocl_fma(-sh, sh, r), s + sh);
+    vtype p = pocl_fma((2.0 * s), u, -pocl_fma((vtype)-2.0, c, piby2_tail));
+    vtype q = pocl_fma((vtype)-2.0, sh, hpiby2_head);
+    vtype vt = hpiby2_head - (p - q);
+    vtype v = pocl_fma(y, u, y);
+    v = transform ? vt : v;
+
+    v = (xexp < (itype)-28) ? y : v;
+    v = MATH_DIVIDE(v, pi);
+    v = (xexp >= (itype)0) ? as_vtype((utype)QNANBITPATT_DP64) : v;
+    v = (y == (vtype)1.0) ? (vtype)0.5 : v;
+    return xneg ? -v : v;
+}
diff --git a/lib/kernel/libclc/atan2pi_fp32.cl b/lib/kernel/libclc/atan2pi_fp32.cl
new file mode 100644
index 0000000..434f1d5
--- /dev/null
+++ b/lib/kernel/libclc/atan2pi_fp32.cl
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+
+
+_CL_OVERLOADABLE  vtype atan2pi(vtype y, vtype x) {
+    const vtype pi = (vtype)0x1.921fb6p+1f;
+
+    vtype ax = fabs(x);
+    vtype ay = fabs(y);
+    vtype v = min(ax, ay);
+    vtype u = max(ax, ay);
+
+    // Scale since u could be large, as in "regular" divide
+    vtype s = (u > (vtype)0x1.0p+96f) ? (vtype)0x1.0p-32f : (vtype)1.0f;
+    vtype vbyu = s * MATH_DIVIDE(v, s*u);
+
+    vtype vbyu2 = vbyu * vbyu;
+
+    vtype p = pocl_fma(vbyu2,
+                pocl_fma(vbyu2,
+                    (vtype)-0x1.7e1f78p-9f,
+                    (vtype)-0x1.7d1b98p-3f),
+                (vtype)-0x1.5554d0p-2f)
+                * vbyu2 * vbyu;
+    vtype q = pocl_fma(vbyu2,
+                pocl_fma(vbyu2,
+                  (vtype)0x1.1a714cp-2f,
+                  (vtype)0x1.287c56p+0f),
+                  (vtype)1.0f);
+
+    // Octant 0 result
+    vtype a = MATH_DIVIDE(pocl_fma(p, MATH_RECIP(q), vbyu), pi);
+
+    // Fix up 3 other octants
+    vtype at = (vtype)0.5f - a;
+    a = (ay > ax) ? at : a;
+    at = (vtype)1.0f - a;
+    a = (x < (vtype)0.0f) ? at : a;
+
+    // y == 0 => 0 for x >= 0, pi for x < 0
+    at = (as_itype(x) & (itype)SIGNBIT_SP32) ? (vtype)1.0f : (vtype)0.0f;
+    a = (y == (vtype)0.0f) ? at : a;
+
+    // x and y are +- Inf
+    at = (x > (vtype)0.0f) ? (vtype)0.25f : (vtype)0.75f;
+    a = ((ax == (vtype)INFINITY) & (ay == (vtype)INFINITY)) ? at : a;
+
+    // x or y is NaN
+    a = (isnan(x) | isnan(y)) ? as_vtype((utype)QNANBITPATT_SP32) : a;
+
+    // Fixup sign and return
+    return copysign(a, y);
+}
diff --git a/lib/kernel/libclc/atan2pi_fp64.cl b/lib/kernel/libclc/atan2pi_fp64.cl
new file mode 100644
index 0000000..b342668
--- /dev/null
+++ b/lib/kernel/libclc/atan2pi_fp64.cl
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+
+_CL_OVERLOADABLE vtype atan2pi(vtype y, vtype x) {
+    const vtype pi = (vtype)3.1415926535897932e+00;          /* 0x400921fb54442d18 */
+    const vtype pi_head = (vtype)3.1415926218032836e+00;     /* 0x400921fb50000000 */
+    const vtype pi_tail = (vtype)3.1786509547056392e-08;     /* 0x3e6110b4611a6263 */
+    const vtype piby2_head = (vtype)1.5707963267948965e+00;  /* 0x3ff921fb54442d18 */
+    const vtype piby2_tail = (vtype)6.1232339957367660e-17;  /* 0x3c91a62633145c07 */
+
+    vtype x2 = x;
+    itype xneg = (as_itype(x) & (itype)SIGNBIT_DP64);
+    itype xexp = ((as_itype(x) & (itype)EXPBITS_DP64) >> 52);
+    vtype y2 = y;
+    itype yneg = (as_itype(y) & (itype)SIGNBIT_DP64);
+    itype yexp = ((as_itype(y) & (itype)EXPBITS_DP64) >> 52);
+    itype diffexp = yexp - xexp;
+
+    // Scale up both x and y if they are both below 1/4
+    vtype x1 = ldexp(x, 1024);
+    itype xexp1 = ((as_itype(x1) & (itype)EXPBITS_DP64) >> 52);
+    vtype y1 = ldexp(y, 1024);
+    itype yexp1 = ((as_itype(y1) & (itype)EXPBITS_DP64) >> 52);
+    itype diffexp1 = yexp1 - xexp1;
+
+    itype cond2 = (xexp < 1021) & (yexp < 1021);
+    diffexp = cond2 ? diffexp1 : diffexp;
+    x = cond2 ? x1 : x;
+    y = cond2 ? y1 : y;
+
+    // General case: take absolute values of arguments
+    vtype u = fabs(x);
+    vtype v = fabs(y);
+
+    // Swap u and v if necessary to obtain 0 < v < u. Compute v/u.
+    itype swap_vu = u < v;
+    vtype uu = u;
+    u = swap_vu ? v : u;
+    v = swap_vu ? uu : v;
+
+    vtype vbyu = v / u;
+    vtype q1, q2;
+
+    // General values of v/u. Use a look-up table and series expansion.
+    {
+        vtype val = (vbyu > (vtype)0.0625) ? vbyu : (vtype)0.063;
+        itype index = convert_itype(pocl_fma((vtype)256.0, val, (vtype)0.5));
+        v2type tv = USE_VTABLE(atan_jby256_tbl, convert_uinttype(index - 16));
+        q1 = tv.lo; // s0
+        q2 = tv.hi; // s1
+        vtype c = convert_vtype(index) * 0x1.0p-8;
+
+        // We're going to scale u and v by 2^(-u_exponent) to bring them close to 1
+        // u_exponent could be EMAX so we have to do it in 2 steps
+        itype m = -(as_itype(as_utype(u) >> EXPSHIFTBITS_DP64) - (itype)EXPBIAS_DP64);
+        vtype um = ldexp(u, convert_inttype(m));
+        vtype vm = ldexp(v, convert_inttype(m));
+
+        // 26 leading bits of u
+        vtype u1 = as_vtype(as_utype(um) & (utype)0xfffffffff8000000UL);
+        vtype u2 = um - u1;
+
+        vtype r = MATH_DIVIDE(pocl_fma(-c, u2, pocl_fma(-c, u1, vm)), pocl_fma(c, vm, um));
+
+        // Polynomial approximation to atan(r)
+        vtype s = r * r;
+        q2 = q2 + pocl_fma((s * pocl_fma(-s, (vtype)0.19999918038989143496, (vtype)0.33333333333224095522)), -r, r);
+    }
+
+
+    vtype q3, q4;
+    {
+        q3 = 0.0;
+        q4 = vbyu;
+    }
+
+    vtype q5, q6;
+    {
+        vtype u1 = as_vtype(as_utype(u) & (utype)0xffffffff00000000UL);
+        vtype u2 = u - u1;
+        vtype vu1 = as_vtype(as_utype(vbyu) & (utype)0xffffffff00000000UL);
+        vtype vu2 = vbyu - vu1;
+
+        q5 = 0.0;
+        vtype s = vbyu * vbyu;
+        q6 = vbyu + pocl_fma(-vbyu * s,
+                        pocl_fma(-s,
+                            pocl_fma(-s,
+                                pocl_fma(-s,
+                                    pocl_fma(-s, (vtype)0.90029810285449784439E-01,
+                                        (vtype)0.11110736283514525407),
+                                    (vtype)0.14285713561807169030),
+                                (vtype)0.19999999999393223405),
+                            (vtype)0.33333333333333170500),
+       MATH_DIVIDE(pocl_fma(-u, vu2, pocl_fma(-u2, vu1, pocl_fma(-u1, vu1, v))), u));
+    }
+
+
+    q3 = vbyu < (vtype)0x1.d12ed0af1a27fp-27 ? q3 : q5;
+    q4 = vbyu < (vtype)0x1.d12ed0af1a27fp-27 ? q4 : q6;
+
+    q1 = vbyu > (vtype)0.0625 ? q1 : q3;
+    q2 = vbyu > (vtype)0.0625 ? q2 : q4;
+
+    // Tidy-up according to which quadrant the arguments lie in
+    vtype res1, res2, res3, res4;
+    q1 = swap_vu ? piby2_head - q1 : q1;
+    q2 = swap_vu ? piby2_tail - q2 : q2;
+    q1 = xneg ? pi_head - q1 : q1;
+    q2 = xneg ? pi_tail - q2 : q2;
+    q1 = MATH_DIVIDE(q1 + q2, pi);
+    res4 = yneg ? -q1 : q1;
+
+    res1 = yneg ? (vtype)-0.75 : (vtype)0.75;
+    res2 = yneg ? (vtype)-0.25 : (vtype)0.25;
+    res3 = xneg ? res1 : res2;
+
+    res3 = isinf(y2) & isinf(x2) ? res3 : res4;
+    res1 = yneg ? (vtype)-1.0 : (vtype)1.0;
+
+    // abs(x)/abs(y) > 2^56 and x < 0
+    res3 = ((diffexp < (itype)-56) && xneg) ? res1 : res3;
+
+    res4 = MATH_DIVIDE(MATH_DIVIDE(y, x), pi);
+    // x positive and dominant over y by a factor of 2^28
+    itype xpos = xneg ^ (itype)SIGNBIT_DP64;
+    res3 = ((diffexp < (itype)-28) & xpos) ? res4 : res3;
+
+    // abs(y)/abs(x) > 2^56
+    res4 = yneg ? (vtype)-0.5 : (vtype)0.5;        // atan(y/x) is insignificant compared to piby2
+    res3 = (diffexp > (itype)56) ? res4 : res3;
+
+    res3 = (x2 == (vtype)0.0) ? res4 : res3;  // Zero x gives +- pi/2 depending on sign of y
+    res4 = xneg ? res1 : y2;
+
+    res3 = (y2 == (vtype)0.0) ? res4 : res3;  // Zero y gives +-0 for positive x and +-pi for negative x
+    res3 = isnan(y2) ? y2 : res3;
+    res3 = isnan(x2) ? x2 : res3;
+
+    return res3;
+}
diff --git a/lib/kernel/libclc/atanh_fp32.cl b/lib/kernel/libclc/atanh_fp32.cl
new file mode 100644
index 0000000..1e522c0
--- /dev/null
+++ b/lib/kernel/libclc/atanh_fp32.cl
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+
+_CL_OVERLOADABLE vtype atanh(vtype x) {
+    utype ux = as_utype(x);
+    utype ax = ux & (utype)EXSIGNBIT_SP32;
+    utype xs = ux ^ ax;
+
+    // |x| > 1 or NaN
+    vtype z = as_vtype((utype)QNANBITPATT_SP32);
+
+    // |x| == 1
+    vtype t = as_vtype(xs | (utype)PINFBITPATT_SP32);
+    z = (ax == (utype)0x3f800000U) ? t : z;
+
+    // 1/2 <= |x| < 1
+    t = as_vtype(ax);
+    t = MATH_DIVIDE(2.0f*t, (vtype)1.0f - t);
+    t = 0.5f * log1p(t);
+    t = as_vtype(xs | as_utype(t));
+    z = (ax < (utype)0x3f800000U) ? t : z;
+
+    // |x| < 1/2
+    t = x * x;
+    vtype a = pocl_fma(
+                pocl_fma((vtype)0.92834212715e-2f,
+                  t, (vtype)-0.28120347286e0f),
+                t, (vtype)0.39453629046e0f);
+    vtype b = pocl_fma(
+                pocl_fma((vtype)0.45281890445e0f,
+                  t, (vtype)-0.15537744551e1f),
+                t, (vtype)0.11836088638e1f);
+    vtype p = MATH_DIVIDE(a, b);
+    t = pocl_fma(x*t, p, x);
+    z = (ax < (utype)0x3f000000) ? t : z;
+
+    // |x| < 2^(vtype)-1.
+    z = (ax < (utype)0x39000000U) ? x : z;
+
+    return z;
+}
diff --git a/lib/kernel/libclc/atanh_fp64.cl b/lib/kernel/libclc/atanh_fp64.cl
new file mode 100644
index 0000000..b9580e6
--- /dev/null
+++ b/lib/kernel/libclc/atanh_fp64.cl
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+
+_CL_OVERLOADABLE vtype atanh(vtype x) {
+    vtype absx = fabs(x);
+
+    vtype ret = (absx == (vtype)1.0) ? as_vtype((utype)PINFBITPATT_DP64) : as_vtype((utype)QNANBITPATT_DP64);
+
+    // |x| >= 0.5
+    // Note that atanh(x) = 0.5 * ln((1+x)/(1-x))
+    // For greater accuracy we use
+    // ln((1+x)/(1-x)) = ln(1 + 2x/(1-x)) = log1p(2x/(1-x)).
+    vtype r = 0.5 * log1p((2.0 * absx) / ((vtype)1.0 - absx));
+    ret = (absx < (vtype)1.0) ? r : ret;
+
+    r = -ret;
+    ret = (x < (vtype)0.0) ? r : ret;
+
+    // Arguments up to 0.5 in magnitude are
+    // approximated by a [5,5] minimax polynomial
+    vtype t = x * x;
+
+    vtype pn = pocl_fma(t,
+                    pocl_fma(t,
+                        pocl_fma(t,
+                            pocl_fma(t,
+                                pocl_fma(t,
+                                  (vtype)-0.10468158892753136958e-3,
+                                  (vtype)0.28728638600548514553e-1),
+                                (vtype)-0.28180210961780814148e0),
+                            (vtype)0.88468142536501647470e0),
+                        (vtype)-0.11028356797846341457e1),
+                    (vtype)0.47482573589747356373e0);
+
+    vtype pd = pocl_fma(t,
+                    pocl_fma(t,
+                        pocl_fma(t,
+                            pocl_fma(t,
+                                pocl_fma(t,
+                                  (vtype)-0.35861554370169537512e-1,
+                                  (vtype)0.49561196555503101989e0),
+                                (vtype)-0.22608883748988489342e1),
+                            (vtype)0.45414700626084508355e1),
+                        (vtype)-0.41631933639693546274e1),
+                    (vtype)0.14244772076924206909e1);
+
+    r = pocl_fma(x*t, pn/pd, x);
+    ret = (absx < (vtype)0.5) ? r : ret;
+
+    return ret;
+}
diff --git a/lib/kernel/libclc/atanpi_fp32.cl b/lib/kernel/libclc/atanpi_fp32.cl
new file mode 100644
index 0000000..04c6d45
--- /dev/null
+++ b/lib/kernel/libclc/atanpi_fp32.cl
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+
+
+_CL_OVERLOADABLE vtype atanpi(vtype x) {
+    const vtype pi = (vtype)M_PI_F;
+
+    utype ux = as_utype(x);
+    utype aux = ux & (utype)EXSIGNBIT_SP32;
+    utype sx = ux ^ aux;
+
+    vtype xbypi = MATH_DIVIDE(x, pi);
+    vtype shalf = as_vtype(sx | as_utype((vtype)0.5f));
+
+    vtype v = as_vtype(aux);
+
+    // Return for NaN
+    vtype ret = x;
+
+    // 2^26 <= |x| <= Inf => atan(x) is close to piby2
+    ret = (aux <= (utype)PINFBITPATT_SP32) ? shalf : ret;
+
+    // Reduce arguments 2^-19 <= |x| < 2^26
+
+    // 39/16 <= x < 2^26
+    x = -MATH_RECIP(v);
+    vtype c = (vtype)1.57079632679489655800f; // atan(infinity)
+
+    // 19/16 <= x < 39/16
+    itype l = (aux < (utype)0x401c0000);
+    vtype xx = MATH_DIVIDE(v - (vtype)1.5f, pocl_fma(v, (vtype)1.5f, (vtype)1.0f));
+    x = l ? xx : x;
+    c = l ? (vtype)9.82793723247329054082e-1f : c; // atan(1.5)
+
+    // 11/16 <= x < 19/16
+    l = (aux < (utype)0x3f980000U);
+    xx =  MATH_DIVIDE(v - (vtype)1.0f, (vtype)1.0f + v);
+    x = l ? xx : x;
+    c = l ? (vtype)7.85398163397448278999e-1f : c; // atan(1)
+
+    // 7/16 <= x < 11/16
+    l = (aux < (utype)0x3f300000);
+    xx = MATH_DIVIDE(pocl_fma(v, (vtype)2.0f, (vtype)-1.0f), (vtype)2.0f + v);
+    x = l ? xx : x;
+    c = l ? (vtype)4.63647609000806093515e-1f: c; // atan(0.5)
+
+    // 2^-19 <= x < 7/16
+    l = (aux < (utype)0x3ee00000);
+    x = l ? v : x;
+    c = l ? (vtype)0.0f : c;
+
+    // Core approximation: Remez(2,2) on [-7/16,7/16]
+
+    vtype s = x * x;
+    vtype a = pocl_fma(s,
+                  pocl_fma(s,
+                    (vtype)0.470677934286149214138357545549e-2f,
+                    (vtype)0.192324546402108583211697690500f),
+                  (vtype)0.296528598819239217902158651186f);
+
+    vtype b = pocl_fma(s,
+                  pocl_fma(s,
+                    (vtype)0.299309699959659728404442796915f,
+                    (vtype)0.111072499995399550138837673349e1f),
+                  (vtype)0.889585796862432286486651434570f);
+
+    vtype q = x * s * MATH_DIVIDE(a, b);
+
+    vtype z = c - (q - x);
+    z = MATH_DIVIDE(z, pi);
+    vtype zs = as_vtype(sx | as_utype(z));
+
+    ret  = (aux< (utype)0x4c800000) ? zs : ret;
+
+    // |x| < 2^e
+    ret = (aux< (utype)0x36000000) ? xbypi : ret;
+    return ret;
+}
diff --git a/lib/kernel/libclc/atanpi_fp64.cl b/lib/kernel/libclc/atanpi_fp64.cl
new file mode 100644
index 0000000..a9435ac
--- /dev/null
+++ b/lib/kernel/libclc/atanpi_fp64.cl
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+
+_CL_OVERLOADABLE vtype atanpi(vtype x) {
+    const vtype pi = (vtype)M_PI;
+
+    vtype v = fabs(x);
+
+    // 2^56 > v > 39/16
+    vtype a = (vtype)-1.0;
+    vtype b = v;
+    // (chi + clo) = arctan(infinity)
+    vtype chi = (vtype)1.57079632679489655800e+00;
+    vtype clo = (vtype)6.12323399573676480327e-17;
+
+    vtype ta = v - (vtype)1.5;
+    vtype tb = (vtype)1.0 + (vtype)1.5 * v;
+    itype l = (v <= (vtype)0x1.38p+1); // 39/16 > v > 19/16
+    a = l ? ta : a;
+    b = l ? tb : b;
+    // (chi + clo) = arctan(1.5)
+    chi = l ? (vtype)9.82793723247329054082e-01 : chi;
+    clo = l ? (vtype)1.39033110312309953701e-17 : clo;
+
+    ta = v - (vtype)1.0;
+    tb = (vtype)1.0 + v;
+    l = (v <= (vtype)0x1.3p+0); // 19/16 > v > 11/16
+    a = l ? ta : a;
+    b = l ? tb : b;
+    // (chi + clo) = arctan(1.)
+    chi = l ? (vtype)7.85398163397448278999e-01 : chi;
+    clo = l ? (vtype)3.06161699786838240164e-17 : clo;
+
+    ta = (vtype)2.0 * v - (vtype)1.0;
+    tb = (vtype)2.0 + v;
+    l = (v <= (vtype)0x1.6p-1); // 11/16 > v > 7/16
+    a = l ? ta : a;
+    b = l ? tb : b;
+    // (chi + clo) = arctan(0.5)
+    chi = l ? (vtype)4.63647609000806093515e-01 : chi;
+    clo = l ? (vtype)2.26987774529616809294e-17 : clo;
+
+    l = (v <= (vtype)0x1.cp-2); // v < 7/16
+    a = l ? v : a;
+    b = l ? (vtype)1.0 : b;;
+    chi = l ? (vtype)0.0 : chi;
+    clo = l ? (vtype)0.0 : clo;
+
+    // Core approximation: Remez(4,4) on [-7/16,7/16]
+    vtype r = a / b;
+    vtype s = r * r;
+    vtype qn = pocl_fma(s,
+                    pocl_fma(s,
+                        pocl_fma(s,
+                            pocl_fma(s,
+                                (vtype)0.142316903342317766e-3,
+                                (vtype)0.304455919504853031e-1),
+                            (vtype)0.220638780716667420e0),
+                        (vtype)0.447677206805497472e0),
+                    (vtype)0.268297920532545909e0);
+
+    vtype qd = pocl_fma(s,
+                 pocl_fma(s,
+                   pocl_fma(s,
+                     pocl_fma(s,
+                       (vtype)0.389525873944742195e-1,
+                       (vtype)0.424602594203847109e0),
+                     (vtype)0.141254259931958921e1),
+                   (vtype)0.182596787737507063e1),
+                 (vtype)0.804893761597637733e0);
+
+    vtype q = r * s * qn / qd;
+    r = (chi - ((q - clo) - r)) / pi;
+    vtype vp = v / pi;
+
+    vtype z = isnan(x) ? x : (vtype)0.5;
+    z = (v <= (vtype)0x1.0p+56) ? r : z;
+    z = (v < (vtype)0x1.0p-26) ? vp : z;
+    return x == v ? z : -z;
+}
diff --git a/lib/kernel/libclc/cos_fp32.cl b/lib/kernel/libclc/cos_fp32.cl
new file mode 100644
index 0000000..425e7d9
--- /dev/null
+++ b/lib/kernel/libclc/cos_fp32.cl
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+_CL_OVERLOADABLE vtype cos(vtype x)
+{
+    itype ix = as_itype(x);
+    itype ax = ix & (itype)EXSIGNBIT_SP32;
+    vtype dx = as_vtype(ax);
+
+    vtype r0, r1;
+    itype regn = __pocl_argReductionS(&r0, &r1, dx);
+
+    vtype ss = -__pocl_sinf_piby4(r0, r1);
+    vtype cc =  __pocl_cosf_piby4(r0, r1);
+
+    vtype c = (regn << 31) ? ss : cc;
+    itype t = ((regn >> 1) << 31);
+    c = as_vtype(as_itype(c) ^ t);
+
+    c = (ax >= (itype)PINFBITPATT_SP32) ? as_vtype((utype)QNANBITPATT_SP32) : c;
+
+    //Subnormals
+    c = (x == 0.0f) ? (vtype)1.0f : c;
+
+    return c;
+}
diff --git a/lib/kernel/libclc/cos_fp64.cl b/lib/kernel/libclc/cos_fp64.cl
new file mode 100644
index 0000000..cd2360c
--- /dev/null
+++ b/lib/kernel/libclc/cos_fp64.cl
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+_CL_OVERLOADABLE vtype cos(vtype x) {
+    x = fabs(x);
+
+    vtype r, rr, r2, rr2;
+    itype regn, regn2;
+
+    __pocl_remainder_piby2_medium(x, &r, &rr, &regn);
+    itype cond = (x >= (vtype)0x1.0p+47);
+    if (SV_ANY(cond)) {
+        __pocl_remainder_piby2_large(x, &r2, &rr2, &regn2);
+        regn = cond ? regn2 : regn;
+        r = cond ? r2 : r;
+        rr = cond ? rr2 : rr;
+    }
+    v2type sc = __pocl_sincos_piby4(r, rr);
+
+    itype ss = as_itype(-sc.lo);
+    itype cc = as_itype(sc.hi);
+
+    itype c = (regn << 63) ? ss : cc;
+    c ^= ((regn >> 1) << 63);
+
+    return (isnan(x) | isinf(x)) ? as_vtype((utype)QNANBITPATT_DP64) : as_vtype(c);
+}
diff --git a/lib/kernel/libclc/cosh_fp32.cl b/lib/kernel/libclc/cosh_fp32.cl
new file mode 100644
index 0000000..58c8c01
--- /dev/null
+++ b/lib/kernel/libclc/cosh_fp32.cl
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+_CL_OVERLOADABLE vtype cosh(vtype x) {
+
+    // After dealing with special cases the computation is split into regions as follows.
+    // abs(x) >= max_cosh_arg:
+    // cosh(x) = sign(x)*Inf
+    // abs(x) >= small_threshold:
+    // cosh(x) = sign(x)*exp(abs(x))/2 computed using the
+    // splitexp and scaleDouble functions as for exp_amd().
+    // abs(x) < small_threshold:
+    // compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0)))
+    // cosh(x) is then z.
+
+    const vtype max_cosh_arg = (vtype)0x1.65a9fap+6f;
+    const vtype small_threshold = (vtype)0x1.0a2b24p+3f;
+
+    utype ux = as_utype(x);
+    utype aux = ux & (utype)EXSIGNBIT_SP32;
+    vtype y = as_vtype(aux);
+
+    // Find the integer part y0 of y and the increment dy = y - y0. We then compute
+    // z = sinh(y) = sinh(y0)cosh(dy) + cosh(y0)sinh(dy)
+    // z = cosh(y) = cosh(y0)cosh(dy) + sinh(y0)sinh(dy)
+    // where sinh(y0) and cosh(y0) are tabulated above.
+
+    vtype indv = trunc(y);
+    utype indi = convert_utype(indv);
+    indi = (indi > (utype)36) ? (utype)0 : indi;
+
+    vtype dy = y - indv;
+    vtype dy2 = dy * dy;
+
+    vtype sdy = pocl_fma(dy2,
+                    pocl_fma(dy2,
+                        pocl_fma(dy2,
+                            pocl_fma(dy2,
+                                pocl_fma(dy2,
+                                    pocl_fma(dy2,
+                                      (vtype)0.7746188980094184251527126e-12f,
+                                      (vtype)0.160576793121939886190847e-9f),
+                                    (vtype)0.250521176994133472333666e-7f),
+                                (vtype)0.275573191913636406057211e-5f),
+                            (vtype)0.198412698413242405162014e-3f),
+                        (vtype)0.833333333333329931873097e-2f),
+                    (vtype)0.166666666666666667013899e0f);
+    sdy = pocl_fma(sdy, dy*dy2, dy);
+
+    vtype cdy = pocl_fma(dy2,
+                    pocl_fma(dy2,
+                        pocl_fma(dy2,
+                            pocl_fma(dy2,
+                                pocl_fma(dy2,
+                                    pocl_fma(dy2,
+                                      (vtype)0.1163921388172173692062032e-10f,
+                                      (vtype)0.208744349831471353536305e-8f),
+                                    (vtype)0.275573350756016588011357e-6f),
+                                (vtype)0.248015872460622433115785e-4f),
+                            (vtype)0.138888888889814854814536e-2f),
+                        (vtype)0.416666666666660876512776e-1f),
+                    (vtype)0.500000000000000005911074e0f);
+
+    cdy = pocl_fma(cdy, dy2, (vtype)1.0f);
+
+    v2type tv = USE_VTABLE(sinhcosh_tbl, indi);
+    vtype z = pocl_fma(tv.lo, sdy, tv.hi * cdy);
+
+    // When exp(-x) is insignificant compared to exp(x), return exp(x)/2
+    vtype t = exp(y - (vtype)0x1.62e500p-1f);
+    vtype zsmall = pocl_fma((vtype)0x1.a0210ep-18f, t, t);
+    z = (y >= small_threshold) ? zsmall : z;
+
+    // Corner cases
+    z = (y >= max_cosh_arg) ? as_vtype((utype)PINFBITPATT_SP32) : z;
+    z = (aux > (utype)PINFBITPATT_SP32) ? as_vtype((utype)QNANBITPATT_SP32) : z;
+    z = (aux < (utype)0x38800000) ? (vtype)1.0f : z;
+
+    return z;
+}
diff --git a/lib/kernel/libclc/cosh_fp64.cl b/lib/kernel/libclc/cosh_fp64.cl
new file mode 100644
index 0000000..bd4979d
--- /dev/null
+++ b/lib/kernel/libclc/cosh_fp64.cl
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+
+_CL_OVERLOADABLE vtype cosh(vtype x) {
+
+    // After dealing with special cases the computation is split into
+    // regions as follows:
+    //
+    // abs(x) >= max_cosh_arg:
+    // cosh(x) = sign(x)*Inf
+    //
+    // abs(x) >= small_threshold:
+    // cosh(x) = sign(x)*exp(abs(x))/2 computed using the
+    // splitexp and scaleDouble functions as for exp_amd().
+    //
+    // abs(x) < small_threshold:
+    // compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0)))
+    // cosh(x) is then sign(x)*z.
+
+    // This is ln(2^1025)
+    const vtype max_cosh_arg = (vtype)7.10475860073943977113e+02;      // 0x408633ce8fb9f87e
+
+    // This is where exp(-x) is insignificant compared to exp(x) = ln(2^27)
+    const vtype small_threshold = (vtype)0x1.2b708872320e2p+4;
+
+    vtype y = fabs(x);
+
+    // In this range we find the integer part y0 of y
+    // and the increment dy = y - y0. We then compute
+    // z = cosh(y) = cosh(y0)cosh(dy) + sinh(y0)sinh(dy)
+    // where sinh(y0) and cosh(y0) are tabulated above.
+
+    vtype indv = trunc(y);
+    itype indi = convert_itype(indv);
+    indi = min((itype)indi, (itype)36U);
+
+    vtype dy = y - indv;
+    vtype dy2 = dy * dy;
+
+    vtype sdy = dy * dy2 *
+          pocl_fma(dy2,
+            pocl_fma(dy2,
+              pocl_fma(dy2,
+                pocl_fma(dy2,
+                  pocl_fma(dy2,
+                    pocl_fma(dy2,
+                      (vtype)0.7746188980094184251527126e-12,
+                      (vtype)0.160576793121939886190847e-9),
+                    (vtype)0.250521176994133472333666e-7),
+                  (vtype)0.275573191913636406057211e-5),
+                (vtype)0.198412698413242405162014e-3),
+              (vtype)0.833333333333329931873097e-2),
+            (vtype)0.166666666666666667013899e0);
+
+    vtype cdy = dy2 *
+         pocl_fma(dy2,
+           pocl_fma(dy2,
+             pocl_fma(dy2,
+               pocl_fma(dy2,
+                 pocl_fma(dy2,
+                   pocl_fma(dy2,
+                     (vtype)0.1163921388172173692062032e-10,
+                     (vtype)0.208744349831471353536305e-8),
+                   (vtype)0.275573350756016588011357e-6),
+                 (vtype)0.248015872460622433115785e-4),
+               (vtype)0.138888888889814854814536e-2),
+             (vtype)0.416666666666660876512776e-1),
+           (vtype)0.500000000000000005911074e0);
+
+    // At this point sinh(dy) is approximated by dy + sdy,
+    // and cosh(dy) is approximated by 1 + cdy.
+
+    v2type tv = USE_VTABLE(cosh_tbl, convert_uinttype(indi));
+    vtype cl = tv.lo;
+    vtype ct = tv.hi;
+
+    tv = USE_VTABLE(sinh_tbl, convert_uinttype(indi));
+    vtype sl = tv.lo;
+    vtype st = tv.hi;
+
+    vtype z = pocl_fma(sl, dy,
+                pocl_fma(sl, sdy,
+                  pocl_fma(cl, cdy,
+                    pocl_fma(st, dy,
+                      pocl_fma(st, sdy, ct*cdy))
+                    + ct))) + cl;
+
+    // Other cases
+    z = (y < (vtype)0x1.0p-28) ? (vtype)1.0 : z;
+
+    vtype t = exp(y - (vtype)0x1.62e42fefa3800p-1);
+    t = pocl_fma(t, (vtype)-0x1.ef35793c76641p-45, t);
+    z = (y >= small_threshold) ? t : z;
+
+    z = (y >= max_cosh_arg) ? as_vtype((utype)PINFBITPATT_DP64) : z;
+
+    z = (isinf(x) | isnan(x)) ? y : z;
+
+    return z;
+
+}
diff --git a/lib/kernel/libclc/cospi_fp32.cl b/lib/kernel/libclc/cospi_fp32.cl
new file mode 100644
index 0000000..b8a7e09
--- /dev/null
+++ b/lib/kernel/libclc/cospi_fp32.cl
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+_CL_OVERLOADABLE vtype cospi(vtype x)
+{
+
+    itype ix = as_itype(x) & (itype)EXSIGNBIT_SP32;
+    vtype ax = as_vtype(ix);
+    vtype iaxv = trunc(ax);
+    itype iaxi = convert_itype(iaxv);
+    vtype r = ax - iaxv;
+    itype xodd = ((iaxi & (itype)0x1) << 31);
+
+    // Initialize with return for +-Inf and NaN
+    itype ir = (itype)QNANBITPATT_SP32;
+
+    // 2^23 <= |x| < Inf, the result is always integer
+    ir = (ix < (itype)(EXPBITS_SP32)) ? (itype)ONEEXPBITS_SP32 : ir;
+
+    // 2^23 <= |x| < 2^24, the result is always integer
+    ir = (ix < (itype)0x4b800000) ? (xodd | (itype)ONEEXPBITS_SP32) : ir;
+
+    // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval
+
+    // r < 1.0
+    vtype a = (vtype)1.0f - r;
+    itype e = (itype)(-1);
+    itype s = xodd ^ (itype)SIGNBIT_SP32;
+
+    // r <= 0.75
+    itype c = (r <= (vtype)0.75f);
+    a = c ? (r - (vtype)0.5f) : a;
+    e = c ? (itype)0 : e;
+
+    // r < 0.5
+    c = (r < (vtype)0.5f);
+    a = c ? ((vtype)0.5f - r) : a;
+    s = c ? xodd : s;
+
+    // r <= 0.25
+    c = (r <= 0.25f);
+    a = c ? r : a;
+    e = c ? (itype)(-1) : e;
+
+    v2type t = __pocl_sincosf_piby4(a * M_PI_F);
+    itype jr = s ^ as_itype(e ? t.hi : t.lo);
+
+    ir = (ix < (itype)0x4b000000) ? jr : ir;
+
+    return as_vtype(ir);
+}
diff --git a/lib/kernel/libclc/cospi_fp64.cl b/lib/kernel/libclc/cospi_fp64.cl
new file mode 100644
index 0000000..b61972d
--- /dev/null
+++ b/lib/kernel/libclc/cospi_fp64.cl
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+
+_CL_OVERLOADABLE vtype cospi(vtype x) {
+
+    itype ix = as_itype(x) & (itype)EXSIGNBIT_DP64;
+    vtype ax = as_vtype(ix);
+    vtype iaxv = trunc(ax);
+    itype iaxi = convert_itype(iaxv);
+    vtype r = ax - iaxv;
+    itype xodd = ((iaxi & (itype)1) << 63);
+
+    // Initialize with return for +-Inf and NaN
+    itype ir = (itype)QNANBITPATT_DP64;
+
+    // 2^53 <= |x| < Inf, the result is always even integer
+    ir = (ix < (itype)EXPBITS_DP64) ? (itype)ONEEXPBITS_DP64 : ir;
+
+    // 2^52 <= |x| < 2^53, the result is always integer
+    ir = (ax < (vtype)0x1.0p+53) ? (xodd | (itype)ONEEXPBITS_DP64) : ir;
+
+    // 0x1.0p-7 <= |x| < 2^52, result depends on which 0.25 interval
+
+    // r < 1.0
+    vtype a = (vtype)1.0 - r;
+    itype e = (itype)(-1);
+    itype s = xodd ^ (itype)SIGNBIT_DP64;
+
+    // r <= 0.75
+    itype c = (r <= (vtype)0.75);
+    a = c ? (r - (vtype)0.5) : a;
+    e = c ? (itype)0 : e;
+
+    // r < 0.5
+    c = (r < (vtype)0.5);
+    a = c ? ((vtype)0.5 - r) : a;
+    s = c ? xodd : s;
+
+    // r <= 0.25
+    c = (r <= 0.25);
+    a = c ? r : a;
+    e = c ? (itype)(-1) : e;
+
+    v2type sc = __pocl_sincos_piby4(a * M_PI, (vtype)0.0);
+    itype jr = s ^ as_itype(e ? sc.hi : sc.lo);
+
+    ir = (ax < (vtype)0x1.0p+52) ? jr : ir;
+
+    return as_vtype(ir);
+}
diff --git a/lib/kernel/libclc/degrees_fp32.cl b/lib/kernel/libclc/degrees_fp32.cl
new file mode 100644
index 0000000..403757f
--- /dev/null
+++ b/lib/kernel/libclc/degrees_fp32.cl
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+_CL_OVERLOADABLE vtype degrees(vtype radians) {
+  // 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F
+  return 0x1.ca5dc2p+5F * radians;
+}
diff --git a/lib/kernel/libclc/degrees_fp64.cl b/lib/kernel/libclc/degrees_fp64.cl
new file mode 100644
index 0000000..facfad4
--- /dev/null
+++ b/lib/kernel/libclc/degrees_fp64.cl
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+_CL_OVERLOADABLE vtype degrees(vtype radians) {
+  // 180/pi = ~57.29577951308232087685 or 0x1.ca5dc1a63c1f8p+5 or 0x1.ca5dc2p+5F
+  return 0x1.ca5dc1a63c1f8p+5 * radians;
+}
diff --git a/lib/kernel/libclc/ep_log.h b/lib/kernel/libclc/ep_log.h
new file mode 100644
index 0000000..ebb7d0f
--- /dev/null
+++ b/lib/kernel/libclc/ep_log.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+_CL_OVERLOADABLE void __pocl_ep_log(vtype x, itype *xexp, vtype *r1, vtype *r2);
diff --git a/lib/kernel/libclc/ep_log_fp32.cl b/lib/kernel/libclc/ep_log_fp32.cl
new file mode 100644
index 0000000..e69de29
diff --git a/lib/kernel/libclc/ep_log_fp64.cl b/lib/kernel/libclc/ep_log_fp64.cl
new file mode 100644
index 0000000..3ca188a
--- /dev/null
+++ b/lib/kernel/libclc/ep_log_fp64.cl
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define LN0 (vtype)8.33333333333317923934e-02
+#define LN1 (vtype)1.25000000037717509602e-02
+#define LN2 (vtype)2.23213998791944806202e-03
+#define LN3 (vtype)4.34887777707614552256e-04
+
+#define LF0 (vtype)8.33333333333333593622e-02
+#define LF1 (vtype)1.24999999978138668903e-02
+#define LF2 (vtype)2.23219810758559851206e-03
+
+_CL_OVERLOADABLE void __pocl_ep_log(vtype x, itype *xexp, vtype *r1, vtype *r2)
+{
+    // Computes natural log(x). Algorithm based on:
+    // Ping-Tak Peter Tang
+    // "Table-driven implementation of the logarithm function in IEEE
+    // vtypeing-point arithmetic"
+    // ACM Transactions on Mathematical Software (TOMS)
+    // Volume 16, Issue 4 (December 1990)
+    itype near_one = (x >= (vtype)0x1.e0faap-1) & (x <= (vtype)0x1.1082cp+0);
+
+    utype ux = as_utype(x);
+    utype uxs = as_utype(as_vtype(as_utype((utype)0x03d0000000000000UL) | ux) - (vtype)0x1.0p-962);
+    itype c = (ux < (utype)IMPBIT_DP64);
+    ux = c ? uxs : ux;
+    itype expadjust = c ? (itype)60 : (itype)0;
+
+    // Store the exponent of x in xexp and put f into the range [0.5,1)
+    itype xexp1 = ((as_itype(ux) >> 52) & 0x7ff) - (itype)EXPBIAS_DP64 - expadjust;
+    vtype f = as_vtype(HALFEXPBITS_DP64 | (ux & MANTBITS_DP64));
+    *xexp = near_one ? (itype)0 : xexp1;
+
+    vtype r = x - (vtype)1.0;
+    vtype u1 = MATH_DIVIDE(r, (vtype)2.0 + r);
+    vtype ru1 = -r * u1;
+    u1 = u1 + u1;
+
+    itype index = as_itype(ux) >> 45; // 13 + 32
+    index = (((itype)0x80 | (index & (itype)0x7e)) >> 1) + (index & (itype)0x1);
+
+    vtype f1 = convert_vtype(index) * 0x1.0p-7;
+    vtype f2 = f - f1;
+    vtype u2 = MATH_DIVIDE(f2, pocl_fma((vtype)0.5, f2, f1));
+
+    v2type tv = USE_VTABLE(ln_tbl, convert_uinttype(index - (itype)64));
+    vtype z1 = tv.lo;
+    vtype q = tv.hi;
+
+    z1 = near_one ? r : z1;
+    q = near_one ? (vtype)0.0 : q;
+    vtype u = near_one ? u1 : u2;
+    vtype v = u*u;
+
+    vtype cc = near_one ? ru1 : u2;
+
+    vtype z21 = pocl_fma(v, pocl_fma(v, pocl_fma(v, LN3, LN2), LN1), LN0);
+    vtype z22 = pocl_fma(v, pocl_fma(v, LF2, LF1), LF0);
+    vtype z2 = near_one ? z21 : z22;
+    z2 = pocl_fma(u*v, z2, cc) + q;
+
+    *r1 = z1;
+    *r2 = z2;
+}
diff --git a/lib/CL/clReleaseDevice.c b/lib/kernel/libclc/expfrexp_fp32.cl
similarity index 65%
copy from lib/CL/clReleaseDevice.c
copy to lib/kernel/libclc/expfrexp_fp32.cl
index b48b29e..566d08e 100644
--- a/lib/CL/clReleaseDevice.c
+++ b/lib/kernel/libclc/expfrexp_fp32.cl
@@ -1,17 +1,17 @@
-/* OpenCL runtime library: clReleaseDevice()
+/* OpenCL built-in library: expfrexp()
+
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
 
-   Copyright (c) 2011 Pekka Jääskeläinen / TUT
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -21,20 +21,20 @@
    THE SOFTWARE.
 */
 
-#include "pocl_cl.h"
 
-CL_API_ENTRY cl_int CL_API_CALL
-POname(clReleaseDevice)(cl_device_id device) CL_API_SUFFIX__VERSION_1_2 
+_CL_OVERLOADABLE _CL_ALWAYSINLINE itype
+_cl_expfrexp(vtype x)
 {
-  if (device->parent_device == NULL)
-    return CL_SUCCESS;
-
-  int new_refcount;
-  POCL_RELEASE_OBJECT (device, new_refcount);
+  itype ret = (itype)0;
+  // denorms
+  itype cond = (fabs(x) < (vtype)FLT_MIN);
+  x = cond ? (x * 0x1p30f) : x;
+  ret = cond ? (ret - (itype)30) : ret;
 
-  if (new_refcount == 0)
-    POCL_MEM_FREE(device);
+  //ret += (as_itype((as_utype(x) >> 23) & (utype)0xFF) - (itype)0x7E);
+  ret += (as_itype( (as_utype(x) << 1) >> 23 ) - (itype)0x7E);
 
-  return CL_SUCCESS;
+  ret = (x == (vtype)0.0f) ? (itype)0 : ret;
+  ret = (isnan(x) | isinf(x)) ? (itype)0 : ret;
+  return ret;
 }
-POsym(clReleaseDevice)
diff --git a/lib/CL/clReleaseDevice.c b/lib/kernel/libclc/expfrexp_fp64.cl
similarity index 65%
copy from lib/CL/clReleaseDevice.c
copy to lib/kernel/libclc/expfrexp_fp64.cl
index b48b29e..f95590c 100644
--- a/lib/CL/clReleaseDevice.c
+++ b/lib/kernel/libclc/expfrexp_fp64.cl
@@ -1,17 +1,17 @@
-/* OpenCL runtime library: clReleaseDevice()
+/* OpenCL built-in library: expfrexp()
+
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
 
-   Copyright (c) 2011 Pekka Jääskeläinen / TUT
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -21,20 +21,20 @@
    THE SOFTWARE.
 */
 
-#include "pocl_cl.h"
 
-CL_API_ENTRY cl_int CL_API_CALL
-POname(clReleaseDevice)(cl_device_id device) CL_API_SUFFIX__VERSION_1_2 
+_CL_OVERLOADABLE _CL_ALWAYSINLINE itype
+_cl_expfrexp(vtype x)
 {
-  if (device->parent_device == NULL)
-    return CL_SUCCESS;
-
-  int new_refcount;
-  POCL_RELEASE_OBJECT (device, new_refcount);
+  itype ret = (itype)0;
+  // denorms
+  itype cond = (fabs(x) < (vtype)DBL_MIN);
+  x = cond ? (x * 0x1p63) : x;
+  ret = cond ? (ret - (itype)63) : ret;
 
-  if (new_refcount == 0)
-    POCL_MEM_FREE(device);
+  //ret += (as_itype((as_utype(x) >> 52) & (utype)0x7FF) - (itype)0x3FE);
+  ret += (as_itype( (as_utype(x) << 1) >> 52 ) - (itype)0x3FE);
 
-  return CL_SUCCESS;
+  ret = (x == (vtype)0.0) ? (itype)0 : ret;
+  ret = (isnan(x) | isinf(x)) ? (itype)0 : ret;
+  return ret;
 }
-POsym(clReleaseDevice)
diff --git a/lib/kernel/libclc/fmod_fp32.cl b/lib/kernel/libclc/fmod_fp32.cl
new file mode 100644
index 0000000..f5b04df
--- /dev/null
+++ b/lib/kernel/libclc/fmod_fp32.cl
@@ -0,0 +1,12 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See ROCM_LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#define COMPILING_FMOD
+
+#include "remainder_base_fp32.cl"
+
+#undef COMPILING_FMOD
diff --git a/lib/kernel/libclc/fmod_fp64.cl b/lib/kernel/libclc/fmod_fp64.cl
new file mode 100644
index 0000000..4a7a754
--- /dev/null
+++ b/lib/kernel/libclc/fmod_fp64.cl
@@ -0,0 +1,12 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See ROCM_LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#define COMPILING_FMOD
+
+#include "remainder_base_fp64.cl"
+
+#undef COMPILING_FMOD
diff --git a/lib/kernel/rsqrt.cl b/lib/kernel/libclc/frexp_fp32.cl
similarity index 77%
copy from lib/kernel/rsqrt.cl
copy to lib/kernel/libclc/frexp_fp32.cl
index 3c75ca1..74d459e 100644
--- a/lib/kernel/rsqrt.cl
+++ b/lib/kernel/libclc/frexp_fp32.cl
@@ -1,18 +1,17 @@
-/* OpenCL built-in library: rsqrt()
+/* OpenCL built-in library: frexp()
+
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
 
-   Copyright (c) 2011 Erik Schnetter <eschnetter at perimeterinstitute.ca>
-                      Perimeter Institute for Theoretical Physics
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -22,8 +21,11 @@
    THE SOFTWARE.
 */
 
-#include "templates.h"
-
-DEFINE_EXPR_V_V(rsqrt, (stype)1/sqrt(a))
+_CL_OVERLOADABLE itype _cl_expfrexp(vtype x);
+_CL_OVERLOADABLE vtype _cl_frfrexp(vtype x);
 
-DEFINE_EXPR_F_F(half_rsqrt, rsqrt(a))
+_CL_OVERLOADABLE vtype frexp(vtype x, inttype ADDRSPACE *exp)
+{
+  *exp = _cl_expfrexp(x);
+  return _cl_frfrexp(x);
+}
diff --git a/lib/kernel/rsqrt.cl b/lib/kernel/libclc/frexp_fp64.cl
similarity index 76%
copy from lib/kernel/rsqrt.cl
copy to lib/kernel/libclc/frexp_fp64.cl
index 3c75ca1..13ed5f3 100644
--- a/lib/kernel/rsqrt.cl
+++ b/lib/kernel/libclc/frexp_fp64.cl
@@ -1,18 +1,17 @@
-/* OpenCL built-in library: rsqrt()
+/* OpenCL built-in library: frexp()
+
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
 
-   Copyright (c) 2011 Erik Schnetter <eschnetter at perimeterinstitute.ca>
-                      Perimeter Institute for Theoretical Physics
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -22,8 +21,11 @@
    THE SOFTWARE.
 */
 
-#include "templates.h"
-
-DEFINE_EXPR_V_V(rsqrt, (stype)1/sqrt(a))
+_CL_OVERLOADABLE itype _cl_expfrexp(vtype x);
+_CL_OVERLOADABLE vtype _cl_frfrexp(vtype x);
 
-DEFINE_EXPR_F_F(half_rsqrt, rsqrt(a))
+_CL_OVERLOADABLE vtype frexp(vtype x, inttype ADDRSPACE *exp)
+{
+  *exp = convert_inttype(_cl_expfrexp(x));
+  return _cl_frfrexp(x);
+}
diff --git a/lib/CL/clReleaseDevice.c b/lib/kernel/libclc/frfrexp_fp32.cl
similarity index 69%
copy from lib/CL/clReleaseDevice.c
copy to lib/kernel/libclc/frfrexp_fp32.cl
index b48b29e..316763f 100644
--- a/lib/CL/clReleaseDevice.c
+++ b/lib/kernel/libclc/frfrexp_fp32.cl
@@ -1,17 +1,17 @@
-/* OpenCL runtime library: clReleaseDevice()
+/* OpenCL built-in library: frfrexp()
+
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
 
-   Copyright (c) 2011 Pekka Jääskeläinen / TUT
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -21,20 +21,17 @@
    THE SOFTWARE.
 */
 
-#include "pocl_cl.h"
-
-CL_API_ENTRY cl_int CL_API_CALL
-POname(clReleaseDevice)(cl_device_id device) CL_API_SUFFIX__VERSION_1_2 
+_CL_OVERLOADABLE _CL_ALWAYSINLINE vtype
+_cl_frfrexp(vtype x)
 {
-  if (device->parent_device == NULL)
-    return CL_SUCCESS;
-
-  int new_refcount;
-  POCL_RELEASE_OBJECT (device, new_refcount);
+  // denorms
+  itype cond = (fabs(x) < (vtype)FLT_MIN);
+  x = cond ? (x * 0x1p30f) : x;
 
-  if (new_refcount == 0)
-    POCL_MEM_FREE(device);
+  vtype ret = as_vtype((as_utype(x) & (utype)MANTSIGNBITS_SP32 )
+                       | (utype)HALFEXPBITS_SP32);
 
-  return CL_SUCCESS;
+  ret = (x == (vtype)0.0f) ? x : ret;
+  ret = (isnan(x) | isinf(x)) ? x : ret;
+  return ret;
 }
-POsym(clReleaseDevice)
diff --git a/lib/CL/clReleaseDevice.c b/lib/kernel/libclc/frfrexp_fp64.cl
similarity index 69%
copy from lib/CL/clReleaseDevice.c
copy to lib/kernel/libclc/frfrexp_fp64.cl
index b48b29e..967553e 100644
--- a/lib/CL/clReleaseDevice.c
+++ b/lib/kernel/libclc/frfrexp_fp64.cl
@@ -1,17 +1,17 @@
-/* OpenCL runtime library: clReleaseDevice()
+/* OpenCL built-in library: frfrexp()
+
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
 
-   Copyright (c) 2011 Pekka Jääskeläinen / TUT
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -21,20 +21,18 @@
    THE SOFTWARE.
 */
 
-#include "pocl_cl.h"
-
-CL_API_ENTRY cl_int CL_API_CALL
-POname(clReleaseDevice)(cl_device_id device) CL_API_SUFFIX__VERSION_1_2 
+_CL_OVERLOADABLE _CL_ALWAYSINLINE vtype
+_cl_frfrexp(vtype x)
 {
-  if (device->parent_device == NULL)
-    return CL_SUCCESS;
 
-  int new_refcount;
-  POCL_RELEASE_OBJECT (device, new_refcount);
+  // denorms
+  itype cond = (fabs(x) < (vtype)DBL_MIN);
+  x = cond ? (x * 0x1p63) : x;
 
-  if (new_refcount == 0)
-    POCL_MEM_FREE(device);
+  vtype ret = as_vtype((as_utype(x) & (utype)MANTSIGNBITS_DP64 )
+                       | (utype)HALFEXPBITS_DP64);
 
-  return CL_SUCCESS;
+  ret = (x == (vtype)0.0) ? x : ret;
+  ret = (isnan(x) | isinf(x)) ? x : ret;
+  return ret;
 }
-POsym(clReleaseDevice)
diff --git a/lib/kernel/libclc/isfinite_fp32.cl b/lib/kernel/libclc/isfinite_fp32.cl
new file mode 100644
index 0000000..c380fd1
--- /dev/null
+++ b/lib/kernel/libclc/isfinite_fp32.cl
@@ -0,0 +1,4 @@
+_CL_OVERLOADABLE itype isfinite(vtype i)
+{
+  return ((as_utype(i) << 1) < (utype)(EXPBITS_SP32 << 1));
+}
diff --git a/lib/kernel/libclc/isfinite_fp64.cl b/lib/kernel/libclc/isfinite_fp64.cl
new file mode 100644
index 0000000..d2eaa1a
--- /dev/null
+++ b/lib/kernel/libclc/isfinite_fp64.cl
@@ -0,0 +1,13 @@
+#ifdef SINGLEVEC
+_CL_OVERLOADABLE inttype isfinite(vtype i)
+#else
+_CL_OVERLOADABLE itype isfinite(vtype i)
+#endif
+{
+  itype res = ((as_utype(i) << 1) < (utype)(EXPBITS_DP64 << 1));
+#ifdef SINGLEVEC
+  return convert_int(res);
+#else
+  return res;
+#endif
+}
diff --git a/lib/kernel/libclc/isinf_fp32.cl b/lib/kernel/libclc/isinf_fp32.cl
new file mode 100644
index 0000000..64fbee8
--- /dev/null
+++ b/lib/kernel/libclc/isinf_fp32.cl
@@ -0,0 +1,4 @@
+_CL_OVERLOADABLE itype isinf(vtype i)
+{
+  return ((as_itype(i) << 1) == (itype)(EXPBITS_SP32 << 1));
+}
diff --git a/lib/kernel/libclc/isinf_fp64.cl b/lib/kernel/libclc/isinf_fp64.cl
new file mode 100644
index 0000000..e8f0b44
--- /dev/null
+++ b/lib/kernel/libclc/isinf_fp64.cl
@@ -0,0 +1,13 @@
+#ifdef SINGLEVEC
+_CL_OVERLOADABLE inttype isinf(vtype i)
+#else
+_CL_OVERLOADABLE itype isinf(vtype i)
+#endif
+{
+  itype res = ((as_itype(i) << 1) == (itype)(EXPBITS_DP64 << 1));
+#ifdef SINGLEVEC
+  return convert_int(res);
+#else
+  return res;
+#endif
+}
diff --git a/lib/kernel/libclc/isnan_fp32.cl b/lib/kernel/libclc/isnan_fp32.cl
new file mode 100644
index 0000000..a95c108
--- /dev/null
+++ b/lib/kernel/libclc/isnan_fp32.cl
@@ -0,0 +1,4 @@
+_CL_OVERLOADABLE itype isnan(vtype i)
+{
+  return ((as_utype(i) << 1) > (utype)(EXPBITS_SP32 << 1));
+}
diff --git a/lib/kernel/libclc/isnan_fp64.cl b/lib/kernel/libclc/isnan_fp64.cl
new file mode 100644
index 0000000..8e952f1
--- /dev/null
+++ b/lib/kernel/libclc/isnan_fp64.cl
@@ -0,0 +1,13 @@
+#ifdef SINGLEVEC
+_CL_OVERLOADABLE inttype isnan(vtype i)
+#else
+_CL_OVERLOADABLE itype isnan(vtype i)
+#endif
+{
+  itype res = ((as_utype(i) << 1) > (utype)((ulong)EXPBITS_DP64 << 1));
+#ifdef SINGLEVEC
+  return convert_int(res);
+#else
+  return res;
+#endif
+}
diff --git a/lib/kernel/libclc/isnormal_fp32.cl b/lib/kernel/libclc/isnormal_fp32.cl
new file mode 100644
index 0000000..6192763
--- /dev/null
+++ b/lib/kernel/libclc/isnormal_fp32.cl
@@ -0,0 +1,4 @@
+_CL_OVERLOADABLE itype isnormal(vtype i)
+{
+  return ((as_utype(i) << 1) < (utype)(EXPBITS_SP32 << 1)) & ((as_utype(i) << 1) > (utype)(MANTBITS_SP32 << 1));
+}
diff --git a/lib/kernel/libclc/isnormal_fp64.cl b/lib/kernel/libclc/isnormal_fp64.cl
new file mode 100644
index 0000000..13e7e83
--- /dev/null
+++ b/lib/kernel/libclc/isnormal_fp64.cl
@@ -0,0 +1,13 @@
+#ifdef SINGLEVEC
+_CL_OVERLOADABLE inttype isnormal(vtype i)
+#else
+_CL_OVERLOADABLE itype isnormal(vtype i)
+#endif
+{
+  itype res = ((as_utype(i) << 1) < (utype)(EXPBITS_DP64 << 1)) & ((as_utype(i) << 1) > (utype)(MANTBITS_DP64 << 1));
+#ifdef SINGLEVEC
+  return convert_int(res);
+#else
+  return res;
+#endif
+}
diff --git a/lib/kernel/libclc/length.cl b/lib/kernel/libclc/length.cl
new file mode 100644
index 0000000..6569f1c
--- /dev/null
+++ b/lib/kernel/libclc/length.cl
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+_CL_OVERLOADABLE float length(float p) {
+  return fabs(p);
+}
+
+#define V_FLENGTH(p)                     \
+  float l2 = dot(p, p);                  \
+                                         \
+  if (l2 < FLT_MIN) {                    \
+    p *= 0x1.0p+86F;                     \
+    return sqrt(dot(p, p)) * 0x1.0p-86F; \
+  } else if (l2 == INFINITY) {           \
+    p *= 0x1.0p-65F;                     \
+    return sqrt(dot(p, p)) * 0x1.0p+65F; \
+  }                                      \
+                                         \
+  return sqrt(l2);
+
+_CL_OVERLOADABLE float length(float2 p) {
+  V_FLENGTH(p);
+}
+
+_CL_OVERLOADABLE float length(float3 p) {
+  V_FLENGTH(p);
+}
+
+_CL_OVERLOADABLE float length(float4 p) {
+  V_FLENGTH(p);
+}
+
+
+
+#ifdef cl_khr_fp64
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CL_OVERLOADABLE double length(double p){
+  return fabs(p);
+}
+
+#define V_DLENGTH(p)                       \
+  double l2 = dot(p, p);                   \
+                                           \
+  if (l2 < DBL_MIN) {                      \
+      p *= 0x1.0p+563;                     \
+      return sqrt(dot(p, p)) * 0x1.0p-563; \
+  } else if (l2 == INFINITY) {             \
+      p *= 0x1.0p-513;                     \
+      return sqrt(dot(p, p)) * 0x1.0p+513; \
+  }                                        \
+                                           \
+  return sqrt(l2);
+
+_CL_OVERLOADABLE double length(double2 p) {
+  V_DLENGTH(p);
+}
+
+_CL_OVERLOADABLE double length(double3 p) {
+  V_DLENGTH(p);
+}
+
+_CL_OVERLOADABLE double
+length(double4 p) {
+  V_DLENGTH(p);
+}
+
+#endif
diff --git a/lib/kernel/libclc/log1p_fp32.cl b/lib/kernel/libclc/log1p_fp32.cl
new file mode 100644
index 0000000..c1b82fb
--- /dev/null
+++ b/lib/kernel/libclc/log1p_fp32.cl
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+_CL_OVERLOADABLE float log1p(float x)
+{
+    float w = x;
+    uint ux = as_uint(x);
+    uint ax = ux & EXSIGNBIT_SP32;
+
+    // |x| < 2^-4
+    float u2 = MATH_DIVIDE(x, 2.0f + x);
+    float u = u2 + u2;
+    float v = u * u;
+    // 2/(5 * 2^5), 2/(3 * 2^3)
+    float zsmall = pocl_fma(-u2, x, pocl_fma(v, 0x1.99999ap-7f, 0x1.555556p-4f) * v * u) + x;
+
+    // |x| >= 2^-4
+    ux = as_uint(x + 1.0f);
+
+    int m = (int)((ux >> EXPSHIFTBITS_SP32) & 0xff) - EXPBIAS_SP32;
+    float mf = (float)m;
+    uint indx = (ux & 0x007f0000) + ((ux & 0x00008000) << 1);
+    float F = as_float(indx | 0x3f000000);
+
+    // x > 2^24
+    float fg24 = F - as_float(0x3f000000 | (ux & MANTBITS_SP32));
+
+    // x <= 2^24
+    uint xhi = ux & 0xffff8000;
+    float xh = as_float(xhi);
+    float xt = (1.0f - xh) + w;
+    uint xnm = ((~(xhi & 0x7f800000)) - 0x00800000) & 0x7f800000;
+    xt = xt * as_float(xnm) * 0.5f;
+    float fl24 = F - as_float(0x3f000000 | (xhi & MANTBITS_SP32)) - xt;
+
+    float f = mf > 24.0f ? fg24 : fl24;
+
+    indx = indx >> 16;
+    float r = f * USE_TABLE(log_inv_tbl, indx);
+
+    // 1/3, 1/2
+    float poly = pocl_fma(pocl_fma(r, 0x1.555556p-2f, 0x1.0p-1f), r*r, r);
+
+    const float LOG2_HEAD = 0x1.62e000p-1f;   // 0.693115234
+    const float LOG2_TAIL = 0x1.0bfbe8p-15f;  // 0.0000319461833
+
+    float2 tv = USE_TABLE(loge_tbl, indx);
+    float z1 = pocl_fma(mf, LOG2_HEAD, tv.s0);
+    float z2 = pocl_fma(mf, LOG2_TAIL, -poly) + tv.s1;
+    float z = z1 + z2;
+
+    z = ax < 0x3d800000U ? zsmall : z;
+
+
+
+    // Edge cases
+    z = ax >= PINFBITPATT_SP32 ? w : z;
+    z = w  < -1.0f ? as_float(QNANBITPATT_SP32) : z;
+    z = w == -1.0f ? as_float(NINFBITPATT_SP32) : z;
+        //fix subnormals
+        z = ax  < 0x33800000 ? x : z;
+
+    return z;
+}
\ No newline at end of file
diff --git a/lib/kernel/libclc/log1p_fp64.cl b/lib/kernel/libclc/log1p_fp64.cl
new file mode 100644
index 0000000..8aad6c6
--- /dev/null
+++ b/lib/kernel/libclc/log1p_fp64.cl
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+_CL_OVERLOADABLE double log1p(double x)
+{
+    // Computes natural log(1+x). Algorithm based on:
+    // Ping-Tak Peter Tang
+    // "Table-driven implementation of the logarithm function in IEEE
+    // floating-point arithmetic"
+    // ACM Transactions on Mathematical Software (TOMS)
+    // Volume 16, Issue 4 (December 1990)
+    // Note that we use a lookup table of size 64 rather than 128,
+    // and compensate by having extra terms in the minimax polynomial
+    // for the kernel approximation.
+
+    // Process Inside the threshold now
+    ulong ux = as_utype(1.0 + x);
+    int xexp = ((as_int2(ux).hi >> 20) & 0x7ff) - EXPBIAS_DP64;
+    double f = as_double(ONEEXPBITS_DP64 | (ux & MANTBITS_DP64));
+
+    int j = as_int2(ux).hi >> 13;
+    j = ((0x80 | (j & 0x7e)) >> 1) + (j & 0x1);
+    double f1 = (double)j * 0x1.0p-6;
+    j -= 64;
+
+    double f2temp = f - f1;
+    double m2 = as_double(convert_ulong(0x3ff - xexp) << EXPSHIFTBITS_DP64);
+    double f2l = pocl_fma(m2, x, m2 - f1);
+    double f2g = pocl_fma(m2, x, -f1) + m2;
+    double f2 = xexp <= MANTLENGTH_DP64-1 ? f2l : f2g;
+    f2 = (xexp <= -2) | (xexp >= MANTLENGTH_DP64+8) ? f2temp : f2;
+
+    double2 tv = USE_TABLE(ln_tbl, j);
+    double z1 = tv.s0;
+    double q = tv.s1;
+
+    double u = MATH_DIVIDE(f2, pocl_fma(0.5, f2, f1));
+    double v = u * u;
+
+    double poly = v * pocl_fma(v,
+                          pocl_fma(v, 2.23219810758559851206e-03, 1.24999999978138668903e-02),
+                          8.33333333333333593622e-02);
+
+    // log2_lead and log2_tail sum to an extra-precise version of log(2)
+    const double log2_lead = 6.93147122859954833984e-01; /* 0x3fe62e42e0000000 */
+    const double log2_tail = 5.76999904754328540596e-08; /* 0x3e6efa39ef35793c */
+
+    double z2 = q + pocl_fma(u, poly, u);
+    double dxexp = (double)xexp;
+    double r1 = pocl_fma(dxexp, log2_lead, z1);
+    double r2 = pocl_fma(dxexp, log2_tail, z2);
+    double result1 = r1 + r2;
+
+    // Process Outside the threshold now
+    double r = x;
+    u = r / (2.0 + r);
+    double correction = r * u;
+    u = u + u;
+    v = u * u;
+    r1 = r;
+
+    poly = pocl_fma(v,
+               pocl_fma(v,
+                   pocl_fma(v, 4.34887777707614552256e-04, 2.23213998791944806202e-03),
+                   1.25000000037717509602e-02),
+               8.33333333333317923934e-02);
+
+    r2 = pocl_fma(u*v, poly, -correction);
+
+    // The values exp(-1/16)-1 and exp(1/16)-1
+    const double log1p_thresh1 = -0x1.f0540438fd5c3p-5;
+    const double log1p_thresh2 =  0x1.082b577d34ed8p-4;
+    double result2 = r1 + r2;
+    result2 = x < log1p_thresh1 | x > log1p_thresh2 ? result1 : result2;
+
+    result2 = isinf(x) ? x : result2;
+    result2 = x < -1.0 ? as_double(QNANBITPATT_DP64) : result2;
+    result2 = x == -1.0 ? as_double(NINFBITPATT_DP64) : result2;
+    return result2;
+}
\ No newline at end of file
diff --git a/lib/kernel/libclc/log2_fp32.cl b/lib/kernel/libclc/log2_fp32.cl
new file mode 100644
index 0000000..3a20555
--- /dev/null
+++ b/lib/kernel/libclc/log2_fp32.cl
@@ -0,0 +1,12 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See ROCM_LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#define COMPILING_LOG2
+
+#include "log_base_fp32.cl"
+
+#undef COMPILING_LOG2
diff --git a/lib/kernel/libclc/log2_fp64.cl b/lib/kernel/libclc/log2_fp64.cl
new file mode 100644
index 0000000..0463b59
--- /dev/null
+++ b/lib/kernel/libclc/log2_fp64.cl
@@ -0,0 +1,12 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See ROCM_LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#define COMPILING_LOG2
+
+#include "log_base_fp64.cl"
+
+#undef COMPILING_LOG2
diff --git a/lib/kernel/libclc/log_base_fp32.cl b/lib/kernel/libclc/log_base_fp32.cl
new file mode 100644
index 0000000..7d949cd
--- /dev/null
+++ b/lib/kernel/libclc/log_base_fp32.cl
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+/*
+   Algorithm:
+
+   Based on:
+   Ping-Tak Peter Tang
+   "Table-driven implementation of the logarithm function in IEEE
+   floating-point arithmetic"
+   ACM Transactions on Mathematical Software (TOMS)
+   Volume 16, Issue 4 (December 1990)
+
+   x very close to 1.0 is handled differently, for x everywhere else
+   a brief explanation is given below
+
+   x = (2^m)*A
+   x = (2^m)*(G+g) with (1 <= G < 2) and (g <= 2^(-8))
+   x = (2^m)*2*(G/2+g/2)
+   x = (2^m)*2*(F+f) with (0.5 <= F < 1) and (f <= 2^(-9))
+
+   Y = (2^(-1))*(2^(-m))*(2^m)*A
+   Now, range of Y is: 0.5 <= Y < 1
+
+   F = 0x80 + (first 7 mantissa bits) + (8th mantissa bit)
+   Now, range of F is: 128 <= F <= 256
+   F = F / 256
+   Now, range of F is: 0.5 <= F <= 1
+
+   f = -(Y-F), with (f <= 2^(-9))
+
+   log(x) = m*log(2) + log(2) + log(F-f)
+   log(x) = m*log(2) + log(2) + log(F) + log(1-(f/F))
+   log(x) = m*log(2) + log(2*F) + log(1-r)
+
+   r = (f/F), with (r <= 2^(-8))
+   r = f*(1/F) with (1/F) precomputed to avoid division
+
+   log(x) = m*log(2) + log(G) - poly
+
+   log(G) is precomputed
+   poly = (r + (r^2)/2 + (r^3)/3 + (r^4)/4) + (r^5)/5))
+
+   log(2) and log(G) need to be maintained in extra precision
+   to avoid losing precision in the calculations
+
+
+   For x close to 1.0, we employ the following technique to
+   ensure faster convergence.
+
+   log(x) = log((1+s)/(1-s)) = 2*s + (2/3)*s^3 + (2/5)*s^5 + (2/7)*s^7
+   x = ((1+s)/(1-s))
+   x = 1 + r
+   s = r/(2+r)
+
+*/
+
+
+_CL_OVERLOADABLE vtype
+#if defined(COMPILING_LOG2)
+log2(vtype x)
+#elif defined(COMPILING_LOGB)
+logb(vtype x)
+#elif defined(COMPILING_LOG10)
+log10(vtype x)
+#else
+log(vtype x)
+#endif
+{
+
+#if defined(COMPILING_LOGB)
+#define COMPILING_LOG2
+#endif
+
+#if defined(COMPILING_LOG2)
+    const vtype LOG2E = (vtype)0x1.715476p+0f;      // 1.4426950408889634
+    const vtype LOG2E_HEAD = (vtype)0x1.700000p+0f; // 1.4375
+    const vtype LOG2E_TAIL = (vtype)0x1.547652p-8f; // 0.00519504072
+#elif defined(COMPILING_LOG10)
+    const vtype LOG10E = (vtype)0x1.bcb7b2p-2f;        // 0.43429448190325182
+    const vtype LOG10E_HEAD = (vtype)0x1.bc0000p-2f;   // 0.43359375
+    const vtype LOG10E_TAIL = (vtype)0x1.6f62a4p-11f;  // 0.0007007319
+    const vtype LOG10_2_HEAD = (vtype)0x1.340000p-2f;  // 0.30078125
+    const vtype LOG10_2_TAIL = (vtype)0x1.04d426p-12f; // 0.000248745637
+#else
+    const vtype LOG2_HEAD = (vtype)0x1.62e000p-1f;  // 0.693115234
+    const vtype LOG2_TAIL = (vtype)0x1.0bfbe8p-15f; // 0.0000319461833
+#endif
+
+    utype xi = as_utype(x);
+    utype ax = xi & (utype)EXSIGNBIT_SP32;
+
+    // Calculations for |x-1| < 2^-4
+    vtype r = x - (vtype)1.0f;
+    itype near1 = (fabs(r) < (vtype)0x1.0p-4f);
+    vtype u2 = MATH_DIVIDE(r, (vtype)2.0f + r);
+    vtype corr = u2 * r;
+    vtype u = u2 + u2;
+    vtype v = u * u;
+    vtype znear1, z1, z2;
+
+    // 2/(5 * 2^5), 2/(3 * 2^3)
+    z2 = pocl_fma(u,
+           pocl_fma(v,
+             (vtype)0x1.99999ap-7f,
+             (vtype)0x1.555556p-4f)*v,
+           -corr);
+
+#if defined(COMPILING_LOG2)
+    z1 = as_vtype(as_itype(r) & (itype)0xffff0000);
+    z2 = z2 + (r - z1);
+    znear1 = pocl_fma(z1, LOG2E_HEAD,
+               pocl_fma(z2, LOG2E_HEAD,
+                 pocl_fma(z1, LOG2E_TAIL, z2*LOG2E_TAIL)));
+#elif defined(COMPILING_LOG10)
+    z1 = as_vtype(as_itype(r) & (itype)0xffff0000);
+    z2 = z2 + (r - z1);
+    znear1 = pocl_fma(z1, LOG10E_HEAD,
+               pocl_fma(z2, LOG10E_HEAD,
+                 pocl_fma(z1, LOG10E_TAIL, z2*LOG10E_TAIL)));
+#else
+    znear1 = z2 + r;
+#endif
+
+    // Calculations for x not near 1
+    itype m = as_itype(xi >> EXPSHIFTBITS_SP32) - (itype)EXPBIAS_SP32;
+
+    // Normalize subnormal
+    utype xis = as_utype(as_vtype(xi | (utype)0x3f800000) - (vtype)1.0f);
+    itype ms = (as_itype(xis) >> EXPSHIFTBITS_SP32) - (itype)253;
+    itype c = (m == -127);
+    m = c ? ms : m;
+    utype xin = c ? xis : xi;
+
+    vtype mf = convert_vtype(m);
+    utype indx = (xin & (utype)0x007f0000) + ((xin & (utype)0x00008000) << 1);
+
+    // F - Y
+    vtype f = as_vtype((utype)0x3f000000 | indx)
+              - as_vtype((utype)0x3f000000 | (xin & MANTBITS_SP32));
+
+    indx = indx >> 16;
+    r = f * USE_VTABLE(log_inv_tbl, indx);
+
+    // 1/3,  1/2
+    vtype poly = pocl_fma(
+                   pocl_fma(r, (vtype)0x1.555556p-2f, (vtype)0.5f),
+                   r*r,
+                   r);
+
+#if defined(COMPILING_LOG2)
+    v2type tv = USE_VTABLE(log2_tbl, indx);
+    vtype s0 = tv.lo;
+    vtype s1 = tv.hi;
+    z1 = s0 + mf;
+    z2 = pocl_fma(poly, -LOG2E, s1);
+#elif defined(COMPILING_LOG10)
+    v2type tv = USE_VTABLE(log10_tbl, indx);
+    vtype s0 = tv.lo;
+    vtype s1 = tv.hi;
+    z1 = pocl_fma(mf, LOG10_2_HEAD, s0);
+    z2 = pocl_fma(poly, -LOG10E, mf*LOG10_2_TAIL) + s1;
+#else
+    v2type tv = USE_VTABLE(loge_tbl, indx);
+    vtype s0 = tv.lo;
+    vtype s1 = tv.hi;
+    z1 = pocl_fma(mf, LOG2_HEAD, s0);
+    z2 = pocl_fma(mf, LOG2_TAIL, -poly) + s1;
+#endif
+
+    vtype z = z1 + z2;
+    z = near1 ? znear1 : z;
+
+    // Corner cases
+    z = (ax >= (utype)PINFBITPATT_SP32) ? x : z;
+    z = (xi != ax) ? as_vtype((utype)QNANBITPATT_SP32) : z;
+    z = (ax == 0) ? as_vtype((utype)NINFBITPATT_SP32) : z;
+
+    return z;
+}
diff --git a/lib/kernel/libclc/log_base_fp64.cl b/lib/kernel/libclc/log_base_fp64.cl
new file mode 100644
index 0000000..dc46aea
--- /dev/null
+++ b/lib/kernel/libclc/log_base_fp64.cl
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+_CL_OVERLOADABLE vtype
+#if defined(COMPILING_LOG2)
+log2(vtype x)
+#elif defined(COMPILING_LOGB)
+logb(vtype x)
+#elif defined(COMPILING_LOG10)
+log10(vtype x)
+#else
+log(vtype x)
+#endif
+{
+
+#ifdef COMPILING_LOGB
+#define COMPILING_LOG2
+#endif
+
+#if defined(COMPILING_LOG10)
+    // log10e_lead and log10e_tail sum to an extra-precision version of log10(e) (19 bits in lead)
+    const vtype log10e_lead = (vtype)4.34293746948242187500e-01;  /* 0x3fdbcb7800000000 */
+    const vtype log10e_tail = (vtype)7.3495500964015109100644e-7; /* 0x3ea8a93728719535 */
+#elif defined(COMPILING_LOG2)
+    // log2e_lead and log2e_tail sum to an extra-precision version of log2(e) (19 bits in lead)
+    const vtype log2e_lead = (vtype)1.44269180297851562500E+00; /* 0x3FF7154400000000 */
+    const vtype log2e_tail = (vtype)3.23791044778235969970E-06; /* 0x3ECB295C17F0BBBE */
+#endif
+
+    // log_thresh1 = 9.39412117004394531250e-1 = 0x3fee0faa00000000
+    // log_thresh2 = 1.06449508666992187500 = 0x3ff1082c00000000
+    const vtype log_thresh1 = (vtype)0x1.e0faap-1;
+    const vtype log_thresh2 = (vtype)0x1.1082cp+0;
+
+    itype is_near = (x >= log_thresh1) & (x <= log_thresh2);
+
+    // Near 1 code
+    vtype r = x - (vtype)1.0;
+    vtype u = r / ((vtype)2.0 + r);
+    vtype correction = r * u;
+    u = u + u;
+    vtype v = u * u;
+    vtype r1 = r;
+
+    const vtype ca_1 = (vtype)8.33333333333317923934e-02; /* 0x3fb55555555554e6 */
+    const vtype ca_2 = (vtype)1.25000000037717509602e-02; /* 0x3f89999999bac6d4 */
+    const vtype ca_3 = (vtype)2.23213998791944806202e-03; /* 0x3f62492307f1519f */
+    const vtype ca_4 = (vtype)4.34887777707614552256e-04; /* 0x3f3c8034c85dfff0 */
+
+    vtype r2 = pocl_fma(u*v,
+                 pocl_fma(v,
+                   pocl_fma(v,
+                     pocl_fma(v, ca_4, ca_3),
+                     ca_2),
+                   ca_1),
+                 -correction);
+
+#if defined(COMPILING_LOG10)
+    r = r1;
+    r1 = as_vtype(as_utype(r1) & (utype)0xffffffff00000000);
+    r2 = r2 + (r - r1);
+    vtype ret_near = pocl_fma(log10e_lead, r1,
+                       pocl_fma(log10e_lead, r2,
+                         pocl_fma(log10e_tail, r1, log10e_tail * r2)));
+#elif defined(COMPILING_LOG2)
+    r = r1;
+    r1 = as_vtype(as_utype(r1) & (utype)0xffffffff00000000);
+    r2 = r2 + (r - r1);
+    vtype ret_near = pocl_fma(log2e_lead, r1,
+                       pocl_fma(log2e_lead, r2,
+                         pocl_fma(log2e_tail, r1, log2e_tail * r2)));
+#else
+    vtype ret_near = r1 + r2;
+#endif
+
+    // This is the far from 1 code
+
+    // Deal with subnormal
+    utype ux = as_utype(x);
+    utype uxs = as_utype(
+                as_vtype((utype)0x03d0000000000000UL | ux)
+                - (vtype)0x1.0p-962);
+    itype c = (ux < IMPBIT_DP64);
+    ux = c ? uxs : ux;
+    itype expadjust = c ? (itype)60 : (itype)0;
+
+    itype xexp = ((as_itype(ux) >> 52) & 0x7ff) - (itype)EXPBIAS_DP64 - expadjust;
+    vtype f = as_vtype((utype)HALFEXPBITS_DP64 | (ux & (utype)MANTBITS_DP64));
+    uinttype index = convert_uinttype(ux >> 45);
+    index = (((uinttype)0x80 | (index & (uinttype)0x7e)) >> 1)
+             + (index & (uinttype)0x1);
+
+    v2type tv = USE_VTABLE(ln_tbl, index - (uinttype)64);
+    vtype z1 = tv.lo;
+    vtype q = tv.hi;
+
+    vtype f1 = convert_vtype(index) * 0x1.0p-7;
+    vtype f2 = f - f1;
+    u = f2 / pocl_fma(f2, (vtype)0.5, f1);
+    v = u * u;
+
+    const vtype cb_1 = (vtype)8.33333333333333593622e-02; /* 0x3fb5555555555557 */
+    const vtype cb_2 = (vtype)1.24999999978138668903e-02; /* 0x3f89999999865ede */
+    const vtype cb_3 = (vtype)2.23219810758559851206e-03; /* 0x3f6249423bd94741 */
+
+    vtype poly = v * pocl_fma(v, pocl_fma(v, cb_3, cb_2), cb_1);
+    vtype z2 = q + pocl_fma(u, poly, u);
+
+    vtype dxexp = convert_vtype(xexp);
+#if defined (COMPILING_LOG10)
+    // Add xexp * log(2) to z1,z2 to get log(x)
+    r1 = pocl_fma(dxexp, log2_lead, z1);
+    r2 = pocl_fma(dxexp, log2_tail, z2);
+    vtype ret_far = pocl_fma(log10e_lead, r1,
+                      pocl_fma(log10e_lead, r2,
+                        pocl_fma(log10e_tail, r1, log10e_tail*r2)));
+#elif defined(COMPILING_LOG2)
+    r1 = pocl_fma(log2e_lead, z1, dxexp);
+    r2 = pocl_fma(log2e_lead, z2, pocl_fma(log2e_tail, z1, log2e_tail*z2));
+    vtype ret_far = r1 + r2;
+#else
+    r1 = pocl_fma(dxexp, log2_lead, z1);
+    r2 = pocl_fma(dxexp, log2_tail, z2);
+    vtype ret_far = r1 + r2;
+#endif
+
+    vtype ret = is_near ? ret_near : ret_far;
+
+    ret = isinf(x) ? as_vtype((utype)PINFBITPATT_DP64) : ret;
+    ret = (isnan(x) | (x < (vtype)0.0))
+           ? as_vtype((utype)QNANBITPATT_DP64) : ret;
+    ret = (x == (vtype)0.0) ? as_vtype((utype)NINFBITPATT_DP64) : ret;
+    return ret;
+}
diff --git a/lib/kernel/libclc/logb_fp32.cl b/lib/kernel/libclc/logb_fp32.cl
new file mode 100644
index 0000000..97a5f52
--- /dev/null
+++ b/lib/kernel/libclc/logb_fp32.cl
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+_CL_OVERLOADABLE vtype logb(vtype x)
+{
+  vtype res = convert_vtype(ilogb(fabs(x)));
+  res = (as_utype(fabs(x)) == (utype)(0))
+         ? (vtype)(-INFINITY) : res;
+  res = isnan(x) ? x : res;
+  res = isinf(x) ? (vtype)INFINITY : res;
+  return res;
+}
diff --git a/lib/kernel/libclc/logb_fp64.cl b/lib/kernel/libclc/logb_fp64.cl
new file mode 100644
index 0000000..97a5f52
--- /dev/null
+++ b/lib/kernel/libclc/logb_fp64.cl
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+_CL_OVERLOADABLE vtype logb(vtype x)
+{
+  vtype res = convert_vtype(ilogb(fabs(x)));
+  res = (as_utype(fabs(x)) == (utype)(0))
+         ? (vtype)(-INFINITY) : res;
+  res = isnan(x) ? x : res;
+  res = isinf(x) ? (vtype)INFINITY : res;
+  return res;
+}
diff --git a/lib/kernel/libclc/misc.h b/lib/kernel/libclc/misc.h
new file mode 100644
index 0000000..d2b911c
--- /dev/null
+++ b/lib/kernel/libclc/misc.h
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define SNAN 0x001
+#define QNAN 0x002
+#define NINF 0x004
+#define NNOR 0x008
+#define NSUB 0x010
+#define NZER 0x020
+#define PZER 0x040
+#define PSUB 0x080
+#define PNOR 0x100
+#define PINF 0x200
+
+#define HAVE_BITALIGN() (0)
+
+#define MATH_DIVIDE(X, Y) ((X) / (Y))
+#define MATH_RECIP(X) (1.0f / (X))
+#define MATH_SQRT(X) sqrt(X)
+
+#define SIGNBIT_SP32      0x80000000
+#define EXSIGNBIT_SP32    0x7fffffff
+#define EXPBITS_SP32      0x7f800000
+#define MANTBITS_SP32     0x007fffff
+#define MANTSIGNBITS_SP32 0x807fffff
+#define ONEEXPBITS_SP32   0x3f800000
+#define TWOEXPBITS_SP32   0x40000000
+#define HALFEXPBITS_SP32  0x3f000000
+#define IMPBIT_SP32       0x00800000
+#define QNANBITPATT_SP32  0x7fc00000
+#define INDEFBITPATT_SP32 0xffc00000
+#define PINFBITPATT_SP32  0x7f800000
+#define NINFBITPATT_SP32  0xff800000
+#define EXPBIAS_SP32      127
+#define EXPSHIFTBITS_SP32 23
+#define BIASEDEMIN_SP32   1
+#define EMIN_SP32         -126
+#define BIASEDEMAX_SP32   254
+#define EMAX_SP32         127
+#define LAMBDA_SP32       1.0e30
+#define MANTLENGTH_SP32   24
+#define BASEDIGITS_SP32   7
+#define ISNEG_SP32(x)     (as_itype(x) & (itype)SIGNBIT_SP32)
+#define vINFINITY_SP32    (as_vtype((utype)PINFBITPATT_SP32))
+#define vNINFINITY_SP32   (as_vtype((utype)NINFBITPATT_SP32))
+#define vNAN_SP32         (as_vtype((utype)QNANBITPATT_SP32))
+#define vZERO_SP32        (vtype)0.0f
+#define vONE_SP32        (vtype)1.0f
+
+#ifdef cl_khr_fp64
+
+#define SIGNBIT_DP64      0x8000000000000000L
+#define EXSIGNBIT_DP64    0x7fffffffffffffffL
+#define EXPBITS_DP64      0x7ff0000000000000L
+#define MANTBITS_DP64     0x000fffffffffffffL
+#define MANTSIGNBITS_DP64 0x800fffffffffffffL
+#define ONEEXPBITS_DP64   0x3ff0000000000000L
+#define TWOEXPBITS_DP64   0x4000000000000000L
+#define HALFEXPBITS_DP64  0x3fe0000000000000L
+#define IMPBIT_DP64       0x0010000000000000L
+#define QNANBITPATT_DP64  0x7ff8000000000000L
+#define INDEFBITPATT_DP64 0xfff8000000000000L
+#define PINFBITPATT_DP64  0x7ff0000000000000L
+#define NINFBITPATT_DP64  0xfff0000000000000L
+#define EXPBIAS_DP64      1023
+#define EXPSHIFTBITS_DP64 52
+#define BIASEDEMIN_DP64   1
+#define EMIN_DP64         -1022
+#define BIASEDEMAX_DP64   2046 /* 0x7fe */
+#define EMAX_DP64         1023 /* 0x3ff */
+#define LAMBDA_DP64       1.0e300
+#define MANTLENGTH_DP64   53
+#define BASEDIGITS_DP64   15
+#define ISNEG_DP64(x)     (as_itype(x) & (itype)SIGNBIT_DP64)
+#define vINFINITY_DP64    (as_vtype((utype)PINFBITPATT_DP64))
+#define vNINFINITY_DP64   (as_vtype((utype)NINFBITPATT_DP64))
+#define vNAN_DP64         (as_vtype((utype)QNANBITPATT_DP64))
+#define vZERO_DP64        (vtype)0.0
+#define vONE_DP64         (vtype)1.0
+
+#endif // cl_khr_fp64
+
+#define ALIGNED(x)  __attribute__((aligned(x)))
+
+
+#ifdef cl_khr_fp64
+
+typedef struct { double lo,hi; } v2double;
+typedef struct { double2 lo,hi; } v2double2;
+typedef struct { double3 lo,hi; } v2double3;
+typedef struct { double4 lo,hi; } v2double4;
+typedef struct { double8 lo,hi; } v2double8;
+typedef struct { double16 lo,hi; } v2double16;
+
+#endif
+
+typedef struct { float lo,hi; } v2float;
+typedef struct { float2 lo,hi; } v2float2;
+typedef struct { float3 lo,hi; } v2float3;
+typedef struct { float4 lo,hi; } v2float4;
+typedef struct { float8 lo,hi; } v2float8;
+typedef struct { float16 lo,hi; } v2float16;
+
+// for PI tables sin / cos
+typedef struct { uint s0, s1, s2, s3; } v4uint;
+typedef struct { uint2 s0, s1, s2, s3; } v4uint2;
+typedef struct { uint3 s0, s1, s2, s3; } v4uint3;
+typedef struct { uint4 s0, s1, s2, s3; } v4uint4;
+typedef struct { uint8 s0, s1, s2, s3; } v4uint8;
+typedef struct { uint16 s0, s1, s2, s3; } v4uint16;
+
+// for PI tables sin / cos
+typedef struct { int s0, s1, s2, s3; } v4int;
+typedef struct { int2 s0, s1, s2, s3; } v4int2;
+typedef struct { int3 s0, s1, s2, s3; } v4int3;
+typedef struct { int4 s0, s1, s2, s3; } v4int4;
+typedef struct { int8 s0, s1, s2, s3; } v4int8;
+typedef struct { int16 s0, s1, s2, s3; } v4int16;
+
+
+
+#define OCML_ATTR __attribute__((always_inline, const, overloadable))
+
+#define ALIGNEDATTR(X) __attribute__((aligned(X)))
+#define INLINEATTR __attribute__((always_inline))
+#define PUREATTR __attribute__((pure))
+#define CONSTATTR __attribute__((const))
+
+#define FMA fma
+#define RCP(X) ((vtype)(1.0) / X)
+#define DIV(X,Y) (X / Y)
+
+#define LDEXP ldexp
+#define SQRT sqrt
+#define ISINF isinf
+#define COPYSIGN copysign
+#define MATH_FAST_RCP RCP
+#define MATH_RCP RCP
+#define MATH_MAD pocl_fma
+
+#define BUILTIN_ABS_F32 fabs
+#define BUILTIN_TRUNC_F32 trunc
+#define BUILTIN_FRACTION_F32 fract
+#define BUILTIN_COPYSIGN_F32 copysign
+#define BUILTIN_FMA_F32 fma
+
+#define BUILTIN_FREXP_MANT_F32 _cl_frfrexp
+#define BUILTIN_FLDEXP_F32 ldexp
+#define BUILTIN_FREXP_EXP_F32 _cl_expfrexp
+#define BUILTIN_RINT_F32 rint
+
+#define BUILTIN_ABS_F64 fabs
+#define BUILTIN_TRUNC_F64 trunc
+#define BUILTIN_FRACTION_F64 fract
+#define BUILTIN_COPYSIGN_F64 copysign
+#define BUILTIN_FMA_F64 fma
+
+#define BUILTIN_FREXP_MANT_F64 _cl_frfrexp
+#define BUILTIN_FLDEXP_F64 ldexp
+#define BUILTIN_FREXP_EXP_F64 _cl_expfrexp
+#define BUILTIN_RINT_F64 rint
+
+#define MATH_PRIVATE(NAME) __pocl_ ## NAME
+#define MATH_MANGLE(NAME) _CL_OVERLOADABLE NAME
+
+#ifndef _CL_DECLARE_FUNC_V_VVV
+#define _CL_DECLARE_FUNC_V_VVV(NAME)                                    \
+  __IF_FP16(                                                            \
+  half     _CL_OVERLOADABLE NAME(half    , half    , half  );           \
+  half2    _CL_OVERLOADABLE NAME(half2   , half2   , half2 );           \
+  half3    _CL_OVERLOADABLE NAME(half3   , half3   , half3 );           \
+  half4    _CL_OVERLOADABLE NAME(half4   , half4   , half4 );           \
+  half8    _CL_OVERLOADABLE NAME(half8   , half8   , half8 );           \
+  half16   _CL_OVERLOADABLE NAME(half16  , half16  , half16);)          \
+  float    _CL_OVERLOADABLE NAME(float   , float   , float   );         \
+  float2   _CL_OVERLOADABLE NAME(float2  , float2  , float2  );         \
+  float3   _CL_OVERLOADABLE NAME(float3  , float3  , float3  );         \
+  float4   _CL_OVERLOADABLE NAME(float4  , float4  , float4  );         \
+  float8   _CL_OVERLOADABLE NAME(float8  , float8  , float8  );         \
+  float16  _CL_OVERLOADABLE NAME(float16 , float16 , float16 );         \
+  __IF_FP64(                                                            \
+  double   _CL_OVERLOADABLE NAME(double  , double  , double  );         \
+  double2  _CL_OVERLOADABLE NAME(double2 , double2 , double2 );         \
+  double3  _CL_OVERLOADABLE NAME(double3 , double3 , double3 );         \
+  double4  _CL_OVERLOADABLE NAME(double4 , double4 , double4 );         \
+  double8  _CL_OVERLOADABLE NAME(double8 , double8 , double8 );         \
+  double16 _CL_OVERLOADABLE NAME(double16, double16, double16);)
+#endif
+
+_CL_DECLARE_FUNC_V_VVV(pocl_fma)
diff --git a/lib/kernel/libclc/normalize.cl b/lib/kernel/libclc/normalize.cl
new file mode 100644
index 0000000..4ed5220
--- /dev/null
+++ b/lib/kernel/libclc/normalize.cl
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+
+_CL_OVERLOADABLE float normalize(float p) {
+  return sign(p);
+}
+
+_CL_OVERLOADABLE float2 normalize(float2 p) {
+  if (all(p == (float2)0.0F))
+    return p;
+
+  float l2 = dot(p, p);
+
+  if (l2 < FLT_MIN) {
+    p *= 0x1.0p+86F;
+    l2 = dot(p, p);
+  } else if (l2 == INFINITY) {
+    p *= 0x1.0p-65f;
+    l2 = dot(p, p);
+    if (l2 == INFINITY) {
+      p = copysign(select((float2)0.0F, (float2)1.0F, isinf(p)), p);
+      l2 = dot(p, p);
+    }
+  }
+  return p * rsqrt(l2);
+}
+
+_CL_OVERLOADABLE float3 normalize(float3 p) {
+  if (all(p == (float3)0.0F))
+    return p;
+
+  float l2 = dot(p, p);
+
+  if (l2 < FLT_MIN) {
+    p *= 0x1.0p+86F;
+    l2 = dot(p, p);
+  } else if (l2 == INFINITY) {
+    p *= 0x1.0p-66f;
+    l2 = dot(p, p);
+    if (l2 == INFINITY) {
+      p = copysign(select((float3)0.0F, (float3)1.0F, isinf(p)), p);
+      l2 = dot(p, p);
+    }
+  }
+  return p * rsqrt(l2);
+}
+
+_CL_OVERLOADABLE float4 normalize(float4 p) {
+  if (all(p == (float4)0.0F))
+    return p;
+
+  float l2 = dot(p, p);
+
+  if (l2 < FLT_MIN) {
+    p *= 0x1.0p+86F;
+    l2 = dot(p, p);
+  } else if (l2 == INFINITY) {
+    p *= 0x1.0p-66f;
+    l2 = dot(p, p);
+    if (l2 == INFINITY) {
+      p = copysign(select((float4)0.0F, (float4)1.0F, isinf(p)), p);
+      l2 = dot(p, p);
+    }
+  }
+  return p * rsqrt(l2);
+}
+
+#ifdef cl_khr_fp64
+
+#pragma OPENCL EXTENSION cl_khr_fp64 : enable
+
+_CL_OVERLOADABLE double normalize(double p) {
+  return sign(p);
+}
+
+_CL_OVERLOADABLE double2 normalize(double2 p) {
+  if (all(p == (double2)0.0))
+    return p;
+
+  double l2 = dot(p, p);
+
+  if (l2 < DBL_MIN) {
+    p *= 0x1.0p+563;
+    l2 = dot(p, p);
+  } else if (l2 == INFINITY) {
+    p *= 0x1.0p-513;
+    l2 = dot(p, p);
+    if (l2 == INFINITY) {
+      p = copysign(select((double2)0.0, (double2)1.0, isinf(p)), p);
+      l2 = dot(p, p);
+    }
+  }
+  return p * rsqrt(l2);
+}
+
+_CL_OVERLOADABLE double3 normalize(double3 p) {
+  if (all(p == (double3)0.0))
+    return p;
+
+  double l2 = dot(p, p);
+
+  if (l2 < DBL_MIN) {
+    p *= 0x1.0p+563;
+    l2 = dot(p, p);
+  } else if (l2 == INFINITY) {
+    p *= 0x1.0p-514;
+    l2 = dot(p, p);
+    if (l2 == INFINITY) {
+      p = copysign(select((double3)0.0, (double3)1.0, isinf(p)), p);
+      l2 = dot(p, p);
+    }
+  }
+  return p * rsqrt(l2);
+}
+
+_CL_OVERLOADABLE double4 normalize(double4 p) {
+  if (all(p == (double4)0.0))
+    return p;
+
+  double l2 = dot(p, p);
+
+  if (l2 < DBL_MIN) {
+    p *= 0x1.0p+563;
+    l2 = dot(p, p);
+  } else if (l2 == INFINITY) {
+    p *= 0x1.0p-514;
+    l2 = dot(p, p);
+    if (l2 == INFINITY) {
+      p = copysign(select((double4)0.0, (double4)1.0, isinf(p)), p);
+      l2 = dot(p, p);
+    }
+  }
+  return p * rsqrt(l2);
+}
+
+#endif
diff --git a/lib/kernel/libclc/ocml_helpers.h b/lib/kernel/libclc/ocml_helpers.h
new file mode 100644
index 0000000..73ed4e1
--- /dev/null
+++ b/lib/kernel/libclc/ocml_helpers.h
@@ -0,0 +1,94 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See ROCM_LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+ OCML_ATTR v2type con(vtype a, vtype b);
+
+ OCML_ATTR v2type csgn(v2type a, vtype b);
+
+ OCML_ATTR v2type csgn(v2type a, v2type b);
+
+ OCML_ATTR v2type fadd(vtype a, vtype b);
+
+ OCML_ATTR v2type nrm(v2type a);
+
+ OCML_ATTR v2type onrm(v2type a);
+
+ OCML_ATTR v2type fsub(vtype a, vtype b);
+
+ OCML_ATTR v2type add(vtype a, vtype b);
+
+ OCML_ATTR v2type sub(vtype a, vtype b);
+
+ OCML_ATTR v2type mul(vtype a, vtype b);
+
+ OCML_ATTR v2type sqr(vtype a);
+
+ OCML_ATTR v2type add(v2type a, vtype b);
+
+ OCML_ATTR v2type fadd(v2type a, vtype b);
+
+ OCML_ATTR v2type add(vtype a, v2type b);
+
+ OCML_ATTR v2type fadd(vtype a, v2type b);
+
+ OCML_ATTR v2type add(v2type a, v2type b);
+
+ OCML_ATTR v2type fadd(v2type a, v2type b);
+
+ OCML_ATTR v2type sub(v2type a, vtype b);
+
+ OCML_ATTR v2type fsub(v2type a, vtype b);
+
+ OCML_ATTR v2type sub(vtype a, v2type b);
+
+ OCML_ATTR v2type fsub(vtype a, v2type b);
+
+ OCML_ATTR v2type sub(v2type a, v2type b);
+
+ OCML_ATTR v2type fsub(v2type a, v2type b);
+
+ OCML_ATTR v2type ldx(v2type a, int e);
+
+ OCML_ATTR v2type mul(v2type a, vtype b);
+
+ OCML_ATTR v2type omul(v2type a, vtype b);
+
+ OCML_ATTR v2type mul(vtype a, v2type b);
+
+ OCML_ATTR v2type omul(vtype a, v2type b);
+
+ OCML_ATTR v2type mul(v2type a, v2type b);
+
+ OCML_ATTR v2type omul(v2type a, v2type b);
+
+ OCML_ATTR v2type div(vtype a, vtype b);
+
+ OCML_ATTR v2type div(v2type a, vtype b);
+
+ OCML_ATTR v2type div(vtype a, v2type b);
+
+ OCML_ATTR v2type fdiv(v2type a, v2type b);
+
+ OCML_ATTR v2type div(v2type a, v2type b);
+
+ OCML_ATTR v2type rcp(vtype b);
+
+ OCML_ATTR v2type frcp(v2type b);
+
+ OCML_ATTR v2type rcp(v2type b);
+
+ OCML_ATTR v2type sqr(v2type a);
+
+ OCML_ATTR v2type root2(vtype a);
+
+ OCML_ATTR v2type root2(v2type a);
+
+ OCML_ATTR vtype fnma(vtype a, vtype b, vtype c);
+
+_CL_OVERLOADABLE vtype _cl_frfrexp(vtype a);
+
+_CL_OVERLOADABLE itype _cl_expfrexp(vtype a);
diff --git a/lib/kernel/libclc/ocml_helpers_fp32.cl b/lib/kernel/libclc/ocml_helpers_fp32.cl
new file mode 100644
index 0000000..1fbe2db
--- /dev/null
+++ b/lib/kernel/libclc/ocml_helpers_fp32.cl
@@ -0,0 +1,40 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See ROCM_LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#define FLOAT_SPECIALIZATION
+#include "ocml_helpers.h"
+#include "ocml_helpers_impl.cl"
+#undef FLOAT_SPECIALIZATION
+
+// The arguments must only be variable names
+#define FULL_MUL(A, B, CHI, CLO) \
+    do { \
+        vtype __ha = as_vtype(as_utype(A) & (utype)0xfffff000U); \
+        vtype __ta = A - __ha; \
+        vtype __hb = as_vtype(as_utype(B) & (utype)0xfffff000U); \
+        vtype __tb = B - __hb; \
+        CHI = A * B; \
+        CLO = MATH_MAD(__ta, __tb, MATH_MAD(__ta, __hb, MATH_MAD(__ha, __tb, MATH_MAD(__ha, __hb, -CHI)))); \
+    } while (0)
+
+
+OCML_ATTR vtype
+fnma(vtype a, vtype b, vtype c)
+{
+    vtype d;
+    if (HAVE_FMA32) {
+        d = BUILTIN_FMA_F32(-a, b, c);
+    } else {
+        vtype h, t;
+        FULL_MUL(a, b, h, t);
+        d = c - h;
+        d = (((c - d) - h) - t) + d;
+    }
+    return d;
+}
+
+#undef FULL_MUL
diff --git a/lib/kernel/libclc/ocml_helpers_fp64.cl b/lib/kernel/libclc/ocml_helpers_fp64.cl
new file mode 100644
index 0000000..8f4beea
--- /dev/null
+++ b/lib/kernel/libclc/ocml_helpers_fp64.cl
@@ -0,0 +1,17 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See ROCM_LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#define DOUBLE_SPECIALIZATION
+#include "ocml_helpers.h"
+#include "ocml_helpers_impl.cl"
+#undef DOUBLE_SPECIALIZATION
+
+OCML_ATTR _CL_OVERLOADABLE vtype
+fnma(vtype a, vtype b, vtype c)
+{
+    return BUILTIN_FMA_F64(-a, b, c);
+}
diff --git a/lib/kernel/libclc/ocml_helpers_impl.cl b/lib/kernel/libclc/ocml_helpers_impl.cl
new file mode 100644
index 0000000..74d3529
--- /dev/null
+++ b/lib/kernel/libclc/ocml_helpers_impl.cl
@@ -0,0 +1,435 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See ROCM_LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+
+#ifndef vtype
+#error vtype must be defined
+#endif
+
+#ifndef v2type
+#error v2type must be defined
+#endif
+
+#if defined(FLOAT_SPECIALIZATION)
+
+#define HIGH(X) as_vtype(as_utype(X) & (utype)0xfffff000U)
+
+#define USE_FMA HAVE_FMA32
+
+#endif
+
+#if defined(DOUBLE_SPECIALIZATION)
+
+#define USE_FMA HAVE_FMA64
+
+#define HIGH(X) as_vtype(as_utype(X) & (utype)0xfffffffff8000000UL)
+
+#endif
+
+
+OCML_ATTR v2type
+con(vtype a, vtype b)
+{
+    v2type c; c.lo = b; c.hi = a;
+    return c;
+}
+
+OCML_ATTR v2type
+csgn(v2type a, vtype b)
+{
+    return con(COPYSIGN(a.hi, b), COPYSIGN(a.lo, b));
+}
+
+OCML_ATTR v2type
+csgn(v2type a, v2type b)
+{
+    return con(COPYSIGN(a.hi, b.hi), COPYSIGN(a.lo, b.lo));
+}
+
+OCML_ATTR v2type
+fadd(vtype a, vtype b)
+{
+    vtype s = a + b;
+    return con(s, b - (s - a));
+}
+
+OCML_ATTR v2type
+nrm(v2type a)
+{
+    return fadd(a.hi, a.lo);
+}
+
+OCML_ATTR v2type
+onrm(v2type a)
+{
+    vtype s = a.hi + a.lo;
+    vtype t = a.lo - (s - a.hi);
+    s = ISINF(a.hi) ? a.hi : s;
+    return con(s, ISINF(s) ? (vtype)0 : t);
+}
+
+OCML_ATTR v2type
+fsub(vtype a, vtype b)
+{
+    vtype d = a - b;
+    return con(d, (a - d) - b);
+}
+
+OCML_ATTR v2type
+add(vtype a, vtype b)
+{
+    vtype s = a + b;
+    vtype d = s - a;
+    return con(s, (a - (s - d)) + (b - d));
+}
+
+OCML_ATTR v2type
+sub(vtype a, vtype b)
+{
+    vtype d = a - b;
+    vtype e = d - a;
+    return con(d, (a - (d - e)) - (b + e));
+}
+
+OCML_ATTR v2type
+mul(vtype a, vtype b)
+{
+    vtype p = a * b;
+    if (USE_FMA) {
+        return con(p, FMA(a, b, -p));
+    } else {
+        vtype ah = HIGH(a);
+        vtype al = a - ah;
+        vtype bh = HIGH(b);
+        vtype bl = b - bh;
+        vtype p = a * b;
+        return con(p, ((ah*bh - p) + ah*bl + al*bh) + al*bl);
+    }
+}
+
+OCML_ATTR v2type
+sqr(vtype a)
+{
+    vtype p = a * a;
+    if (USE_FMA) {
+        return con(p, FMA(a, a, -p));
+    } else {
+        vtype ah = HIGH(a);
+        vtype al = a - ah;
+        return con(p, ((ah*ah - p) + 2.0f*ah*al) + al*al);
+    }
+}
+
+OCML_ATTR v2type
+add(v2type a, vtype b)
+{
+    v2type s = add(a.hi, b);
+    s.lo += a.lo;
+    return nrm(s);
+}
+
+OCML_ATTR v2type
+fadd(v2type a, vtype b)
+{
+    v2type s = fadd(a.hi, b);
+    s.lo += a.lo;
+    return nrm(s);
+}
+
+OCML_ATTR v2type
+add(vtype a, v2type b)
+{
+    v2type s = add(a, b.hi);
+    s.lo += b.lo;
+    return nrm(s);
+}
+
+OCML_ATTR v2type
+fadd(vtype a, v2type b)
+{
+    v2type s = fadd(a, b.hi);
+    s.lo += b.lo;
+    return nrm(s);
+}
+
+OCML_ATTR v2type
+add(v2type a, v2type b)
+{
+    v2type s = add(a.hi, b.hi);
+    v2type t = add(a.lo, b.lo);
+    s.lo += t.hi;
+    s = nrm(s);
+    s.lo += t.lo;
+    return nrm(s);
+}
+
+OCML_ATTR v2type
+fadd(v2type a, v2type b)
+{
+    v2type s = fadd(a.hi, b.hi);
+    s.lo += a.lo + b.lo;
+    return nrm(s);
+}
+
+OCML_ATTR v2type
+sub(v2type a, vtype b)
+{
+    v2type d = sub(a.hi, b);
+    d.lo += a.lo;
+    return nrm(d);
+}
+
+OCML_ATTR v2type
+fsub(v2type a, vtype b)
+{
+    v2type d = fsub(a.hi, b);
+    d.lo += a.lo;
+    return nrm(d);
+}
+
+OCML_ATTR v2type
+sub(vtype a, v2type b)
+{
+    v2type d = sub(a, b.hi);
+    d.lo -= b.lo;
+    return nrm(d);
+}
+
+OCML_ATTR v2type
+fsub(vtype a, v2type b)
+{
+    v2type d = fsub(a, b.hi);
+    d.lo -= b.lo;
+    return nrm(d);
+}
+
+OCML_ATTR v2type
+sub(v2type a, v2type b)
+{
+    v2type d = sub(a.hi, b.hi);
+    v2type e = sub(a.lo, b.lo);
+    d.lo += e.hi;
+    d = nrm(d);
+    d.lo += e.lo;
+    return nrm(d);
+}
+
+OCML_ATTR v2type
+fsub(v2type a, v2type b)
+{
+    v2type d = fsub(a.hi, b.hi);
+    d.lo = d.lo + a.lo - b.lo;
+    return nrm(d);
+}
+
+OCML_ATTR v2type
+ldx(v2type a, int e)
+{
+    return con(LDEXP(a.hi, e), LDEXP(a.lo, e));
+}
+
+OCML_ATTR v2type
+mul(v2type a, vtype b)
+{
+    v2type p = mul(a.hi, b);
+    if (USE_FMA) {
+        p.lo = FMA(a.lo, b, p.lo);
+    } else {
+        p.lo += a.lo * b;
+    }
+    return nrm(p);
+}
+
+OCML_ATTR v2type
+omul(v2type a, vtype b)
+{
+    v2type p = mul(a.hi, b);
+    if (USE_FMA) {
+        p.lo = FMA(a.lo, b, p.lo);
+    } else {
+        p.lo += a.lo * b;
+    }
+    return onrm(p);
+}
+
+OCML_ATTR v2type
+mul(vtype a, v2type b)
+{
+    v2type p = mul(a, b.hi);
+    if (USE_FMA) {
+        p.lo = FMA(a, b.lo, p.lo);
+    } else {
+        p.lo += a * b.lo;
+    }
+    return nrm(p);
+}
+
+OCML_ATTR v2type
+omul(vtype a, v2type b)
+{
+    v2type p = mul(a, b.hi);
+    if (USE_FMA) {
+        p.lo = FMA(a, b.lo, p.lo);
+    } else {
+        p.lo += a * b.lo;
+    }
+    return onrm(p);
+}
+
+OCML_ATTR v2type
+mul(v2type a, v2type b)
+{
+    v2type p = mul(a.hi, b.hi);
+    if (USE_FMA) {
+        p.lo += FMA(a.hi, b.lo, a.lo*b.hi);
+    } else {
+        p.lo += a.hi*b.lo + a.lo*b.hi;
+    }
+    return nrm(p);
+}
+
+OCML_ATTR v2type
+omul(v2type a, v2type b)
+{
+    v2type p = mul(a.hi, b.hi);
+    if (USE_FMA) {
+        p.lo += FMA(a.hi, b.lo, a.lo*b.hi);
+    } else {
+        p.lo += a.hi*b.lo + a.lo*b.hi;
+    }
+    return onrm(p);
+}
+
+OCML_ATTR v2type
+div(vtype a, vtype b)
+{
+    vtype r = RCP(b);
+    vtype qhi = a * r;
+    v2type p = mul(qhi, b);
+    v2type d = fsub(a, p.hi);
+    d.lo -= p.lo;
+    vtype qlo = (d.hi + d.lo) * r;
+    return fadd(qhi, qlo);
+}
+
+OCML_ATTR v2type
+div(v2type a, vtype b)
+{
+    vtype r = RCP(b);
+    vtype qhi = a.hi * r;
+    v2type p = mul(qhi, b);
+    v2type d = fsub(a.hi, p.hi);
+    d.lo = d.lo + a.lo - p.lo;
+    vtype qlo = (d.hi + d.lo) * r;
+    return fadd(qhi, qlo);
+}
+
+OCML_ATTR v2type
+div(vtype a, v2type b)
+{
+    vtype r = RCP(b.hi);
+    vtype qhi = a * r;
+    v2type p = mul(qhi, b);
+    v2type d = fsub(a, p.hi);
+    d.lo -= p.lo;
+    vtype qlo = (d.hi + d.lo) * r;
+    return fadd(qhi, qlo);
+}
+
+OCML_ATTR v2type
+fdiv(v2type a, v2type b)
+{
+    vtype r = RCP(b.hi);
+    vtype qhi = a.hi * r;
+    v2type p = mul(qhi, b);
+    v2type d = fsub(a.hi, p.hi);
+    d.lo = d.lo - p.lo + a.lo;
+    vtype qlo = (d.hi + d.lo) * r;
+    return fadd(qhi, qlo);
+}
+
+OCML_ATTR v2type
+div(v2type a, v2type b)
+{
+    vtype y = RCP(b.hi);
+    vtype qhi = a.hi * y;
+    v2type r = fsub(a, mul(qhi, b));
+    vtype qmi = r.hi * y;
+    r = fsub(r, mul(qmi, b));
+    vtype qlo = r.hi * y;
+    v2type q = fadd(qhi, qmi);
+    q.lo += qlo;
+    return nrm(q);
+}
+
+OCML_ATTR v2type
+rcp(vtype b)
+{
+    vtype qhi = RCP(b);
+    v2type p = mul(qhi, b);
+    v2type d = fsub((vtype)1, p.hi);
+    d.lo -= p.lo;
+    vtype qlo = (d.hi + d.lo) * qhi;
+    return fadd(qhi, qlo);
+}
+
+OCML_ATTR v2type
+frcp(v2type b)
+{
+    vtype qhi = RCP(b.hi);
+    v2type p = mul(qhi, b);
+    v2type d = fsub((vtype)1, p.hi);
+    d.lo -= p.lo;
+    vtype qlo = (d.hi + d.lo) * qhi;
+    return fadd(qhi, qlo);
+}
+
+OCML_ATTR v2type
+rcp(v2type b)
+{
+    vtype qhi = RCP(b.hi);
+    v2type r = fsub((vtype)1, mul(qhi, b));
+    vtype qmi = r.hi * qhi;
+    r = fsub(r, mul(qmi, b));
+    vtype qlo = r.hi * qhi;
+    v2type q = fadd(qhi, qmi);
+    q.lo += qlo;
+    return nrm(q);
+}
+
+OCML_ATTR v2type
+sqr(v2type a)
+{
+    v2type p = sqr(a.hi);
+    if (USE_FMA) {
+        p.lo = FMA(a.lo, a.lo, FMA(a.hi, (vtype)2*a.lo, p.lo));
+    } else {
+        p.lo = p.lo + a.hi * a.lo * (vtype)2 + a.lo * a.lo;
+    }
+    return fadd(p.hi, p.lo);
+}
+
+OCML_ATTR v2type
+root2(vtype a)
+{
+    vtype shi = SQRT(a);
+    v2type e = fsub(a, sqr(shi));
+    vtype slo = DIV(e.hi, (vtype)2 * shi);
+    return fadd(shi, slo);
+}
+
+OCML_ATTR v2type
+root2(v2type a)
+{
+    vtype shi = SQRT(a.hi);
+    v2type e = fsub(a, sqr(shi));
+    vtype slo = DIV(e.hi, (vtype)2 * shi);
+    return fadd(shi, slo);
+}
+
+#undef USE_FMA
+#undef HIGH
diff --git a/lib/kernel/rsqrt.cl b/lib/kernel/libclc/pocl_fma_fp32.cl
similarity index 77%
copy from lib/kernel/rsqrt.cl
copy to lib/kernel/libclc/pocl_fma_fp32.cl
index 3c75ca1..f0c0247 100644
--- a/lib/kernel/rsqrt.cl
+++ b/lib/kernel/libclc/pocl_fma_fp32.cl
@@ -1,18 +1,17 @@
-/* OpenCL built-in library: rsqrt()
+/* OpenCL built-in library: pocl_fma()
+
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
 
-   Copyright (c) 2011 Erik Schnetter <eschnetter at perimeterinstitute.ca>
-                      Perimeter Institute for Theoretical Physics
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -22,8 +21,12 @@
    THE SOFTWARE.
 */
 
-#include "templates.h"
+#ifdef HAVE_FMA32
+
+_CL_OVERLOADABLE vtype pocl_fma(vtype x, vtype y, vtype z) { return fma(x, y, z); }
+
+#else
 
-DEFINE_EXPR_V_V(rsqrt, (stype)1/sqrt(a))
+_CL_OVERLOADABLE vtype pocl_fma(vtype x, vtype y, vtype z) { return (x*y + z); }
 
-DEFINE_EXPR_F_F(half_rsqrt, rsqrt(a))
+#endif
\ No newline at end of file
diff --git a/lib/kernel/rsqrt.cl b/lib/kernel/libclc/pocl_fma_fp64.cl
similarity index 77%
copy from lib/kernel/rsqrt.cl
copy to lib/kernel/libclc/pocl_fma_fp64.cl
index 3c75ca1..8c8bf2f 100644
--- a/lib/kernel/rsqrt.cl
+++ b/lib/kernel/libclc/pocl_fma_fp64.cl
@@ -1,18 +1,17 @@
-/* OpenCL built-in library: rsqrt()
+/* OpenCL built-in library: pocl_fma()
+
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
 
-   Copyright (c) 2011 Erik Schnetter <eschnetter at perimeterinstitute.ca>
-                      Perimeter Institute for Theoretical Physics
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -22,8 +21,12 @@
    THE SOFTWARE.
 */
 
-#include "templates.h"
+#ifdef HAVE_FMA64
+
+_CL_OVERLOADABLE vtype pocl_fma(vtype x, vtype y, vtype z) { return fma(x, y, z); }
+
+#else
 
-DEFINE_EXPR_V_V(rsqrt, (stype)1/sqrt(a))
+_CL_OVERLOADABLE vtype pocl_fma(vtype x, vtype y, vtype z) { return (x*y + z); }
 
-DEFINE_EXPR_F_F(half_rsqrt, rsqrt(a))
+#endif
\ No newline at end of file
diff --git a/lib/kernel/libclc/pow_base_fp32.cl b/lib/kernel/libclc/pow_base_fp32.cl
new file mode 100644
index 0000000..2323683
--- /dev/null
+++ b/lib/kernel/libclc/pow_base_fp32.cl
@@ -0,0 +1,204 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See ROCM_LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#include "ocml_helpers.h"
+#include "singlevec.h"
+
+extern _CL_OVERLOADABLE v2type MATH_PRIVATE(epln)(vtype);
+extern _CL_OVERLOADABLE vtype MATH_PRIVATE(expep)(v2type);
+
+vtype
+#if defined(COMPILING_POWR)
+MATH_MANGLE(powr)(vtype x, vtype y)
+#elif defined(COMPILING_POWN)
+MATH_MANGLE(pown)(vtype x, itype ny)
+#elif defined(COMPILING_ROOTN)
+MATH_MANGLE(rootn)(vtype x, itype ny)
+#else
+MATH_MANGLE(pow)(vtype x, vtype y)
+#endif
+{
+
+    vtype ax = BUILTIN_ABS_F32(x);
+
+#if defined(COMPILING_POWN) || defined(COMPILING_ROOTN)
+    itype nyh = ny & (itype)0xffff0000;
+    v2type y = fadd(convert_vtype(nyh), convert_vtype(ny - nyh));
+#if defined(COMPILING_ROOTN)
+    y = rcp(y);
+#endif
+#else
+    vtype ay = BUILTIN_ABS_F32(y);
+    // flush denorm y to zero
+    itype is_denorm = (as_itype(ay) < (itype)IMPBIT_SP32);
+    y = (is_denorm) ? as_vtype(as_itype(y) & (itype)SIGNBIT_SP32) : y;
+#endif
+
+    vtype ret = MATH_PRIVATE(expep)(omul(y, MATH_PRIVATE(epln)(ax)));
+
+#if defined(COMPILING_POWN) || defined(COMPILING_ROOTN)
+    itype is_odd = (ny << 31);
+#else
+
+    vtype tay = BUILTIN_TRUNC_F32(ay);
+    itype is_int = (ay == tay);
+    vtype unused;
+    itype is_odd = is_int ? (fract(tay*0.5f, &unused) != vZERO_SP32) : (itype)0;
+
+#ifdef SINGLEVEC
+    if (is_odd && (ax > x))
+      ret = copysign(ret, -0.0f);
+#else
+    ret = copysign(ret, as_vtype(is_odd & (ax > x)));
+#endif
+
+#endif
+
+
+
+    // edge cases
+#if defined(COMPILING_POWN) || defined(COMPILING_ROOTN)
+    vtype ret0 = (ny > (itype)0) ? vZERO_SP32 : vINFINITY_SP32;
+    vtype retI = (ny > (itype)0) ? vINFINITY_SP32 : vZERO_SP32;
+    ret = (ax == vZERO_SP32) ? ret0 : ret;
+    ret = (isinf(x)) ? retI : ret;
+
+    itype xneg = as_itype(x) & (itype)SIGNBIT_SP32;
+    ret = (is_odd & xneg) ? copysign(ret, x) : ret;
+
+#if defined(COMPILING_POWN)
+    ret = (ny == (itype)0) ? vONE_SP32 : ret;
+#elif defined(COMPILING_ROOTN)
+    ret = (SV_NOT(is_odd) SV_AND (x < vZERO_SP32)) ? vNAN_SP32 : ret;
+    ret = (ny == (itype)0) ? vNAN_SP32 : ret;
+#endif
+
+    return ret;
+
+#else /* POW / POWR */
+
+    itype ax_eq_0 = (as_itype(ax) == as_itype(vZERO_SP32));
+    itype ay_eq_0 = (as_itype(ay) == as_itype(vZERO_SP32));
+
+    itype ax_lt_1 = (ax < vONE_SP32);
+    itype ax_gt_1 = (ax > vONE_SP32);
+
+    itype ax_eq_nan = (isnan(ax));
+    itype ay_eq_nan = (isnan(ay));
+    itype ay_eq_pinf = (ay == vINFINITY_SP32);
+
+#ifdef SINGLEVEC
+    itype y_pos = (as_itype(y) & (itype)SIGNBIT_SP32) == 0;
+    itype x_neg = (as_itype(x) & (itype)SIGNBIT_SP32) != 0;
+    itype y_neg = !y_pos;
+#else
+    itype x_neg = (as_itype(x) & (itype)SIGNBIT_SP32);
+    itype y_neg = (as_itype(y) & (itype)SIGNBIT_SP32);
+    itype y_pos = (y_neg ^ (itype)SIGNBIT_SP32);
+#endif
+
+    itype y_eq_ninf = (y == vNINFINITY_SP32);
+    itype y_eq_pinf = (y == vINFINITY_SP32);
+
+
+#endif /* POW / POWR */
+
+#if defined(COMPILING_POWR)
+
+    itype ax_eq_pinf = (ax == vINFINITY_SP32);
+    itype ax_eq_1 = (ax == vONE_SP32);
+    itype ay_lt_inf = (ay < vINFINITY_SP32);
+    itype ax_lt_pinf = (ax < vINFINITY_SP32);
+    itype ax_ne_0 = SV_NOT(ax_eq_0);
+
+    #ifndef FINITE_MATH_ONLY
+        ret = (ax_lt_1 & y_eq_ninf) ? vINFINITY_SP32 : ret;
+        ret = (ax_lt_1 & y_eq_pinf) ? vZERO_SP32 : ret;
+        ret = (ax_eq_1 & ay_lt_inf) ? vONE_SP32 : ret;
+        ret = (ax_eq_1 & ay_eq_pinf) ? vNAN_SP32 : ret;
+        ret = (ax_gt_1 & y_eq_ninf) ? vZERO_SP32 : ret;
+        ret = (ax_gt_1 & y_eq_pinf) ? vINFINITY_SP32 : ret;
+
+        ret = (ax_lt_pinf & ay_eq_0) ? vONE_SP32 : ret;
+
+        ret = (ax_eq_pinf & y_neg) ? vZERO_SP32 : ret;
+        ret = (ax_eq_pinf & y_pos) ? vINFINITY_SP32 : ret;
+        ret = (ax_eq_pinf & y_eq_pinf) ? vINFINITY_SP32 : ret;
+        ret = (ax_eq_pinf & ay_eq_0) ? vNAN_SP32 : ret;
+
+        ret = (ax_eq_0 & y_neg) ? vINFINITY_SP32 : ret;
+        ret = (ax_eq_0 & y_pos) ? vZERO_SP32 : ret;
+        ret = (ax_eq_0 & ay_eq_0) ? vNAN_SP32 : ret;
+        ret = (ax_ne_0 & x_neg) ? vNAN_SP32 : ret;
+
+        ret = (ax_eq_nan) ? x : ret;
+        ret = (ay_eq_nan) ? y : ret;
+
+    #else
+        ret = (ax_eq_1) ? vONE_SP32 : ret;
+        ret = (ay_eq_0) ? vONE_SP32 : ret;
+        ret = (ax_eq_0 & y_pos) ? vZERO_SP32 : ret;
+    #endif
+
+    return ret;
+#endif
+
+#if defined(COMPILING_POW)
+
+    itype is_not_int = SV_NOT(is_int);
+    itype is_not_odd = SV_NOT(is_odd);
+
+    itype x_eq_ninf = (x == vNINFINITY_SP32);
+    itype x_eq_pinf = (x == vINFINITY_SP32);
+
+    #ifndef FINITE_MATH_ONLY
+        vtype xinf = copysign(vINFINITY_SP32, x);
+        vtype xzero = copysign(vZERO_SP32, x);
+
+        ret = (x_neg & is_not_int) ? vNAN_SP32 : ret;
+
+        ret = (ax_lt_1 & y_eq_ninf) ? vINFINITY_SP32 : ret;
+        ret = (ax_gt_1 & y_eq_ninf) ? vZERO_SP32 : ret;
+        ret = (ax_lt_1 & y_eq_pinf) ? vZERO_SP32 : ret;
+        ret = (ax_gt_1 & y_eq_pinf) ? vINFINITY_SP32 : ret;
+
+        ret = (ax_eq_0 & y_neg & is_odd) ? xinf : ret;
+        ret = (ax_eq_0 & y_neg & is_not_odd) ? vINFINITY_SP32 : ret;
+        ret = (ax_eq_0 & y_pos & is_odd) ? xzero : ret;
+        ret = (ax_eq_0 & y_pos & is_not_odd) ? vZERO_SP32 : ret;
+        ret = (ax_eq_0 & y_eq_ninf) ? vINFINITY_SP32 : ret;
+
+        ret = ((x == (vtype)-1.0f) & ay_eq_pinf) ? vONE_SP32 : ret;
+
+        ret = (x_eq_ninf & y_neg & is_odd) ? (vtype)-0.0f : ret;
+        ret = (x_eq_ninf & y_neg & is_not_odd) ? vZERO_SP32 : ret;
+        ret = (x_eq_ninf & y_pos & is_odd) ? vNINFINITY_SP32 : ret;
+        ret = (x_eq_ninf & y_pos & is_not_odd) ? vINFINITY_SP32 : ret;
+        ret = (x_eq_pinf & y_neg) ? vZERO_SP32 : ret;
+        ret = (x_eq_pinf & y_pos) ? vINFINITY_SP32 : ret;
+        ret = (ax_eq_nan) ? x : ret;
+        ret = (ay_eq_nan) ? y : ret;
+
+    #else
+        // XXX work around conformance test incorrectly checking these cases
+        vtype xinf = copysign(vINFINITY_SP32, x);
+        ret = (ax_eq_0 & y_neg & is_odd) ? xinf : ret;
+        ret = (ax_eq_0 & y_neg & is_not_odd) ? vINFINITY_SP32 : ret;
+
+        vtype xzero = copysign(0.0f, x);
+        ret = (ax_eq_0 & y_pos & is_odd) ? xzero : ret;
+        ret = (ax_eq_0 & y_pos & is_not_odd) ? vZERO_SP32 : ret;
+    #endif
+
+    ret = ay_eq_0 ? vONE_SP32 : ret;
+    ret = (x == vONE_SP32) ? vONE_SP32 : ret;
+    return ret;
+
+
+#endif
+
+}
diff --git a/lib/kernel/libclc/pow_base_fp64.cl b/lib/kernel/libclc/pow_base_fp64.cl
new file mode 100644
index 0000000..153e5b4
--- /dev/null
+++ b/lib/kernel/libclc/pow_base_fp64.cl
@@ -0,0 +1,198 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See ROCM_LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#include "ocml_helpers.h"
+#include "singlevec.h"
+
+extern _CL_OVERLOADABLE v2type MATH_PRIVATE(epln)(vtype);
+extern _CL_OVERLOADABLE vtype MATH_PRIVATE(expep)(v2type);
+
+vtype
+#if defined(COMPILING_POWR)
+MATH_MANGLE(powr)(vtype x, vtype y)
+#elif defined(COMPILING_POWN)
+MATH_MANGLE(pown)(vtype x, inttype ny)
+#elif defined(COMPILING_ROOTN)
+MATH_MANGLE(rootn)(vtype x, inttype ny)
+#else
+MATH_MANGLE(pow)(vtype x, vtype y)
+#endif
+{
+
+    vtype ax = BUILTIN_ABS_F64(x);
+
+#if defined(COMPILING_POWN)
+    vtype y = convert_vtype(ny);
+#elif defined(COMPILING_ROOTN)
+    v2type y = rcp(convert_vtype(ny));
+#else
+    vtype ay = BUILTIN_ABS_F64(y);
+    // flush denorm y to zero
+    itype is_denorm = (as_itype(ay) < (itype)IMPBIT_DP64);
+    y = (is_denorm) ? as_vtype(as_itype(y) & (itype)SIGNBIT_DP64) : y;
+#endif
+
+    vtype ret = MATH_PRIVATE(expep)(omul(y, MATH_PRIVATE(epln)(ax)));
+
+#if defined(COMPILING_POWN) || defined(COMPILING_ROOTN)
+    itype nyi = convert_itype(ny);
+    itype is_odd = (nyi << 63);
+#else
+    vtype tay = BUILTIN_TRUNC_F64(ay);
+    itype is_int = (ay == tay);
+    vtype unused;
+    itype is_odd = is_int ? (fract(tay*0.5, &unused) != vZERO_DP64) : (itype)0;
+
+#ifdef SINGLEVEC
+    if (is_odd && (ax > x))
+      ret = copysign(ret, -0.0);
+#else
+    ret = copysign(ret, as_vtype(is_odd & (ax > x)));
+#endif
+
+#endif
+
+
+    // edge cases
+#if defined(COMPILING_POWN) || defined(COMPILING_ROOTN)
+    vtype ret0 = (nyi > (itype)0) ? (vtype)0.0 : vINFINITY_DP64;
+    vtype retI = (nyi > (itype)0) ? vINFINITY_DP64 : (vtype)0.0f;
+    ret = (ax > (vtype)0.0) ? ret : ret0;
+    ret = (isinf(x)) ? retI : ret;
+    ret = (isnan(x)) ? x : ret;
+
+    itype xneg = as_itype(x) & (itype)SIGNBIT_DP64;
+    ret = (is_odd & xneg) ? copysign(ret, x) : ret;
+
+#if defined(COMPILING_POWN)
+    ret = (nyi == (itype)0) ? (vtype)1.0 : ret;
+#elif defined(COMPILING_ROOTN)
+    ret = (SV_NOT(is_odd) SV_AND (x < (vtype)0.0)) ? vNAN_DP64 : ret;
+    ret = (nyi == (itype)0) ? vNAN_DP64: ret;
+#endif
+
+    return ret;
+#else  /* POW / POWR */
+    itype ax_eq_0 = (as_itype(ax) == as_itype(vZERO_DP64));
+    itype ay_eq_0 = (as_itype(ay) == as_itype(vZERO_DP64));
+
+    itype ax_lt_1 = (ax < vONE_DP64);
+    itype ax_gt_1 = (ax > vONE_DP64);
+
+    itype ax_eq_nan = (isnan(ax));
+    itype ay_eq_nan = (isnan(ay));
+    itype ay_eq_pinf = (ay == vINFINITY_DP64);
+
+#ifdef SINGLEVEC
+    itype y_pos = (as_itype(y) & (itype)SIGNBIT_DP64) == 0;
+    itype x_neg = (as_itype(x) & (itype)SIGNBIT_DP64) != 0;
+    itype y_neg = !y_pos;
+#else
+    itype x_neg = (as_itype(x) & (itype)SIGNBIT_DP64);
+    itype y_neg = (as_itype(y) & (itype)SIGNBIT_DP64);
+    itype y_pos = (y_neg ^ (itype)SIGNBIT_DP64);
+#endif
+
+    itype y_eq_ninf = (y == vNINFINITY_DP64);
+    itype y_eq_pinf = (y == vINFINITY_DP64);
+
+#endif /* POW / POWR */
+
+
+#if defined(COMPILING_POWR)
+
+    itype ax_eq_pinf = (ax == vINFINITY_DP64);
+    itype ax_eq_1 = (ax == vONE_DP64);
+    itype ay_lt_inf = (ay < vINFINITY_DP64);
+    itype ax_lt_pinf = (ax < vINFINITY_DP64);
+    itype ax_ne_0 = SV_NOT(ax_eq_0);
+
+    #ifndef FINITE_MATH_ONLY
+        ret = (ax_lt_1 & y_eq_ninf) ? vINFINITY_DP64 : ret;
+        ret = (ax_lt_1 & y_eq_pinf) ? vZERO_DP64 : ret;
+        ret = (ax_eq_1 & ay_lt_inf) ? vONE_DP64 : ret;
+        ret = (ax_eq_1 & ay_eq_pinf) ? vNAN_DP64 : ret;
+        ret = (ax_gt_1 & y_eq_ninf) ? vZERO_DP64 : ret;
+        ret = (ax_gt_1 & y_eq_pinf) ? vINFINITY_DP64 : ret;
+
+        ret = (ax_lt_pinf & ay_eq_0) ? vONE_DP64 : ret;
+
+        ret = (ax_eq_pinf & y_neg) ? vZERO_DP64 : ret;
+        ret = (ax_eq_pinf & y_pos) ? vINFINITY_DP64 : ret;
+        ret = (ax_eq_pinf & y_eq_pinf) ? vINFINITY_DP64 : ret;
+        ret = (ax_eq_pinf & ay_eq_0) ? vNAN_DP64 : ret;
+
+        ret = (ax_eq_0 & y_neg) ? vINFINITY_DP64 : ret;
+        ret = (ax_eq_0 & y_pos) ? vZERO_DP64 : ret;
+        ret = (ax_eq_0 & ay_eq_0) ? vNAN_DP64 : ret;
+        ret = (ax_ne_0 & x_neg) ? vNAN_DP64 : ret;
+
+        ret = (ax_eq_nan) ? x : ret;
+        ret = (ay_eq_nan) ? y : ret;
+    #else
+        ret = (ax_eq_1) ? vONE_DP64 : ret;
+        ret = (ay_eq_0) ? vONE_DP64 : ret;
+        ret = (ax_eq_0 & y_pos) ? vZERO_DP64 : ret;
+    #endif
+
+    return ret;
+#endif
+
+#if defined(COMPILING_POW)
+
+    itype is_not_int = SV_NOT(is_int);
+    itype is_not_odd = SV_NOT(is_odd);
+
+    itype x_eq_ninf = (x == vNINFINITY_DP64);
+    itype x_eq_pinf = (x == vINFINITY_DP64);
+
+    #ifndef FINITE_MATH_ONLY
+        vtype xinf = copysign(vINFINITY_DP64, x);
+        vtype xzero = copysign(vZERO_DP64, x);
+
+        ret = (x_neg & is_not_int) ? vNAN_DP64 : ret;
+
+        ret = (ax_lt_1 & y_eq_ninf) ? vINFINITY_DP64 : ret;
+        ret = (ax_gt_1 & y_eq_ninf) ? vZERO_DP64 : ret;
+        ret = (ax_lt_1 & y_eq_pinf) ? vZERO_DP64 : ret;
+        ret = (ax_gt_1 & y_eq_pinf) ? vINFINITY_DP64 : ret;
+
+        ret = (ax_eq_0 & y_neg & is_odd) ? xinf : ret;
+        ret = (ax_eq_0 & y_neg & is_not_odd) ? vINFINITY_DP64 : ret;
+        ret = (ax_eq_0 & y_pos & is_odd) ? xzero : ret;
+        ret = (ax_eq_0 & y_pos & is_not_odd) ? vZERO_DP64 : ret;
+        ret = (ax_eq_0 & y_eq_ninf) ? vINFINITY_DP64 : ret;
+
+        ret = ((x == (vtype)(-1.0)) & ay_eq_pinf) ? vONE_DP64 : ret;
+
+        ret = (x_eq_ninf & y_neg & is_odd) ? (vtype)(-0.0) : ret;
+        ret = (x_eq_ninf & y_neg & is_not_odd) ? vZERO_DP64 : ret;
+        ret = (x_eq_ninf & y_pos & is_odd) ? vNINFINITY_DP64 : ret;
+        ret = (x_eq_ninf & y_pos & is_not_odd) ? vINFINITY_DP64 : ret;
+        ret = (x_eq_pinf & y_neg) ? vZERO_DP64 : ret;
+        ret = (x_eq_pinf & y_pos) ? vINFINITY_DP64 : ret;
+        ret = (ax_eq_nan) ? x : ret;
+        ret = (ay_eq_nan) ? y : ret;
+
+    #else
+        // XXX work around conformance test incorrectly checking these cases
+        vtype xinf = copysign(vINFINITY_DP64, x);
+        ret = (ax_eq_0 & y_neg & is_odd) ? xinf : ret;
+        ret = (ax_eq_0 & y_neg & is_not_odd) ? vINFINITY_DP64 : ret;
+
+        vtype xzero = copysign(0.0f, x);
+        ret = (ax_eq_0 & y_pos & is_odd) ? xzero : ret;
+        ret = (ax_eq_0 & y_pos & is_not_odd) ? vZERO_DP64 : ret;
+    #endif
+
+    ret = ay_eq_0 ? vONE_DP64 : ret;
+    ret = (x == vONE_DP64) ? vONE_DP64 : ret;
+    return ret;
+
+#endif
+
+}
diff --git a/lib/kernel/libclc/pow_fp32.cl b/lib/kernel/libclc/pow_fp32.cl
new file mode 100644
index 0000000..1519278
--- /dev/null
+++ b/lib/kernel/libclc/pow_fp32.cl
@@ -0,0 +1,12 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See ROCM_LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#define COMPILING_POW
+
+#include "pow_base_fp32.cl"
+
+#undef COMPILING_POW
diff --git a/lib/kernel/libclc/pow_fp64.cl b/lib/kernel/libclc/pow_fp64.cl
new file mode 100644
index 0000000..88d7022
--- /dev/null
+++ b/lib/kernel/libclc/pow_fp64.cl
@@ -0,0 +1,12 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See ROCM_LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#define COMPILING_POW
+
+#include "pow_base_fp64.cl"
+
+#undef COMPILING_POW
diff --git a/lib/kernel/libclc/pow_helpers_fp32.cl b/lib/kernel/libclc/pow_helpers_fp32.cl
new file mode 100644
index 0000000..2b74ca9
--- /dev/null
+++ b/lib/kernel/libclc/pow_helpers_fp32.cl
@@ -0,0 +1,58 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See ROCM_LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#include "ocml_helpers.h"
+
+v2type _CL_OVERLOADABLE
+MATH_PRIVATE(epln)(vtype a)
+{
+    vtype m = BUILTIN_FREXP_MANT_F32(a);
+    itype b = (m < (vtype)(2.0f/3.0f)) ? (itype)1 : (itype)0;
+    m = BUILTIN_FLDEXP_F32(m, b);
+    itype e = BUILTIN_FREXP_EXP_F32(a) - b;
+
+    v2type x = div(m - (vtype)1.0f, add(m, (vtype)1.0f));
+    v2type s = sqr(x);
+    vtype t = s.hi;
+    vtype p = MATH_MAD(t, MATH_MAD(t, (vtype)0x1.ed89c2p-3f,
+                      (vtype)0x1.23e988p-2f), (vtype)0x1.999bdep-2f);
+
+    // ln(2)*e + 2*x + x^3(c3 + x^2*p)
+    v2type r = add(mul(con((vtype)0x1.62e430p-1f, (vtype)-0x1.05c610p-29f),
+                 convert_vtype(e)),
+                   fadd(ldx(x,1),
+                      mul(mul(s, x),
+                        fadd(con((vtype)0x1.555554p-1f,
+                                 (vtype)0x1.e72020p-29f),
+                             mul(s, p)))));
+
+    return r;
+}
+
+
+vtype _CL_OVERLOADABLE
+MATH_PRIVATE(expep)(v2type x)
+{
+    vtype fn = BUILTIN_RINT_F32(x.hi * 0x1.715476p+0f);
+    v2type t = fsub(fsub(sub(x, fn*0x1.62e400p-1f), fn*0x1.7f7800p-20f), fn*0x1.473de6p-34f);
+
+    vtype th = t.hi;
+    vtype p = MATH_MAD(th, MATH_MAD(th, MATH_MAD(th, MATH_MAD(th,
+                  (vtype)0x1.6850e4p-10f, (vtype)0x1.123bccp-7f),
+                  (vtype)0x1.555b98p-5f), (vtype)0x1.55548ep-3f),
+                  (vtype)0x1.fffff8p-2f);
+
+    v2type r = fadd(t, mul(sqr(t), p));
+    vtype z = (vtype)1.0 + r.hi;
+
+    z = BUILTIN_FLDEXP_F32(z, convert_inttype(fn));
+
+    z = (x.hi > (vtype)89.0f) ? as_vtype((utype)PINFBITPATT_SP32) : z;
+    z = (x.hi < (vtype)-104.0f) ? (vtype)0.0f : z;
+
+    return z;
+}
diff --git a/lib/kernel/libclc/pow_helpers_fp64.cl b/lib/kernel/libclc/pow_helpers_fp64.cl
new file mode 100644
index 0000000..45e119e
--- /dev/null
+++ b/lib/kernel/libclc/pow_helpers_fp64.cl
@@ -0,0 +1,65 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See ROCM_LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#include "ocml_helpers.h"
+
+v2type _CL_OVERLOADABLE
+MATH_PRIVATE(epln)(vtype a)
+{
+    vtype m = BUILTIN_FREXP_MANT_F64(a);
+    itype b = (m < (vtype)(2.0/3.0)) ? (itype)1 : (itype)0;
+    m = BUILTIN_FLDEXP_F64(m, convert_inttype(b));
+    itype e = BUILTIN_FREXP_EXP_F64(a) - b;
+
+    v2type x = div(m - (vtype)1.0, add(m, (vtype)1.0));
+    v2type s = sqr(x);
+    vtype t = s.hi;
+    vtype p = MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t,
+               MATH_MAD(t, MATH_MAD(t, MATH_MAD(t, MATH_MAD(t,
+                   (vtype)0x1.dee674222de17p-4, (vtype)0x1.a6564968915a9p-4),
+                   (vtype)0x1.e25e43abe935ap-4), (vtype)0x1.110ef47e6c9c2p-3),
+                   (vtype)0x1.3b13bcfa74449p-3), (vtype)0x1.745d171bf3c30p-3),
+                   (vtype)0x1.c71c71c7792cep-3), (vtype)0x1.24924924920dap-2),
+                   (vtype)0x1.999999999999cp-2);
+
+    // ln(2)*e + 2*x + x^3(c3 + x^2*p)
+    v2type r = add(mul(con((vtype)0x1.62e42fefa39efp-1, (vtype)0x1.abc9e3b39803fp-56), convert_vtype(e)),
+                    fadd(ldx(x,1),
+                          mul(mul(s, x),
+                              fadd(con((vtype)0x1.5555555555555p-1,(vtype)0x1.543b0d5df274dp-55),
+                                   mul(s, p)))));
+
+    return r;
+}
+
+vtype _CL_OVERLOADABLE
+MATH_PRIVATE(expep)(v2type x)
+{
+    vtype dn = BUILTIN_RINT_F64(x.hi * 0x1.71547652b82fep+0);
+    v2type t = fsub(fsub(sub(x, dn*0x1.62e42fefa3000p-1),
+                  dn*0x1.3de6af278e000p-42), dn*0x1.9cc01f97b57a0p-83);
+
+    vtype th = t.hi;
+    vtype p = MATH_MAD(th, MATH_MAD(th, MATH_MAD(th, MATH_MAD(th,
+               MATH_MAD(th, MATH_MAD(th, MATH_MAD(th, MATH_MAD(th,
+               MATH_MAD(th,
+                   (vtype)0x1.ade156a5dcb37p-26, (vtype)0x1.28af3fca7ab0cp-22),
+                   (vtype)0x1.71dee623fde64p-19), (vtype)0x1.a01997c89e6b0p-16),
+                   (vtype)0x1.a01a014761f6ep-13), (vtype)0x1.6c16c1852b7b0p-10),
+                   (vtype)0x1.1111111122322p-7), (vtype)0x1.55555555502a1p-5),
+                   (vtype)0x1.5555555555511p-3), (vtype)0x1.000000000000bp-1);
+
+    v2type r = fadd(t, mul(sqr(t), p));
+    vtype z = (vtype)1.0 + r.hi;
+
+    z = BUILTIN_FLDEXP_F64(z, convert_inttype(dn));
+
+    z = (x.hi > (vtype)710.0) ? as_vtype((utype)PINFBITPATT_DP64) : z;
+    z = (x.hi < (vtype)-745.0) ? (vtype)0.0 : z;
+
+    return z;
+}
diff --git a/lib/kernel/libclc/pown_fp32.cl b/lib/kernel/libclc/pown_fp32.cl
new file mode 100644
index 0000000..c905d16
--- /dev/null
+++ b/lib/kernel/libclc/pown_fp32.cl
@@ -0,0 +1,12 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See ROCM_LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#define COMPILING_POWN
+
+#include "pow_base_fp32.cl"
+
+#undef COMPILING_POWN
diff --git a/lib/kernel/libclc/pown_fp64.cl b/lib/kernel/libclc/pown_fp64.cl
new file mode 100644
index 0000000..9084277
--- /dev/null
+++ b/lib/kernel/libclc/pown_fp64.cl
@@ -0,0 +1,12 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See ROCM_LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#define COMPILING_POWN
+
+#include "pow_base_fp64.cl"
+
+#undef COMPILING_POWN
diff --git a/lib/kernel/libclc/powr_fp32.cl b/lib/kernel/libclc/powr_fp32.cl
new file mode 100644
index 0000000..de0eba0
--- /dev/null
+++ b/lib/kernel/libclc/powr_fp32.cl
@@ -0,0 +1,12 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See ROCM_LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#define COMPILING_POWR
+
+#include "pow_base_fp32.cl"
+
+#undef COMPILING_POWR
diff --git a/lib/kernel/libclc/powr_fp64.cl b/lib/kernel/libclc/powr_fp64.cl
new file mode 100644
index 0000000..a546a87
--- /dev/null
+++ b/lib/kernel/libclc/powr_fp64.cl
@@ -0,0 +1,12 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See ROCM_LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#define COMPILING_POWR
+
+#include "pow_base_fp64.cl"
+
+#undef COMPILING_POWR
diff --git a/lib/kernel/libclc/radians_fp32.cl b/lib/kernel/libclc/radians_fp32.cl
new file mode 100644
index 0000000..0269683
--- /dev/null
+++ b/lib/kernel/libclc/radians_fp32.cl
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+
+_CL_OVERLOADABLE vtype radians(vtype degrees) {
+  // pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F
+  return (vtype)0x1.1df46ap-6F * degrees;
+}
diff --git a/lib/kernel/libclc/radians_fp64.cl b/lib/kernel/libclc/radians_fp64.cl
new file mode 100644
index 0000000..de191cc
--- /dev/null
+++ b/lib/kernel/libclc/radians_fp64.cl
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+_CL_OVERLOADABLE vtype radians(vtype degrees) {
+  // pi/180 = ~0.01745329251994329577 or 0x1.1df46a2529d39p-6 or 0x1.1df46ap-6F
+  return (vtype)0x1.1df46a2529d39p-6 * degrees;
+}
diff --git a/lib/kernel/libclc/remainder_base_fp32.cl b/lib/kernel/libclc/remainder_base_fp32.cl
new file mode 100644
index 0000000..d10508d
--- /dev/null
+++ b/lib/kernel/libclc/remainder_base_fp32.cl
@@ -0,0 +1,188 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See ROCM_LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#include "ocml_helpers.h"
+
+// How many bits of the quotient per iteration
+#define X_GT_Y_BITS (itype)12
+#define BITS 12
+
+#define APPEND_AS(x, y) APPEND_AS2(x, y)
+#define APPEND_AS2(x, y) x ## y
+
+#if defined(COMPILING_REMQUO)
+#define X_GT_Y APPEND_AS(x_gt_y_remquo_, ADDRSPACE)
+#define X_LT_Y APPEND_AS(x_lt_y_remquo_, ADDRSPACE)
+#elif defined(COMPILING_FMOD)
+#define X_GT_Y x_gt_y_fmod
+#define X_LT_Y x_lt_y_fmod
+#else //remainder
+#define X_GT_Y x_gt_y_remainder
+#define X_LT_Y x_lt_y_remainder
+#endif
+
+#if defined(COMPILING_REMQUO)
+OCML_ATTR vtype X_GT_Y(vtype x, vtype y, vtype ax, vtype ay, itype *q7) {
+#else
+OCML_ATTR vtype X_GT_Y(vtype x, vtype y, vtype ax, vtype ay) {
+#endif
+    itype ex, ey;
+
+    ex = BUILTIN_FREXP_EXP_F32(ax) - (itype)1;
+    ax = BUILTIN_FLDEXP_F32(BUILTIN_FREXP_MANT_F32(ax), X_GT_Y_BITS);
+    ey = BUILTIN_FREXP_EXP_F32(ay) - (itype)1;
+    ay = BUILTIN_FLDEXP_F32(BUILTIN_FREXP_MANT_F32(ay), 1);
+    vtype axN = ax;
+
+    itype nb = ex - ey;
+    itype nbN = nb;
+    vtype ayinv = MATH_FAST_RCP(ay);
+
+#if !defined(COMPILING_FMOD)
+    itype qacc = (itype)0;
+#endif
+#if defined(COMPILING_REMQUO)
+    itype qaccN = qacc;
+#endif
+
+    while (SV_ANY(nb > X_GT_Y_BITS)) {
+        vtype q = BUILTIN_RINT_F32(ax * ayinv);
+        axN = fnma(q, ay, ax);
+        itype clt = (axN < (vtype)0.0f);
+        axN = clt ? (axN + ay) : axN;
+#if defined(COMPILING_REMQUO)
+        itype iq = convert_itype(q);
+        iq = clt ? (iq-(itype)1) : iq;
+        qaccN = (qacc << BITS) | iq;
+#endif
+        axN = BUILTIN_FLDEXP_F32(axN, X_GT_Y_BITS);
+        nbN -= X_GT_Y_BITS;
+
+        itype cond = (nb > X_GT_Y_BITS);
+        nb = cond ? nbN : nb;
+        ax = cond ? axN : ax;
+#if defined(COMPILING_REMQUO)
+        qacc = cond ? qaccN : qacc;
+#endif
+    }
+
+    ax = BUILTIN_FLDEXP_F32(ax, (nb - X_GT_Y_BITS + (itype)1));
+
+    // Final iteration
+    {
+        vtype q = BUILTIN_RINT_F32(ax * ayinv);
+        ax = fnma(q, ay, ax);
+        itype clt = (ax < (vtype)0.0f);
+        ax = clt ? (ax + ay) : ax;
+#if !defined(COMPILING_FMOD)
+        itype iq = convert_itype(q);
+        iq = clt ? (iq-(itype)1) : iq;
+#if defined(COMPILING_REMQUO)
+        qacc = (qacc << (nb+(itype)1)) | iq;
+#else
+        qacc = iq;
+#endif
+#endif
+    }
+
+#if !defined(COMPILING_FMOD)
+    // Adjust ax so that it is the range (-y/2, y/2]
+    // We need to choose the even integer when x/y is midway between two integer
+    itype qacc_is_odd = SV_ODD32(qacc);
+    itype aq = ((2.0f*ax > ay) SV_OR ((qacc_is_odd) SV_AND ((2.0f*ax) == ay)));
+    ax = ax - (aq ? ay : (vtype)0.0f);
+#if defined(COMPILING_REMQUO)
+    qacc += (aq ? (itype)1 : (itype)0);
+    itype qneg = (as_itype(x) ^ as_itype(y)) >> 31;
+    *q7 = ((qacc & (itype)0x7f) ^ qneg) - qneg;
+#endif
+#endif
+
+    ax = BUILTIN_FLDEXP_F32(ax, ey);
+    return as_vtype( ((as_itype(x) & (itype)SIGNBIT_SP32) ^ as_itype(ax)) );
+}
+
+#undef X_GT_Y_BITS
+#undef BITS
+
+/***************************************************************************/
+/***************************************************************************/
+/***************************************************************************/
+
+#if defined(COMPILING_REMQUO)
+OCML_ATTR vtype X_LT_Y(vtype x, vtype y, vtype ax, vtype ay, itype *q7o) {
+#else
+OCML_ATTR vtype X_LT_Y(vtype x, vtype y, vtype ax, vtype ay) {
+#endif
+    vtype ret = x;
+#if defined(COMPILING_REMQUO)
+    itype q7 = (itype)0;
+#endif
+
+#if !defined(COMPILING_FMOD)
+    itype c = ((ay < (vtype)0x1.0p+127f) & (2.0f*ax > ay)) | (ax > 0.5f*ay);
+
+    itype qsgn = (itype)1 + (((as_itype(x) ^ as_itype(y)) >> 31) << 1);
+    vtype t = MATH_MAD(y, convert_vtype(-qsgn), x);
+    ret = c ? t : ret;
+#if defined(COMPILING_REMQUO)
+    q7 = c ? qsgn : q7;
+#endif
+#endif
+
+    ret = (ax == ay) ? copysign((vtype)0.0f, x) : ret;
+#if defined(COMPILING_REMQUO)
+    q7 = (ax == ay) ? qsgn : q7;
+    *q7o = q7;
+#endif
+    return ret;
+}
+
+/***************************************************************************/
+/***************************************************************************/
+/***************************************************************************/
+
+
+#if defined(COMPILING_FMOD)
+CONSTATTR vtype
+MATH_MANGLE(fmod)(vtype x, vtype y)
+#elif defined(COMPILING_REMQUO)
+vtype
+MATH_MANGLE(remquo)(vtype x, vtype y, ADDRSPACE itype *q7p)
+#else
+CONSTATTR vtype
+MATH_MANGLE(remainder)(vtype x, vtype y)
+#endif
+{
+
+    vtype ax = BUILTIN_ABS_F32(x);
+    vtype ay = BUILTIN_ABS_F32(y);
+    vtype ret;
+
+#if defined(COMPILING_REMQUO)
+    itype q7, q7_gt, q7_lt;
+    vtype x_gt = X_GT_Y(x, y, ax, ay, &q7_gt);
+    vtype x_lt = X_LT_Y(x, y, ax, ay, &q7_lt);
+    q7 = (ax > ay) ? q7_gt : q7_lt;
+#else
+    vtype x_gt = X_GT_Y(x, y, ax, ay);
+    vtype x_lt = X_LT_Y(x, y, ax, ay);
+#endif
+    ret = (ax > ay) ? x_gt : x_lt;
+
+    itype c = (isnan(y) SV_OR isinf(x) SV_OR isnan(x) SV_OR (y == (vtype)0.0f));
+    ret = c ? as_vtype((utype)QNANBITPATT_SP32) : ret;
+
+#if defined(COMPILING_REMQUO)
+    q7 = c ? (itype)0 : q7;
+    *q7p = q7;
+#endif
+
+    return ret;
+}
+
+#undef APPEND_AS
diff --git a/lib/kernel/libclc/remainder_base_fp64.cl b/lib/kernel/libclc/remainder_base_fp64.cl
new file mode 100644
index 0000000..cf84054
--- /dev/null
+++ b/lib/kernel/libclc/remainder_base_fp64.cl
@@ -0,0 +1,188 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See ROCM_LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#include "ocml_helpers.h"
+
+// How many bits of the quotient per iteration
+#define X_GT_Y_BITS (itype)26
+#define BITS 12
+
+#define APPEND_AS(x, y) APPEND_AS2(x, y)
+#define APPEND_AS2(x, y) x ## y
+
+#if defined(COMPILING_REMQUO)
+#define X_GT_Y APPEND_AS(x_gt_y_remquo_, ADDRSPACE)
+#define X_LT_Y APPEND_AS(x_lt_y_remquo_, ADDRSPACE)
+#elif defined(COMPILING_FMOD)
+#define X_GT_Y x_gt_y_fmod
+#define X_LT_Y x_lt_y_fmod
+#else //remainder
+#define X_GT_Y x_gt_y_remainder
+#define X_LT_Y x_lt_y_remainder
+#endif
+
+#if defined(COMPILING_REMQUO)
+OCML_ATTR vtype X_GT_Y(vtype x, vtype y, vtype ax, vtype ay, itype *q7) {
+#else
+OCML_ATTR vtype X_GT_Y(vtype x, vtype y, vtype ax, vtype ay) {
+#endif
+    itype ex, ey;
+
+    ex = BUILTIN_FREXP_EXP_F64(ax) - (itype)1;
+    ax = BUILTIN_FLDEXP_F64(BUILTIN_FREXP_MANT_F64(ax), convert_inttype(X_GT_Y_BITS));
+    ey = BUILTIN_FREXP_EXP_F64(ay) - (itype)1;
+    ay = BUILTIN_FLDEXP_F64(BUILTIN_FREXP_MANT_F64(ay), 1);
+    vtype axN = ax;
+
+    itype nb = ex - ey;
+    itype nbN = nb;
+    vtype ayinv = MATH_RCP(ay);
+
+#if !defined(COMPILING_FMOD)
+    itype qacc = 0;
+#endif
+#if defined(COMPILING_REMQUO)
+    itype qaccN = qacc;
+#endif
+
+    while (SV_ANY(nb > X_GT_Y_BITS)) {
+        vtype q = BUILTIN_RINT_F64(ax * ayinv);
+        axN = fnma(q, ay, ax);
+        itype clt = (axN < (vtype)0.0);
+        axN = clt ? (axN + ay) : axN;
+#if defined(COMPILING_REMQUO)
+        itype iq = convert_itype(q);
+        iq = clt ? (iq-1) : iq;
+        qaccN = (qacc << BITS) | iq;
+#endif
+        axN = BUILTIN_FLDEXP_F64(axN, convert_inttype(X_GT_Y_BITS));
+        nbN -= X_GT_Y_BITS;
+
+        itype cond = (nb > X_GT_Y_BITS);
+        nb = cond ? nbN : nb;
+        ax = cond ? axN : ax;
+#if defined(COMPILING_REMQUO)
+        qacc = cond ? qaccN : qacc;
+#endif
+    }
+
+    ax = BUILTIN_FLDEXP_F64(ax, convert_inttype(nb - X_GT_Y_BITS + (itype)1));
+
+    // Final iteration
+    {
+        vtype q = BUILTIN_RINT_F64(ax * ayinv);
+        ax = fnma(q, ay, ax);
+        itype clt = (ax < (vtype)0.0);
+        ax = clt ? (ax + ay) : ax;
+#if !defined(COMPILING_FMOD)
+        itype iq = convert_itype(q);
+        iq = clt ? (iq-1) : iq;
+#if defined(COMPILING_REMQUO)
+        qacc = (qacc << (nb+(itype)1)) | iq;
+#else
+        qacc = iq;
+#endif
+#endif
+    }
+
+#if !defined(COMPILING_FMOD)
+    // Adjust ax so that it is the range (-y/2, y/2]
+    // We need to choose the even integer when x/y is midway between two itypeegers
+    itype qacc_is_odd = SV_ODD64(qacc);
+    itype aq = ((2.0*ax > ay) SV_OR ((qacc_is_odd) SV_AND ((2.0*ax) == ay)));
+    ax = ax - (aq ? ay : (vtype)0.0f);
+#if defined(COMPILING_REMQUO)
+    qacc += (aq ? (itype)1 : (itype)0);
+    itype qneg = (as_itype(x) ^ as_itype(y)) >> 63;
+    *q7 = ((qacc & (itype)0x7f) ^ qneg) - qneg;
+#endif
+#endif
+
+    ax = BUILTIN_FLDEXP_F64(ax, convert_inttype(ey));
+    return as_vtype( ((as_itype(x) & (itype)SIGNBIT_DP64) ^ as_itype(ax)) );
+}
+
+#undef X_GT_Y_BITS
+#undef BITS
+
+/***************************************************************************/
+/***************************************************************************/
+/***************************************************************************/
+
+#if defined(COMPILING_REMQUO)
+OCML_ATTR vtype X_LT_Y(vtype x, vtype y, vtype ax, vtype ay, itype *q7o) {
+#else
+OCML_ATTR vtype X_LT_Y(vtype x, vtype y, vtype ax, vtype ay) {
+#endif
+    vtype ret = x;
+#if defined(COMPILING_REMQUO)
+    itype q7 = (itype)0;
+#endif
+
+#if !defined(COMPILING_FMOD)
+    itype c = ((ay < (vtype)0x1.0p+1023) & (2.0*ax > ay)) | (ax > 0.5*ay);
+
+    itype qsgn = (itype)1 + (((as_itype(x) ^ as_itype(y)) >> 63) << 1);
+    vtype t = MATH_MAD(y, convert_vtype(-qsgn), x);
+    ret = c ? t : ret;
+#if defined(COMPILING_REMQUO)
+    q7 = c ? qsgn : q7;
+#endif
+#endif
+
+    ret = (ax == ay) ? copysign((vtype)0.0, x) : ret;
+#if defined(COMPILING_REMQUO)
+    q7 = (ax == ay) ? qsgn : q7;
+    *q7o = q7;
+#endif
+    return ret;
+}
+
+/***************************************************************************/
+/***************************************************************************/
+/***************************************************************************/
+
+
+#if defined(COMPILING_FMOD)
+CONSTATTR vtype
+MATH_MANGLE(fmod)(vtype x, vtype y)
+#elif defined(COMPILING_REMQUO)
+vtype
+MATH_MANGLE(remquo)(vtype x, vtype y, ADDRSPACE inttype *q7p)
+#else
+CONSTATTR vtype
+MATH_MANGLE(remainder)(vtype x, vtype y)
+#endif
+{
+
+    vtype ax = BUILTIN_ABS_F64(x);
+    vtype ay = BUILTIN_ABS_F64(y);
+    vtype ret;
+
+#if defined(COMPILING_REMQUO)
+    itype q7, q7_gt, q7_lt;
+    vtype x_gt = X_GT_Y(x, y, ax, ay, &q7_gt);
+    vtype x_lt = X_LT_Y(x, y, ax, ay, &q7_lt);
+    q7 = (ax > ay) ? q7_gt : q7_lt;
+#else
+    vtype x_gt = X_GT_Y(x, y, ax, ay);
+    vtype x_lt = X_LT_Y(x, y, ax, ay);
+#endif
+    ret = (ax > ay) ? x_gt : x_lt;
+
+    itype c = (isnan(y) SV_OR isinf(x) SV_OR isnan(x) SV_OR (y == (vtype)0.0));
+    ret = c ? as_vtype((utype)QNANBITPATT_DP64) : ret;
+
+#if defined(COMPILING_REMQUO)
+    q7 = c ? (itype)0 : q7;
+    *q7p = convert_inttype(q7);
+#endif
+
+    return ret;
+}
+
+#undef APPEND_AS
diff --git a/lib/kernel/libclc/remainder_fp32.cl b/lib/kernel/libclc/remainder_fp32.cl
new file mode 100644
index 0000000..521aded
--- /dev/null
+++ b/lib/kernel/libclc/remainder_fp32.cl
@@ -0,0 +1,12 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See ROCM_LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#define COMPILING_REMAINDER
+
+#include "remainder_base_fp32.cl"
+
+#undef COMPILING_REMAINDER
diff --git a/lib/kernel/libclc/remainder_fp64.cl b/lib/kernel/libclc/remainder_fp64.cl
new file mode 100644
index 0000000..49c3faf
--- /dev/null
+++ b/lib/kernel/libclc/remainder_fp64.cl
@@ -0,0 +1,12 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See ROCM_LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#define COMPILING_REMAINDER
+
+#include "remainder_base_fp64.cl"
+
+#undef COMPILING_REMAINDER
diff --git a/lib/kernel/libclc/remquo_fp32.cl b/lib/kernel/libclc/remquo_fp32.cl
new file mode 100644
index 0000000..f620f6e
--- /dev/null
+++ b/lib/kernel/libclc/remquo_fp32.cl
@@ -0,0 +1,12 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See ROCM_LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#define COMPILING_REMQUO
+
+#include "remainder_base_fp32.cl"
+
+#undef COMPILING_REMQUO
diff --git a/lib/kernel/libclc/remquo_fp64.cl b/lib/kernel/libclc/remquo_fp64.cl
new file mode 100644
index 0000000..bf6b7fc
--- /dev/null
+++ b/lib/kernel/libclc/remquo_fp64.cl
@@ -0,0 +1,12 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See ROCM_LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#define COMPILING_REMQUO
+
+#include "remainder_base_fp64.cl"
+
+#undef COMPILING_REMQUO
diff --git a/lib/kernel/libclc/rootn_fp32.cl b/lib/kernel/libclc/rootn_fp32.cl
new file mode 100644
index 0000000..7865692
--- /dev/null
+++ b/lib/kernel/libclc/rootn_fp32.cl
@@ -0,0 +1,12 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See ROCM_LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#define COMPILING_ROOTN
+
+#include "pow_base_fp32.cl"
+
+#undef COMPILING_ROOTN
diff --git a/lib/kernel/libclc/rootn_fp64.cl b/lib/kernel/libclc/rootn_fp64.cl
new file mode 100644
index 0000000..2effc36
--- /dev/null
+++ b/lib/kernel/libclc/rootn_fp64.cl
@@ -0,0 +1,12 @@
+/*===--------------------------------------------------------------------------
+ *                   ROCm Device Libraries
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See ROCM_LICENSE.TXT for details.
+ *===------------------------------------------------------------------------*/
+
+#define COMPILING_ROOTN
+
+#include "pow_base_fp64.cl"
+
+#undef COMPILING_ROOTN
diff --git a/lib/kernel/libclc/sin_fp32.cl b/lib/kernel/libclc/sin_fp32.cl
new file mode 100644
index 0000000..59c1562
--- /dev/null
+++ b/lib/kernel/libclc/sin_fp32.cl
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+
+_CL_OVERLOADABLE vtype sin(vtype x)
+{
+    itype ix = as_itype(x);
+    itype ax = ix & (itype)EXSIGNBIT_SP32;
+    vtype dx = as_vtype(ax);
+
+    vtype r0, r1;
+    itype regn = __pocl_argReductionS(&r0, &r1, dx);
+
+    vtype ss = __pocl_sinf_piby4(r0, r1);
+    vtype cc = __pocl_cosf_piby4(r0, r1);
+
+    vtype s = (regn << 31) ? cc : ss;
+    itype mask = ((regn > (itype)1) ? (itype)SIGNBIT_SP32 : (itype)0);
+    s = as_vtype(as_itype(s) ^ mask ^ (ix ^ ax));
+
+    s = (ax >= (itype)PINFBITPATT_SP32) ? as_vtype((utype)QNANBITPATT_SP32) : s;
+
+    //Subnormals
+    s = (x == 0.0f) ? x : s;
+
+    return s;
+}
diff --git a/lib/kernel/libclc/sin_fp64.cl b/lib/kernel/libclc/sin_fp64.cl
new file mode 100644
index 0000000..e976b41
--- /dev/null
+++ b/lib/kernel/libclc/sin_fp64.cl
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+_CL_OVERLOADABLE vtype sin(vtype x) {
+    vtype y = fabs(x);
+
+    vtype r, rr, r2, rr2;
+    itype regn, regn2;
+
+    __pocl_remainder_piby2_medium(y, &r, &rr, &regn);
+    itype cond = (y >= (vtype)0x1.0p+47);
+    if (SV_ANY(cond)) {
+        __pocl_remainder_piby2_large(y, &r2, &rr2, &regn2);
+        regn = cond ? regn2 : regn;
+        r = cond ? r2 : r;
+        rr = cond ? rr2 : rr;
+    }
+    v2type sc = __pocl_sincos_piby4(r, rr);
+
+    itype ss = as_itype(sc.lo);
+    itype cc = as_itype(sc.hi);
+    itype s = (regn << 63) ? cc : ss;
+    s ^= ((regn >> 1) << 63);
+    s ^= (as_itype(x) & (itype)SIGNBIT_DP64);
+
+    // denorms
+    vtype res = (y < (vtype)DBL_MIN) ? x : as_vtype(s);
+
+    return (isnan(x) | isinf(x)) ? as_vtype((utype)QNANBITPATT_DP64) : res;
+}
diff --git a/lib/kernel/libclc/sincos_fp32.cl b/lib/kernel/libclc/sincos_fp32.cl
new file mode 100644
index 0000000..da568f2
--- /dev/null
+++ b/lib/kernel/libclc/sincos_fp32.cl
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+_CL_OVERLOADABLE vtype sincos(vtype x, ADDRSPACE vtype *cosval)
+{
+    itype ix = as_itype(x);
+    itype ax = ix & (itype)0x7fffffff;
+    vtype dx = as_vtype(ax);
+
+    vtype r0, r1;
+    itype regn = __pocl_argReductionS(&r0, &r1, dx);
+
+    vtype ss = __pocl_sinf_piby4(r0, r1);
+    vtype cc = __pocl_cosf_piby4(r0, r1);
+    vtype ss2 = as_vtype(as_itype(ss) ^ (itype)SIGNBIT_SP32);
+
+    itype cond = (regn << 31);
+    vtype s = cond ? cc : ss;
+    vtype c = cond ? ss2 : cc;
+
+    itype t = ((regn >> 1) << 31);
+    c = as_vtype(as_itype(c) ^ t);
+
+    itype mask = ((regn > (itype)1) ? (itype)SIGNBIT_SP32 : (itype)0);
+    s = as_vtype(as_itype(s) ^ mask ^ (ix ^ ax));
+
+    itype cond2 = (ax >= (itype)PINFBITPATT_SP32);
+    c = cond2 ? as_vtype((utype)QNANBITPATT_SP32) : c;
+    s = cond2 ? as_vtype((utype)QNANBITPATT_SP32) : s;
+
+    itype infcond = (ax >= (itype)PINFBITPATT_SP32);
+    s = infcond ? as_vtype((utype)QNANBITPATT_SP32) : s;
+    c = infcond ? as_vtype((utype)QNANBITPATT_SP32) : c;
+
+    //Subnormals
+    s = (x == 0.0f) ? x : s;
+    c = (x == 0.0f) ? (vtype)1.0f : c;
+
+    *cosval = c;
+    return s;
+}
diff --git a/lib/kernel/libclc/sincos_fp64.cl b/lib/kernel/libclc/sincos_fp64.cl
new file mode 100644
index 0000000..a4fbdf3
--- /dev/null
+++ b/lib/kernel/libclc/sincos_fp64.cl
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+_CL_OVERLOADABLE vtype sincos(vtype x, ADDRSPACE vtype *cosval)
+{
+    vtype y = fabs(x);
+
+    vtype r, rr, r2, rr2;
+    itype regn, regn2;
+
+    __pocl_remainder_piby2_medium(y, &r, &rr, &regn);
+    itype cond = (y >= (vtype)0x1.0p+47);
+    if (SV_ANY(cond)) {
+        __pocl_remainder_piby2_large(y, &r2, &rr2, &regn2);
+        regn = cond ? regn2 : regn;
+        r = cond ? r2 : r;
+        rr = cond ? rr2 : rr;
+    }
+    v2type sc = __pocl_sincos_piby4(r, rr);
+
+    cond = (regn << 63);
+
+    vtype ss = sc.lo;
+    vtype cc = sc.hi;
+
+    itype s = cond ? as_itype(cc) : as_itype(ss);
+
+    ss = -sc.lo;
+    itype c = cond ? as_itype(ss) : as_itype(cc);
+
+    itype sgn = ((regn >> 1) << 63);
+    s ^= sgn;
+    c ^= sgn;
+    s ^= (as_itype(x) & (itype)SIGNBIT_DP64);
+
+    itype nancond = (isinf(x) | isnan(x));
+    c = nancond ? (itype)(QNANBITPATT_DP64) : c;
+    s = nancond ? (itype)(QNANBITPATT_DP64) : s;
+
+    *cosval = as_vtype(c);
+    return as_vtype(s);
+}
diff --git a/lib/kernel/libclc/sincos_helpers_fp32.cl b/lib/kernel/libclc/sincos_helpers_fp32.cl
new file mode 100644
index 0000000..aaab2a7
--- /dev/null
+++ b/lib/kernel/libclc/sincos_helpers_fp32.cl
@@ -0,0 +1,380 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#define bitalign(hi, lo, shift) \
+  ((hi) << ((itype)32 - (shift))) | ((lo) >> (shift));
+
+_CL_OVERLOADABLE void __pocl_fullMulS(vtype *hi, vtype *lo, vtype a, vtype b, vtype bh, vtype bt)
+{
+    if (HAVE_FMA32) {
+        vtype ph = a * b;
+        *hi = ph;
+        *lo = fma(a, b, -ph);
+    } else {
+        vtype ah = as_vtype(as_utype(a) & (utype)0xfffff000U);
+        vtype at = a - ah;
+        vtype ph = a * b;
+        vtype pt = pocl_fma(at, bt, pocl_fma(at, bh, pocl_fma(ah, bt, pocl_fma(ah, bh, -ph))));
+        *hi = ph;
+        *lo = pt;
+    }
+}
+
+_CL_OVERLOADABLE vtype __pocl_removePi2S(vtype *hi, vtype *lo, vtype x)
+{
+    // 72 bits of pi/2
+    const vtype fpiby2_1 = (vtype)( 0xC90FDA / 0x1.0p+23f);
+    const vtype fpiby2_1_h = (vtype)( 0xC90 / 0x1.0p+11f);
+    const vtype fpiby2_1_t = (vtype)( 0xFDA / 0x1.0p+23f);
+
+    const vtype fpiby2_2 = (vtype)( 0xA22168 / 0x1.0p+47f);
+    const vtype fpiby2_2_h = (vtype)( 0xA22 / 0x1.0p+35f);
+    const vtype fpiby2_2_t = (vtype)( 0x168 / 0x1.0p+47f);
+
+    const vtype fpiby2_3 = (vtype)( 0xC234C4 / 0x1.0p+71f);
+    const vtype fpiby2_3_h = (vtype)( 0xC23 / 0x1.0p+59f);
+    const vtype fpiby2_3_t = (vtype)( 0x4C4 / 0x1.0p+71f);
+
+    const vtype twobypi = (vtype)0x1.45f306p-1f;
+
+    vtype fnpi2 = trunc(pocl_fma(x, twobypi, (vtype)0.5f));
+
+    // subtract n * pi/2 from x
+    vtype rhead, rtail;
+    __pocl_fullMulS(&rhead, &rtail, fnpi2, fpiby2_1, fpiby2_1_h, fpiby2_1_t);
+    vtype v = x - rhead;
+    vtype rem = v + (((x - v) - rhead) - rtail);
+
+    vtype rhead2, rtail2;
+    __pocl_fullMulS(&rhead2, &rtail2, fnpi2, fpiby2_2, fpiby2_2_h, fpiby2_2_t);
+    v = rem - rhead2;
+    rem = v + (((rem - v) - rhead2) - rtail2);
+
+    vtype rhead3, rtail3;
+    __pocl_fullMulS(&rhead3, &rtail3, fnpi2, fpiby2_3, fpiby2_3_h, fpiby2_3_t);
+    v = rem - rhead3;
+
+    *hi = v + ((rem - v) - rhead3);
+    *lo = -rtail3;
+    return fnpi2;
+}
+
+_CL_OVERLOADABLE itype __pocl_argReductionSmallS(vtype *r, vtype *rr, vtype x)
+{
+    vtype fnpi2 = __pocl_removePi2S(r, rr, x);
+    return convert_itype(fnpi2) & (itype)0x3;
+}
+
+#define FULL_MUL(A, B, HI, LO) \
+    LO = A * B; \
+    HI = mul_hi(A, B)
+
+#define FULL_MAD(A, B, C, HI, LO) \
+    LO = ((A) * (B) + (C)); \
+    HI = mul_hi(A, B); \
+    HI += ((LO < C) ? (utype)1 : (utype)0)
+
+#ifdef SINGLEVEC
+#define SHIFT_MINUS_32 shift -= c << 5
+#else
+#define SHIFT_MINUS_32 shift -= c & (itype)32
+#endif
+
+_CL_OVERLOADABLE itype __pocl_argReductionLargeS(vtype *r, vtype *rr, vtype x)
+{
+    itype xe = (itype)(as_itype(x) >> 23) - (itype)127;
+    utype xm = (utype)0x00800000U | (as_utype(x) & (utype)0x7fffffU);
+
+    // 224 bits of 2/PI: . A2F9836E 4E441529 FC2757D1 F534DDC0 DB629599 3C439041 FE5163AB
+    const utype b6 = (utype)0xA2F9836EU;
+    const utype b5 = (utype)0x4E441529U;
+    const utype b4 = (utype)0xFC2757D1U;
+    const utype b3 = (utype)0xF534DDC0U;
+    const utype b2 = (utype)0xDB629599U;
+    const utype b1 = (utype)0x3C439041U;
+    const utype b0 = (utype)0xFE5163ABU;
+
+    utype p0, p1, p2, p3, p4, p5, p6, p7, c0, c1;
+
+    FULL_MUL(xm, b0, c0, p0);
+    FULL_MAD(xm, b1, c0, c1, p1);
+    FULL_MAD(xm, b2, c1, c0, p2);
+    FULL_MAD(xm, b3, c0, c1, p3);
+    FULL_MAD(xm, b4, c1, c0, p4);
+    FULL_MAD(xm, b5, c0, c1, p5);
+    FULL_MAD(xm, b6, c1, p7, p6);
+
+    itype fbits = (itype)224 + (itype)23 - xe;
+
+    // shift amount to get 2 lsb of integer part at top 2 bits
+    //   min: 25 (xe=18) max: 134 (xe=127)
+    itype shift = (itype)254 - fbits;
+
+    // Shift by up to 134/32 = 4 words
+    itype c = (shift > 31);
+    p7 = c ? p6 : p7;
+    p6 = c ? p5 : p6;
+    p5 = c ? p4 : p5;
+    p4 = c ? p3 : p4;
+    p3 = c ? p2 : p3;
+    p2 = c ? p1 : p2;
+    p1 = c ? p0 : p1;
+    SHIFT_MINUS_32;
+
+    c = (shift > 31);
+    p7 = c ? p6 : p7;
+    p6 = c ? p5 : p6;
+    p5 = c ? p4 : p5;
+    p4 = c ? p3 : p4;
+    p3 = c ? p2 : p3;
+    p2 = c ? p1 : p2;
+    SHIFT_MINUS_32;
+
+    c = (shift > 31);
+    p7 = c ? p6 : p7;
+    p6 = c ? p5 : p6;
+    p5 = c ? p4 : p5;
+    p4 = c ? p3 : p4;
+    p3 = c ? p2 : p3;
+    SHIFT_MINUS_32;
+
+    c = (shift > 31);
+    p7 = c ? p6 : p7;
+    p6 = c ? p5 : p6;
+    p5 = c ? p4 : p5;
+    p4 = c ? p3 : p4;
+    SHIFT_MINUS_32;
+
+    // bitalign cannot handle a shift of 32
+    c = (shift > 0);
+    shift = (itype)32 - shift;
+    utype t7 = bitalign(p7, p6, shift);
+    utype t6 = bitalign(p6, p5, shift);
+    utype t5 = bitalign(p5, p4, shift);
+    p7 = c ? t7 : p7;
+    p6 = c ? t6 : p6;
+    p5 = c ? t5 : p5;
+
+    // Get 2 lsb of itype part and msb of fraction
+    itype i = as_itype(p7 >> 29);
+
+    // Scoot up 2 more bits so only fraction remains
+    p7 = bitalign(p7, p6, 30);
+    p6 = bitalign(p6, p5, 30);
+    p5 = bitalign(p5, p4, 30);
+
+    // Subtract 1 if msb of fraction is 1, i.e. fraction >= 0.5
+    utype flip = (i << 31) ? (utype)0xffffffffU : (utype)0U;
+    utype sign = (i << 31) ? (utype)0x80000000U : (utype)0U;
+    p7 = p7 ^ flip;
+    p6 = p6 ^ flip;
+    p5 = p5 ^ flip;
+
+    // Find exponent and shift away leading zeroes and hidden bit
+    xe = as_itype(clz(p7)) + (itype)1;
+    shift = (itype)32 - xe;
+    p7 = bitalign(p7, p6, shift);
+    p6 = bitalign(p6, p5, shift);
+
+    // Most significant part of fraction
+    vtype q1 = as_vtype(as_itype(sign) | (((itype)127 - xe) << 23) | as_itype(p7 >> 9));
+
+    // Shift out bits we captured on q1
+    p7 = bitalign(p7, p6, 32-23);
+
+    // Get 24 more bits of fraction in another vtype, there are not long strings of zeroes here
+    itype xxe = as_itype(clz(p7)) + (itype)1;
+    p7 = bitalign(p7, p6, (itype)32 - xxe);
+    vtype q0 = as_vtype(as_itype(sign) | (((itype)127 - (xe + (itype)23 + xxe)) << 23) | as_itype(p7 >> 9));
+
+    // At this point, the fraction q1 + q0 is correct to at least 48 bits
+    // Now we need to multiply the fraction by pi/2
+    // This loses us about 4 bits
+    // pi/2 = C90 FDA A22 168 C23 4C4
+
+    const vtype pio2h = (vtype)(0xc90fda / 0x1.0p+23f);
+    const vtype pio2hh = (vtype)(0xc90 / 0x1.0p+11f);
+    const vtype pio2ht = (vtype)(0xfda / 0x1.0p+23f);
+    const vtype pio2t = (vtype)(0xa22168 / 0x1.0p+47f);
+
+    vtype rh, rt;
+
+    if (HAVE_FMA32) {
+        rh = q1 * pio2h;
+        rt = pocl_fma(q0, pio2h,
+               pocl_fma(q1, pio2t,
+                 pocl_fma(q1, pio2h, -rh)));
+    } else {
+        vtype q1h = as_vtype(as_utype(q1) & (utype)0xfffff000);
+        vtype q1t = q1 - q1h;
+        rh = q1 * pio2h;
+        rt = pocl_fma(q1t, pio2ht,
+               pocl_fma(q1t, pio2hh,
+                 pocl_fma(q1h, pio2ht, pocl_fma(q1h, pio2hh, -rh))));
+        rt = pocl_fma(q0, pio2h, pocl_fma(q1, pio2t, rt));
+    }
+
+    vtype t = rh + rt;
+    rt = rt - (t - rh);
+
+    *r = t;
+    *rr = rt;
+    return ((i >> 1) + (i & (itype)1)) & (itype)0x3;
+}
+
+#undef SHIFT_MINUS_32
+
+_CL_OVERLOADABLE itype __pocl_argReductionS(vtype *r, vtype *rr, vtype x)
+{
+    itype retval = __pocl_argReductionSmallS(r, rr, x);
+    itype cond = (x >= (vtype)0x1.0p+23f);
+    if (SV_ANY(cond)) {
+        retval = __pocl_argReductionLargeS(r, rr, x);
+    }
+    return retval;
+}
+
+
+
+// Evaluate single precisions in and cos of value in interval [-pi/4, pi/4]
+_CL_OVERLOADABLE v2type __pocl_sincosf_piby4(vtype x)
+{
+    // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
+    // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
+    // = x * f(w)
+    // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
+    // We use a minimax approximation of (f(w) - 1) / w
+    // because this produces an expansion in even powers of x.
+
+    // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
+    // = f(w)
+    // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
+    // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
+    // because this produces an expansion in even powers of x.
+
+    const vtype sc1 = (vtype)-0.166666666638608441788607926e0F;
+    const vtype sc2 = (vtype)0.833333187633086262120839299e-2F;
+    const vtype sc3 = (vtype)-0.198400874359527693921333720e-3F;
+    const vtype sc4 = (vtype)0.272500015145584081596826911e-5F;
+
+    const vtype cc1 = (vtype)0.41666666664325175238031e-1F;
+    const vtype cc2 = (vtype)-0.13888887673175665567647e-2F;
+    const vtype cc3 = (vtype)0.24800600878112441958053e-4F;
+    const vtype cc4 = (vtype)-0.27301013343179832472841e-6F;
+
+    vtype x2 = x * x;
+
+    v2type ret;
+    ret.lo = pocl_fma(x*x2,
+               pocl_fma(x2,
+                 pocl_fma(x2,
+                   pocl_fma(x2, sc4, sc3),
+                   sc2),
+                 sc1),
+               x);
+    ret.hi = pocl_fma(x2*x2,
+               pocl_fma(x2,
+                 pocl_fma(x2,
+                   pocl_fma(x2, cc4, cc3),
+                   cc2),
+                 cc1),
+                 pocl_fma(x2, (vtype)(-0.5f), (vtype)1.0f));
+    return ret;
+}
+
+
+_CL_OVERLOADABLE vtype __pocl_cosf_piby4(vtype x, vtype y) {
+    // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
+    // = f(w)
+    // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
+    // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
+    // because this produces an expansion in even powers of x.
+
+    const vtype c1 = (vtype)0.416666666e-1f;
+    const vtype c2 = (vtype)-0.138888876e-2f;
+    const vtype c3 = (vtype)0.248006008e-4f;
+    const vtype c4 = (vtype)-0.2730101334e-6f;
+    const vtype c5 = (vtype)2.0875723372e-09f;  // 0x310f74f6
+    const vtype c6 = (vtype)-1.1359647598e-11f; // 0xad47d74e
+
+    vtype z = x * x;
+    vtype r = z * pocl_fma(z,
+                    pocl_fma(z,
+                      pocl_fma(z,
+                        pocl_fma(z,
+                          pocl_fma(z, c6,  c5),
+                          c4),
+                        c3),
+                      c2),
+                    c1);
+
+    // if |x| < 0.3
+    vtype qx = (vtype)0.0f;
+
+    itype ix = as_itype(x) & (itype)EXSIGNBIT_SP32;
+
+    //  0.78125 > |x| >= 0.3
+    vtype xby4 = as_vtype(ix - (itype)0x01000000);
+    qx = ((ix >= (itype)0x3e99999a) & (ix <= (itype)0x3f480000)) ? xby4 : qx;
+
+    // x > 0.78125
+    qx = (ix > (itype)0x3f480000) ? (vtype)0.28125f : qx;
+
+    vtype hz = pocl_fma(z, (vtype)0.5f, -qx);
+    vtype a = (vtype)1.0f - qx;
+    vtype ret = a - (hz - pocl_fma(z, r, -x*y));
+    return ret;
+}
+
+
+_CL_OVERLOADABLE vtype __pocl_sinf_piby4(vtype x, vtype y) {
+    // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
+    // = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
+    // = x * f(w)
+    // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
+    // We use a minimax approximation of (f(w) - 1) / w
+    // because this produces an expansion in even powers of x.
+
+    const vtype c1 = (vtype)-0.1666666666e0f;
+    const vtype c2 = (vtype)0.8333331876e-2f;
+    const vtype c3 = (vtype)-0.198400874e-3f;
+    const vtype c4 = (vtype)0.272500015e-5f;
+    const vtype c5 = (vtype)-2.5050759689e-08f; // 0xb2d72f34
+    const vtype c6 = (vtype)1.5896910177e-10f;  // 0x2f2ec9d3
+
+    vtype z = x * x;
+    vtype v = z * x;
+    vtype r = pocl_fma(z,
+                pocl_fma(z,
+                  pocl_fma(z,
+                    pocl_fma(z, c6, c5),
+                    c4),
+                  c3),
+                c2);
+    vtype ret = x - pocl_fma(v, -c1,
+                      pocl_fma(z,
+                        pocl_fma(y, (vtype)0.5f, -v*r), -y));
+
+    return ret;
+}
diff --git a/lib/kernel/libclc/sincos_helpers_fp32.h b/lib/kernel/libclc/sincos_helpers_fp32.h
new file mode 100644
index 0000000..7f538fa
--- /dev/null
+++ b/lib/kernel/libclc/sincos_helpers_fp32.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+_CL_OVERLOADABLE v2type __polc__sincosf_piby4(vtype x);
+_CL_OVERLOADABLE vtype __pocl_sinf_piby4(vtype x, vtype y);
+_CL_OVERLOADABLE vtype __pocl_cosf_piby4(vtype x, vtype y);
+_CL_OVERLOADABLE itype __pocl_argReductionS(vtype *r, vtype *rr, vtype x);
+_CL_OVERLOADABLE v2type __pocl_sincosf_piby4(vtype x);
diff --git a/lib/kernel/libclc/sincos_helpers_fp64.cl b/lib/kernel/libclc/sincos_helpers_fp64.cl
new file mode 100644
index 0000000..33b9480
--- /dev/null
+++ b/lib/kernel/libclc/sincos_helpers_fp64.cl
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+#define bytealign(src0, src1, src2) \
+  ((( ((convert_utype(src0)) << 32) | convert_utype(src1)) >> (((src2) & 3)*8)))
+
+// Reduction for medium sized arguments
+_CL_OVERLOADABLE void __pocl_remainder_piby2_medium(vtype x, vtype *r, vtype *rr, itype *regn) {
+
+    // How many pi/2 is x a multiple of?
+    const vtype two_by_pi = (vtype)0x1.45f306dc9c883p-1;
+    const vtype dnpi2 = trunc(pocl_fma(x, two_by_pi, (vtype)0.5));
+
+    const vtype piby2_h = (vtype)(-7074237752028440.0 / 0x1.0p+52);
+    const vtype piby2_m = (vtype)(-2483878800010755.0 / 0x1.0p+105);
+    const vtype piby2_t = (vtype)(-3956492004828932.0 / 0x1.0p+158);
+
+    // Compute product of npi2 with 159 bits of 2/pi
+    vtype p_hh = piby2_h * dnpi2;
+    vtype p_ht = pocl_fma(piby2_h, dnpi2, -p_hh);
+    vtype p_mh = piby2_m * dnpi2;
+    vtype p_mt = pocl_fma(piby2_m, dnpi2, -p_mh);
+    vtype p_th = piby2_t * dnpi2;
+    vtype p_tt = pocl_fma(piby2_t, dnpi2, -p_th);
+
+    // Reduce to 159 bits
+    vtype ph = p_hh;
+    vtype pm = p_ht + p_mh;
+    vtype t = p_mh - (pm - p_ht);
+    vtype pt = p_th + t + p_mt + p_tt;
+    t = ph + pm; pm = pm - (t - ph); ph = t;
+    t = pm + pt; pt = pt - (t - pm); pm = t;
+
+    // Subtract from x
+    t = x + ph;
+    vtype qh = t + pm;
+    vtype qt = pm - (qh - t) + pt;
+
+    *r = qh;
+    *rr = qt;
+    *regn = convert_itype(dnpi2) & (itype)0x3;
+
+}
+
+// Given positive argument x, reduce it to the range [-pi/4,pi/4] using
+// extra precision, and return the result in r, rr.
+// Return value "regn" tells how many lots of pi/2 were subtracted
+// from x to put it in the range [-pi/4,pi/4], mod 4.
+_CL_OVERLOADABLE void __pocl_remainder_piby2_large(vtype x, vtype *r, vtype *rr, itype *regn) {
+
+    itype ux = as_itype(x);
+    itype e = (ux >> 52) - (itype)1023;
+    itype i = max((itype)23, (e >> 3) + (itype)17);
+    itype j = (itype)150 - i;
+    itype j16 = j & (itype)(~0xf);
+    vtype fract_temp;
+
+    // The following extracts 192 consecutive bits of 2/pi aligned on an arbitrary byte boundary
+    uinttype j16i = convert_uinttype(j16);
+    utype4 q0 = USE_VTABLE(pibits_tbl, j16i);
+    utype4 q1 = USE_VTABLE(pibits_tbl, (j16i + (uinttype)16));
+    utype4 q2 = USE_VTABLE(pibits_tbl, (j16i + (uinttype)32));
+
+    itype k = (j >> 2) & (itype)0x3;
+    itype4 c;
+    c.s0 = convert_inttype(k == (itype)0);
+    c.s1 = convert_inttype(k == (itype)1);
+    c.s2 = convert_inttype(k == (itype)2);
+    c.s3 = convert_inttype(k == (itype)3);
+
+    uinttype u0, u1, u2, u3, u4, u5, u6;
+
+    u0 = c.s1 ? q0.s1 : q0.s0;
+    u0 = c.s2 ? q0.s2 : u0;
+    u0 = c.s3 ? q0.s3 : u0;
+
+    u1 = c.s1 ? q0.s2 : q0.s1;
+    u1 = c.s2 ? q0.s3 : u1;
+    u1 = c.s3 ? q1.s0 : u1;
+
+    u2 = c.s1 ? q0.s3 : q0.s2;
+    u2 = c.s2 ? q1.s0 : u2;
+    u2 = c.s3 ? q1.s1 : u2;
+
+    u3 = c.s1 ? q1.s0 : q0.s3;
+    u3 = c.s2 ? q1.s1 : u3;
+    u3 = c.s3 ? q1.s2 : u3;
+
+    u4 = c.s1 ? q1.s1 : q1.s0;
+    u4 = c.s2 ? q1.s2 : u4;
+    u4 = c.s3 ? q1.s3 : u4;
+
+    u5 = c.s1 ? q1.s2 : q1.s1;
+    u5 = c.s2 ? q1.s3 : u5;
+    u5 = c.s3 ? q2.s0 : u5;
+
+    u6 = c.s1 ? q1.s3 : q1.s2;
+    u6 = c.s2 ? q2.s0 : u6;
+    u6 = c.s3 ? q2.s1 : u6;
+
+    const utype lomask = (utype)(0xffffffff);
+    const utype himask = lomask << 32;
+    const utype himask2 = (utype)0xffff00000000UL;
+
+    utype v0 = bytealign(u1, u0, j) & lomask;
+    utype v1 = bytealign(u2, u1, j) & lomask;
+    utype v2 = bytealign(u3, u2, j) & lomask;
+    utype v3 = bytealign(u4, u3, j) & lomask;
+    utype v4 = bytealign(u5, u4, j) & lomask;
+    utype v5 = bytealign(u6, u5, j) & lomask;
+    utype v1hi = v1 << 32;
+    utype v2hi = v2 << 32;
+    utype v4hi = v4 << 32;
+    utype v5hi = v5 << 32;
+
+    // Place those 192 bits in 4 48-bit vtypes along with correct exponent
+    // If i > 1018 we would get subnormals so we scale p up and x down to get the same product
+    i = (itype)2 + 8*i;
+    x *= (i > (itype)1018) ? (vtype)0x1.0p-136 : (vtype)1.0;
+    i -= (i > (itype)1018) ? (itype)136 : (itype)0;
+
+    utype ua = as_utype(1023 + 52 - i) << 52;
+    vtype a = as_vtype(ua);
+    utype addi3 = (utype)0x0300000000000000U;
+    vtype p0 = as_vtype(v0 | (ua | (v1hi & himask2))  ) - a;
+    ua += addi3;
+    a = as_vtype(ua & himask);
+    vtype p1 = as_vtype( ((v2 << 16) | (v1 >> 16))
+                         | ((ua | (v2hi >> 16)) & himask) ) - a;
+    ua += addi3;
+    a = as_vtype(ua & himask);
+    vtype p2 = as_vtype(v3 | ((ua | (v4hi & himask2))) ) - a;
+    ua += addi3;
+    a = as_vtype(ua & himask);
+    vtype p3 = as_vtype( ((v5 << 16) | (v4 >> 16))
+                         | ((ua | (v5hi >> 16)) & himask) ) - a;
+
+    // Exact multiply
+    vtype f0h = p0 * x;
+    vtype f0l = pocl_fma(p0, x, (vtype)-f0h);
+    vtype f1h = p1 * x;
+    vtype f1l = pocl_fma(p1, x, (vtype)-f1h);
+    vtype f2h = p2 * x;
+    vtype f2l = pocl_fma(p2, x, (vtype)-f2h);
+    vtype f3h = p3 * x;
+    vtype f3l = pocl_fma(p3, x, (vtype)-f3h);
+
+    // Accumulate product into 4 vtypes
+    vtype s, t;
+
+    vtype f3 = f3h + f2h;
+    t = f2h - (f3 - f3h);
+    s = f3l + t;
+    t = t - (s - f3l);
+
+    vtype f2 = s + f1h;
+    t = f1h - (f2 - s) + t;
+    s = f2l + t;
+    t = t - (s - f2l);
+
+    vtype f1 = s + f0h;
+    t = f0h - (f1 - s) + t;
+    s = f1l + t;
+
+    vtype f0 = s + f0l;
+
+    // Strip off unwanted large integer bits
+    f3 = (vtype)0x1.0p+10 * fract((f3 * 0x1.0p-10), &fract_temp);
+    f3 += ((f3 + f2) < (vtype)0.0) ? (vtype)0x1.0p+10 : (vtype)0.0;
+
+    // Compute least significant integer bits
+    t = f3 + f2;
+    vtype di = t - fract(t, &fract_temp);
+    i = convert_itype(di);
+
+    // Shift out remaining integer part
+    f3 -= di;
+    s = f3 + f2; t = f2 - (s - f3); f3 = s; f2 = t;
+    s = f2 + f1; t = f1 - (s - f2); f2 = s; f1 = t;
+    f1 += f0;
+
+    // Subtract 1 if fraction is >= 0.5, and update regn
+#ifdef SINGLEVEC
+    itype g = (f3 >= (vtype)0.5);
+    i += g;
+#else
+    utype g = (as_utype(f3 >= (vtype)0.5) >> 63);
+    i += as_itype(g);
+#endif
+    f3 -= convert_vtype(g);
+
+    // Shift up bits
+    s = f3 + f2; t = f2 -(s - f3); f3 = s; f2 = t + f1;
+
+    // Multiply precise fraction by pi/2 to get radians
+    const vtype p2h = (vtype)(7074237752028440.0 / 0x1.0p+52);
+    const vtype p2t = (vtype)(4967757600021510.0 / 0x1.0p+106);
+
+    vtype rhi = f3 * p2h;
+    vtype rlo = pocl_fma(f2, p2h, pocl_fma(f3, p2t, pocl_fma(f3, p2h, -rhi)));
+
+    *r = rhi + rlo;
+    *rr = rlo - (*r - rhi);
+    *regn = i & (itype)0x3;
+}
+
+
+
+_CL_OVERLOADABLE v2type __pocl_sincos_piby4(vtype x, vtype xx) {
+    // Taylor series for sin(x) is x - x^3/3! + x^5/5! - x^7/7! ...
+    //                      = x * (1 - x^2/3! + x^4/5! - x^6/7! ...
+    //                      = x * f(w)
+    // where w = x*x and f(w) = (1 - w/3! + w^2/5! - w^3/7! ...
+    // We use a minimax approximation of (f(w) - 1) / w
+    // because this produces an expansion in even powers of x.
+    // If xx (the tail of x) is non-zero, we add a correction
+    // term g(x,xx) = (1-x*x/2)*xx to the result, where g(x,xx)
+    // is an approximation to cos(x)*sin(xx) valid because
+    // xx is tiny relative to x.
+
+    // Taylor series for cos(x) is 1 - x^2/2! + x^4/4! - x^6/6! ...
+    //                      = f(w)
+    // where w = x*x and f(w) = (1 - w/2! + w^2/4! - w^3/6! ...
+    // We use a minimax approximation of (f(w) - 1 + w/2) / (w*w)
+    // because this produces an expansion in even powers of x.
+    // If xx (the tail of x) is non-zero, we subtract a correction
+    // term g(x,xx) = x*xx to the result, where g(x,xx)
+    // is an approximation to sin(x)*sin(xx) valid because
+    // xx is tiny relative to x.
+
+    const vtype sc1 = (vtype)-0.166666666666666646259241729;
+    const vtype sc2 = (vtype)0.833333333333095043065222816e-2;
+    const vtype sc3 = (vtype)-0.19841269836761125688538679e-3;
+    const vtype sc4 = (vtype)0.275573161037288022676895908448e-5;
+    const vtype sc5 = (vtype)-0.25051132068021699772257377197e-7;
+    const vtype sc6 = (vtype)0.159181443044859136852668200e-9;
+
+    const vtype cc1 = (vtype)0.41666666666666665390037e-1;
+    const vtype cc2 = (vtype)-0.13888888888887398280412e-2;
+    const vtype cc3 = (vtype)0.248015872987670414957399e-4;
+    const vtype cc4 = (vtype)-0.275573172723441909470836e-6;
+    const vtype cc5 = (vtype)0.208761463822329611076335e-8;
+    const vtype cc6 = (vtype)-0.113826398067944859590880e-10;
+
+    vtype x2 = x * x;
+    vtype x3 = x2 * x;
+    vtype r = 0.5 * x2;
+    vtype t = (vtype)1.0 - r;
+
+    vtype sp = pocl_fma(
+                 pocl_fma(
+                   pocl_fma(
+                     pocl_fma(sc6, x2, sc5),
+                     x2, sc4),
+                   x2, sc3),
+                 x2, sc2);
+
+    vtype cp = t + pocl_fma(
+                     pocl_fma(
+                       pocl_fma(
+                         pocl_fma(
+                           pocl_fma(
+                             pocl_fma(cc6, x2, cc5),
+                             x2, cc4),
+                           x2, cc3),
+                         x2, cc2),
+                       x2, cc1),
+                     x2*x2,
+                     pocl_fma(x, xx, ((vtype)1.0 - t) - r));
+
+    v2type ret;
+    ret.lo = x - pocl_fma(-x3, sc1,
+                   pocl_fma(
+                     pocl_fma(-x3, sp, 0.5*xx),
+                   x2,
+                 -xx));
+    ret.hi = cp;
+
+    return ret;
+}
diff --git a/lib/kernel/libclc/sincos_helpers_fp64.h b/lib/kernel/libclc/sincos_helpers_fp64.h
new file mode 100644
index 0000000..cd289b2
--- /dev/null
+++ b/lib/kernel/libclc/sincos_helpers_fp64.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE void __pocl_remainder_piby2_medium(vtype x, vtype *r, vtype *rr, itype *regn);
+_CL_OVERLOADABLE void __pocl_remainder_piby2_large(vtype x, vtype *r, vtype *rr, itype *regn);
+_CL_OVERLOADABLE v2type __pocl_sincos_piby4(vtype x, vtype xx);
+
+#endif
diff --git a/lib/CL/pocl_mem_management.h b/lib/kernel/libclc/singlevec.h
similarity index 61%
copy from lib/CL/pocl_mem_management.h
copy to lib/kernel/libclc/singlevec.h
index c69e65b..ac6245f 100644
--- a/lib/CL/pocl_mem_management.h
+++ b/lib/kernel/libclc/singlevec.h
@@ -1,17 +1,17 @@
-/* pocl_cl.h - local runtime library declarations.
+/* OpenCL built-in library: singlevec.h
+
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
 
-   Copyright (c) 2014 Ville Korhonen
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -21,26 +21,32 @@
    THE SOFTWARE.
 */
 
-#include "pocl_cl.h"
-
-#ifdef __GNUC__
-#pragma GCC visibility push(hidden)
-#endif
-
-void pocl_init_mem_manager (void);
-
-cl_event pocl_mem_manager_new_event (void);
+/* These macros help dealing with differences between true-false representation
+ * between clang extended vectors (-1, 0) and scalar expressions (1, 0). */
 
-void pocl_mem_manager_free_event (cl_event event);
+#undef SV_ANY
+#undef SV_NOT
+#undef SV_AND
+#undef SV_OR
+#undef SV_ODD32
+#undef SV_ODD64
 
-_cl_command_node* pocl_mem_manager_new_command (void);
+#ifdef SINGLEVEC
 
-void pocl_mem_manager_free_command (_cl_command_node *cmd_ptr);
+#define SV_ANY(BOOL) (BOOL)
+#define SV_NOT(x) (!(x))
+#define SV_OR ||
+#define SV_AND &&
+#define SV_ODD32(x) (x & 1)
+#define SV_ODD64(x) (x & 1)
 
-event_node* pocl_mem_manager_new_event_node ();
+#else
 
-void pocl_mem_manager_free_event_node (event_node *ed);
+#define SV_ANY(BOOL) (any(BOOL))
+#define SV_NOT(x) (~(x))
+#define SV_OR |
+#define SV_AND &
+#define SV_ODD32(x) (x << 31)
+#define SV_ODD64(x) (x << 63)
 
-#ifdef __GNUC__
-#pragma GCC visibility pop
 #endif
diff --git a/lib/kernel/libclc/sinh_fp32.cl b/lib/kernel/libclc/sinh_fp32.cl
new file mode 100644
index 0000000..5a50f5e
--- /dev/null
+++ b/lib/kernel/libclc/sinh_fp32.cl
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+
+_CL_OVERLOADABLE vtype sinh(vtype x)
+{
+    // After dealing with special cases the computation is split into regions as follows.
+    // abs(x) >= max_sinh_arg:
+    // sinh(x) = sign(x)*Inf
+    // abs(x) >= small_threshold:
+    // sinh(x) = sign(x)*exp(abs(x))/2 computed using the splitexp and scaleDouble functions as for exp_amd().
+    // abs(x) < small_threshold:
+    // compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0)))
+    // sinh(x) is then sign(x)*z.
+
+    const vtype max_sinh_arg = (vtype)0x1.65a9fap+6f;
+    const vtype small_threshold = (vtype)0x1.0a2b24p+3f;
+
+    utype ux = as_utype(x);
+    utype aux = ux & (utype)EXSIGNBIT_SP32;
+    utype xs = ux ^ aux;
+    vtype y = as_vtype(aux);
+
+    // We find the integer part y0 of y and the increment dy = y - y0. We then compute
+    // z = sinh(y) = sinh(y0)cosh(dy) + cosh(y0)sinh(dy)
+    // where sinh(y0) and cosh(y0) are tabulated above.
+
+    vtype indv = trunc(y);
+    utype indi = convert_utype(indv);
+    indi = (indi > (utype)36) ? (utype)0 : indi;
+
+    vtype dy = y - indv;
+    vtype dy2 = dy * dy;
+
+    vtype sdy = pocl_fma(dy2,
+                  pocl_fma(dy2,
+                    pocl_fma(dy2,
+                      pocl_fma(dy2,
+                        pocl_fma(dy2,
+                          pocl_fma(dy2,
+                     (vtype)0.7746188980094184251527126e-12f,
+                     (vtype)0.160576793121939886190847e-9f),
+                   (vtype)0.250521176994133472333666e-7f),
+                 (vtype)0.275573191913636406057211e-5f),
+               (vtype)0.198412698413242405162014e-3f),
+             (vtype)0.833333333333329931873097e-2f),
+           (vtype)0.166666666666666667013899e0f);
+    sdy = pocl_fma(sdy, dy*dy2, dy);
+
+    vtype cdy = pocl_fma(dy2,
+                  pocl_fma(dy2,
+                    pocl_fma(dy2,
+                      pocl_fma(dy2,
+                        pocl_fma(dy2,
+                          pocl_fma(dy2,
+                     (vtype)0.1163921388172173692062032e-10f,
+                     (vtype)0.208744349831471353536305e-8f),
+                   (vtype)0.275573350756016588011357e-6f),
+                 (vtype)0.248015872460622433115785e-4f),
+               (vtype)0.138888888889814854814536e-2f),
+             (vtype)0.416666666666660876512776e-1f),
+           (vtype)0.500000000000000005911074e0f);
+    cdy = pocl_fma(cdy, dy2, (vtype)1.0f);
+
+    v2type tv = USE_VTABLE(sinhcosh_tbl, indi);
+    vtype z = pocl_fma(tv.hi, sdy, tv.lo * cdy);
+    z = as_vtype(xs | as_utype(z));
+
+    // When y is large enough so that the negative exponential is negligible,
+    // so sinh(y) is approximated by sign(x)*exp(y)/2.
+    vtype t = exp(y - (vtype)0x1.62e500p-1f);
+    vtype zsmall = pocl_fma((vtype)0x1.a0210ep-18f, t, t);
+    zsmall = as_vtype(xs | as_utype(zsmall));
+    z = (y >= small_threshold) ? zsmall : z;
+
+    // Corner cases
+    vtype zinf = as_vtype((utype)PINFBITPATT_SP32 | xs);
+    z = (y >= max_sinh_arg) ? zinf : z;
+    z = as_vtype(
+          as_utype(aux > (utype)PINFBITPATT_SP32)
+          | as_utype((aux < (utype)0x38800000U) ? x : z) );
+
+    return z;
+}
diff --git a/lib/kernel/libclc/sinh_fp64.cl b/lib/kernel/libclc/sinh_fp64.cl
new file mode 100644
index 0000000..19af426
--- /dev/null
+++ b/lib/kernel/libclc/sinh_fp64.cl
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+
+_CL_OVERLOADABLE vtype sinh(vtype x)
+{
+    // After dealing with special cases the computation is split into
+    // regions as follows:
+    //
+    // abs(x) >= max_sinh_arg:
+    // sinh(x) = sign(x)*Inf
+    //
+    // abs(x) >= small_threshold:
+    // sinh(x) = sign(x)*exp(abs(x))/2 computed using the
+    // splitexp and scaleDouble functions as for exp_amd().
+    //
+    // abs(x) < small_threshold:
+    // compute p = exp(y) - 1 and then z = 0.5*(p+(p/(p+1.0)))
+    // sinh(x) is then sign(x)*z.
+
+    const vtype max_sinh_arg = 7.10475860073943977113e+02; // 0x408633ce8fb9f87e
+
+    // This is where exp(-x) is insignificant compared to exp(x) = ln(2^27)
+    const vtype small_threshold = 0x1.2b708872320e2p+4;
+
+    vtype y = fabs(x);
+
+    // In this range we find the integer part y0 of y
+    // and the increment dy = y - y0. We then compute
+    // z = sinh(y) = sinh(y0)cosh(dy) + cosh(y0)sinh(dy)
+    // where sinh(y0) and cosh(y0) are obtained from tables
+
+    vtype indv = trunc(y);
+    itype indi = convert_itype(indv);
+    indi = min((itype)indi, (itype)36U);
+
+    vtype dy = y - indv;
+    vtype dy2 = dy * dy;
+
+    vtype sdy = dy * dy2 *
+            pocl_fma(dy2,
+              pocl_fma(dy2,
+                pocl_fma(dy2,
+                  pocl_fma(dy2,
+                    pocl_fma(dy2,
+                      pocl_fma(dy2,
+                 (vtype)0.7746188980094184251527126e-12,
+                 (vtype)0.160576793121939886190847e-9),
+               (vtype)0.250521176994133472333666e-7),
+             (vtype)0.275573191913636406057211e-5),
+           (vtype)0.198412698413242405162014e-3),
+         (vtype)0.833333333333329931873097e-2),
+       (vtype)0.166666666666666667013899e0);
+
+    vtype cdy = dy2 *
+            pocl_fma(dy2,
+              pocl_fma(dy2,
+                pocl_fma(dy2,
+                  pocl_fma(dy2,
+                    pocl_fma(dy2,
+                      pocl_fma(dy2,
+                 (vtype)0.1163921388172173692062032e-10,
+                 (vtype)0.208744349831471353536305e-8),
+               (vtype)0.275573350756016588011357e-6),
+             (vtype)0.248015872460622433115785e-4),
+           (vtype)0.138888888889814854814536e-2),
+         (vtype)0.416666666666660876512776e-1),
+       (vtype)0.500000000000000005911074e0);
+
+    // At this point sinh(dy) is approximated by dy + sdy.
+    // Shift some significant bits from dy to sdy.
+    vtype sdy1 = as_vtype(as_utype(dy) & 0xfffffffff8000000UL);
+    vtype sdy2 = sdy + (dy - sdy1);
+
+    v2type tv = USE_VTABLE(cosh_tbl, convert_uinttype(indi));
+    vtype cl = tv.lo;
+    vtype ct = tv.hi;
+
+    tv = USE_VTABLE(sinh_tbl, convert_uinttype(indi));
+    vtype sl = tv.lo;
+    vtype st = tv.hi;
+
+
+    vtype z = pocl_fma(cl, sdy1,
+                pocl_fma(sl, cdy,
+                  pocl_fma(cl, sdy2,
+                    pocl_fma(ct, sdy1,
+                      pocl_fma(st, cdy, ct*sdy2)) + st))) + sl;
+
+    // Other cases
+    z = (y < 0x1.0p-28) | isnan(x) | isinf(x) ? y : z;
+
+    vtype t = exp(y - 0x1.62e42fefa3800p-1);
+    t = pocl_fma(t, (vtype)-0x1.ef35793c76641p-45, t);
+    z = (y >= small_threshold) ? t : z;
+    z = (y >= max_sinh_arg) ? as_vtype((utype)PINFBITPATT_DP64) : z;
+
+    return copysign(z, x);
+}
diff --git a/lib/kernel/libclc/sinpi_fp32.cl b/lib/kernel/libclc/sinpi_fp32.cl
new file mode 100644
index 0000000..a44c857
--- /dev/null
+++ b/lib/kernel/libclc/sinpi_fp32.cl
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+
+_CL_OVERLOADABLE vtype sinpi(vtype x)
+{
+    itype ix = as_itype(x);
+    itype xsgn = ix & (itype)SIGNBIT_SP32;
+    ix ^= xsgn;
+    vtype ax = as_vtype(ix);
+    vtype iaxv = trunc(ax);
+    itype iaxi = convert_itype(iaxv);
+    vtype r = ax - iaxv;
+    itype xodd = xsgn ^ ((iaxi & (itype)0x1) << 31);
+
+    // Initialize with return for +-Inf and NaN
+    itype ir = (itype)QNANBITPATT_SP32;
+
+    // 2^23 <= |x| < Inf, the result is always integer
+    ir = (ix < (itype)(EXPBITS_SP32)) ? xsgn : ir;
+
+    // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval
+
+    // r < 1.0
+    vtype a = ((vtype)1.0f - r);
+    itype e = (itype)0;
+
+    // r <= 0.75
+    itype c = (r <= (vtype)0.75f);
+    a = c ? (r - (vtype)0.5f) : a;
+    e = c ? (itype)(-1) : e;
+
+    // r < 0.5
+    c = (r < (vtype)0.5f);
+    a = c ? ((vtype)0.5f - r) : a;
+
+    // r <= 0.25
+    c = (r <= 0.25f);
+    a = c ? r : a;
+    e = c ? (itype)0 : e;
+
+    v2type t = __pocl_sincosf_piby4(a * M_PI_F);
+    itype jr = xodd ^ as_itype(e ? t.hi : t.lo);
+
+    ir = (ix < (itype)0x4b000000) ? jr : ir;
+
+    return as_vtype(ir);
+}
diff --git a/lib/kernel/libclc/sinpi_fp64.cl b/lib/kernel/libclc/sinpi_fp64.cl
new file mode 100644
index 0000000..8dc960a
--- /dev/null
+++ b/lib/kernel/libclc/sinpi_fp64.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+_CL_OVERLOADABLE vtype sinpi(vtype x)
+{
+    itype ix = as_itype(x);
+    itype xsgn = ix & (itype)SIGNBIT_DP64;
+    ix ^= xsgn;
+    vtype ax = as_vtype(ix);
+    vtype iaxv = trunc(ax);
+    itype iaxi = convert_itype(iaxv);
+    vtype r = ax - iaxv;
+    itype xodd = xsgn ^ ((iaxi & (itype)1) << 63);
+
+    // Initialize with return for +-Inf and NaN
+    itype ir = (itype)QNANBITPATT_DP64;
+
+    // 2^23 <= |x| < Inf, the result is always integer
+    ir = (ix < (itype)EXPBITS_DP64) ? xsgn : ir;
+
+    // 0x1.0p-7 <= |x| < 2^23, result depends on which 0.25 interval
+
+    // r < 1.0
+    vtype a = (vtype)1.0 - r;
+    itype e = (itype)0;
+
+    // r <= 0.75
+    itype c = (r <= (vtype)0.75);
+    a = c ? (r - (vtype)0.5) : a;
+    e = c ? (itype)(-1) : e;
+
+    // r < 0.5
+    c = (r < (vtype)0.5);
+    a = c ? ((vtype)0.5 - r) : a;
+
+    // r <= 0.25
+    c = (r <= 0.25);
+    a = c ? r : a;
+    e = c ? (itype)0 : e;
+
+    v2type sc = __pocl_sincos_piby4(a * M_PI, (vtype)0.0);
+    itype jr = xodd ^ as_itype(e ? sc.hi : sc.lo);
+
+    ir = (ax < (vtype)0x1.0p+52) ? jr : ir;
+
+    return as_vtype(ir);
+}
diff --git a/lib/kernel/libclc/tan_fp32.cl b/lib/kernel/libclc/tan_fp32.cl
new file mode 100644
index 0000000..653d55a
--- /dev/null
+++ b/lib/kernel/libclc/tan_fp32.cl
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+_CL_OVERLOADABLE vtype tan(vtype x) {
+  vtype sinx, cosx;
+  sinx = sincos(x, &cosx);
+  return sinx / cosx;
+}
diff --git a/lib/kernel/libclc/tan_fp64.cl b/lib/kernel/libclc/tan_fp64.cl
new file mode 100644
index 0000000..606dd81
--- /dev/null
+++ b/lib/kernel/libclc/tan_fp64.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+_CL_OVERLOADABLE vtype tan(vtype x) {
+
+    /*
+    * TODO this does not work, for some reason
+    *
+    *  vtype sinx, cosx;
+    *  sinx = sincos(x, (private vtype *)&cosx);
+    *  return sinx / cosx;
+    */
+    vtype y = fabs(x);
+
+    vtype r, rr, r2, rr2;
+    itype regn, regn2;
+
+    __pocl_remainder_piby2_medium(y, &r, &rr, &regn);
+    itype cond = (y >= (vtype)0x1.0p+47);
+    if (SV_ANY(cond)) {
+        __pocl_remainder_piby2_large(y, &r2, &rr2, &regn2);
+        regn = cond ? regn2 : regn;
+        r = cond ? r2 : r;
+        rr = cond ? rr2 : rr;
+    }
+    v2type sc = __pocl_sincos_piby4(r, rr);
+
+    cond = (regn << 63);
+
+    vtype ss = sc.lo;
+    vtype cc = sc.hi;
+
+    itype s = cond ? as_itype(cc) : as_itype(ss);
+
+    ss = -sc.lo;
+    itype c = cond ? as_itype(ss) : as_itype(cc);
+
+    itype sgn = ((regn >> 1) << 63);
+    s ^= sgn;
+    c ^= sgn;
+    s ^= (as_itype(x) & (itype)SIGNBIT_DP64);
+
+    vtype ret = as_vtype(s) / as_vtype(c);
+    vtype nans = as_vtype( (as_utype(x) & (utype)SIGNBIT_DP64)
+    | ((utype)QNANBITPATT_DP64) );
+
+    return (isnan(x) | isinf(x)) ? nans : ret;
+}
diff --git a/lib/kernel/libclc/tanh_fp32.cl b/lib/kernel/libclc/tanh_fp32.cl
new file mode 100644
index 0000000..d6578f4
--- /dev/null
+++ b/lib/kernel/libclc/tanh_fp32.cl
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+
+_CL_OVERLOADABLE vtype tanh(vtype x)
+{
+    // The definition of tanh(x) is sinh(x)/cosh(x), which is also equivalent
+    // to the following three formulae:
+    // 1.  (exp(x) - exp(-x))/(exp(x) + exp(-x))
+    // 2.  (1 - (2/(exp(2*x) + 1 )))
+    // 3.  (exp(2*x) - 1)/(exp(2*x) + 1)
+    // but computationally, some formulae are better on some ranges.
+
+#ifdef MAX_PRECISION
+    // 1.29 ULP
+    const vtype large_threshold1 = (vtype)0x1.039346p+3;  //8.1117277
+    const vtype retval_threshold1 = (vtype)0x1.fffffap-1;
+    const vtype large_threshold2 = (vtype)0x1.0a101p+3;  // 8.3144608
+    const vtype retval_threshold2 = (vtype)0x1.fffffcp-1;
+    const vtype large_threshold3 = (vtype)0x1.15273p+3;  // 8.661034
+    const vtype retval_threshold3 = (vtype)0x1.fffffep-1;
+#else
+    // 3.0 ULP
+    const vtype large_threshold1 = (vtype)0x1.0a2b24p+3f;
+    const vtype retval_threshold1 = (vtype)0x1.fffffep-1;
+#endif
+    utype ux = as_utype(x);
+    utype aux = ux & (utype)EXSIGNBIT_SP32;
+    utype xs = ux ^ aux;
+
+    vtype y = as_vtype(aux);
+    vtype y2 = y*y;
+
+    vtype a1 = pocl_fma(y2,
+                   pocl_fma(y2,
+                     (vtype)0.4891631088530669873e-4f,
+                     (vtype)-0.14628356048797849e-2f),
+                   (vtype)-0.28192806108402678f);
+    vtype b1 = pocl_fma(y2,
+                 (vtype)0.3427017942262751343f,
+                 (vtype)0.845784192581041099f);
+
+    vtype a2 = pocl_fma(y2,
+                   pocl_fma(y2,
+                     (vtype)0.3827534993599483396e-4f,
+                     (vtype)-0.12325644183611929e-2f),
+                   (vtype)-0.24069858695196524f);
+    vtype b2 = pocl_fma(y2,
+                 (vtype)0.292529068698052819f,
+                 (vtype)0.72209738473684982f);
+    itype c = (y < (vtype)0.9f);
+    vtype a = c ? a1 : a2;
+    vtype b = c ? b1 : b2;
+    vtype zlo = pocl_fma(MATH_DIVIDE(a, b), y*y2, y);
+
+    vtype p = exp(2.0f * y) + (vtype)1.0f;
+    vtype zhi = (vtype)1.0F - MATH_DIVIDE((vtype)2.0F, p);
+
+    vtype z = (y <= (vtype)1.0f) ? zlo : zhi;
+
+    // Edge cases
+    z = (y > large_threshold1) ? retval_threshold1 : z;
+#ifdef MAX_PRECISION
+    z = (y > large_threshold2) ? retval_threshold2 : z;
+    z = (y > large_threshold3) ? retval_threshold3 : z;
+#endif
+    z = as_vtype(xs | as_utype(z));
+
+    return z;
+}
diff --git a/lib/kernel/libclc/tanh_fp64.cl b/lib/kernel/libclc/tanh_fp64.cl
new file mode 100644
index 0000000..4d37aae
--- /dev/null
+++ b/lib/kernel/libclc/tanh_fp64.cl
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+_CL_OVERLOADABLE vtype tanh(vtype x)
+{
+    // The definition of tanh(x) is sinh(x)/cosh(x), which is also equivalent
+    // to the following three formulae:
+    // 1.  (exp(x) - exp(-x))/(exp(x) + exp(-x))
+    // 2.  (1 - (2/(exp(2*x) + 1 )))
+    // 3.  (exp(2*x) - 1)/(exp(2*x) + 1)
+    // but computationally, some formulae are better on some ranges.
+
+    // The point at which e^-x is insignificant compared to e^x = ln(2^27)
+    const vtype large_threshold = (vtype)0x1.2b708872320e2p+4;
+
+    utype ux = as_utype(x);
+    utype ax = ux & (utype)EXSIGNBIT_DP64;
+    utype sx = ux ^ ax;
+    vtype y = as_vtype(ax);
+    vtype y2 = y * y;
+
+    // y < 0.9
+    vtype znl = pocl_fma(y2,
+                     pocl_fma(y2,
+                       pocl_fma(y2,
+                         (vtype)-0.142077926378834722618091e-7,
+                         (vtype)-0.200047621071909498730453e-3),
+                       (vtype)-0.176016349003044679402273e-1),
+                     (vtype)-0.274030424656179760118928e0);
+
+    vtype zdl = pocl_fma(y2,
+                     pocl_fma(y2,
+                       pocl_fma(y2,
+                         (vtype)0.2091140262529164482568557e-3,
+                         (vtype)0.201562166026937652780575e-1),
+                       (vtype)0.381641414288328849317962e0),
+                     (vtype)0.822091273968539282568011e0);
+
+    // 0.9 <= y <= 1
+    vtype znm = pocl_fma(y2,
+                     pocl_fma(y2,
+                         pocl_fma(y2,
+                         (vtype)-0.115475878996143396378318e-7,
+                         (vtype)-0.165597043903549960486816e-3),
+                       (vtype)-0.146173047288731678404066e-1),
+                     (vtype)-0.227793870659088295252442e0);
+
+    vtype zdm = pocl_fma(y2,
+                     pocl_fma(y2,
+                       pocl_fma(y2,
+                         (vtype)0.173076050126225961768710e-3,
+                         (vtype)0.167358775461896562588695e-1),
+                       (vtype)0.317204558977294374244770e0),
+                     (vtype)0.683381611977295894959554e0);
+
+    itype c = (y < (vtype)0.9);
+    vtype zn = c ? znl : znm;
+    vtype zd = c ? zdl : zdm;
+    vtype z = y + y*y2 * MATH_DIVIDE(zn, zd);
+
+    // y > 1
+    vtype p = exp(2.0 * y) + (vtype)1.0;
+    vtype zg = (vtype)1.0 - ((vtype)2.0 / p);
+
+    z = (y > (vtype)1.0) ? zg : z;
+
+    // Other cases
+    z = (y < (vtype)0x1.0p-28) ? x : z;
+    z = (ax > (utype)PINFBITPATT_DP64) ? x : z;
+
+    z = (y > large_threshold) ? (vtype)1.0 : z;
+
+    return as_vtype(sx | as_utype(z));
+}
diff --git a/lib/kernel/rsqrt.cl b/lib/kernel/libclc/tanpi_fp32.cl
similarity index 76%
copy from lib/kernel/rsqrt.cl
copy to lib/kernel/libclc/tanpi_fp32.cl
index 3c75ca1..1e85909 100644
--- a/lib/kernel/rsqrt.cl
+++ b/lib/kernel/libclc/tanpi_fp32.cl
@@ -1,18 +1,17 @@
-/* OpenCL built-in library: rsqrt()
+/* OpenCL built-in library: tanpi_fp32.cl
+
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
 
-   Copyright (c) 2011 Erik Schnetter <eschnetter at perimeterinstitute.ca>
-                      Perimeter Institute for Theoretical Physics
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -22,8 +21,12 @@
    THE SOFTWARE.
 */
 
-#include "templates.h"
 
-DEFINE_EXPR_V_V(rsqrt, (stype)1/sqrt(a))
+_CL_OVERLOADABLE vtype tanpi(vtype x) {
 
-DEFINE_EXPR_F_F(half_rsqrt, rsqrt(a))
+  vtype sinpix = sinpi(x);
+  vtype cospix = cospi(x);
+  /* -0.0 -> 0.0 */
+  cospix = (as_itype(cospix) == (itype)SIGNBIT_SP32) ? (vtype)0.0f : cospix;
+  return (sinpix / cospix);
+}
diff --git a/lib/kernel/rsqrt.cl b/lib/kernel/libclc/tanpi_fp64.cl
similarity index 76%
copy from lib/kernel/rsqrt.cl
copy to lib/kernel/libclc/tanpi_fp64.cl
index 3c75ca1..14bd9d5 100644
--- a/lib/kernel/rsqrt.cl
+++ b/lib/kernel/libclc/tanpi_fp64.cl
@@ -1,18 +1,17 @@
-/* OpenCL built-in library: rsqrt()
+/* OpenCL built-in library: tanpi_fp32.cl
+
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
 
-   Copyright (c) 2011 Erik Schnetter <eschnetter at perimeterinstitute.ca>
-                      Perimeter Institute for Theoretical Physics
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -22,8 +21,12 @@
    THE SOFTWARE.
 */
 
-#include "templates.h"
 
-DEFINE_EXPR_V_V(rsqrt, (stype)1/sqrt(a))
+_CL_OVERLOADABLE vtype tanpi(vtype x) {
 
-DEFINE_EXPR_F_F(half_rsqrt, rsqrt(a))
+  vtype sinpix = sinpi(x);
+  vtype cospix = cospi(x);
+  /* -0.0 -> 0.0 */
+  cospix = (as_itype(cospix) == (itype)SIGNBIT_DP64) ? (vtype)0.0f : cospix;
+  return (sinpix / cospix);
+}
diff --git a/lib/kernel/libclc/vtables.h b/lib/kernel/libclc/vtables.h
new file mode 100644
index 0000000..41bae72
--- /dev/null
+++ b/lib/kernel/libclc/vtables.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifndef __VTABLES_H__
+#define __VTABLES_H__
+
+#define VTABLE_SPACE static constant
+
+#define VTABLE_MANGLE(NAME) __pocl_v_##NAME
+
+#define DECLARE_VTABLE(TYPE,NAME,LENGTH) \
+    VTABLE_SPACE TYPE NAME [ LENGTH ]
+
+#define SS(x) x
+#define SGFY(x, y) x ## y
+
+#define VTABLE_FUNCTION_DECL(TYPE, NAME) \
+    _CL_OVERLOADABLE TYPE VTABLE_MANGLE(NAME)(uint idx); \
+    _CL_OVERLOADABLE SGFY(TYPE, 2) VTABLE_MANGLE(NAME)(uint2 idx); \
+    _CL_OVERLOADABLE SGFY(TYPE, 3) VTABLE_MANGLE(NAME)(uint3 idx); \
+    _CL_OVERLOADABLE SGFY(TYPE, 4) VTABLE_MANGLE(NAME)(uint4 idx); \
+    _CL_OVERLOADABLE SGFY(TYPE, 8) VTABLE_MANGLE(NAME)(uint8 idx); \
+    _CL_OVERLOADABLE SGFY(TYPE, 16) VTABLE_MANGLE(NAME)(uint16 idx);
+
+#define USE_VTABLE(NAME, IDX) \
+    VTABLE_MANGLE(NAME)(IDX)
+
+
+
+
+
+
+VTABLE_FUNCTION_DECL(v4uint, pibits_tbl);
+
+VTABLE_FUNCTION_DECL(float, log_inv_tbl);
+VTABLE_FUNCTION_DECL(float, exp_tbl);
+
+VTABLE_FUNCTION_DECL(v2float, loge_tbl);
+VTABLE_FUNCTION_DECL(v2float, log2_tbl);
+VTABLE_FUNCTION_DECL(v2float, sinhcosh_tbl);
+VTABLE_FUNCTION_DECL(v2float, cbrt_tbl);
+VTABLE_FUNCTION_DECL(v2float, exp_tbl_ep);
+
+
+
+
+#ifdef cl_khr_fp64
+
+VTABLE_FUNCTION_DECL(double, cbrt_inv_tbl);
+
+VTABLE_FUNCTION_DECL(v2double, ln_tbl);
+VTABLE_FUNCTION_DECL(v2double, atan_jby256_tbl);
+VTABLE_FUNCTION_DECL(v2double, two_to_jby64_ep_tbl);
+VTABLE_FUNCTION_DECL(v2double, sinh_tbl);
+VTABLE_FUNCTION_DECL(v2double, cosh_tbl);
+VTABLE_FUNCTION_DECL(v2double, cbrt_dbl_tbl);
+VTABLE_FUNCTION_DECL(v2double, cbrt_rem_tbl);
+
+#endif
+
+
+
+
+#endif // __VTABLES_H__
diff --git a/lib/kernel/libclc/vtables_fp32.cl b/lib/kernel/libclc/vtables_fp32.cl
new file mode 100644
index 0000000..89428fd
--- /dev/null
+++ b/lib/kernel/libclc/vtables_fp32.cl
@@ -0,0 +1,759 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+#include "misc.h"
+#include "vtables.h"
+#include "vtables_macros.h"
+
+DECLARE_VTABLE(float2, LOGE_TBL, 129) = {
+    (float2)(0x0.000000p+0f, 0x0.000000p+0f),
+    (float2)(0x1.fe0000p-8f, 0x1.535882p-23f),
+    (float2)(0x1.fc0000p-7f, 0x1.5161f8p-20f),
+    (float2)(0x1.7b8000p-6f, 0x1.1b07d4p-18f),
+    (float2)(0x1.f82000p-6f, 0x1.361cf0p-19f),
+    (float2)(0x1.39e000p-5f, 0x1.0f73fcp-18f),
+    (float2)(0x1.774000p-5f, 0x1.63d8cap-19f),
+    (float2)(0x1.b42000p-5f, 0x1.bae232p-18f),
+    (float2)(0x1.f0a000p-5f, 0x1.86008ap-20f),
+    (float2)(0x1.164000p-4f, 0x1.36eea2p-16f),
+    (float2)(0x1.340000p-4f, 0x1.d7961ap-16f),
+    (float2)(0x1.51a000p-4f, 0x1.073f06p-16f),
+    (float2)(0x1.6f0000p-4f, 0x1.a515cap-17f),
+    (float2)(0x1.8c2000p-4f, 0x1.45d630p-16f),
+    (float2)(0x1.a92000p-4f, 0x1.b4e92ap-18f),
+    (float2)(0x1.c5e000p-4f, 0x1.523d6ep-18f),
+    (float2)(0x1.e26000p-4f, 0x1.076e2ap-16f),
+    (float2)(0x1.fec000p-4f, 0x1.2263b6p-17f),
+    (float2)(0x1.0d6000p-3f, 0x1.7e7cd0p-15f),
+    (float2)(0x1.1b6000p-3f, 0x1.2ad52ep-15f),
+    (float2)(0x1.294000p-3f, 0x1.52f81ep-15f),
+    (float2)(0x1.370000p-3f, 0x1.fc201ep-15f),
+    (float2)(0x1.44c000p-3f, 0x1.2b6ccap-15f),
+    (float2)(0x1.526000p-3f, 0x1.cbc742p-16f),
+    (float2)(0x1.5fe000p-3f, 0x1.3070a6p-15f),
+    (float2)(0x1.6d6000p-3f, 0x1.fce33ap-20f),
+    (float2)(0x1.7aa000p-3f, 0x1.890210p-15f),
+    (float2)(0x1.87e000p-3f, 0x1.a06520p-15f),
+    (float2)(0x1.952000p-3f, 0x1.6a73d0p-17f),
+    (float2)(0x1.a22000p-3f, 0x1.bc1fe2p-15f),
+    (float2)(0x1.af2000p-3f, 0x1.c94e80p-15f),
+    (float2)(0x1.bc2000p-3f, 0x1.0ce85ap-16f),
+    (float2)(0x1.c8e000p-3f, 0x1.f7c79ap-15f),
+    (float2)(0x1.d5c000p-3f, 0x1.0b5a7cp-18f),
+    (float2)(0x1.e26000p-3f, 0x1.076e2ap-15f),
+    (float2)(0x1.ef0000p-3f, 0x1.5b97b8p-16f),
+    (float2)(0x1.fb8000p-3f, 0x1.186d5ep-15f),
+    (float2)(0x1.040000p-2f, 0x1.2ca5a6p-17f),
+    (float2)(0x1.0a2000p-2f, 0x1.24e272p-14f),
+    (float2)(0x1.104000p-2f, 0x1.8bf9aep-14f),
+    (float2)(0x1.166000p-2f, 0x1.5cabaap-14f),
+    (float2)(0x1.1c8000p-2f, 0x1.3182d2p-15f),
+    (float2)(0x1.228000p-2f, 0x1.41fbcep-14f),
+    (float2)(0x1.288000p-2f, 0x1.5a13dep-14f),
+    (float2)(0x1.2e8000p-2f, 0x1.c575c2p-15f),
+    (float2)(0x1.346000p-2f, 0x1.dd9a98p-14f),
+    (float2)(0x1.3a6000p-2f, 0x1.3155a4p-16f),
+    (float2)(0x1.404000p-2f, 0x1.843434p-17f),
+    (float2)(0x1.460000p-2f, 0x1.8bc21cp-14f),
+    (float2)(0x1.4be000p-2f, 0x1.7e55dcp-16f),
+    (float2)(0x1.51a000p-2f, 0x1.5b0e5ap-15f),
+    (float2)(0x1.576000p-2f, 0x1.dc5d14p-16f),
+    (float2)(0x1.5d0000p-2f, 0x1.bdbf58p-14f),
+    (float2)(0x1.62c000p-2f, 0x1.05e572p-15f),
+    (float2)(0x1.686000p-2f, 0x1.903d36p-15f),
+    (float2)(0x1.6e0000p-2f, 0x1.1d5456p-15f),
+    (float2)(0x1.738000p-2f, 0x1.d7f6bap-14f),
+    (float2)(0x1.792000p-2f, 0x1.4abfbap-15f),
+    (float2)(0x1.7ea000p-2f, 0x1.f07704p-15f),
+    (float2)(0x1.842000p-2f, 0x1.a3b43cp-15f),
+    (float2)(0x1.89a000p-2f, 0x1.9c360ap-17f),
+    (float2)(0x1.8f0000p-2f, 0x1.1e8736p-14f),
+    (float2)(0x1.946000p-2f, 0x1.941c20p-14f),
+    (float2)(0x1.99c000p-2f, 0x1.958116p-14f),
+    (float2)(0x1.9f2000p-2f, 0x1.23ecbep-14f),
+    (float2)(0x1.a48000p-2f, 0x1.024396p-16f),
+    (float2)(0x1.a9c000p-2f, 0x1.d93534p-15f),
+    (float2)(0x1.af0000p-2f, 0x1.293246p-14f),
+    (float2)(0x1.b44000p-2f, 0x1.eef798p-15f),
+    (float2)(0x1.b98000p-2f, 0x1.625a4cp-16f),
+    (float2)(0x1.bea000p-2f, 0x1.4d9da6p-14f),
+    (float2)(0x1.c3c000p-2f, 0x1.d7a7ccp-14f),
+    (float2)(0x1.c8e000p-2f, 0x1.f7c79ap-14f),
+    (float2)(0x1.ce0000p-2f, 0x1.af0b84p-14f),
+    (float2)(0x1.d32000p-2f, 0x1.fcfc00p-15f),
+    (float2)(0x1.d82000p-2f, 0x1.e7258ap-14f),
+    (float2)(0x1.dd4000p-2f, 0x1.a81306p-16f),
+    (float2)(0x1.e24000p-2f, 0x1.1034f8p-15f),
+    (float2)(0x1.e74000p-2f, 0x1.09875ap-16f),
+    (float2)(0x1.ec2000p-2f, 0x1.99d246p-14f),
+    (float2)(0x1.f12000p-2f, 0x1.1ebf5ep-15f),
+    (float2)(0x1.f60000p-2f, 0x1.23fa70p-14f),
+    (float2)(0x1.fae000p-2f, 0x1.588f78p-14f),
+    (float2)(0x1.ffc000p-2f, 0x1.2e0856p-14f),
+    (float2)(0x1.024000p-1f, 0x1.52a5a4p-13f),
+    (float2)(0x1.04a000p-1f, 0x1.df9da8p-13f),
+    (float2)(0x1.072000p-1f, 0x1.f2e0e6p-16f),
+    (float2)(0x1.098000p-1f, 0x1.bd3d5cp-15f),
+    (float2)(0x1.0be000p-1f, 0x1.cb9094p-15f),
+    (float2)(0x1.0e4000p-1f, 0x1.261746p-15f),
+    (float2)(0x1.108000p-1f, 0x1.f39e2cp-13f),
+    (float2)(0x1.12e000p-1f, 0x1.719592p-13f),
+    (float2)(0x1.154000p-1f, 0x1.87a5e8p-14f),
+    (float2)(0x1.178000p-1f, 0x1.eabbd8p-13f),
+    (float2)(0x1.19e000p-1f, 0x1.cd68cep-14f),
+    (float2)(0x1.1c2000p-1f, 0x1.b81f70p-13f),
+    (float2)(0x1.1e8000p-1f, 0x1.7d79c0p-15f),
+    (float2)(0x1.20c000p-1f, 0x1.b9a324p-14f),
+    (float2)(0x1.230000p-1f, 0x1.30d7bep-13f),
+    (float2)(0x1.254000p-1f, 0x1.5bce98p-13f),
+    (float2)(0x1.278000p-1f, 0x1.5e1288p-13f),
+    (float2)(0x1.29c000p-1f, 0x1.37fec2p-13f),
+    (float2)(0x1.2c0000p-1f, 0x1.d3da88p-14f),
+    (float2)(0x1.2e4000p-1f, 0x1.d0db90p-15f),
+    (float2)(0x1.306000p-1f, 0x1.d7334ep-13f),
+    (float2)(0x1.32a000p-1f, 0x1.133912p-13f),
+    (float2)(0x1.34e000p-1f, 0x1.44ece6p-16f),
+    (float2)(0x1.370000p-1f, 0x1.17b546p-13f),
+    (float2)(0x1.392000p-1f, 0x1.e0d356p-13f),
+    (float2)(0x1.3b6000p-1f, 0x1.0893fep-14f),
+    (float2)(0x1.3d8000p-1f, 0x1.026a70p-13f),
+    (float2)(0x1.3fa000p-1f, 0x1.5b84d0p-13f),
+    (float2)(0x1.41c000p-1f, 0x1.8fe846p-13f),
+    (float2)(0x1.43e000p-1f, 0x1.9fe2f8p-13f),
+    (float2)(0x1.460000p-1f, 0x1.8bc21cp-13f),
+    (float2)(0x1.482000p-1f, 0x1.53d1eap-13f),
+    (float2)(0x1.4a4000p-1f, 0x1.f0bb60p-14f),
+    (float2)(0x1.4c6000p-1f, 0x1.e6bf32p-15f),
+    (float2)(0x1.4e6000p-1f, 0x1.d811b6p-13f),
+    (float2)(0x1.508000p-1f, 0x1.13cc00p-13f),
+    (float2)(0x1.52a000p-1f, 0x1.6932dep-16f),
+    (float2)(0x1.54a000p-1f, 0x1.246798p-13f),
+    (float2)(0x1.56a000p-1f, 0x1.f9d5b2p-13f),
+    (float2)(0x1.58c000p-1f, 0x1.5b6b9ap-14f),
+    (float2)(0x1.5ac000p-1f, 0x1.404c34p-13f),
+    (float2)(0x1.5cc000p-1f, 0x1.b1dc6cp-13f),
+    (float2)(0x1.5ee000p-1f, 0x1.54920ap-20f),
+    (float2)(0x1.60e000p-1f, 0x1.97a23cp-16f),
+    (float2)(0x1.62e000p-1f, 0x1.0bfbe8p-15f),
+};
+
+DECLARE_VTABLE(float, LOG_INV_TBL, 129) = {
+    0x1.000000p+1f,
+    0x1.fc07f0p+0f,
+    0x1.f81f82p+0f,
+    0x1.f4465ap+0f,
+    0x1.f07c20p+0f,
+    0x1.ecc07cp+0f,
+    0x1.e9131ap+0f,
+    0x1.e573acp+0f,
+    0x1.e1e1e2p+0f,
+    0x1.de5d6ep+0f,
+    0x1.dae608p+0f,
+    0x1.d77b66p+0f,
+    0x1.d41d42p+0f,
+    0x1.d0cb58p+0f,
+    0x1.cd8568p+0f,
+    0x1.ca4b30p+0f,
+    0x1.c71c72p+0f,
+    0x1.c3f8f0p+0f,
+    0x1.c0e070p+0f,
+    0x1.bdd2b8p+0f,
+    0x1.bacf92p+0f,
+    0x1.b7d6c4p+0f,
+    0x1.b4e81cp+0f,
+    0x1.b20364p+0f,
+    0x1.af286cp+0f,
+    0x1.ac5702p+0f,
+    0x1.a98ef6p+0f,
+    0x1.a6d01ap+0f,
+    0x1.a41a42p+0f,
+    0x1.a16d40p+0f,
+    0x1.9ec8eap+0f,
+    0x1.9c2d14p+0f,
+    0x1.99999ap+0f,
+    0x1.970e50p+0f,
+    0x1.948b10p+0f,
+    0x1.920fb4p+0f,
+    0x1.8f9c18p+0f,
+    0x1.8d3018p+0f,
+    0x1.8acb90p+0f,
+    0x1.886e60p+0f,
+    0x1.861862p+0f,
+    0x1.83c978p+0f,
+    0x1.818182p+0f,
+    0x1.7f4060p+0f,
+    0x1.7d05f4p+0f,
+    0x1.7ad220p+0f,
+    0x1.78a4c8p+0f,
+    0x1.767dcep+0f,
+    0x1.745d18p+0f,
+    0x1.724288p+0f,
+    0x1.702e06p+0f,
+    0x1.6e1f76p+0f,
+    0x1.6c16c2p+0f,
+    0x1.6a13cep+0f,
+    0x1.681682p+0f,
+    0x1.661ec6p+0f,
+    0x1.642c86p+0f,
+    0x1.623fa8p+0f,
+    0x1.605816p+0f,
+    0x1.5e75bcp+0f,
+    0x1.5c9882p+0f,
+    0x1.5ac056p+0f,
+    0x1.58ed24p+0f,
+    0x1.571ed4p+0f,
+    0x1.555556p+0f,
+    0x1.539094p+0f,
+    0x1.51d07ep+0f,
+    0x1.501502p+0f,
+    0x1.4e5e0ap+0f,
+    0x1.4cab88p+0f,
+    0x1.4afd6ap+0f,
+    0x1.49539ep+0f,
+    0x1.47ae14p+0f,
+    0x1.460cbcp+0f,
+    0x1.446f86p+0f,
+    0x1.42d662p+0f,
+    0x1.414142p+0f,
+    0x1.3fb014p+0f,
+    0x1.3e22ccp+0f,
+    0x1.3c995ap+0f,
+    0x1.3b13b2p+0f,
+    0x1.3991c2p+0f,
+    0x1.381382p+0f,
+    0x1.3698e0p+0f,
+    0x1.3521d0p+0f,
+    0x1.33ae46p+0f,
+    0x1.323e34p+0f,
+    0x1.30d190p+0f,
+    0x1.2f684cp+0f,
+    0x1.2e025cp+0f,
+    0x1.2c9fb4p+0f,
+    0x1.2b404ap+0f,
+    0x1.29e412p+0f,
+    0x1.288b02p+0f,
+    0x1.27350cp+0f,
+    0x1.25e228p+0f,
+    0x1.24924ap+0f,
+    0x1.234568p+0f,
+    0x1.21fb78p+0f,
+    0x1.20b470p+0f,
+    0x1.1f7048p+0f,
+    0x1.1e2ef4p+0f,
+    0x1.1cf06ap+0f,
+    0x1.1bb4a4p+0f,
+    0x1.1a7b96p+0f,
+    0x1.194538p+0f,
+    0x1.181182p+0f,
+    0x1.16e068p+0f,
+    0x1.15b1e6p+0f,
+    0x1.1485f0p+0f,
+    0x1.135c82p+0f,
+    0x1.12358ep+0f,
+    0x1.111112p+0f,
+    0x1.0fef02p+0f,
+    0x1.0ecf56p+0f,
+    0x1.0db20ap+0f,
+    0x1.0c9714p+0f,
+    0x1.0b7e6ep+0f,
+    0x1.0a6810p+0f,
+    0x1.0953f4p+0f,
+    0x1.084210p+0f,
+    0x1.073260p+0f,
+    0x1.0624dep+0f,
+    0x1.051980p+0f,
+    0x1.041042p+0f,
+    0x1.03091cp+0f,
+    0x1.020408p+0f,
+    0x1.010102p+0f,
+    0x1.000000p+0f,
+};
+
+DECLARE_VTABLE(float2, LOG2_TBL, 129) = {
+    (float2)(0x0.000000p+0f, 0x0.000000p+0f),
+    (float2)(0x1.6f8000p-7f, 0x1.942dbap-17f),
+    (float2)(0x1.6e0000p-6f, 0x1.e5a170p-16f),
+    (float2)(0x1.118000p-5f, 0x1.347544p-15f),
+    (float2)(0x1.6b8000p-5f, 0x1.69bac6p-16f),
+    (float2)(0x1.c48000p-5f, 0x1.7eae42p-15f),
+    (float2)(0x1.0e8000p-4f, 0x1.9c4fd0p-15f),
+    (float2)(0x1.3a8000p-4f, 0x1.17ee92p-15f),
+    (float2)(0x1.660000p-4f, 0x1.fb7d64p-15f),
+    (float2)(0x1.918000p-4f, 0x1.42dc8cp-17f),
+    (float2)(0x1.bc8000p-4f, 0x1.0902b6p-18f),
+    (float2)(0x1.e70000p-4f, 0x1.7608bep-15f),
+    (float2)(0x1.088000p-3f, 0x1.162336p-13f),
+    (float2)(0x1.1d8000p-3f, 0x1.3465d4p-13f),
+    (float2)(0x1.328000p-3f, 0x1.74f13cp-14f),
+    (float2)(0x1.470000p-3f, 0x1.aa7e60p-13f),
+    (float2)(0x1.5c0000p-3f, 0x1.a39fbcp-19f),
+    (float2)(0x1.700000p-3f, 0x1.d0b53ap-13f),
+    (float2)(0x1.848000p-3f, 0x1.0af40ap-13f),
+    (float2)(0x1.988000p-3f, 0x1.b741dep-13f),
+    (float2)(0x1.ac8000p-3f, 0x1.d78b6cp-13f),
+    (float2)(0x1.c08000p-3f, 0x1.6db376p-13f),
+    (float2)(0x1.d48000p-3f, 0x1.ee4c32p-15f),
+    (float2)(0x1.e80000p-3f, 0x1.02f9d2p-13f),
+    (float2)(0x1.fb8000p-3f, 0x1.05ae40p-13f),
+    (float2)(0x1.078000p-2f, 0x1.0adbb0p-14f),
+    (float2)(0x1.110000p-2f, 0x1.83ed68p-13f),
+    (float2)(0x1.1a8000p-2f, 0x1.016ca4p-12f),
+    (float2)(0x1.240000p-2f, 0x1.01eac2p-12f),
+    (float2)(0x1.2d8000p-2f, 0x1.887e26p-13f),
+    (float2)(0x1.370000p-2f, 0x1.24cea4p-14f),
+    (float2)(0x1.400000p-2f, 0x1.918ec6p-12f),
+    (float2)(0x1.498000p-2f, 0x1.3c25e6p-13f),
+    (float2)(0x1.528000p-2f, 0x1.6f7f12p-12f),
+    (float2)(0x1.5c0000p-2f, 0x1.a39fbcp-18f),
+    (float2)(0x1.650000p-2f, 0x1.8fe466p-14f),
+    (float2)(0x1.6e0000p-2f, 0x1.10e6cep-13f),
+    (float2)(0x1.770000p-2f, 0x1.d2ba7ep-14f),
+    (float2)(0x1.800000p-2f, 0x1.4ac62cp-15f),
+    (float2)(0x1.888000p-2f, 0x1.a71cb8p-12f),
+    (float2)(0x1.918000p-2f, 0x1.dd448ep-13f),
+    (float2)(0x1.9a8000p-2f, 0x1.1c8f10p-21f),
+    (float2)(0x1.a30000p-2f, 0x1.bb053ep-13f),
+    (float2)(0x1.ab8000p-2f, 0x1.861e5ep-12f),
+    (float2)(0x1.b40000p-2f, 0x1.fafdcep-12f),
+    (float2)(0x1.bd0000p-2f, 0x1.e5d3cep-15f),
+    (float2)(0x1.c58000p-2f, 0x1.2fad28p-14f),
+    (float2)(0x1.ce0000p-2f, 0x1.492474p-15f),
+    (float2)(0x1.d60000p-2f, 0x1.d4f80cp-12f),
+    (float2)(0x1.de8000p-2f, 0x1.4ff510p-12f),
+    (float2)(0x1.e70000p-2f, 0x1.3550f2p-13f),
+    (float2)(0x1.ef0000p-2f, 0x1.b59ccap-12f),
+    (float2)(0x1.f78000p-2f, 0x1.42b464p-13f),
+    (float2)(0x1.ff8000p-2f, 0x1.5e66a0p-12f),
+    (float2)(0x1.038000p-1f, 0x1.f6a2e4p-11f),
+    (float2)(0x1.080000p-1f, 0x1.39e4fep-14f),
+    (float2)(0x1.0c0000p-1f, 0x1.0500d6p-13f),
+    (float2)(0x1.100000p-1f, 0x1.13b152p-13f),
+    (float2)(0x1.140000p-1f, 0x1.93f542p-14f),
+    (float2)(0x1.180000p-1f, 0x1.467b94p-16f),
+    (float2)(0x1.1b8000p-1f, 0x1.cc47a4p-11f),
+    (float2)(0x1.1f8000p-1f, 0x1.78f4c2p-11f),
+    (float2)(0x1.238000p-1f, 0x1.107508p-11f),
+    (float2)(0x1.278000p-1f, 0x1.2602c2p-12f),
+    (float2)(0x1.2b8000p-1f, 0x1.a39fbcp-20f),
+    (float2)(0x1.2f0000p-1f, 0x1.5a1d7ap-11f),
+    (float2)(0x1.330000p-1f, 0x1.3e355ap-12f),
+    (float2)(0x1.368000p-1f, 0x1.cffedap-11f),
+    (float2)(0x1.3a8000p-1f, 0x1.d9fd50p-12f),
+    (float2)(0x1.3e0000p-1f, 0x1.f64de6p-11f),
+    (float2)(0x1.420000p-1f, 0x1.d83f4cp-12f),
+    (float2)(0x1.458000p-1f, 0x1.cea628p-11f),
+    (float2)(0x1.498000p-1f, 0x1.3c25e6p-12f),
+    (float2)(0x1.4d0000p-1f, 0x1.5a96ccp-11f),
+    (float2)(0x1.510000p-1f, 0x1.18708ap-17f),
+    (float2)(0x1.548000p-1f, 0x1.374652p-12f),
+    (float2)(0x1.580000p-1f, 0x1.2089a6p-11f),
+    (float2)(0x1.5b8000p-1f, 0x1.93432cp-11f),
+    (float2)(0x1.5f0000p-1f, 0x1.f3fd06p-11f),
+    (float2)(0x1.630000p-1f, 0x1.0b8f54p-13f),
+    (float2)(0x1.668000p-1f, 0x1.004722p-12f),
+    (float2)(0x1.6a0000p-1f, 0x1.57cf2cp-12f),
+    (float2)(0x1.6d8000p-1f, 0x1.8cb53ap-12f),
+    (float2)(0x1.710000p-1f, 0x1.9f4d8ap-12f),
+    (float2)(0x1.748000p-1f, 0x1.8feb26p-12f),
+    (float2)(0x1.780000p-1f, 0x1.5edfeep-12f),
+    (float2)(0x1.7b8000p-1f, 0x1.0c7c9ap-12f),
+    (float2)(0x1.7f0000p-1f, 0x1.322182p-13f),
+    (float2)(0x1.828000p-1f, 0x1.3ab7cep-18f),
+    (float2)(0x1.858000p-1f, 0x1.a82c2cp-11f),
+    (float2)(0x1.890000p-1f, 0x1.3dd2c0p-11f),
+    (float2)(0x1.8c8000p-1f, 0x1.871da4p-12f),
+    (float2)(0x1.900000p-1f, 0x1.cc2c00p-14f),
+    (float2)(0x1.930000p-1f, 0x1.9fdb68p-11f),
+    (float2)(0x1.968000p-1f, 0x1.ed6956p-12f),
+    (float2)(0x1.9a0000p-1f, 0x1.f1a760p-14f),
+    (float2)(0x1.9d0000p-1f, 0x1.767f54p-11f),
+    (float2)(0x1.a08000p-1f, 0x1.3f6d26p-12f),
+    (float2)(0x1.a38000p-1f, 0x1.b9fce2p-11f),
+    (float2)(0x1.a70000p-1f, 0x1.8ae816p-12f),
+    (float2)(0x1.aa0000p-1f, 0x1.c23d60p-11f),
+    (float2)(0x1.ad8000p-1f, 0x1.60f388p-12f),
+    (float2)(0x1.b08000p-1f, 0x1.9049aep-11f),
+    (float2)(0x1.b40000p-1f, 0x1.8734a8p-13f),
+    (float2)(0x1.b70000p-1f, 0x1.2523d4p-11f),
+    (float2)(0x1.ba0000p-1f, 0x1.da6ce6p-11f),
+    (float2)(0x1.bd8000p-1f, 0x1.038e62p-12f),
+    (float2)(0x1.c08000p-1f, 0x1.1b511ep-11f),
+    (float2)(0x1.c38000p-1f, 0x1.a728b8p-11f),
+    (float2)(0x1.c70000p-1f, 0x1.2b5d22p-14f),
+    (float2)(0x1.ca0000p-1f, 0x1.2c6e54p-12f),
+    (float2)(0x1.cd0000p-1f, 0x1.f35064p-12f),
+    (float2)(0x1.d00000p-1f, 0x1.4fdb48p-11f),
+    (float2)(0x1.d30000p-1f, 0x1.98ec9ep-11f),
+    (float2)(0x1.d60000p-1f, 0x1.d4f80cp-11f),
+    (float2)(0x1.d98000p-1f, 0x1.0643d6p-17f),
+    (float2)(0x1.dc8000p-1f, 0x1.33567ep-14f),
+    (float2)(0x1.df8000p-1f, 0x1.e0410cp-14f),
+    (float2)(0x1.e28000p-1f, 0x1.142e0ep-13f),
+    (float2)(0x1.e58000p-1f, 0x1.063c88p-13f),
+    (float2)(0x1.e88000p-1f, 0x1.8d66c4p-14f),
+    (float2)(0x1.eb8000p-1f, 0x1.57e32ap-15f),
+    (float2)(0x1.ee0000p-1f, 0x1.ed1c6cp-11f),
+    (float2)(0x1.f10000p-1f, 0x1.b8a076p-11f),
+    (float2)(0x1.f40000p-1f, 0x1.7822f2p-11f),
+    (float2)(0x1.f70000p-1f, 0x1.2bbc3ap-11f),
+    (float2)(0x1.fa0000p-1f, 0x1.a708bap-12f),
+    (float2)(0x1.fd0000p-1f, 0x1.be4c7ep-13f),
+    (float2)(0x1.000000p+0f, 0x0.000000p+0f)
+};
+
+DECLARE_VTABLE(uchar, PIBITS_TBL, ) = {
+    224, 241, 27, 193, 12, 88, 33, 116, 53, 126, 196, 126, 237, 175,
+    169, 75, 74, 41, 222, 231, 28, 244, 236, 197, 151, 175, 31,
+    235, 158, 212, 181, 168, 127, 121, 154, 253, 24, 61, 221, 38,
+    44, 159, 60, 251, 217, 180, 125, 180, 41, 104, 45, 70, 188,
+    188, 63, 96, 22, 120, 255, 95, 226, 127, 236, 160, 228, 247,
+    46, 126, 17, 114, 210, 231, 76, 13, 230, 88, 71, 230, 4, 249,
+    125, 209, 154, 192, 113, 166, 19, 18, 237, 186, 212, 215, 8,
+    162, 251, 156, 166, 196, 114, 172, 119, 248, 115, 72, 70, 39,
+    168, 187, 36, 25, 128, 75, 55, 9, 233, 184, 145, 220, 134, 21,
+    239, 122, 175, 142, 69, 249, 7, 65, 14, 241, 100, 86, 138, 109,
+    3, 119, 211, 212, 71, 95, 157, 240, 167, 84, 16, 57, 185, 13,
+    230, 139, 2, 0, 0, 0, 0, 0, 0, 0
+};
+
+// Tabulated values of sinh(i) and cosh(i) for i = 0,...,36.
+DECLARE_VTABLE(float2, SINHCOSH_TBL, 37) = {
+    (float2)(0x0.000000p+0f, 0x1.000000p+0f),
+    (float2)(0x1.2cd9fcp+0f, 0x1.8b0756p+0f),
+    (float2)(0x1.d03cf6p+1f, 0x1.e18fa0p+1f),
+    (float2)(0x1.40926ep+3f, 0x1.422a4ap+3f),
+    (float2)(0x1.b4a380p+4f, 0x1.b4ee86p+4f),
+    (float2)(0x1.28d016p+6f, 0x1.28d6fcp+6f),
+    (float2)(0x1.936d22p+7f, 0x1.936e68p+7f),
+    (float2)(0x1.122876p+9f, 0x1.122894p+9f),
+    (float2)(0x1.749ea6p+10f, 0x1.749eaap+10f),
+    (float2)(0x1.fa7158p+11f, 0x1.fa7158p+11f),
+    (float2)(0x1.5829dcp+13f, 0x1.5829dep+13f),
+    (float2)(0x1.d3c448p+14f, 0x1.d3c448p+14f),
+    (float2)(0x1.3de166p+16f, 0x1.3de166p+16f),
+    (float2)(0x1.b00b5ap+17f, 0x1.b00b5ap+17f),
+    (float2)(0x1.259ac4p+19f, 0x1.259ac4p+19f),
+    (float2)(0x1.8f0ccap+20f, 0x1.8f0ccap+20f),
+    (float2)(0x1.0f2ebep+22f, 0x1.0f2ebep+22f),
+    (float2)(0x1.709348p+23f, 0x1.709348p+23f),
+    (float2)(0x1.f4f220p+24f, 0x1.f4f220p+24f),
+    (float2)(0x1.546d90p+26f, 0x1.546d90p+26f),
+    (float2)(0x1.ceb088p+27f, 0x1.ceb088p+27f),
+    (float2)(0x1.3a6e20p+29f, 0x1.3a6e20p+29f),
+    (float2)(0x1.ab5adcp+30f, 0x1.ab5adcp+30f),
+    (float2)(0x1.226af4p+32f, 0x1.226af4p+32f),
+    (float2)(0x1.8ab7fcp+33f, 0x1.8ab7fcp+33f),
+    (float2)(0x1.0c3d3ap+35f, 0x1.0c3d3ap+35f),
+    (float2)(0x1.6c9326p+36f, 0x1.6c9326p+36f),
+    (float2)(0x1.ef8230p+37f, 0x1.ef8230p+37f),
+    (float2)(0x1.50bba4p+39f, 0x1.50bba4p+39f),
+    (float2)(0x1.c9aae4p+40f, 0x1.c9aae4p+40f),
+    (float2)(0x1.370470p+42f, 0x1.370470p+42f),
+    (float2)(0x1.a6b766p+43f, 0x1.a6b766p+43f),
+    (float2)(0x1.1f43fcp+45f, 0x1.1f43fcp+45f),
+    (float2)(0x1.866f34p+46f, 0x1.866f34p+46f),
+    (float2)(0x1.0953e2p+48f, 0x1.0953e2p+48f),
+    (float2)(0x1.689e22p+49f, 0x1.689e22p+49f),
+    (float2)(0x1.ea215ap+50f, 0x1.ea215ap+50f)
+};
+
+DECLARE_VTABLE(float2, CBRT_TBL, 129) = {
+    (float2)(0x1.000000p+0f, 0x0.000000p+0f),
+    (float2)(0x1.008000p+0f, 0x1.51cb0ap-11f),
+    (float2)(0x1.014000p+0f, 0x1.39221ep-12f),
+    (float2)(0x1.01c000p+0f, 0x1.e06908p-11f),
+    (float2)(0x1.028000p+0f, 0x1.1d6978p-11f),
+    (float2)(0x1.034000p+0f, 0x1.4ea1bep-13f),
+    (float2)(0x1.03c000p+0f, 0x1.833b8ep-11f),
+    (float2)(0x1.048000p+0f, 0x1.587002p-12f),
+    (float2)(0x1.050000p+0f, 0x1.ceb290p-11f),
+    (float2)(0x1.05c000p+0f, 0x1.d57f34p-12f),
+    (float2)(0x1.068000p+0f, 0x1.cc53acp-21f),
+    (float2)(0x1.070000p+0f, 0x1.0fe098p-11f),
+    (float2)(0x1.07c000p+0f, 0x1.91b586p-15f),
+    (float2)(0x1.084000p+0f, 0x1.1c362ep-11f),
+    (float2)(0x1.090000p+0f, 0x1.94398ep-15f),
+    (float2)(0x1.098000p+0f, 0x1.1055bcp-11f),
+    (float2)(0x1.0a4000p+0f, 0x1.7e63cap-19f),
+    (float2)(0x1.0ac000p+0f, 0x1.d99e1ap-12f),
+    (float2)(0x1.0b4000p+0f, 0x1.d258dep-11f),
+    (float2)(0x1.0c0000p+0f, 0x1.645962p-12f),
+    (float2)(0x1.0c8000p+0f, 0x1.8c5b0ep-11f),
+    (float2)(0x1.0d4000p+0f, 0x1.83d0c8p-13f),
+    (float2)(0x1.0dc000p+0f, 0x1.300812p-11f),
+    (float2)(0x1.0e4000p+0f, 0x1.f9a65ap-11f),
+    (float2)(0x1.0f0000p+0f, 0x1.7bbcd8p-12f),
+    (float2)(0x1.0f8000p+0f, 0x1.7cbf68p-11f),
+    (float2)(0x1.104000p+0f, 0x1.b2c166p-14f),
+    (float2)(0x1.10c000p+0f, 0x1.d56ea4p-12f),
+    (float2)(0x1.114000p+0f, 0x1.99eb32p-11f),
+    (float2)(0x1.120000p+0f, 0x1.1007a2p-13f),
+    (float2)(0x1.128000p+0f, 0x1.d212aap-12f),
+    (float2)(0x1.130000p+0f, 0x1.890f18p-11f),
+    (float2)(0x1.13c000p+0f, 0x1.2104e2p-14f),
+    (float2)(0x1.144000p+0f, 0x1.74961ep-12f),
+    (float2)(0x1.14c000p+0f, 0x1.4b9b66p-11f),
+    (float2)(0x1.154000p+0f, 0x1.d81e66p-11f),
+    (float2)(0x1.160000p+0f, 0x1.7f825cp-13f),
+    (float2)(0x1.168000p+0f, 0x1.c5dca2p-12f),
+    (float2)(0x1.170000p+0f, 0x1.6153bap-11f),
+    (float2)(0x1.178000p+0f, 0x1.db1cc2p-11f),
+    (float2)(0x1.184000p+0f, 0x1.4154b0p-13f),
+    (float2)(0x1.18c000p+0f, 0x1.821114p-12f),
+    (float2)(0x1.194000p+0f, 0x1.2d4240p-11f),
+    (float2)(0x1.19c000p+0f, 0x1.950d82p-11f),
+    (float2)(0x1.1a4000p+0f, 0x1.f8755cp-11f),
+    (float2)(0x1.1b0000p+0f, 0x1.5e12a4p-13f),
+    (float2)(0x1.1b8000p+0f, 0x1.648c38p-12f),
+    (float2)(0x1.1c0000p+0f, 0x1.08c43ep-11f),
+    (float2)(0x1.1c8000p+0f, 0x1.5b0970p-11f),
+    (float2)(0x1.1d0000p+0f, 0x1.a91fe8p-11f),
+    (float2)(0x1.1d8000p+0f, 0x1.f311b6p-11f),
+    (float2)(0x1.1e4000p+0f, 0x1.c74618p-14f),
+    (float2)(0x1.1ec000p+0f, 0x1.eabb54p-13f),
+    (float2)(0x1.1f4000p+0f, 0x1.70db14p-12f),
+    (float2)(0x1.1fc000p+0f, 0x1.e45cbcp-12f),
+    (float2)(0x1.204000p+0f, 0x1.27faa6p-11f),
+    (float2)(0x1.20c000p+0f, 0x1.59db98p-11f),
+    (float2)(0x1.214000p+0f, 0x1.87da46p-11f),
+    (float2)(0x1.21c000p+0f, 0x1.b1ffa0p-11f),
+    (float2)(0x1.224000p+0f, 0x1.d85478p-11f),
+    (float2)(0x1.22c000p+0f, 0x1.fae17ep-11f),
+    (float2)(0x1.238000p+0f, 0x1.9af40cp-15f),
+    (float2)(0x1.240000p+0f, 0x1.a6319ep-14f),
+    (float2)(0x1.248000p+0f, 0x1.30baa6p-13f),
+    (float2)(0x1.250000p+0f, 0x1.7fc362p-13f),
+    (float2)(0x1.258000p+0f, 0x1.c05362p-13f),
+    (float2)(0x1.260000p+0f, 0x1.f28a98p-13f),
+    (float2)(0x1.268000p+0f, 0x1.0b4442p-12f),
+    (float2)(0x1.270000p+0f, 0x1.16361ap-12f),
+    (float2)(0x1.278000p+0f, 0x1.1a2a2ap-12f),
+    (float2)(0x1.280000p+0f, 0x1.172f8ep-12f),
+    (float2)(0x1.288000p+0f, 0x1.0d5530p-12f),
+    (float2)(0x1.290000p+0f, 0x1.f9538ep-13f),
+    (float2)(0x1.298000p+0f, 0x1.ca77b0p-13f),
+    (float2)(0x1.2a0000p+0f, 0x1.8e336ap-13f),
+    (float2)(0x1.2a8000p+0f, 0x1.44a304p-13f),
+    (float2)(0x1.2b0000p+0f, 0x1.dbc4c8p-14f),
+    (float2)(0x1.2b8000p+0f, 0x1.141a2ap-14f),
+    (float2)(0x1.2c0000p+0f, 0x1.93e44cp-17f),
+    (float2)(0x1.2c4000p+0f, 0x1.e6e432p-11f),
+    (float2)(0x1.2cc000p+0f, 0x1.c447c6p-11f),
+    (float2)(0x1.2d4000p+0f, 0x1.9e80d8p-11f),
+    (float2)(0x1.2dc000p+0f, 0x1.7595dcp-11f),
+    (float2)(0x1.2e4000p+0f, 0x1.498d30p-11f),
+    (float2)(0x1.2ec000p+0f, 0x1.1a6d1ep-11f),
+    (float2)(0x1.2f4000p+0f, 0x1.d077bap-12f),
+    (float2)(0x1.2fc000p+0f, 0x1.65ff1ep-12f),
+    (float2)(0x1.304000p+0f, 0x1.eaf912p-13f),
+    (float2)(0x1.30c000p+0f, 0x1.fbefb8p-14f),
+    (float2)(0x1.314000p+0f, 0x1.44905ap-19f),
+    (float2)(0x1.318000p+0f, 0x1.c017e6p-11f),
+    (float2)(0x1.320000p+0f, 0x1.7bfdbep-11f),
+    (float2)(0x1.328000p+0f, 0x1.34fbc6p-11f),
+    (float2)(0x1.330000p+0f, 0x1.d62f48p-12f),
+    (float2)(0x1.338000p+0f, 0x1.3cadc6p-12f),
+    (float2)(0x1.340000p+0f, 0x1.3afc06p-13f),
+    (float2)(0x1.344000p+0f, 0x1.fc556ep-11f),
+    (float2)(0x1.34c000p+0f, 0x1.a71f84p-11f),
+    (float2)(0x1.354000p+0f, 0x1.4f2290p-11f),
+    (float2)(0x1.35c000p+0f, 0x1.e8c79cp-12f),
+    (float2)(0x1.364000p+0f, 0x1.2dd0d8p-12f),
+    (float2)(0x1.36c000p+0f, 0x1.b5ac2ep-14f),
+    (float2)(0x1.370000p+0f, 0x1.d3d02ap-11f),
+    (float2)(0x1.378000p+0f, 0x1.6e3d58p-11f),
+    (float2)(0x1.380000p+0f, 0x1.060200p-11f),
+    (float2)(0x1.388000p+0f, 0x1.364608p-12f),
+    (float2)(0x1.390000p+0f, 0x1.6d29b6p-14f),
+    (float2)(0x1.394000p+0f, 0x1.bd8d5ep-11f),
+    (float2)(0x1.39c000p+0f, 0x1.4ae030p-11f),
+    (float2)(0x1.3a4000p+0f, 0x1.ab44b2p-12f),
+    (float2)(0x1.3ac000p+0f, 0x1.7761cep-13f),
+    (float2)(0x1.3b0000p+0f, 0x1.e38710p-11f),
+    (float2)(0x1.3b8000p+0f, 0x1.66b2b0p-11f),
+    (float2)(0x1.3c0000p+0f, 0x1.cebf96p-12f),
+    (float2)(0x1.3c8000p+0f, 0x1.964b20p-13f),
+    (float2)(0x1.3cc000p+0f, 0x1.e15004p-11f),
+    (float2)(0x1.3d4000p+0f, 0x1.5a9bcep-11f),
+    (float2)(0x1.3dc000p+0f, 0x1.a2f4d8p-12f),
+    (float2)(0x1.3e4000p+0f, 0x1.17c056p-13f),
+    (float2)(0x1.3e8000p+0f, 0x1.b800f8p-11f),
+    (float2)(0x1.3f0000p+0f, 0x1.27b132p-11f),
+    (float2)(0x1.3f8000p+0f, 0x1.2a09b8p-12f),
+    (float2)(0x1.400000p+0f, 0x0.000000p+0f),
+    (float2)(0x1.404000p+0f, 0x1.68a69cp-11f),
+    (float2)(0x1.40c000p+0f, 0x1.9df950p-12f),
+    (float2)(0x1.414000p+0f, 0x1.983050p-14f),
+    (float2)(0x1.418000p+0f, 0x1.94c6a4p-11f),
+    (float2)(0x1.420000p+0f, 0x1.e88494p-12f),
+    (float2)(0x1.428000p+0f, 0x1.45f31ap-13f)
+};
+
+DECLARE_VTABLE(float, EXP_TBL, 65) = {
+    0x1.000000p+0f,
+    0x1.02c9a4p+0f,
+    0x1.059b0ep+0f,
+    0x1.087452p+0f,
+    0x1.0b5586p+0f,
+    0x1.0e3ec4p+0f,
+    0x1.11301ep+0f,
+    0x1.1429aap+0f,
+    0x1.172b84p+0f,
+    0x1.1a35bep+0f,
+    0x1.1d4874p+0f,
+    0x1.2063b8p+0f,
+    0x1.2387a6p+0f,
+    0x1.26b456p+0f,
+    0x1.29e9e0p+0f,
+    0x1.2d285ap+0f,
+    0x1.306fe0p+0f,
+    0x1.33c08cp+0f,
+    0x1.371a74p+0f,
+    0x1.3a7db4p+0f,
+    0x1.3dea64p+0f,
+    0x1.4160a2p+0f,
+    0x1.44e086p+0f,
+    0x1.486a2cp+0f,
+    0x1.4bfdaep+0f,
+    0x1.4f9b28p+0f,
+    0x1.5342b6p+0f,
+    0x1.56f474p+0f,
+    0x1.5ab07ep+0f,
+    0x1.5e76f2p+0f,
+    0x1.6247ecp+0f,
+    0x1.662388p+0f,
+    0x1.6a09e6p+0f,
+    0x1.6dfb24p+0f,
+    0x1.71f75ep+0f,
+    0x1.75feb6p+0f,
+    0x1.7a1148p+0f,
+    0x1.7e2f34p+0f,
+    0x1.82589ap+0f,
+    0x1.868d9ap+0f,
+    0x1.8ace54p+0f,
+    0x1.8f1aeap+0f,
+    0x1.93737cp+0f,
+    0x1.97d82ap+0f,
+    0x1.9c4918p+0f,
+    0x1.a0c668p+0f,
+    0x1.a5503cp+0f,
+    0x1.a9e6b6p+0f,
+    0x1.ae89fap+0f,
+    0x1.b33a2cp+0f,
+    0x1.b7f770p+0f,
+    0x1.bcc1eap+0f,
+    0x1.c199bep+0f,
+    0x1.c67f12p+0f,
+    0x1.cb720ep+0f,
+    0x1.d072d4p+0f,
+    0x1.d5818ep+0f,
+    0x1.da9e60p+0f,
+    0x1.dfc974p+0f,
+    0x1.e502eep+0f,
+    0x1.ea4afap+0f,
+    0x1.efa1bep+0f,
+    0x1.f50766p+0f,
+    0x1.fa7c18p+0f,
+    0x1.000000p+1f,
+};
+
+DECLARE_VTABLE(float2, EXP_TBL_EP, 65) = {
+    (float2) (0x1.000000p+0f, 0x0.000000p+0f),
+    (float2) (0x1.02c000p+0f, 0x1.347ceep-13f),
+    (float2) (0x1.058000p+0f, 0x1.b0d314p-12f),
+    (float2) (0x1.084000p+0f, 0x1.a28c3ap-11f),
+    (float2) (0x1.0b4000p+0f, 0x1.586cf8p-12f),
+    (float2) (0x1.0e0000p+0f, 0x1.f61968p-11f),
+    (float2) (0x1.110000p+0f, 0x1.80e808p-11f),
+    (float2) (0x1.140000p+0f, 0x1.4d5754p-11f),
+    (float2) (0x1.170000p+0f, 0x1.5c1e3ep-11f),
+    (float2) (0x1.1a0000p+0f, 0x1.adf5b6p-11f),
+    (float2) (0x1.1d4000p+0f, 0x1.0e62d0p-13f),
+    (float2) (0x1.204000p+0f, 0x1.1dc430p-11f),
+    (float2) (0x1.238000p+0f, 0x1.e9b9d4p-14f),
+    (float2) (0x1.268000p+0f, 0x1.a2b2f0p-11f),
+    (float2) (0x1.29c000p+0f, 0x1.4efa8ep-11f),
+    (float2) (0x1.2d0000p+0f, 0x1.42d372p-11f),
+    (float2) (0x1.304000p+0f, 0x1.7f0518p-11f),
+    (float2) (0x1.33c000p+0f, 0x1.164c82p-17f),
+    (float2) (0x1.370000p+0f, 0x1.a7373ap-12f),
+    (float2) (0x1.3a4000p+0f, 0x1.ed9a72p-11f),
+    (float2) (0x1.3dc000p+0f, 0x1.532608p-11f),
+    (float2) (0x1.414000p+0f, 0x1.0510fap-11f),
+    (float2) (0x1.44c000p+0f, 0x1.043030p-11f),
+    (float2) (0x1.484000p+0f, 0x1.515ae0p-11f),
+    (float2) (0x1.4bc000p+0f, 0x1.ed6a9ap-11f),
+    (float2) (0x1.4f8000p+0f, 0x1.b2769cp-12f),
+    (float2) (0x1.534000p+0f, 0x1.5ab4eap-15f),
+    (float2) (0x1.56c000p+0f, 0x1.a39b5ap-11f),
+    (float2) (0x1.5a8000p+0f, 0x1.83eea4p-11f),
+    (float2) (0x1.5e4000p+0f, 0x1.b78ad6p-11f),
+    (float2) (0x1.624000p+0f, 0x1.fac0e8p-14f),
+    (float2) (0x1.660000p+0f, 0x1.1c412ap-11f),
+    (float2) (0x1.6a0000p+0f, 0x1.3cccfep-13f),
+    (float2) (0x1.6dc000p+0f, 0x1.d91e32p-11f),
+    (float2) (0x1.71c000p+0f, 0x1.baf476p-11f),
+    (float2) (0x1.75c000p+0f, 0x1.f5ab20p-11f),
+    (float2) (0x1.7a0000p+0f, 0x1.1473eap-12f),
+    (float2) (0x1.7e0000p+0f, 0x1.799b66p-11f),
+    (float2) (0x1.824000p+0f, 0x1.89994cp-12f),
+    (float2) (0x1.868000p+0f, 0x1.b33688p-13f),
+    (float2) (0x1.8ac000p+0f, 0x1.ca8454p-13f),
+    (float2) (0x1.8f0000p+0f, 0x1.ae9914p-12f),
+    (float2) (0x1.934000p+0f, 0x1.9bd866p-11f),
+    (float2) (0x1.97c000p+0f, 0x1.829fdep-12f),
+    (float2) (0x1.9c4000p+0f, 0x1.230546p-13f),
+    (float2) (0x1.a0c000p+0f, 0x1.99ed76p-14f),
+    (float2) (0x1.a54000p+0f, 0x1.03b23ep-12f),
+    (float2) (0x1.a9c000p+0f, 0x1.35aabcp-11f),
+    (float2) (0x1.ae8000p+0f, 0x1.3f32b4p-13f),
+    (float2) (0x1.b30000p+0f, 0x1.d15c26p-11f),
+    (float2) (0x1.b7c000p+0f, 0x1.bb797cp-11f),
+    (float2) (0x1.bcc000p+0f, 0x1.e904bcp-16f),
+    (float2) (0x1.c18000p+0f, 0x1.9bdd84p-12f),
+    (float2) (0x1.c64000p+0f, 0x1.f8972ap-11f),
+    (float2) (0x1.cb4000p+0f, 0x1.906e76p-11f),
+    (float2) (0x1.d04000p+0f, 0x1.96a502p-11f),
+    (float2) (0x1.d58000p+0f, 0x1.8dcfbap-16f),
+    (float2) (0x1.da8000p+0f, 0x1.e603dap-12f),
+    (float2) (0x1.dfc000p+0f, 0x1.2e66f6p-13f),
+    (float2) (0x1.e50000p+0f, 0x1.773c58p-15f),
+    (float2) (0x1.ea4000p+0f, 0x1.5f4548p-13f),
+    (float2) (0x1.ef8000p+0f, 0x1.0df730p-11f),
+    (float2) (0x1.f50000p+0f, 0x1.d96db8p-14f),
+    (float2) (0x1.fa4000p+0f, 0x1.e0c0cep-11f),
+    (float2) (0x1.000000p+1f, 0x0.000000p+0f),
+};
+
+VTABLE_FUNCTION4(PIBITS_TBL, pibits_tbl);
+
+VTABLE_FUNCTION(float, LOG_INV_TBL, log_inv_tbl);
+VTABLE_FUNCTION(float, EXP_TBL, exp_tbl);
+
+VTABLE_FUNCTION2(v2float, LOGE_TBL, loge_tbl);
+VTABLE_FUNCTION2(v2float, LOG2_TBL, log2_tbl);
+VTABLE_FUNCTION2(v2float, SINHCOSH_TBL, sinhcosh_tbl);
+VTABLE_FUNCTION2(v2float, CBRT_TBL, cbrt_tbl);
+VTABLE_FUNCTION2(v2float, EXP_TBL_EP, exp_tbl_ep);
diff --git a/lib/kernel/libclc/vtables_fp64.cl b/lib/kernel/libclc/vtables_fp64.cl
new file mode 100644
index 0000000..4414290
--- /dev/null
+++ b/lib/kernel/libclc/vtables_fp64.cl
@@ -0,0 +1,1039 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Copyright (c) 2017 Michal Babej / Tampere University of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "misc.h"
+#include "vtables.h"
+#include "vtables_macros.h"
+
+#ifdef cl_khr_fp64
+
+DECLARE_VTABLE(double2, LN_TBL, 65) = {
+    (double2)(0x0.0000000000000p+0, 0x0.0000000000000p+0),
+    (double2)(0x1.fc0a800000000p-7, 0x1.61f807c79f3dbp-28),
+    (double2)(0x1.f829800000000p-6, 0x1.873c1980267c8p-25),
+    (double2)(0x1.7745800000000p-5, 0x1.ec65b9f88c69ep-26),
+    (double2)(0x1.f0a3000000000p-5, 0x1.8022c54cc2f99p-26),
+    (double2)(0x1.341d700000000p-4, 0x1.2c37a3a125330p-25),
+    (double2)(0x1.6f0d200000000p-4, 0x1.15cad69737c93p-25),
+    (double2)(0x1.a926d00000000p-4, 0x1.d256ab1b285e9p-27),
+    (double2)(0x1.e270700000000p-4, 0x1.b8abcb97a7aa2p-26),
+    (double2)(0x1.0d77e00000000p-3, 0x1.f34239659a5dcp-25),
+    (double2)(0x1.2955280000000p-3, 0x1.e07fd48d30177p-25),
+    (double2)(0x1.44d2b00000000p-3, 0x1.b32df4799f4f6p-25),
+    (double2)(0x1.5ff3000000000p-3, 0x1.c29e4f4f21cf8p-25),
+    (double2)(0x1.7ab8900000000p-3, 0x1.086c848df1b59p-30),
+    (double2)(0x1.9525a80000000p-3, 0x1.cf456b4764130p-27),
+    (double2)(0x1.af3c900000000p-3, 0x1.3a02ffcb63398p-25),
+    (double2)(0x1.c8ff780000000p-3, 0x1.1e6a6886b0976p-25),
+    (double2)(0x1.e270700000000p-3, 0x1.b8abcb97a7aa2p-25),
+    (double2)(0x1.fb91800000000p-3, 0x1.b578f8aa35552p-25),
+    (double2)(0x1.0a324c0000000p-2, 0x1.139c871afb9fcp-25),
+    (double2)(0x1.1675c80000000p-2, 0x1.5d5d30701ce64p-25),
+    (double2)(0x1.22941c0000000p-2, 0x1.de7bcb2d12142p-25),
+    (double2)(0x1.2e8e280000000p-2, 0x1.d708e984e1664p-25),
+    (double2)(0x1.3a64c40000000p-2, 0x1.56945e9c72f36p-26),
+    (double2)(0x1.4618bc0000000p-2, 0x1.0e2f613e85bdap-29),
+    (double2)(0x1.51aad80000000p-2, 0x1.cb7e0b42724f6p-28),
+    (double2)(0x1.5d1bd80000000p-2, 0x1.fac04e52846c7p-25),
+    (double2)(0x1.686c800000000p-2, 0x1.e9b14aec442bep-26),
+    (double2)(0x1.739d7c0000000p-2, 0x1.b5de8034e7126p-25),
+    (double2)(0x1.7eaf800000000p-2, 0x1.dc157e1b259d3p-25),
+    (double2)(0x1.89a3380000000p-2, 0x1.b05096ad69c62p-28),
+    (double2)(0x1.9479400000000p-2, 0x1.c2116faba4cddp-26),
+    (double2)(0x1.9f323c0000000p-2, 0x1.65fcc25f95b47p-25),
+    (double2)(0x1.a9cec80000000p-2, 0x1.a9a08498d4850p-26),
+    (double2)(0x1.b44f740000000p-2, 0x1.de647b1465f77p-25),
+    (double2)(0x1.beb4d80000000p-2, 0x1.da71b7bf7861dp-26),
+    (double2)(0x1.c8ff7c0000000p-2, 0x1.e6a6886b09760p-28),
+    (double2)(0x1.d32fe40000000p-2, 0x1.f0075eab0ef64p-25),
+    (double2)(0x1.dd46a00000000p-2, 0x1.3071282fb989bp-28),
+    (double2)(0x1.e744240000000p-2, 0x1.0eb43c3f1bed2p-25),
+    (double2)(0x1.f128f40000000p-2, 0x1.faf06ecb35c84p-26),
+    (double2)(0x1.faf5880000000p-2, 0x1.ef1e63db35f68p-27),
+    (double2)(0x1.02552a0000000p-1, 0x1.69743fb1a71a5p-27),
+    (double2)(0x1.0723e40000000p-1, 0x1.c1cdf404e5796p-25),
+    (double2)(0x1.0be72e0000000p-1, 0x1.094aa0ada625ep-27),
+    (double2)(0x1.109f380000000p-1, 0x1.e2d4c96fde3ecp-25),
+    (double2)(0x1.154c3c0000000p-1, 0x1.2f4d5e9a98f34p-25),
+    (double2)(0x1.19ee6a0000000p-1, 0x1.467c96ecc5cbep-25),
+    (double2)(0x1.1e85f40000000p-1, 0x1.e7040d03dec5ap-25),
+    (double2)(0x1.23130c0000000p-1, 0x1.7bebf4282de36p-25),
+    (double2)(0x1.2795e00000000p-1, 0x1.289b11aeb783fp-25),
+    (double2)(0x1.2c0e9e0000000p-1, 0x1.a891d1772f538p-26),
+    (double2)(0x1.307d720000000p-1, 0x1.34f10be1fb591p-25),
+    (double2)(0x1.34e2880000000p-1, 0x1.d9ce1d316eb93p-25),
+    (double2)(0x1.393e0c0000000p-1, 0x1.3562a19a9c442p-25),
+    (double2)(0x1.3d90260000000p-1, 0x1.4e2adf548084cp-26),
+    (double2)(0x1.41d8fe0000000p-1, 0x1.08ce55cc8c97ap-26),
+    (double2)(0x1.4618bc0000000p-1, 0x1.0e2f613e85bdap-28),
+    (double2)(0x1.4a4f840000000p-1, 0x1.db03ebb0227bfp-25),
+    (double2)(0x1.4e7d800000000p-1, 0x1.1b75bb09cb098p-25),
+    (double2)(0x1.52a2d20000000p-1, 0x1.96f16abb9df22p-27),
+    (double2)(0x1.56bf9c0000000p-1, 0x1.5b3f399411c62p-25),
+    (double2)(0x1.5ad4040000000p-1, 0x1.86b3e59f65355p-26),
+    (double2)(0x1.5ee02a0000000p-1, 0x1.2482ceae1ac12p-26),
+    (double2)(0x1.62e42e0000000p-1, 0x1.efa39ef35793cp-25),
+};
+
+
+
+// Arrays atan_jby256_lead and atan_jby256_tail contain
+// leading and trailing parts respectively of precomputed
+// values of atan(j/256), for j = 16, 17, ..., 256.
+// atan_jby256_lead contains the first 21 bits of precision,
+// and atan_jby256_tail contains a further 53 bits precision.
+
+DECLARE_VTABLE(double2, ATAN_JBY256_TBL, 241) = {
+    (double2)(0x1.ff55b00000000p-5, 0x1.6e59fbd38db2cp-26),
+    (double2)(0x1.0f99e00000000p-4, 0x1.4e3aa54dedf96p-25),
+    (double2)(0x1.1f86d00000000p-4, 0x1.7e105ab1bda88p-25),
+    (double2)(0x1.2f71900000000p-4, 0x1.8c5254d013fd0p-27),
+    (double2)(0x1.3f59f00000000p-4, 0x1.cf8ab3ad62670p-29),
+    (double2)(0x1.4f3fd00000000p-4, 0x1.9dca4bec80468p-26),
+    (double2)(0x1.5f23200000000p-4, 0x1.3f4b5ec98a8dap-26),
+    (double2)(0x1.6f03b00000000p-4, 0x1.b9d49619d81fep-25),
+    (double2)(0x1.7ee1800000000p-4, 0x1.3017887460934p-27),
+    (double2)(0x1.8ebc500000000p-4, 0x1.11e3eca0b9944p-26),
+    (double2)(0x1.9e94100000000p-4, 0x1.4f3f73c5a332ep-26),
+    (double2)(0x1.ae68a00000000p-4, 0x1.c71c8ae0e00a6p-26),
+    (double2)(0x1.be39e00000000p-4, 0x1.7cde0f86fbdc7p-25),
+    (double2)(0x1.ce07c00000000p-4, 0x1.70f328c889c72p-26),
+    (double2)(0x1.ddd2100000000p-4, 0x1.c07ae9b994efep-26),
+    (double2)(0x1.ed98c00000000p-4, 0x1.0c8021d7b1698p-27),
+    (double2)(0x1.fd5ba00000000p-4, 0x1.35585edb8cb22p-25),
+    (double2)(0x1.068d500000000p-3, 0x1.0842567b30e96p-24),
+    (double2)(0x1.0e6ad00000000p-3, 0x1.99e811031472ep-24),
+    (double2)(0x1.1646500000000p-3, 0x1.041821416bceep-25),
+    (double2)(0x1.1e1fa00000000p-3, 0x1.f6086e4dc96f4p-24),
+    (double2)(0x1.25f6e00000000p-3, 0x1.71a535c5f1b58p-27),
+    (double2)(0x1.2dcbd00000000p-3, 0x1.65f743fe63ca1p-24),
+    (double2)(0x1.359e800000000p-3, 0x1.dbd733472d014p-24),
+    (double2)(0x1.3d6ee00000000p-3, 0x1.d18cc4d8b0d1dp-24),
+    (double2)(0x1.453ce00000000p-3, 0x1.8c12553c8fb29p-24),
+    (double2)(0x1.4d08700000000p-3, 0x1.53b49e2e8f991p-24),
+    (double2)(0x1.54d1800000000p-3, 0x1.7422ae148c141p-24),
+    (double2)(0x1.5c98100000000p-3, 0x1.e3ec269df56a8p-27),
+    (double2)(0x1.645bf00000000p-3, 0x1.ff6754e7e0ac9p-24),
+    (double2)(0x1.6c1d400000000p-3, 0x1.131267b1b5aadp-24),
+    (double2)(0x1.73dbd00000000p-3, 0x1.d14fa403a94bcp-24),
+    (double2)(0x1.7b97b00000000p-3, 0x1.2f396c089a3d8p-25),
+    (double2)(0x1.8350b00000000p-3, 0x1.c731d78fa95bbp-24),
+    (double2)(0x1.8b06e00000000p-3, 0x1.c50f385177399p-24),
+    (double2)(0x1.92ba300000000p-3, 0x1.f41409c6f2c20p-25),
+    (double2)(0x1.9a6a800000000p-3, 0x1.d2d90c4c39ec0p-24),
+    (double2)(0x1.a217e00000000p-3, 0x1.80420696f2106p-25),
+    (double2)(0x1.a9c2300000000p-3, 0x1.b40327943a2e8p-27),
+    (double2)(0x1.b169600000000p-3, 0x1.5d35e02f3d2a2p-25),
+    (double2)(0x1.b90d700000000p-3, 0x1.4a498288117b0p-25),
+    (double2)(0x1.c0ae500000000p-3, 0x1.35da119afb324p-25),
+    (double2)(0x1.c84bf00000000p-3, 0x1.14e85cdb9a908p-24),
+    (double2)(0x1.cfe6500000000p-3, 0x1.38754e5547b9ap-25),
+    (double2)(0x1.d77d500000000p-3, 0x1.be40ae6ce3246p-24),
+    (double2)(0x1.df11000000000p-3, 0x1.0c993b3bea7e7p-24),
+    (double2)(0x1.e6a1400000000p-3, 0x1.1d2dd89ac3359p-24),
+    (double2)(0x1.ee2e100000000p-3, 0x1.1476603332c46p-25),
+    (double2)(0x1.f5b7500000000p-3, 0x1.f25901bac55b7p-24),
+    (double2)(0x1.fd3d100000000p-3, 0x1.f881b7c826e28p-24),
+    (double2)(0x1.025fa00000000p-2, 0x1.441996d698d20p-24),
+    (double2)(0x1.061ee00000000p-2, 0x1.407ac521ea089p-23),
+    (double2)(0x1.09dc500000000p-2, 0x1.2fb0c6c4b1723p-23),
+    (double2)(0x1.0d97e00000000p-2, 0x1.ca135966a3e18p-23),
+    (double2)(0x1.1151a00000000p-2, 0x1.b1218e4d646e4p-25),
+    (double2)(0x1.1509700000000p-2, 0x1.d4e72a350d288p-25),
+    (double2)(0x1.18bf500000000p-2, 0x1.4617e2f04c329p-23),
+    (double2)(0x1.1c73500000000p-2, 0x1.096ec41e82650p-25),
+    (double2)(0x1.2025500000000p-2, 0x1.9f91f25773e6ep-24),
+    (double2)(0x1.23d5600000000p-2, 0x1.59c0820f1d674p-25),
+    (double2)(0x1.2783700000000p-2, 0x1.02bf7a2df1064p-25),
+    (double2)(0x1.2b2f700000000p-2, 0x1.fb36bfc40508fp-23),
+    (double2)(0x1.2ed9800000000p-2, 0x1.ea08f3f8dc892p-24),
+    (double2)(0x1.3281800000000p-2, 0x1.3ed6254656a0ep-24),
+    (double2)(0x1.3627700000000p-2, 0x1.b83f5e5e69c58p-25),
+    (double2)(0x1.39cb400000000p-2, 0x1.d6ec2af768592p-23),
+    (double2)(0x1.3d6d100000000p-2, 0x1.493889a226f94p-25),
+    (double2)(0x1.410cb00000000p-2, 0x1.5ad8fa65279bap-23),
+    (double2)(0x1.44aa400000000p-2, 0x1.b615784d45434p-25),
+    (double2)(0x1.4845a00000000p-2, 0x1.09a184368f145p-23),
+    (double2)(0x1.4bdee00000000p-2, 0x1.61a2439b0d91cp-24),
+    (double2)(0x1.4f75f00000000p-2, 0x1.ce1a65e39a978p-24),
+    (double2)(0x1.530ad00000000p-2, 0x1.32a39a93b6a66p-23),
+    (double2)(0x1.569d800000000p-2, 0x1.1c3699af804e7p-23),
+    (double2)(0x1.5a2e000000000p-2, 0x1.75e0f4e44ede8p-26),
+    (double2)(0x1.5dbc300000000p-2, 0x1.f77ced1a7a83bp-23),
+    (double2)(0x1.6148400000000p-2, 0x1.84e7f0cb1b500p-29),
+    (double2)(0x1.64d1f00000000p-2, 0x1.ec6b838b02dfep-23),
+    (double2)(0x1.6859700000000p-2, 0x1.3ebf4dfbeda87p-23),
+    (double2)(0x1.6bdea00000000p-2, 0x1.9397aed9cb475p-23),
+    (double2)(0x1.6f61900000000p-2, 0x1.07937bc239c54p-24),
+    (double2)(0x1.72e2200000000p-2, 0x1.aa754553131b6p-23),
+    (double2)(0x1.7660700000000p-2, 0x1.4a05d407c45dcp-24),
+    (double2)(0x1.79dc600000000p-2, 0x1.132231a206dd0p-23),
+    (double2)(0x1.7d56000000000p-2, 0x1.2d8ecfdd69c88p-24),
+    (double2)(0x1.80cd400000000p-2, 0x1.a852c74218606p-24),
+    (double2)(0x1.8442200000000p-2, 0x1.71bf2baeebb50p-23),
+    (double2)(0x1.87b4b00000000p-2, 0x1.83d7db7491820p-27),
+    (double2)(0x1.8b24d00000000p-2, 0x1.ca50d92b6da14p-25),
+    (double2)(0x1.8e92900000000p-2, 0x1.6f5cde8530298p-26),
+    (double2)(0x1.91fde00000000p-2, 0x1.f343198910740p-24),
+    (double2)(0x1.9566d00000000p-2, 0x1.0e8d241ccd80ap-24),
+    (double2)(0x1.98cd500000000p-2, 0x1.1535ac619e6c8p-24),
+    (double2)(0x1.9c31600000000p-2, 0x1.7316041c36cd2p-24),
+    (double2)(0x1.9f93000000000p-2, 0x1.985a000637d8ep-24),
+    (double2)(0x1.a2f2300000000p-2, 0x1.f2f29858c0a68p-25),
+    (double2)(0x1.a64ee00000000p-2, 0x1.879847f96d909p-23),
+    (double2)(0x1.a9a9200000000p-2, 0x1.ab3d319e12e42p-23),
+    (double2)(0x1.ad00f00000000p-2, 0x1.5088162dfc4c2p-24),
+    (double2)(0x1.b056400000000p-2, 0x1.05749a1cd9d8cp-25),
+    (double2)(0x1.b3a9100000000p-2, 0x1.da65c6c6b8618p-26),
+    (double2)(0x1.b6f9600000000p-2, 0x1.739bf7df1ad64p-25),
+    (double2)(0x1.ba47300000000p-2, 0x1.bc31252aa3340p-25),
+    (double2)(0x1.bd92800000000p-2, 0x1.e528191ad3aa8p-26),
+    (double2)(0x1.c0db400000000p-2, 0x1.929d93df19f18p-23),
+    (double2)(0x1.c421900000000p-2, 0x1.ff11eb693a080p-26),
+    (double2)(0x1.c765500000000p-2, 0x1.55ae3f145a3a0p-27),
+    (double2)(0x1.caa6800000000p-2, 0x1.cbcd8c6c0ca82p-24),
+    (double2)(0x1.cde5300000000p-2, 0x1.0cb04d425d304p-24),
+    (double2)(0x1.d121500000000p-2, 0x1.9adfcab5be678p-24),
+    (double2)(0x1.d45ae00000000p-2, 0x1.93d90c5662508p-23),
+    (double2)(0x1.d791f00000000p-2, 0x1.68489bd35ff40p-24),
+    (double2)(0x1.dac6700000000p-2, 0x1.586ed3da2b7e0p-28),
+    (double2)(0x1.ddf8500000000p-2, 0x1.7604d2e850eeep-23),
+    (double2)(0x1.e127b00000000p-2, 0x1.ac1d12bfb53d8p-24),
+    (double2)(0x1.e454800000000p-2, 0x1.9b3d468274740p-28),
+    (double2)(0x1.e77eb00000000p-2, 0x1.fc5d68d10e53cp-24),
+    (double2)(0x1.eaa6500000000p-2, 0x1.8f9e51884becbp-23),
+    (double2)(0x1.edcb600000000p-2, 0x1.a87f0869c06d1p-23),
+    (double2)(0x1.f0ede00000000p-2, 0x1.31e7279f685fap-23),
+    (double2)(0x1.f40dd00000000p-2, 0x1.6a8282f9719b0p-27),
+    (double2)(0x1.f72b200000000p-2, 0x1.0d2724a8a44e0p-25),
+    (double2)(0x1.fa45d00000000p-2, 0x1.a60524b11ad4ep-23),
+    (double2)(0x1.fd5e000000000p-2, 0x1.75fdf832750f0p-26),
+    (double2)(0x1.0039c00000000p-1, 0x1.cf06902e4cd36p-23),
+    (double2)(0x1.01c3400000000p-1, 0x1.e82422d4f6d10p-25),
+    (double2)(0x1.034b700000000p-1, 0x1.24a091063e6c0p-26),
+    (double2)(0x1.04d2500000000p-1, 0x1.8a1a172dc6f38p-24),
+    (double2)(0x1.0657e00000000p-1, 0x1.29b6619f8a92dp-22),
+    (double2)(0x1.07dc300000000p-1, 0x1.9274d9c1b70c8p-24),
+    (double2)(0x1.095f300000000p-1, 0x1.0c34b1fbb7930p-26),
+    (double2)(0x1.0ae0e00000000p-1, 0x1.639866c20eb50p-25),
+    (double2)(0x1.0c61400000000p-1, 0x1.6d6d0f6832e9ep-23),
+    (double2)(0x1.0de0500000000p-1, 0x1.af54def99f25ep-22),
+    (double2)(0x1.0f5e200000000p-1, 0x1.16cfc52a00262p-22),
+    (double2)(0x1.10daa00000000p-1, 0x1.dcc1e83569c32p-23),
+    (double2)(0x1.1255d00000000p-1, 0x1.37f7a551ed425p-22),
+    (double2)(0x1.13cfb00000000p-1, 0x1.f6360adc98887p-22),
+    (double2)(0x1.1548500000000p-1, 0x1.2c6ec8d35a2c1p-22),
+    (double2)(0x1.16bfa00000000p-1, 0x1.bd44df84cb036p-23),
+    (double2)(0x1.1835a00000000p-1, 0x1.117cf826e310ep-22),
+    (double2)(0x1.19aa500000000p-1, 0x1.ca533f332cfc9p-22),
+    (double2)(0x1.1b1dc00000000p-1, 0x1.0f208509dbc2ep-22),
+    (double2)(0x1.1c8fe00000000p-1, 0x1.cd07d93c945dep-23),
+    (double2)(0x1.1e00b00000000p-1, 0x1.57bdfd67e6d72p-22),
+    (double2)(0x1.1f70400000000p-1, 0x1.aab89c516c658p-24),
+    (double2)(0x1.20de800000000p-1, 0x1.3e823b1a1b8a0p-25),
+    (double2)(0x1.224b700000000p-1, 0x1.307464a9d6d3cp-23),
+    (double2)(0x1.23b7100000000p-1, 0x1.c5993cd438843p-22),
+    (double2)(0x1.2521700000000p-1, 0x1.ba2fca02ab554p-22),
+    (double2)(0x1.268a900000000p-1, 0x1.01a5b6983a268p-23),
+    (double2)(0x1.27f2600000000p-1, 0x1.273d1b350efc8p-25),
+    (double2)(0x1.2958e00000000p-1, 0x1.64c238c37b0c6p-23),
+    (double2)(0x1.2abe200000000p-1, 0x1.aded07370a300p-25),
+    (double2)(0x1.2c22100000000p-1, 0x1.78091197eb47ep-23),
+    (double2)(0x1.2d84c00000000p-1, 0x1.4b0f245e0dabcp-24),
+    (double2)(0x1.2ee6200000000p-1, 0x1.080d9794e2eafp-22),
+    (double2)(0x1.3046400000000p-1, 0x1.d4ec242b60c76p-23),
+    (double2)(0x1.31a5200000000p-1, 0x1.221d2f940caa0p-27),
+    (double2)(0x1.3302b00000000p-1, 0x1.cdbc42b2bba5cp-24),
+    (double2)(0x1.345f000000000p-1, 0x1.cce37bb440840p-25),
+    (double2)(0x1.35ba000000000p-1, 0x1.6c1d999cf1dd0p-22),
+    (double2)(0x1.3713d00000000p-1, 0x1.bed8a07eb0870p-26),
+    (double2)(0x1.386c500000000p-1, 0x1.69ed88f490e3cp-24),
+    (double2)(0x1.39c3900000000p-1, 0x1.cd41719b73ef0p-25),
+    (double2)(0x1.3b19800000000p-1, 0x1.cbc4ac95b41b7p-22),
+    (double2)(0x1.3c6e400000000p-1, 0x1.238f1b890f5d7p-22),
+    (double2)(0x1.3dc1c00000000p-1, 0x1.50c4282259cc4p-24),
+    (double2)(0x1.3f13f00000000p-1, 0x1.713d2de87b3e2p-22),
+    (double2)(0x1.4064f00000000p-1, 0x1.1d5a7d2255276p-23),
+    (double2)(0x1.41b4a00000000p-1, 0x1.c0dfd48227ac1p-22),
+    (double2)(0x1.4303200000000p-1, 0x1.1c964dab76753p-22),
+    (double2)(0x1.4450600000000p-1, 0x1.6de56d5704496p-23),
+    (double2)(0x1.459c600000000p-1, 0x1.4aeb71fd19968p-23),
+    (double2)(0x1.46e7200000000p-1, 0x1.fbf91c57b1918p-23),
+    (double2)(0x1.4830a00000000p-1, 0x1.d6bef7fbe5d9ap-22),
+    (double2)(0x1.4978f00000000p-1, 0x1.464d3dc249066p-22),
+    (double2)(0x1.4ac0000000000p-1, 0x1.638e2ec4d9073p-22),
+    (double2)(0x1.4c05e00000000p-1, 0x1.16f4a7247ea7cp-24),
+    (double2)(0x1.4d4a800000000p-1, 0x1.1a0a740f1d440p-28),
+    (double2)(0x1.4e8de00000000p-1, 0x1.6edbb0114a33cp-23),
+    (double2)(0x1.4fd0100000000p-1, 0x1.dbee8bf1d513cp-24),
+    (double2)(0x1.5111000000000p-1, 0x1.5b8bdb0248f73p-22),
+    (double2)(0x1.5250c00000000p-1, 0x1.7de3d3f5eac64p-22),
+    (double2)(0x1.538f500000000p-1, 0x1.ee24187ae448ap-23),
+    (double2)(0x1.54cca00000000p-1, 0x1.e06c591ec5192p-22),
+    (double2)(0x1.5608d00000000p-1, 0x1.4e3861a332738p-24),
+    (double2)(0x1.5743c00000000p-1, 0x1.a9599dcc2bfe4p-24),
+    (double2)(0x1.587d800000000p-1, 0x1.f732fbad43468p-25),
+    (double2)(0x1.59b6000000000p-1, 0x1.eb9f573b727d9p-22),
+    (double2)(0x1.5aed600000000p-1, 0x1.8b212a2eb9897p-22),
+    (double2)(0x1.5c23900000000p-1, 0x1.384884c167215p-22),
+    (double2)(0x1.5d58900000000p-1, 0x1.0e2d363020051p-22),
+    (double2)(0x1.5e8c600000000p-1, 0x1.2820879fbd022p-22),
+    (double2)(0x1.5fbf000000000p-1, 0x1.a1ab9893e4b30p-22),
+    (double2)(0x1.60f0800000000p-1, 0x1.2d1b817a24478p-23),
+    (double2)(0x1.6220d00000000p-1, 0x1.15d7b8ded4878p-25),
+    (double2)(0x1.634ff00000000p-1, 0x1.8968f9db3a5e4p-24),
+    (double2)(0x1.647de00000000p-1, 0x1.71c4171fe135fp-22),
+    (double2)(0x1.65aab00000000p-1, 0x1.6d80f605d0d8cp-22),
+    (double2)(0x1.66d6600000000p-1, 0x1.c91f043691590p-24),
+    (double2)(0x1.6800e00000000p-1, 0x1.39f8a15fce2b2p-23),
+    (double2)(0x1.692a400000000p-1, 0x1.55beda9d94b80p-27),
+    (double2)(0x1.6a52700000000p-1, 0x1.b12c15d60949ap-23),
+    (double2)(0x1.6b79800000000p-1, 0x1.24167b312bfe3p-22),
+    (double2)(0x1.6c9f700000000p-1, 0x1.0ab8633070277p-22),
+    (double2)(0x1.6dc4400000000p-1, 0x1.54554ebbc80eep-23),
+    (double2)(0x1.6ee7f00000000p-1, 0x1.0204aef5a4bb8p-25),
+    (double2)(0x1.700a700000000p-1, 0x1.8af08c679cf2cp-22),
+    (double2)(0x1.712be00000000p-1, 0x1.0852a330ae6c8p-22),
+    (double2)(0x1.724c300000000p-1, 0x1.6d3eb9ec32916p-23),
+    (double2)(0x1.736b600000000p-1, 0x1.685cb7fcbbafep-23),
+    (double2)(0x1.7489700000000p-1, 0x1.1f751c1e0bd95p-22),
+    (double2)(0x1.75a6700000000p-1, 0x1.705b1b0f72560p-26),
+    (double2)(0x1.76c2400000000p-1, 0x1.b98d8d808ca92p-22),
+    (double2)(0x1.77dd100000000p-1, 0x1.2ea22c75cc980p-25),
+    (double2)(0x1.78f6b00000000p-1, 0x1.7aba62bca0350p-22),
+    (double2)(0x1.7a0f400000000p-1, 0x1.d73833442278cp-22),
+    (double2)(0x1.7b26c00000000p-1, 0x1.5a5ca1fb18bf9p-22),
+    (double2)(0x1.7c3d300000000p-1, 0x1.1a6092b6ecf28p-25),
+    (double2)(0x1.7d52800000000p-1, 0x1.44fd049aac104p-24),
+    (double2)(0x1.7e66c00000000p-1, 0x1.c114fd8df5180p-29),
+    (double2)(0x1.7f79e00000000p-1, 0x1.5972f130feae5p-22),
+    (double2)(0x1.808c000000000p-1, 0x1.ca034a55fe198p-24),
+    (double2)(0x1.819d000000000p-1, 0x1.6e2b149990227p-22),
+    (double2)(0x1.82ad000000000p-1, 0x1.b00000294592cp-24),
+    (double2)(0x1.83bbe00000000p-1, 0x1.8b9bdc442620ep-22),
+    (double2)(0x1.84c9c00000000p-1, 0x1.d94fdfabf3e4ep-23),
+    (double2)(0x1.85d6900000000p-1, 0x1.5db30b145ad9ap-23),
+    (double2)(0x1.86e2500000000p-1, 0x1.e3e1eb95022b0p-23),
+    (double2)(0x1.87ed000000000p-1, 0x1.d5b8b45442bd6p-22),
+    (double2)(0x1.88f6b00000000p-1, 0x1.7a046231ecd2ep-22),
+    (double2)(0x1.89ff500000000p-1, 0x1.feafe3ef55232p-22),
+    (double2)(0x1.8b06f00000000p-1, 0x1.839e7bfd78267p-22),
+    (double2)(0x1.8c0d900000000p-1, 0x1.45cf49d6fa900p-25),
+    (double2)(0x1.8d13200000000p-1, 0x1.be3132b27f380p-27),
+    (double2)(0x1.8e17a00000000p-1, 0x1.533980bb84f9fp-22),
+    (double2)(0x1.8f1b300000000p-1, 0x1.889e2ce3ba390p-26),
+    (double2)(0x1.901db00000000p-1, 0x1.f7778c3ad0cc8p-24),
+    (double2)(0x1.911f300000000p-1, 0x1.46660cec4eba2p-23),
+    (double2)(0x1.921fb00000000p-1, 0x1.5110b4611a626p-23),
+};
+
+DECLARE_VTABLE(double2, TWO_TO_JBY64_EP, 64) = {
+    (double2)(0x1.0000000000000p+0, 0x0.0000000000000p+0),
+    (double2)(0x1.02c9a30000000p+0, 0x1.cef00c1dcdef9p-25),
+    (double2)(0x1.059b0d0000000p+0, 0x1.8ac2ba1d73e2ap-27),
+    (double2)(0x1.0874510000000p+0, 0x1.0eb37901186bep-25),
+    (double2)(0x1.0b55860000000p+0, 0x1.9f3121ec53172p-25),
+    (double2)(0x1.0e3ec30000000p+0, 0x1.69e8d10103a17p-27),
+    (double2)(0x1.11301d0000000p+0, 0x1.25b50a4ebbf1ap-32),
+    (double2)(0x1.1429aa0000000p+0, 0x1.d525bbf668203p-25),
+    (double2)(0x1.172b830000000p+0, 0x1.8faa2f5b9bef9p-25),
+    (double2)(0x1.1a35be0000000p+0, 0x1.6df96ea796d31p-25),
+    (double2)(0x1.1d48730000000p+0, 0x1.68b9aa7805b80p-28),
+    (double2)(0x1.2063b80000000p+0, 0x1.0c519ac771dd6p-25),
+    (double2)(0x1.2387a60000000p+0, 0x1.ceac470cd83f5p-25),
+    (double2)(0x1.26b4560000000p+0, 0x1.789f37495e99cp-26),
+    (double2)(0x1.29e9df0000000p+0, 0x1.47f7b84b09745p-26),
+    (double2)(0x1.2d285a0000000p+0, 0x1.b900c2d002475p-26),
+    (double2)(0x1.306fe00000000p+0, 0x1.4636e2a5bd1abp-25),
+    (double2)(0x1.33c08b0000000p+0, 0x1.320b7fa64e430p-27),
+    (double2)(0x1.371a730000000p+0, 0x1.ceaa72a9c5154p-26),
+    (double2)(0x1.3a7db30000000p+0, 0x1.3967fdba86f24p-26),
+    (double2)(0x1.3dea640000000p+0, 0x1.82468446b6824p-25),
+    (double2)(0x1.4160a20000000p+0, 0x1.f72e29f84325bp-28),
+    (double2)(0x1.44e0860000000p+0, 0x1.8624b40c4dbd0p-30),
+    (double2)(0x1.486a2b0000000p+0, 0x1.704f3404f068ep-26),
+    (double2)(0x1.4bfdad0000000p+0, 0x1.4d8a89c750e5ep-26),
+    (double2)(0x1.4f9b270000000p+0, 0x1.a74b29ab4cf62p-26),
+    (double2)(0x1.5342b50000000p+0, 0x1.a753e077c2a0fp-26),
+    (double2)(0x1.56f4730000000p+0, 0x1.ad49f699bb2c0p-26),
+    (double2)(0x1.5ab07d0000000p+0, 0x1.a90a852b19260p-25),
+    (double2)(0x1.5e76f10000000p+0, 0x1.6b48521ba6f93p-26),
+    (double2)(0x1.6247eb0000000p+0, 0x1.d2ac258f87d03p-31),
+    (double2)(0x1.6623880000000p+0, 0x1.2a91124893ecfp-27),
+    (double2)(0x1.6a09e60000000p+0, 0x1.9fcef32422cbep-26),
+    (double2)(0x1.6dfb230000000p+0, 0x1.8ca345de441c5p-25),
+    (double2)(0x1.71f75e0000000p+0, 0x1.1d8bee7ba46e1p-25),
+    (double2)(0x1.75feb50000000p+0, 0x1.9099f22fdba6ap-26),
+    (double2)(0x1.7a11470000000p+0, 0x1.f580c36bea881p-27),
+    (double2)(0x1.7e2f330000000p+0, 0x1.b3d398841740ap-26),
+    (double2)(0x1.8258990000000p+0, 0x1.2999c25159f11p-25),
+    (double2)(0x1.868d990000000p+0, 0x1.68925d901c83bp-25),
+    (double2)(0x1.8ace540000000p+0, 0x1.15506dadd3e2ap-27),
+    (double2)(0x1.8f1ae90000000p+0, 0x1.22aee6c57304ep-25),
+    (double2)(0x1.93737b0000000p+0, 0x1.9b8bc9e8a0387p-29),
+    (double2)(0x1.97d8290000000p+0, 0x1.fbc9c9f173d24p-25),
+    (double2)(0x1.9c49180000000p+0, 0x1.51f8480e3e235p-27),
+    (double2)(0x1.a0c6670000000p+0, 0x1.6bbcac96535b5p-25),
+    (double2)(0x1.a5503b0000000p+0, 0x1.1f12ae45a1224p-27),
+    (double2)(0x1.a9e6b50000000p+0, 0x1.5e7f6fd0fac90p-26),
+    (double2)(0x1.ae89f90000000p+0, 0x1.2b5a75abd0e69p-25),
+    (double2)(0x1.b33a2b0000000p+0, 0x1.09e2bf5ed7fa1p-25),
+    (double2)(0x1.b7f76f0000000p+0, 0x1.7daf237553d84p-27),
+    (double2)(0x1.bcc1e90000000p+0, 0x1.2f074891ee83dp-30),
+    (double2)(0x1.c199bd0000000p+0, 0x1.b0aa538444196p-25),
+    (double2)(0x1.c67f120000000p+0, 0x1.cafa29694426fp-25),
+    (double2)(0x1.cb720d0000000p+0, 0x1.9df20d22a0797p-25),
+    (double2)(0x1.d072d40000000p+0, 0x1.40f12f71a1e45p-25),
+    (double2)(0x1.d5818d0000000p+0, 0x1.9f7490e4bb40bp-25),
+    (double2)(0x1.da9e600000000p+0, 0x1.ed9942b84600dp-27),
+    (double2)(0x1.dfc9730000000p+0, 0x1.bdcdaf5cb4656p-27),
+    (double2)(0x1.e502ee0000000p+0, 0x1.e2cffd89cf44cp-26),
+    (double2)(0x1.ea4afa0000000p+0, 0x1.52486cc2c7b9dp-27),
+    (double2)(0x1.efa1be0000000p+0, 0x1.cc2b44eee3fa4p-25),
+    (double2)(0x1.f507650000000p+0, 0x1.6dc8a80ce9f09p-25),
+    (double2)(0x1.fa7c180000000p+0, 0x1.9e90d82e90a7ep-28)
+};
+
+
+DECLARE_VTABLE(double2, SINH_TBL, 37) = {
+    (double2)(0x0.0000000000000p+0, 0x0.0000000000000p+0),
+    (double2)(0x1.2cd9fc0000000p+0, 0x1.13ae6096a0092p-26),
+    (double2)(0x1.d03cf60000000p+1, 0x1.db70cfb79a640p-26),
+    (double2)(0x1.40926e0000000p+3, 0x1.c2526b66dc067p-23),
+    (double2)(0x1.b4a3800000000p+4, 0x1.b81b18647f380p-23),
+    (double2)(0x1.28d0160000000p+6, 0x1.bc1cdd1e1eb08p-20),
+    (double2)(0x1.936d228000000p+7, 0x1.d9f201534fb09p-19),
+    (double2)(0x1.1228768000000p+9, 0x1.d1c064a4e9954p-18),
+    (double2)(0x1.749ea50000000p+10, 0x1.4eca65d06ea74p-18),
+    (double2)(0x1.fa71570000000p+11, 0x1.0c259bcc0ecc5p-15),
+    (double2)(0x1.5829dc8000000p+13, 0x1.b5a6647cf9016p-13),
+    (double2)(0x1.d3c4488000000p+14, 0x1.9691adefb0870p-15),
+    (double2)(0x1.3de1650000000p+16, 0x1.3410fc29cde38p-10),
+    (double2)(0x1.b00b590000000p+17, 0x1.6a31a50b6fb3cp-11),
+    (double2)(0x1.259ac48000000p+19, 0x1.7defc71805c40p-10),
+    (double2)(0x1.8f0cca8000000p+20, 0x1.eb49fd80e0babp-6),
+    (double2)(0x1.0f2ebd0000000p+22, 0x1.4fffc7bcd5920p-7),
+    (double2)(0x1.7093488000000p+23, 0x1.03a93b6c63435p-3),
+    (double2)(0x1.f4f2208000000p+24, 0x1.1940bb255fd1cp-4),
+    (double2)(0x1.546d8f8000000p+26, 0x1.ed26e14260b50p-2),
+    (double2)(0x1.ceb0888000000p+27, 0x1.b47401fc9f2a2p+0),
+    (double2)(0x1.3a6e1f8000000p+29, 0x1.67bb3f55634f1p+3),
+    (double2)(0x1.ab5adb8000000p+30, 0x1.c435ff8194ddcp+2),
+    (double2)(0x1.226af30000000p+32, 0x1.d8fee052ba63ap+5),
+    (double2)(0x1.8ab7fb0000000p+33, 0x1.51d7edccde3f6p+7),
+    (double2)(0x1.0c3d390000000p+35, 0x1.04b1644557d1ap+8),
+    (double2)(0x1.6c93268000000p+36, 0x1.6a6b5ca0a9dc4p+8),
+    (double2)(0x1.ef822f0000000p+37, 0x1.fd9cc72249abap+11),
+    (double2)(0x1.50bba30000000p+39, 0x1.e58de693edab5p+13),
+    (double2)(0x1.c9aae40000000p+40, 0x1.8c70158ac6363p+14),
+    (double2)(0x1.3704708000000p+42, 0x1.7614764f43e20p+15),
+    (double2)(0x1.a6b7658000000p+43, 0x1.6337db36fc718p+17),
+    (double2)(0x1.1f43fc8000000p+45, 0x1.12d98b1f611e2p+19),
+    (double2)(0x1.866f348000000p+46, 0x1.392bc108b37ccp+19),
+    (double2)(0x1.0953e28000000p+48, 0x1.ce87bdc3473dcp+22),
+    (double2)(0x1.689e220000000p+49, 0x1.bc8d5ae99ad14p+21),
+    (double2)(0x1.ea215a0000000p+50, 0x1.d20d76744835cp+22),
+};
+
+DECLARE_VTABLE(double2, COSH_TBL, 37) = {
+    (double2)(0x1.0000000000000p+0, 0x0.0000000000000p+0),
+    (double2)(0x1.8b07550000000p+0, 0x1.d9f5504c2bd28p-28),
+    (double2)(0x1.e18fa08000000p+1, 0x1.7cb66f0a4c9fdp-25),
+    (double2)(0x1.422a490000000p+3, 0x1.f58617928e588p-23),
+    (double2)(0x1.b4ee858000000p+4, 0x1.bc7d000c38d48p-25),
+    (double2)(0x1.28d6fc8000000p+6, 0x1.f7f9d4e329998p-21),
+    (double2)(0x1.936e678000000p+7, 0x1.6e6e464885269p-19),
+    (double2)(0x1.1228948000000p+9, 0x1.ba3a8b946c154p-19),
+    (double2)(0x1.749eaa8000000p+10, 0x1.3f4e76110d5a4p-18),
+    (double2)(0x1.fa71580000000p+11, 0x1.17622515a3e2bp-15),
+    (double2)(0x1.5829dd0000000p+13, 0x1.4dc4b528af3d0p-17),
+    (double2)(0x1.d3c4488000000p+14, 0x1.1156278615e10p-14),
+    (double2)(0x1.3de1650000000p+16, 0x1.35ad50ed821f5p-10),
+    (double2)(0x1.b00b590000000p+17, 0x1.6b61055f2935cp-11),
+    (double2)(0x1.259ac48000000p+19, 0x1.7e2794a601240p-10),
+    (double2)(0x1.8f0cca8000000p+20, 0x1.eb4b45f6aadd3p-6),
+    (double2)(0x1.0f2ebd0000000p+22, 0x1.5000b967b3698p-7),
+    (double2)(0x1.7093488000000p+23, 0x1.03a940fadc092p-3),
+    (double2)(0x1.f4f2208000000p+24, 0x1.1940bf3bf874cp-4),
+    (double2)(0x1.546d8f8000000p+26, 0x1.ed26e1a2a2110p-2),
+    (double2)(0x1.ceb0888000000p+27, 0x1.b4740205796d6p+0),
+    (double2)(0x1.3a6e1f8000000p+29, 0x1.67bb3f55cb85dp+3),
+    (double2)(0x1.ab5adb8000000p+30, 0x1.c435ff81e18acp+2),
+    (double2)(0x1.226af30000000p+32, 0x1.d8fee052bdea4p+5),
+    (double2)(0x1.8ab7fb0000000p+33, 0x1.51d7edccde926p+7),
+    (double2)(0x1.0c3d390000000p+35, 0x1.04b1644557e0ep+8),
+    (double2)(0x1.6c93268000000p+36, 0x1.6a6b5ca0a9e1cp+8),
+    (double2)(0x1.ef822f0000000p+37, 0x1.fd9cc72249abep+11),
+    (double2)(0x1.50bba30000000p+39, 0x1.e58de693edab5p+13),
+    (double2)(0x1.c9aae40000000p+40, 0x1.8c70158ac6364p+14),
+    (double2)(0x1.3704708000000p+42, 0x1.7614764f43e20p+15),
+    (double2)(0x1.a6b7658000000p+43, 0x1.6337db36fc718p+17),
+    (double2)(0x1.1f43fc8000000p+45, 0x1.12d98b1f611e2p+19),
+    (double2)(0x1.866f348000000p+46, 0x1.392bc108b37ccp+19),
+    (double2)(0x1.0953e28000000p+48, 0x1.ce87bdc3473dcp+22),
+    (double2)(0x1.689e220000000p+49, 0x1.bc8d5ae99ad14p+21),
+    (double2)(0x1.ea215a0000000p+50, 0x1.d20d76744835cp+22)
+};
+
+DECLARE_VTABLE(double, CBRT_INV_TBL, 257) = {
+    0x1.0000000000000p+1,
+    0x1.fe01fe01fe020p+0,
+    0x1.fc07f01fc07f0p+0,
+    0x1.fa11caa01fa12p+0,
+    0x1.f81f81f81f820p+0,
+    0x1.f6310aca0dbb5p+0,
+    0x1.f44659e4a4271p+0,
+    0x1.f25f644230ab5p+0,
+    0x1.f07c1f07c1f08p+0,
+    0x1.ee9c7f8458e02p+0,
+    0x1.ecc07b301ecc0p+0,
+    0x1.eae807aba01ebp+0,
+    0x1.e9131abf0b767p+0,
+    0x1.e741aa59750e4p+0,
+    0x1.e573ac901e574p+0,
+    0x1.e3a9179dc1a73p+0,
+    0x1.e1e1e1e1e1e1ep+0,
+    0x1.e01e01e01e01ep+0,
+    0x1.de5d6e3f8868ap+0,
+    0x1.dca01dca01dcap+0,
+    0x1.dae6076b981dbp+0,
+    0x1.d92f2231e7f8ap+0,
+    0x1.d77b654b82c34p+0,
+    0x1.d5cac807572b2p+0,
+    0x1.d41d41d41d41dp+0,
+    0x1.d272ca3fc5b1ap+0,
+    0x1.d0cb58f6ec074p+0,
+    0x1.cf26e5c44bfc6p+0,
+    0x1.cd85689039b0bp+0,
+    0x1.cbe6d9601cbe7p+0,
+    0x1.ca4b3055ee191p+0,
+    0x1.c8b265afb8a42p+0,
+    0x1.c71c71c71c71cp+0,
+    0x1.c5894d10d4986p+0,
+    0x1.c3f8f01c3f8f0p+0,
+    0x1.c26b5392ea01cp+0,
+    0x1.c0e070381c0e0p+0,
+    0x1.bf583ee868d8bp+0,
+    0x1.bdd2b899406f7p+0,
+    0x1.bc4fd65883e7bp+0,
+    0x1.bacf914c1bad0p+0,
+    0x1.b951e2b18ff23p+0,
+    0x1.b7d6c3dda338bp+0,
+    0x1.b65e2e3beee05p+0,
+    0x1.b4e81b4e81b4fp+0,
+    0x1.b37484ad806cep+0,
+    0x1.b2036406c80d9p+0,
+    0x1.b094b31d922a4p+0,
+    0x1.af286bca1af28p+0,
+    0x1.adbe87f94905ep+0,
+    0x1.ac5701ac5701bp+0,
+    0x1.aaf1d2f87ebfdp+0,
+    0x1.a98ef606a63bep+0,
+    0x1.a82e65130e159p+0,
+    0x1.a6d01a6d01a6dp+0,
+    0x1.a574107688a4ap+0,
+    0x1.a41a41a41a41ap+0,
+    0x1.a2c2a87c51ca0p+0,
+    0x1.a16d3f97a4b02p+0,
+    0x1.a01a01a01a01ap+0,
+    0x1.9ec8e951033d9p+0,
+    0x1.9d79f176b682dp+0,
+    0x1.9c2d14ee4a102p+0,
+    0x1.9ae24ea5510dap+0,
+    0x1.999999999999ap+0,
+    0x1.9852f0d8ec0ffp+0,
+    0x1.970e4f80cb872p+0,
+    0x1.95cbb0be377aep+0,
+    0x1.948b0fcd6e9e0p+0,
+    0x1.934c67f9b2ce6p+0,
+    0x1.920fb49d0e229p+0,
+    0x1.90d4f120190d5p+0,
+    0x1.8f9c18f9c18fap+0,
+    0x1.8e6527af1373fp+0,
+    0x1.8d3018d3018d3p+0,
+    0x1.8bfce8062ff3ap+0,
+    0x1.8acb90f6bf3aap+0,
+    0x1.899c0f601899cp+0,
+    0x1.886e5f0abb04ap+0,
+    0x1.87427bcc092b9p+0,
+    0x1.8618618618618p+0,
+    0x1.84f00c2780614p+0,
+    0x1.83c977ab2beddp+0,
+    0x1.82a4a0182a4a0p+0,
+    0x1.8181818181818p+0,
+    0x1.8060180601806p+0,
+    0x1.7f405fd017f40p+0,
+    0x1.7e225515a4f1dp+0,
+    0x1.7d05f417d05f4p+0,
+    0x1.7beb3922e017cp+0,
+    0x1.7ad2208e0ecc3p+0,
+    0x1.79baa6bb6398bp+0,
+    0x1.78a4c8178a4c8p+0,
+    0x1.77908119ac60dp+0,
+    0x1.767dce434a9b1p+0,
+    0x1.756cac201756dp+0,
+    0x1.745d1745d1746p+0,
+    0x1.734f0c541fe8dp+0,
+    0x1.724287f46debcp+0,
+    0x1.713786d9c7c09p+0,
+    0x1.702e05c0b8170p+0,
+    0x1.6f26016f26017p+0,
+    0x1.6e1f76b4337c7p+0,
+    0x1.6d1a62681c861p+0,
+    0x1.6c16c16c16c17p+0,
+    0x1.6b1490aa31a3dp+0,
+    0x1.6a13cd1537290p+0,
+    0x1.691473a88d0c0p+0,
+    0x1.6816816816817p+0,
+    0x1.6719f3601671ap+0,
+    0x1.661ec6a5122f9p+0,
+    0x1.6524f853b4aa3p+0,
+    0x1.642c8590b2164p+0,
+    0x1.63356b88ac0dep+0,
+    0x1.623fa77016240p+0,
+    0x1.614b36831ae94p+0,
+    0x1.6058160581606p+0,
+    0x1.5f66434292dfcp+0,
+    0x1.5e75bb8d015e7p+0,
+    0x1.5d867c3ece2a5p+0,
+    0x1.5c9882b931057p+0,
+    0x1.5babcc647fa91p+0,
+    0x1.5ac056b015ac0p+0,
+    0x1.59d61f123ccaap+0,
+    0x1.58ed2308158edp+0,
+    0x1.5805601580560p+0,
+    0x1.571ed3c506b3ap+0,
+    0x1.56397ba7c52e2p+0,
+    0x1.5555555555555p+0,
+    0x1.54725e6bb82fep+0,
+    0x1.5390948f40febp+0,
+    0x1.52aff56a8054bp+0,
+    0x1.51d07eae2f815p+0,
+    0x1.50f22e111c4c5p+0,
+    0x1.5015015015015p+0,
+    0x1.4f38f62dd4c9bp+0,
+    0x1.4e5e0a72f0539p+0,
+    0x1.4d843bedc2c4cp+0,
+    0x1.4cab88725af6ep+0,
+    0x1.4bd3edda68fe1p+0,
+    0x1.4afd6a052bf5bp+0,
+    0x1.4a27fad76014ap+0,
+    0x1.49539e3b2d067p+0,
+    0x1.4880522014880p+0,
+    0x1.47ae147ae147bp+0,
+    0x1.46dce34596066p+0,
+    0x1.460cbc7f5cf9ap+0,
+    0x1.453d9e2c776cap+0,
+    0x1.446f86562d9fbp+0,
+    0x1.43a2730abee4dp+0,
+    0x1.42d6625d51f87p+0,
+    0x1.420b5265e5951p+0,
+    0x1.4141414141414p+0,
+    0x1.40782d10e6566p+0,
+    0x1.3fb013fb013fbp+0,
+    0x1.3ee8f42a5af07p+0,
+    0x1.3e22cbce4a902p+0,
+    0x1.3d5d991aa75c6p+0,
+    0x1.3c995a47babe7p+0,
+    0x1.3bd60d9232955p+0,
+    0x1.3b13b13b13b14p+0,
+    0x1.3a524387ac822p+0,
+    0x1.3991c2c187f63p+0,
+    0x1.38d22d366088ep+0,
+    0x1.3813813813814p+0,
+    0x1.3755bd1c945eep+0,
+    0x1.3698df3de0748p+0,
+    0x1.35dce5f9f2af8p+0,
+    0x1.3521cfb2b78c1p+0,
+    0x1.34679ace01346p+0,
+    0x1.33ae45b57bcb2p+0,
+    0x1.32f5ced6a1dfap+0,
+    0x1.323e34a2b10bfp+0,
+    0x1.3187758e9ebb6p+0,
+    0x1.30d190130d190p+0,
+    0x1.301c82ac40260p+0,
+    0x1.2f684bda12f68p+0,
+    0x1.2eb4ea1fed14bp+0,
+    0x1.2e025c04b8097p+0,
+    0x1.2d50a012d50a0p+0,
+    0x1.2c9fb4d812ca0p+0,
+    0x1.2bef98e5a3711p+0,
+    0x1.2b404ad012b40p+0,
+    0x1.2a91c92f3c105p+0,
+    0x1.29e4129e4129ep+0,
+    0x1.293725bb804a5p+0,
+    0x1.288b01288b013p+0,
+    0x1.27dfa38a1ce4dp+0,
+    0x1.27350b8812735p+0,
+    0x1.268b37cd60127p+0,
+    0x1.25e22708092f1p+0,
+    0x1.2539d7e9177b2p+0,
+    0x1.2492492492492p+0,
+    0x1.23eb79717605bp+0,
+    0x1.23456789abcdfp+0,
+    0x1.22a0122a0122ap+0,
+    0x1.21fb78121fb78p+0,
+    0x1.21579804855e6p+0,
+    0x1.20b470c67c0d9p+0,
+    0x1.2012012012012p+0,
+    0x1.1f7047dc11f70p+0,
+    0x1.1ecf43c7fb84cp+0,
+    0x1.1e2ef3b3fb874p+0,
+    0x1.1d8f5672e4abdp+0,
+    0x1.1cf06ada2811dp+0,
+    0x1.1c522fc1ce059p+0,
+    0x1.1bb4a4046ed29p+0,
+    0x1.1b17c67f2bae3p+0,
+    0x1.1a7b9611a7b96p+0,
+    0x1.19e0119e0119ep+0,
+    0x1.19453808ca29cp+0,
+    0x1.18ab083902bdbp+0,
+    0x1.1811811811812p+0,
+    0x1.1778a191bd684p+0,
+    0x1.16e0689427379p+0,
+    0x1.1648d50fc3201p+0,
+    0x1.15b1e5f75270dp+0,
+    0x1.151b9a3fdd5c9p+0,
+    0x1.1485f0e0acd3bp+0,
+    0x1.13f0e8d344724p+0,
+    0x1.135c81135c811p+0,
+    0x1.12c8b89edc0acp+0,
+    0x1.12358e75d3033p+0,
+    0x1.11a3019a74826p+0,
+    0x1.1111111111111p+0,
+    0x1.107fbbe011080p+0,
+    0x1.0fef010fef011p+0,
+    0x1.0f5edfab325a2p+0,
+    0x1.0ecf56be69c90p+0,
+    0x1.0e40655826011p+0,
+    0x1.0db20a88f4696p+0,
+    0x1.0d24456359e3ap+0,
+    0x1.0c9714fbcda3bp+0,
+    0x1.0c0a7868b4171p+0,
+    0x1.0b7e6ec259dc8p+0,
+    0x1.0af2f722eecb5p+0,
+    0x1.0a6810a6810a7p+0,
+    0x1.09ddba6af8360p+0,
+    0x1.0953f39010954p+0,
+    0x1.08cabb37565e2p+0,
+    0x1.0842108421084p+0,
+    0x1.07b9f29b8eae2p+0,
+    0x1.073260a47f7c6p+0,
+    0x1.06ab59c7912fbp+0,
+    0x1.0624dd2f1a9fcp+0,
+    0x1.059eea0727586p+0,
+    0x1.05197f7d73404p+0,
+    0x1.04949cc1664c5p+0,
+    0x1.0410410410410p+0,
+    0x1.038c6b78247fcp+0,
+    0x1.03091b51f5e1ap+0,
+    0x1.02864fc7729e9p+0,
+    0x1.0204081020408p+0,
+    0x1.0182436517a37p+0,
+    0x1.0101010101010p+0,
+    0x1.0080402010080p+0,
+    0x1.0000000000000p+0
+};
+
+DECLARE_VTABLE(double2, CBRT_DBL_TBL, 257) = {
+    (double2)(0x1.0000000000000p+0, 0x0.0000000000000p+0),
+    (double2)(0x1.0055380000000p+0, 0x1.e6a24c81e4294p-25),
+    (double2)(0x1.00aa390000000p+0, 0x1.8548511e3a785p-26),
+    (double2)(0x1.00ff010000000p+0, 0x1.4eb9336ec07f6p-25),
+    (double2)(0x1.0153920000000p+0, 0x1.0ea64b8b750e1p-27),
+    (double2)(0x1.01a7eb0000000p+0, 0x1.61637cff8a53cp-27),
+    (double2)(0x1.01fc0d0000000p+0, 0x1.0733bf7bd1943p-27),
+    (double2)(0x1.024ff80000000p+0, 0x1.666911345ccedp-26),
+    (double2)(0x1.02a3ad0000000p+0, 0x1.77b7a3f592f14p-27),
+    (double2)(0x1.02f72b0000000p+0, 0x1.f18d3dd1a5402p-25),
+    (double2)(0x1.034a750000000p+0, 0x1.be2f5a58ee9a4p-29),
+    (double2)(0x1.039d880000000p+0, 0x1.8901f8f085fa7p-25),
+    (double2)(0x1.03f0670000000p+0, 0x1.c68b8cd5b5d69p-26),
+    (double2)(0x1.0443110000000p+0, 0x1.a6b0e8624be42p-26),
+    (double2)(0x1.0495870000000p+0, 0x1.c4b22b06f68e7p-36),
+    (double2)(0x1.04e7c80000000p+0, 0x1.0f3f0afcabe9bp-25),
+    (double2)(0x1.0539d60000000p+0, 0x1.48495bca4e1b7p-26),
+    (double2)(0x1.058bb00000000p+0, 0x1.6107f1abdfdc3p-25),
+    (double2)(0x1.05dd570000000p+0, 0x1.e67261878288ap-25),
+    (double2)(0x1.062ecc0000000p+0, 0x1.a6bc155286f1ep-26),
+    (double2)(0x1.06800e0000000p+0, 0x1.8a759c64a85f2p-26),
+    (double2)(0x1.06d11e0000000p+0, 0x1.5fce70a4a8d09p-27),
+    (double2)(0x1.0721fc0000000p+0, 0x1.2f9cbf373fe1dp-28),
+    (double2)(0x1.0772a80000000p+0, 0x1.90564ce4ac359p-26),
+    (double2)(0x1.07c3230000000p+0, 0x1.ac29ce761b02fp-26),
+    (double2)(0x1.08136d0000000p+0, 0x1.cb752f497381cp-26),
+    (double2)(0x1.0863860000000p+0, 0x1.8bb9e1cfb35e0p-25),
+    (double2)(0x1.08b36f0000000p+0, 0x1.5b4917099de90p-25),
+    (double2)(0x1.0903280000000p+0, 0x1.cc77ac9c65ef2p-26),
+    (double2)(0x1.0952b10000000p+0, 0x1.7a0f3e7be3dbap-26),
+    (double2)(0x1.09a20a0000000p+0, 0x1.6ec851ee0c16fp-25),
+    (double2)(0x1.09f1340000000p+0, 0x1.89449bf2946dap-25),
+    (double2)(0x1.0a402f0000000p+0, 0x1.98f25301ba223p-25),
+    (double2)(0x1.0a8efc0000000p+0, 0x1.47d5ec651f549p-28),
+    (double2)(0x1.0add990000000p+0, 0x1.c33ec9a86007ap-25),
+    (double2)(0x1.0b2c090000000p+0, 0x1.e0b6653e92649p-26),
+    (double2)(0x1.0b7a4b0000000p+0, 0x1.bd64ac09d755fp-28),
+    (double2)(0x1.0bc85f0000000p+0, 0x1.f537506f78167p-29),
+    (double2)(0x1.0c16450000000p+0, 0x1.2c382d1b3735ep-25),
+    (double2)(0x1.0c63fe0000000p+0, 0x1.e20ed659f99e1p-25),
+    (double2)(0x1.0cb18b0000000p+0, 0x1.86b633a9c182ap-26),
+    (double2)(0x1.0cfeeb0000000p+0, 0x1.45cfd5a65e777p-27),
+    (double2)(0x1.0d4c1e0000000p+0, 0x1.0c8770f58bca4p-25),
+    (double2)(0x1.0d99250000000p+0, 0x1.739e44b0933c5p-25),
+    (double2)(0x1.0de6010000000p+0, 0x1.27dc3d9ce7bd8p-31),
+    (double2)(0x1.0e32b00000000p+0, 0x1.3c53c7c5a7b64p-25),
+    (double2)(0x1.0e7f340000000p+0, 0x1.9669683830cecp-25),
+    (double2)(0x1.0ecb8d0000000p+0, 0x1.8d772c39bdcc4p-25),
+    (double2)(0x1.0f17bb0000000p+0, 0x1.9b0008bcf6d7bp-25),
+    (double2)(0x1.0f63bf0000000p+0, 0x1.bbb305825ce4fp-28),
+    (double2)(0x1.0faf970000000p+0, 0x1.da3f4af13a406p-25),
+    (double2)(0x1.0ffb460000000p+0, 0x1.f36b96f74ce86p-26),
+    (double2)(0x1.1046cb0000000p+0, 0x1.65c002303f790p-30),
+    (double2)(0x1.1092250000000p+0, 0x1.82f84095ba7d5p-25),
+    (double2)(0x1.10dd560000000p+0, 0x1.d46433541b2c6p-25),
+    (double2)(0x1.11285e0000000p+0, 0x1.71c3d56e93a89p-25),
+    (double2)(0x1.11733d0000000p+0, 0x1.98dcef4e40012p-26),
+    (double2)(0x1.11bdf30000000p+0, 0x1.530ebef17fe03p-27),
+    (double2)(0x1.1208800000000p+0, 0x1.e8b8fa3715066p-27),
+    (double2)(0x1.1252e40000000p+0, 0x1.ab26eb3b211dcp-25),
+    (double2)(0x1.129d210000000p+0, 0x1.54dd4dc906307p-27),
+    (double2)(0x1.12e7350000000p+0, 0x1.c9f962387984ep-26),
+    (double2)(0x1.1331210000000p+0, 0x1.c62a959afec09p-25),
+    (double2)(0x1.137ae60000000p+0, 0x1.638d9ac6a866ap-25),
+    (double2)(0x1.13c4840000000p+0, 0x1.38704eca8a22dp-28),
+    (double2)(0x1.140dfa0000000p+0, 0x1.e6c9e1db14f8fp-27),
+    (double2)(0x1.1457490000000p+0, 0x1.8744b7f9c9eaap-26),
+    (double2)(0x1.14a0710000000p+0, 0x1.6c2893486373bp-25),
+    (double2)(0x1.14e9730000000p+0, 0x1.b36bce31699b7p-26),
+    (double2)(0x1.15324e0000000p+0, 0x1.71e3813d200c7p-25),
+    (double2)(0x1.157b030000000p+0, 0x1.99755ab40aa88p-25),
+    (double2)(0x1.15c3920000000p+0, 0x1.b45ca0e4bcfc0p-25),
+    (double2)(0x1.160bfc0000000p+0, 0x1.2dd090d869c5dp-28),
+    (double2)(0x1.16543f0000000p+0, 0x1.4fe0516b917dap-25),
+    (double2)(0x1.169c5d0000000p+0, 0x1.94563226317a2p-25),
+    (double2)(0x1.16e4560000000p+0, 0x1.53d8fafc2c851p-25),
+    (double2)(0x1.172c2a0000000p+0, 0x1.dcbd41fbd41a3p-26),
+    (double2)(0x1.1773d90000000p+0, 0x1.862ff5285f59cp-26),
+    (double2)(0x1.17bb630000000p+0, 0x1.3072ea97a1e1cp-25),
+    (double2)(0x1.1802c90000000p+0, 0x1.2839075184805p-26),
+    (double2)(0x1.184a0a0000000p+0, 0x1.4b0323e9eff42p-25),
+    (double2)(0x1.1891270000000p+0, 0x1.b158893c45484p-25),
+    (double2)(0x1.18d8210000000p+0, 0x1.149ef0fc35826p-28),
+    (double2)(0x1.191ef60000000p+0, 0x1.f2e77ea96acaap-26),
+    (double2)(0x1.1965a80000000p+0, 0x1.200074c471a95p-26),
+    (double2)(0x1.19ac360000000p+0, 0x1.3f8cc517f6f04p-25),
+    (double2)(0x1.19f2a10000000p+0, 0x1.60ba2e311bb55p-25),
+    (double2)(0x1.1a38e90000000p+0, 0x1.4b788730bbec3p-25),
+    (double2)(0x1.1a7f0e0000000p+0, 0x1.57090795ee20cp-25),
+    (double2)(0x1.1ac5100000000p+0, 0x1.d9ffe983670b1p-25),
+    (double2)(0x1.1b0af00000000p+0, 0x1.2a463ff61bfdap-25),
+    (double2)(0x1.1b50ad0000000p+0, 0x1.9d1bc6a5e65cfp-25),
+    (double2)(0x1.1b96480000000p+0, 0x1.8718abaa9e922p-25),
+    (double2)(0x1.1bdbc10000000p+0, 0x1.3c2f52ffa342ep-25),
+    (double2)(0x1.1c21180000000p+0, 0x1.0fae13ff42c80p-25),
+    (double2)(0x1.1c664d0000000p+0, 0x1.5440f0ef00d57p-25),
+    (double2)(0x1.1cab610000000p+0, 0x1.6fcd22d4e3c1ep-27),
+    (double2)(0x1.1cf0530000000p+0, 0x1.e0c60b409e863p-27),
+    (double2)(0x1.1d35230000000p+0, 0x1.f9cab5a5f0333p-25),
+    (double2)(0x1.1d79d30000000p+0, 0x1.30f24744c333dp-25),
+    (double2)(0x1.1dbe620000000p+0, 0x1.b50622a76b2fep-27),
+    (double2)(0x1.1e02cf0000000p+0, 0x1.fdb94ba595375p-25),
+    (double2)(0x1.1e471d0000000p+0, 0x1.861b9b945a171p-28),
+    (double2)(0x1.1e8b490000000p+0, 0x1.54348015188c4p-25),
+    (double2)(0x1.1ecf550000000p+0, 0x1.b54d149865523p-25),
+    (double2)(0x1.1f13410000000p+0, 0x1.a0bb783d9de33p-25),
+    (double2)(0x1.1f570d0000000p+0, 0x1.629d12b1a2157p-25),
+    (double2)(0x1.1f9ab90000000p+0, 0x1.467fe35d179dfp-25),
+    (double2)(0x1.1fde450000000p+0, 0x1.9763f3e26c8f7p-25),
+    (double2)(0x1.2021b20000000p+0, 0x1.3f798bb9f7679p-26),
+    (double2)(0x1.2064ff0000000p+0, 0x1.52e577e855898p-26),
+    (double2)(0x1.20a82c0000000p+0, 0x1.fde47e5502c3ap-25),
+    (double2)(0x1.20eb3b0000000p+0, 0x1.cbd0b548d96a0p-26),
+    (double2)(0x1.212e2a0000000p+0, 0x1.a9cd9f7be8de8p-25),
+    (double2)(0x1.2170fb0000000p+0, 0x1.22bbe704886dep-26),
+    (double2)(0x1.21b3ac0000000p+0, 0x1.e3dea8317f020p-25),
+    (double2)(0x1.21f63f0000000p+0, 0x1.e812085ac8855p-25),
+    (double2)(0x1.2238b40000000p+0, 0x1.c87144f24cb07p-26),
+    (double2)(0x1.227b0a0000000p+0, 0x1.1e128ee311fa2p-25),
+    (double2)(0x1.22bd420000000p+0, 0x1.b5c163d61a2d3p-26),
+    (double2)(0x1.22ff5c0000000p+0, 0x1.7d97e7fb90633p-27),
+    (double2)(0x1.2341570000000p+0, 0x1.efe899d50f6a7p-25),
+    (double2)(0x1.2383350000000p+0, 0x1.d0333eb75de5ap-25),
+    (double2)(0x1.23c4f60000000p+0, 0x1.0e590be73a573p-27),
+    (double2)(0x1.2406980000000p+0, 0x1.8ce8dcac3cdd2p-25),
+    (double2)(0x1.24481d0000000p+0, 0x1.ee8a48954064bp-25),
+    (double2)(0x1.2489850000000p+0, 0x1.aa62f18461e09p-25),
+    (double2)(0x1.24cad00000000p+0, 0x1.01e5940986a15p-25),
+    (double2)(0x1.250bfe0000000p+0, 0x1.b082f4f9b8d4cp-28),
+    (double2)(0x1.254d0e0000000p+0, 0x1.876e0e5527f5ap-25),
+    (double2)(0x1.258e020000000p+0, 0x1.3617080831e6bp-25),
+    (double2)(0x1.25ced90000000p+0, 0x1.81b26e34aa4a2p-25),
+    (double2)(0x1.260f940000000p+0, 0x1.52ee66dfab0c1p-26),
+    (double2)(0x1.2650320000000p+0, 0x1.d85a5329e8819p-26),
+    (double2)(0x1.2690b40000000p+0, 0x1.105c1b646b5d1p-26),
+    (double2)(0x1.26d1190000000p+0, 0x1.bb6690c1a379cp-25),
+    (double2)(0x1.2711630000000p+0, 0x1.86aeba73ce3a9p-26),
+    (double2)(0x1.2751900000000p+0, 0x1.dd16198294dd4p-25),
+    (double2)(0x1.2791a20000000p+0, 0x1.454e675775e83p-25),
+    (double2)(0x1.27d1980000000p+0, 0x1.3842e026197eap-25),
+    (double2)(0x1.2811720000000p+0, 0x1.f1ce0e70c44d2p-25),
+    (double2)(0x1.2851310000000p+0, 0x1.ad636441a5627p-25),
+    (double2)(0x1.2890d50000000p+0, 0x1.4c205d7212abbp-26),
+    (double2)(0x1.28d05d0000000p+0, 0x1.167c86c116419p-25),
+    (double2)(0x1.290fca0000000p+0, 0x1.38ec3ef16e294p-25),
+    (double2)(0x1.294f1c0000000p+0, 0x1.473fceace9321p-25),
+    (double2)(0x1.298e530000000p+0, 0x1.7af53a836dba7p-25),
+    (double2)(0x1.29cd700000000p+0, 0x1.a51f3c383b652p-30),
+    (double2)(0x1.2a0c710000000p+0, 0x1.3696da190822dp-25),
+    (double2)(0x1.2a4b580000000p+0, 0x1.2f9adec77074bp-25),
+    (double2)(0x1.2a8a250000000p+0, 0x1.8190fd5bee55fp-28),
+    (double2)(0x1.2ac8d70000000p+0, 0x1.bfee8fac68e55p-27),
+    (double2)(0x1.2b076f0000000p+0, 0x1.31c9d6bc5f68ap-28),
+    (double2)(0x1.2b45ec0000000p+0, 0x1.89d0523737edfp-25),
+    (double2)(0x1.2b84500000000p+0, 0x1.a295943bf47bbp-26),
+    (double2)(0x1.2bc29a0000000p+0, 0x1.96be32e5b3207p-28),
+    (double2)(0x1.2c00c90000000p+0, 0x1.e44c7d909fa0ep-25),
+    (double2)(0x1.2c3ee00000000p+0, 0x1.b2505da94d9eap-29),
+    (double2)(0x1.2c7cdc0000000p+0, 0x1.0c851f46c9c98p-25),
+    (double2)(0x1.2cbabf0000000p+0, 0x1.da71f7d9aa3b7p-26),
+    (double2)(0x1.2cf8880000000p+0, 0x1.f1b605d019ef1p-25),
+    (double2)(0x1.2d36390000000p+0, 0x1.386e8a2189563p-27),
+    (double2)(0x1.2d73d00000000p+0, 0x1.b19fa5d306ba7p-28),
+    (double2)(0x1.2db14d0000000p+0, 0x1.dd749b67aef76p-25),
+    (double2)(0x1.2deeb20000000p+0, 0x1.76ff6f1dc04b0p-25),
+    (double2)(0x1.2e2bfe0000000p+0, 0x1.35a33d0b232a6p-25),
+    (double2)(0x1.2e69310000000p+0, 0x1.4bdc80024a4e1p-25),
+    (double2)(0x1.2ea64b0000000p+0, 0x1.ebd61770fd723p-25),
+    (double2)(0x1.2ee34d0000000p+0, 0x1.4769fc537264dp-25),
+    (double2)(0x1.2f20360000000p+0, 0x1.9021f429f3b98p-25),
+    (double2)(0x1.2f5d070000000p+0, 0x1.ee7083efbd606p-26),
+    (double2)(0x1.2f99bf0000000p+0, 0x1.ad985552a6b1ap-25),
+    (double2)(0x1.2fd65f0000000p+0, 0x1.e3df778772160p-25),
+    (double2)(0x1.3012e70000000p+0, 0x1.ca5d76ddc9b34p-25),
+    (double2)(0x1.304f570000000p+0, 0x1.91154ffdbaf74p-25),
+    (double2)(0x1.308baf0000000p+0, 0x1.67bdd57fb306ap-25),
+    (double2)(0x1.30c7ef0000000p+0, 0x1.7dc255ac40886p-25),
+    (double2)(0x1.3104180000000p+0, 0x1.219f38e8afafep-32),
+    (double2)(0x1.3140280000000p+0, 0x1.2416bf9669a04p-25),
+    (double2)(0x1.317c210000000p+0, 0x1.11c96b2b3987fp-25),
+    (double2)(0x1.31b8020000000p+0, 0x1.f99ed447e1177p-25),
+    (double2)(0x1.31f3cd0000000p+0, 0x1.3245826328a11p-30),
+    (double2)(0x1.322f7f0000000p+0, 0x1.6f56dd1e645f8p-25),
+    (double2)(0x1.326b1b0000000p+0, 0x1.6164946945535p-27),
+    (double2)(0x1.32a69f0000000p+0, 0x1.e37d59d190028p-26),
+    (double2)(0x1.32e20c0000000p+0, 0x1.68671f12bf828p-25),
+    (double2)(0x1.331d620000000p+0, 0x1.e8ecbca6aabbdp-25),
+    (double2)(0x1.3358a20000000p+0, 0x1.3f49e109a5912p-26),
+    (double2)(0x1.3393ca0000000p+0, 0x1.b8a0e11ec3043p-25),
+    (double2)(0x1.33cedc0000000p+0, 0x1.5fae00aed691ap-25),
+    (double2)(0x1.3409d70000000p+0, 0x1.c0569bece3e4ap-25),
+    (double2)(0x1.3444bc0000000p+0, 0x1.05e26744efbfep-25),
+    (double2)(0x1.347f8a0000000p+0, 0x1.5b570a94be5c5p-25),
+    (double2)(0x1.34ba420000000p+0, 0x1.d6f156ea0e063p-26),
+    (double2)(0x1.34f4e30000000p+0, 0x1.e0ca7612fc484p-25),
+    (double2)(0x1.352f6f0000000p+0, 0x1.963c927b25258p-27),
+    (double2)(0x1.3569e40000000p+0, 0x1.47930aa725a5cp-26),
+    (double2)(0x1.35a4430000000p+0, 0x1.8a79fe3af43b3p-26),
+    (double2)(0x1.35de8c0000000p+0, 0x1.e6dc29c41bdafp-26),
+    (double2)(0x1.3618bf0000000p+0, 0x1.57a2e76f863a5p-25),
+    (double2)(0x1.3652dd0000000p+0, 0x1.ae3b61716354dp-29),
+    (double2)(0x1.368ce40000000p+0, 0x1.65fb5df6906b1p-25),
+    (double2)(0x1.36c6d60000000p+0, 0x1.6177d7f588f7bp-25),
+    (double2)(0x1.3700b30000000p+0, 0x1.ad55abd091b67p-28),
+    (double2)(0x1.373a7a0000000p+0, 0x1.55337b2422d76p-30),
+    (double2)(0x1.37742b0000000p+0, 0x1.084ebe86972d5p-25),
+    (double2)(0x1.37adc70000000p+0, 0x1.56395808e1ea3p-25),
+    (double2)(0x1.37e74e0000000p+0, 0x1.1bce21b40fba7p-25),
+    (double2)(0x1.3820c00000000p+0, 0x1.006f94605b515p-26),
+    (double2)(0x1.385a1c0000000p+0, 0x1.aa676aceb1f7dp-25),
+    (double2)(0x1.3893640000000p+0, 0x1.8229f76554ce6p-26),
+    (double2)(0x1.38cc960000000p+0, 0x1.eabfc6cf57330p-25),
+    (double2)(0x1.3905b40000000p+0, 0x1.4daed9c0ce8bcp-25),
+    (double2)(0x1.393ebd0000000p+0, 0x1.0ff1768237141p-25),
+    (double2)(0x1.3977b10000000p+0, 0x1.575f83051b085p-25),
+    (double2)(0x1.39b0910000000p+0, 0x1.2667deb523e29p-27),
+    (double2)(0x1.39e95c0000000p+0, 0x1.816996954f4fdp-30),
+    (double2)(0x1.3a22120000000p+0, 0x1.87cfccf4d9cd4p-26),
+    (double2)(0x1.3a5ab40000000p+0, 0x1.2c5d018198353p-26),
+    (double2)(0x1.3a93410000000p+0, 0x1.a7a898dcc34aap-25),
+    (double2)(0x1.3acbbb0000000p+0, 0x1.cead6dadc36d1p-29),
+    (double2)(0x1.3b04200000000p+0, 0x1.a55759c498bdfp-29),
+    (double2)(0x1.3b3c700000000p+0, 0x1.c414a9ef6de04p-25),
+    (double2)(0x1.3b74ad0000000p+0, 0x1.3e2108a6e58fap-25),
+    (double2)(0x1.3bacd60000000p+0, 0x1.587fd7643d77cp-26),
+    (double2)(0x1.3be4eb0000000p+0, 0x1.901eb1d3ff3dfp-28),
+    (double2)(0x1.3c1ceb0000000p+0, 0x1.f2ccd7c812fc6p-25),
+    (double2)(0x1.3c54d90000000p+0, 0x1.1c8ee70a01049p-29),
+    (double2)(0x1.3c8cb20000000p+0, 0x1.63e8d02831eecp-26),
+    (double2)(0x1.3cc4770000000p+0, 0x1.f61a42a92c7ffp-25),
+    (double2)(0x1.3cfc2a0000000p+0, 0x1.a917399c84d24p-34),
+    (double2)(0x1.3d33c80000000p+0, 0x1.e9197c8eec2f0p-26),
+    (double2)(0x1.3d6b530000000p+0, 0x1.e6f842f5a1378p-26),
+    (double2)(0x1.3da2cb0000000p+0, 0x1.fac242a90a0fcp-29),
+    (double2)(0x1.3dda2f0000000p+0, 0x1.35ed726610227p-26),
+    (double2)(0x1.3e11800000000p+0, 0x1.0e0d64804b15bp-26),
+    (double2)(0x1.3e48be0000000p+0, 0x1.560675daba814p-31),
+    (double2)(0x1.3e7fe80000000p+0, 0x1.37388c8768032p-25),
+    (double2)(0x1.3eb7000000000p+0, 0x1.ee3c89f9e01f5p-28),
+    (double2)(0x1.3eee040000000p+0, 0x1.39f6f0d09747cp-25),
+    (double2)(0x1.3f24f60000000p+0, 0x1.322c327abb8f0p-27),
+    (double2)(0x1.3f5bd40000000p+0, 0x1.961b347c8ac80p-25),
+    (double2)(0x1.3f92a00000000p+0, 0x1.3711fbbd0f118p-25),
+    (double2)(0x1.3fc9590000000p+0, 0x1.4fad8d7718ffbp-25),
+    (double2)(0x1.3fffff0000000p+0, 0x1.fffffffffffffp-25),
+    (double2)(0x1.4036930000000p+0, 0x1.67efa79ec35b4p-25),
+    (double2)(0x1.406d140000000p+0, 0x1.a737687a254a8p-25),
+    (double2)(0x1.40a3830000000p+0, 0x1.bace0f87d924dp-26),
+    (double2)(0x1.40d9df0000000p+0, 0x1.29e37c237e392p-25),
+    (double2)(0x1.4110290000000p+0, 0x1.57ce7ac3f3012p-26),
+    (double2)(0x1.4146600000000p+0, 0x1.82829359f8fbdp-25),
+    (double2)(0x1.417c850000000p+0, 0x1.cc9be42d14676p-25),
+    (double2)(0x1.41b2980000000p+0, 0x1.a8f001c137d0bp-25),
+    (double2)(0x1.41e8990000000p+0, 0x1.36127687dda05p-25),
+    (double2)(0x1.421e880000000p+0, 0x1.24dba322646f0p-26),
+    (double2)(0x1.4254640000000p+0, 0x1.dc43f1ed210b4p-25),
+    (double2)(0x1.428a2f0000000p+0, 0x1.31ae515c447bbp-25)
+};
+
+
+DECLARE_VTABLE(double2, CBRT_REM_TBL, 5) = {
+    (double2)(0x1.428a2f0000000p-1, 0x1.31ae515c447bbp-26),
+    (double2)(0x1.965fea0000000p-1, 0x1.4f5b8f20ac166p-27),
+    (double2)(0x1.0000000000000p+0, 0x0.0000000000000p+0),
+    (double2)(0x1.428a2f0000000p+0, 0x1.31ae515c447bbp-25),
+    (double2)(0x1.965fea0000000p+0, 0x1.4f5b8f20ac166p-26),
+};
+
+#endif // cl_khr_fp64
+
+
+VTABLE_FUNCTION(double, CBRT_INV_TBL, cbrt_inv_tbl);
+
+VTABLE_FUNCTION2(v2double, LN_TBL, ln_tbl);
+VTABLE_FUNCTION2(v2double, ATAN_JBY256_TBL, atan_jby256_tbl);
+VTABLE_FUNCTION2(v2double, TWO_TO_JBY64_EP, two_to_jby64_ep_tbl);
+VTABLE_FUNCTION2(v2double, SINH_TBL, sinh_tbl);
+VTABLE_FUNCTION2(v2double, COSH_TBL, cosh_tbl);
+VTABLE_FUNCTION2(v2double, CBRT_DBL_TBL, cbrt_dbl_tbl);
+VTABLE_FUNCTION2(v2double, CBRT_REM_TBL, cbrt_rem_tbl);
diff --git a/lib/kernel/libclc/vtables_macros.h b/lib/kernel/libclc/vtables_macros.h
new file mode 100644
index 0000000..cabe63d
--- /dev/null
+++ b/lib/kernel/libclc/vtables_macros.h
@@ -0,0 +1,211 @@
+/* OpenCL built-in library: vtables_macros.h
+
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+
+#define VTABLE_FUNCTION(TYPE,VTABLE,NAME)     \
+     _CL_OVERLOADABLE TYPE VTABLE_MANGLE(NAME)(uint idx) {   \
+        TYPE retval;                        \
+        retval = VTABLE[ idx ];          \
+        return retval;                      \
+    }\
+     _CL_OVERLOADABLE TYPE ## 2 VTABLE_MANGLE(NAME)(uint2 idx) {   \
+        TYPE ## 2 retval;                        \
+        retval.s0 = VTABLE [idx.s0];       \
+        retval.s1 = VTABLE [idx.s1];       \
+        return retval;                      \
+    }\
+     _CL_OVERLOADABLE TYPE ## 3 VTABLE_MANGLE(NAME)(uint3 idx) {   \
+        TYPE ## 3 retval;                        \
+        retval.s0 = VTABLE [idx.s0];       \
+        retval.s1 = VTABLE [idx.s1];       \
+        retval.s2 = VTABLE [idx.s2];       \
+        return retval;                      \
+    }\
+     _CL_OVERLOADABLE TYPE ## 4 VTABLE_MANGLE(NAME)(uint4 idx) {   \
+        TYPE ## 4 retval;                        \
+        retval.s0 = VTABLE [idx.s0];       \
+        retval.s1 = VTABLE [idx.s1];       \
+        retval.s2 = VTABLE [idx.s2];       \
+        retval.s3 = VTABLE [idx.s3];       \
+        return retval;                      \
+    }\
+     _CL_OVERLOADABLE TYPE ## 8 VTABLE_MANGLE(NAME)(uint8 idx) {   \
+        TYPE ## 8 retval;                        \
+        retval.s0 = VTABLE [idx.s0];       \
+        retval.s1 = VTABLE [idx.s1];       \
+        retval.s2 = VTABLE [idx.s2];       \
+        retval.s3 = VTABLE [idx.s3];       \
+        retval.s4 = VTABLE [idx.s4];       \
+        retval.s5 = VTABLE [idx.s5];       \
+        retval.s6 = VTABLE [idx.s6];       \
+        retval.s7 = VTABLE [idx.s7];       \
+        return retval;                      \
+    }\
+     _CL_OVERLOADABLE TYPE ## 16 VTABLE_MANGLE(NAME)(uint16 idx) {   \
+        TYPE ## 16 retval;                        \
+        retval.s0 = VTABLE [idx.s0];       \
+        retval.s1 = VTABLE [idx.s1];       \
+        retval.s2 = VTABLE [idx.s2];       \
+        retval.s3 = VTABLE [idx.s3];       \
+        retval.s4 = VTABLE [idx.s4];       \
+        retval.s5 = VTABLE [idx.s5];       \
+        retval.s6 = VTABLE [idx.s6];       \
+        retval.s7 = VTABLE [idx.s7];       \
+        retval.s8 = VTABLE [idx.s8];       \
+        retval.s9 = VTABLE [idx.s9];       \
+        retval.sA = VTABLE [idx.sA];       \
+        retval.sB = VTABLE [idx.sB];       \
+        retval.sC = VTABLE [idx.sC];       \
+        retval.sD = VTABLE [idx.sD];       \
+        retval.sE = VTABLE [idx.sE];       \
+        retval.sF = VTABLE [idx.sF];       \
+        return retval;                      \
+    }
+
+
+
+
+
+#define VTABLE_FUNCTION2(TYPE,VTABLE,NAME)     \
+     _CL_OVERLOADABLE TYPE VTABLE_MANGLE(NAME)(uint idx) {   \
+        TYPE retval;                        \
+        retval.lo = VTABLE[ idx ].lo;             \
+        retval.hi = VTABLE[ idx ].hi;             \
+        return retval;                      \
+    }\
+     _CL_OVERLOADABLE TYPE ## 2 VTABLE_MANGLE(NAME)(uint2 idx) {   \
+        TYPE ## 2 retval;                        \
+        retval.lo.s0 = VTABLE [idx.s0].lo; retval.hi.s0 = VTABLE [idx.s0].hi;       \
+        retval.lo.s1 = VTABLE [idx.s1].lo; retval.hi.s1 = VTABLE [idx.s1].hi;       \
+        return retval;                      \
+    }\
+     _CL_OVERLOADABLE TYPE ## 3 VTABLE_MANGLE(NAME)(uint3 idx) {   \
+        TYPE ## 3 retval;                        \
+        retval.lo.s0 = VTABLE [idx.s0].lo; retval.hi.s0 = VTABLE [idx.s0].hi;       \
+        retval.lo.s1 = VTABLE [idx.s1].lo; retval.hi.s1 = VTABLE [idx.s1].hi;       \
+        retval.lo.s2 = VTABLE [idx.s2].lo; retval.hi.s2 = VTABLE [idx.s2].hi;       \
+        return retval;                      \
+    }\
+     _CL_OVERLOADABLE TYPE ## 4 VTABLE_MANGLE(NAME)(uint4 idx) {   \
+        TYPE ## 4 retval;                        \
+        retval.lo.s0 = VTABLE [idx.s0].lo; retval.hi.s0 = VTABLE [idx.s0].hi;       \
+        retval.lo.s1 = VTABLE [idx.s1].lo; retval.hi.s1 = VTABLE [idx.s1].hi;       \
+        retval.lo.s2 = VTABLE [idx.s2].lo; retval.hi.s2 = VTABLE [idx.s2].hi;       \
+        retval.lo.s3 = VTABLE [idx.s3].lo; retval.hi.s3 = VTABLE [idx.s3].hi;       \
+        return retval;                      \
+    }\
+     _CL_OVERLOADABLE TYPE ## 8 VTABLE_MANGLE(NAME)(uint8 idx) {   \
+        TYPE ## 8 retval;                        \
+        retval.lo.s0 = VTABLE [idx.s0].lo; retval.hi.s0 = VTABLE [idx.s0].hi;       \
+        retval.lo.s1 = VTABLE [idx.s1].lo; retval.hi.s1 = VTABLE [idx.s1].hi;       \
+        retval.lo.s2 = VTABLE [idx.s2].lo; retval.hi.s2 = VTABLE [idx.s2].hi;       \
+        retval.lo.s3 = VTABLE [idx.s3].lo; retval.hi.s3 = VTABLE [idx.s3].hi;       \
+        retval.lo.s4 = VTABLE [idx.s4].lo; retval.hi.s4 = VTABLE [idx.s4].hi;       \
+        retval.lo.s5 = VTABLE [idx.s5].lo; retval.hi.s5 = VTABLE [idx.s5].hi;       \
+        retval.lo.s6 = VTABLE [idx.s6].lo; retval.hi.s6 = VTABLE [idx.s6].hi;       \
+        retval.lo.s7 = VTABLE [idx.s7].lo; retval.hi.s7 = VTABLE [idx.s7].hi;       \
+        return retval;                      \
+    }\
+     _CL_OVERLOADABLE TYPE ## 16 VTABLE_MANGLE(NAME)(uint16 idx) {   \
+        TYPE ## 16 retval;                        \
+        retval.lo.s0 = VTABLE [idx.s0].lo; retval.hi.s0 = VTABLE [idx.s0].hi;       \
+        retval.lo.s1 = VTABLE [idx.s1].lo; retval.hi.s1 = VTABLE [idx.s1].hi;       \
+        retval.lo.s2 = VTABLE [idx.s2].lo; retval.hi.s2 = VTABLE [idx.s2].hi;       \
+        retval.lo.s3 = VTABLE [idx.s3].lo; retval.hi.s3 = VTABLE [idx.s3].hi;       \
+        retval.lo.s4 = VTABLE [idx.s4].lo; retval.hi.s4 = VTABLE [idx.s4].hi;       \
+        retval.lo.s5 = VTABLE [idx.s5].lo; retval.hi.s5 = VTABLE [idx.s5].hi;       \
+        retval.lo.s6 = VTABLE [idx.s6].lo; retval.hi.s6 = VTABLE [idx.s6].hi;       \
+        retval.lo.s7 = VTABLE [idx.s7].lo; retval.hi.s7 = VTABLE [idx.s7].hi;       \
+        retval.lo.s8 = VTABLE [idx.s8].lo; retval.hi.s8 = VTABLE [idx.s8].hi;       \
+        retval.lo.s9 = VTABLE [idx.s9].lo; retval.hi.s9 = VTABLE [idx.s9].hi;       \
+        retval.lo.sA = VTABLE [idx.sA].lo; retval.hi.sA = VTABLE [idx.sA].hi;       \
+        retval.lo.sB = VTABLE [idx.sB].lo; retval.hi.sB = VTABLE [idx.sB].hi;       \
+        retval.lo.sC = VTABLE [idx.sC].lo; retval.hi.sC = VTABLE [idx.sC].hi;       \
+        retval.lo.sD = VTABLE [idx.sD].lo; retval.hi.sD = VTABLE [idx.sD].hi;       \
+        retval.lo.sE = VTABLE [idx.sE].lo; retval.hi.sE = VTABLE [idx.sE].hi;       \
+        retval.lo.sF = VTABLE [idx.sF].lo; retval.hi.sF = VTABLE [idx.sF].hi;       \
+        return retval;                      \
+    }
+
+
+
+
+#define VTABLE_FUNCTION4(VTABLE,NAME)     \
+     _CL_OVERLOADABLE v4uint VTABLE_MANGLE(NAME)(uint idx) {   \
+        v4uint retval;                      \
+        retval = *(__constant v4uint *)(VTABLE + idx);             \
+        return retval;                      \
+    }\
+     _CL_OVERLOADABLE v4uint2 VTABLE_MANGLE(NAME)(uint2 idx) {   \
+        v4uint2 retval; uint4 tmp;                       \
+        tmp = *(__constant uint4 *)(VTABLE + idx.s0); retval.s0.s0 = tmp.s0; retval.s1.s0 = tmp.s1; retval.s2.s0 = tmp.s2; retval.s3.s0 = tmp.s3;       \
+        tmp = *(__constant uint4 *)(VTABLE + idx.s1); retval.s0.s1 = tmp.s0; retval.s1.s1 = tmp.s1; retval.s2.s1 = tmp.s2; retval.s3.s1 = tmp.s3;       \
+        return retval;                      \
+    }\
+     _CL_OVERLOADABLE v4uint3 VTABLE_MANGLE(NAME)(uint3 idx) {   \
+        v4uint3 retval; uint4 tmp;                       \
+        tmp = *(__constant uint4 *)(VTABLE + idx.s0); retval.s0.s0 = tmp.s0; retval.s1.s0 = tmp.s1; retval.s2.s0 = tmp.s2; retval.s3.s0 = tmp.s3;       \
+        tmp = *(__constant uint4 *)(VTABLE + idx.s1); retval.s0.s1 = tmp.s0; retval.s1.s1 = tmp.s1; retval.s2.s1 = tmp.s2; retval.s3.s1 = tmp.s3;       \
+        tmp = *(__constant uint4 *)(VTABLE + idx.s2); retval.s0.s2 = tmp.s0; retval.s1.s2 = tmp.s1; retval.s2.s2 = tmp.s2; retval.s3.s2 = tmp.s3;       \
+        return retval;                      \
+    }\
+     _CL_OVERLOADABLE v4uint4 VTABLE_MANGLE(NAME)(uint4 idx) {   \
+        v4uint4 retval; uint4 tmp;                       \
+        tmp = *(__constant uint4 *)(VTABLE + idx.s0); retval.s0.s0 = tmp.s0; retval.s1.s0 = tmp.s1; retval.s2.s0 = tmp.s2; retval.s3.s0 = tmp.s3;       \
+        tmp = *(__constant uint4 *)(VTABLE + idx.s1); retval.s0.s1 = tmp.s0; retval.s1.s1 = tmp.s1; retval.s2.s1 = tmp.s2; retval.s3.s1 = tmp.s3;       \
+        tmp = *(__constant uint4 *)(VTABLE + idx.s2); retval.s0.s2 = tmp.s0; retval.s1.s2 = tmp.s1; retval.s2.s2 = tmp.s2; retval.s3.s2 = tmp.s3;       \
+        tmp = *(__constant uint4 *)(VTABLE + idx.s3); retval.s0.s3 = tmp.s0; retval.s1.s3 = tmp.s1; retval.s2.s3 = tmp.s2; retval.s3.s3 = tmp.s3;       \
+        return retval;                      \
+    }\
+     _CL_OVERLOADABLE v4uint8 VTABLE_MANGLE(NAME)(uint8 idx) {   \
+        v4uint8 retval; uint4 tmp;                        \
+        tmp = *(__constant uint4 *)(VTABLE + idx.s0); retval.s0.s0 = tmp.s0; retval.s1.s0 = tmp.s1; retval.s2.s0 = tmp.s2; retval.s3.s0 = tmp.s3;       \
+        tmp = *(__constant uint4 *)(VTABLE + idx.s1); retval.s0.s1 = tmp.s0; retval.s1.s1 = tmp.s1; retval.s2.s1 = tmp.s2; retval.s3.s1 = tmp.s3;       \
+        tmp = *(__constant uint4 *)(VTABLE + idx.s2); retval.s0.s2 = tmp.s0; retval.s1.s2 = tmp.s1; retval.s2.s2 = tmp.s2; retval.s3.s2 = tmp.s3;       \
+        tmp = *(__constant uint4 *)(VTABLE + idx.s3); retval.s0.s3 = tmp.s0; retval.s1.s3 = tmp.s1; retval.s2.s3 = tmp.s2; retval.s3.s3 = tmp.s3;       \
+        tmp = *(__constant uint4 *)(VTABLE + idx.s4); retval.s0.s4 = tmp.s0; retval.s1.s4 = tmp.s1; retval.s2.s4 = tmp.s2; retval.s3.s4 = tmp.s3;       \
+        tmp = *(__constant uint4 *)(VTABLE + idx.s5); retval.s0.s5 = tmp.s0; retval.s1.s5 = tmp.s1; retval.s2.s5 = tmp.s2; retval.s3.s5 = tmp.s3;       \
+        tmp = *(__constant uint4 *)(VTABLE + idx.s6); retval.s0.s6 = tmp.s0; retval.s1.s6 = tmp.s1; retval.s2.s6 = tmp.s2; retval.s3.s6 = tmp.s3;       \
+        tmp = *(__constant uint4 *)(VTABLE + idx.s7); retval.s0.s7 = tmp.s0; retval.s1.s7 = tmp.s1; retval.s2.s7 = tmp.s2; retval.s3.s7 = tmp.s3;       \
+        return retval;                      \
+    }\
+     _CL_OVERLOADABLE v4uint16 VTABLE_MANGLE(NAME)(uint16 idx) {   \
+        v4uint16 retval; uint4 tmp;                        \
+        tmp = *(__constant uint4 *)(VTABLE + idx.s0); retval.s0.s0 = tmp.s0; retval.s1.s0 = tmp.s1; retval.s2.s0 = tmp.s2; retval.s3.s0 = tmp.s3;       \
+        tmp = *(__constant uint4 *)(VTABLE + idx.s1); retval.s0.s1 = tmp.s0; retval.s1.s1 = tmp.s1; retval.s2.s1 = tmp.s2; retval.s3.s1 = tmp.s3;       \
+        tmp = *(__constant uint4 *)(VTABLE + idx.s2); retval.s0.s2 = tmp.s0; retval.s1.s2 = tmp.s1; retval.s2.s2 = tmp.s2; retval.s3.s2 = tmp.s3;       \
+        tmp = *(__constant uint4 *)(VTABLE + idx.s3); retval.s0.s3 = tmp.s0; retval.s1.s3 = tmp.s1; retval.s2.s3 = tmp.s2; retval.s3.s3 = tmp.s3;       \
+        tmp = *(__constant uint4 *)(VTABLE + idx.s4); retval.s0.s4 = tmp.s0; retval.s1.s4 = tmp.s1; retval.s2.s4 = tmp.s2; retval.s3.s4 = tmp.s3;       \
+        tmp = *(__constant uint4 *)(VTABLE + idx.s5); retval.s0.s5 = tmp.s0; retval.s1.s5 = tmp.s1; retval.s2.s5 = tmp.s2; retval.s3.s5 = tmp.s3;       \
+        tmp = *(__constant uint4 *)(VTABLE + idx.s6); retval.s0.s6 = tmp.s0; retval.s1.s6 = tmp.s1; retval.s2.s6 = tmp.s2; retval.s3.s6 = tmp.s3;       \
+        tmp = *(__constant uint4 *)(VTABLE + idx.s7); retval.s0.s7 = tmp.s0; retval.s1.s7 = tmp.s1; retval.s2.s7 = tmp.s2; retval.s3.s7 = tmp.s3;       \
+        tmp = *(__constant uint4 *)(VTABLE + idx.s8); retval.s0.s8 = tmp.s0; retval.s1.s8 = tmp.s1; retval.s2.s8 = tmp.s2; retval.s3.s8 = tmp.s3;       \
+        tmp = *(__constant uint4 *)(VTABLE + idx.s9); retval.s0.s9 = tmp.s0; retval.s1.s9 = tmp.s1; retval.s2.s9 = tmp.s2; retval.s3.s9 = tmp.s3;       \
+        tmp = *(__constant uint4 *)(VTABLE + idx.sA); retval.s0.sA = tmp.s0; retval.s1.sA = tmp.s1; retval.s2.sA = tmp.s2; retval.s3.sA = tmp.s3;       \
+        tmp = *(__constant uint4 *)(VTABLE + idx.sB); retval.s0.sB = tmp.s0; retval.s1.sB = tmp.s1; retval.s2.sB = tmp.s2; retval.s3.sB = tmp.s3;       \
+        tmp = *(__constant uint4 *)(VTABLE + idx.sC); retval.s0.sC = tmp.s0; retval.s1.sC = tmp.s1; retval.s2.sC = tmp.s2; retval.s3.sC = tmp.s3;       \
+        tmp = *(__constant uint4 *)(VTABLE + idx.sD); retval.s0.sD = tmp.s0; retval.s1.sD = tmp.s1; retval.s2.sD = tmp.s2; retval.s3.sD = tmp.s3;       \
+        tmp = *(__constant uint4 *)(VTABLE + idx.sE); retval.s0.sE = tmp.s0; retval.s1.sE = tmp.s1; retval.s2.sE = tmp.s2; retval.s3.sE = tmp.s3;       \
+        tmp = *(__constant uint4 *)(VTABLE + idx.sF); retval.s0.sF = tmp.s0; retval.s1.sF = tmp.s1; retval.s2.sF = tmp.s2; retval.s3.sF = tmp.s3;       \
+        return retval;                      \
+    }
diff --git a/lib/CL/clRetainDevice.c b/lib/kernel/mem_fence.c
similarity index 74%
copy from lib/CL/clRetainDevice.c
copy to lib/kernel/mem_fence.c
index 5f7e10a..2c6367c 100644
--- a/lib/CL/clRetainDevice.c
+++ b/lib/kernel/mem_fence.c
@@ -1,17 +1,17 @@
-/* OpenCL runtime library: clRetainDevice()
+/* OpenCL built-in library: mem_fence()
+
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
 
-   Copyright (c) 2012 Pekka Jääskeläinen / TUT
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -20,15 +20,21 @@
    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    THE SOFTWARE.
 */
-#include "pocl_cl.h"
 
-CL_API_ENTRY cl_int CL_API_CALL
-POname(clRetainDevice)(cl_device_id device) CL_API_SUFFIX__VERSION_1_2
+
+/* Empty implementation should work on CPU devices. */
+
+void _CL_OVERLOADABLE
+read_mem_fence (cl_mem_fence_flags flags)
 {
-  if (device->parent_device == NULL)
-    return CL_SUCCESS;
+}
 
-  POCL_RETAIN_OBJECT (device);
-  return CL_SUCCESS;
+void _CL_OVERLOADABLE
+write_mem_fence (cl_mem_fence_flags flags)
+{
+}
+
+void _CL_OVERLOADABLE
+mem_fence (cl_mem_fence_flags flags)
+{
 }
-POsym(clRetainDevice)
diff --git a/lib/kernel/pocl_image_rw_utils.h b/lib/kernel/pocl_image_rw_utils.h
index 85cf549..eaf78b9 100644
--- a/lib/kernel/pocl_image_rw_utils.h
+++ b/lib/kernel/pocl_image_rw_utils.h
@@ -25,7 +25,7 @@
 
 /* coordinate initialization */
 #define INITCOORDint(dest, source){             \
-  dest.x = source.x;                            \
+  dest.x = source;                              \
   dest.y = 0;                                   \
   dest.z = 0;                                   \
   dest.w = 0;                                   \
@@ -45,4 +45,28 @@
   dest.w = source.w;                                 \
   }
 
+#define INITCOORDfloat(dest, source)                                          \
+  {                                                                           \
+    dest.x = source;                                                          \
+    dest.y = 0.0f;                                                            \
+    dest.z = 0.0f;                                                            \
+    dest.w = 0.0f;                                                            \
+  }
+
+#define INITCOORDfloat2(dest, source)                                         \
+  {                                                                           \
+    dest.x = source.x;                                                        \
+    dest.y = source.y;                                                        \
+    dest.z = 0.0f;                                                            \
+    dest.w = 0.0f;                                                            \
+  }
+
+#define INITCOORDfloat4(dest, source)                                         \
+  {                                                                           \
+    dest.x = source.x;                                                        \
+    dest.y = source.y;                                                        \
+    dest.z = source.z;                                                        \
+    dest.w = source.w;                                                        \
+  }
+
 #endif
diff --git a/lib/kernel/printf.c b/lib/kernel/printf.c
index 8ea0d12..bd522fb 100644
--- a/lib/kernel/printf.c
+++ b/lib/kernel/printf.c
@@ -28,10 +28,15 @@
 #include <limits.h>
 #include <stdarg.h>
 #include <stdbool.h>
+#include <math.h>
 
 // We implement the OpenCL printf by calling the C99 printf. This is
 // not very efficient, but is easy to implement.
+#if LLVM_OLDER_THAN_5_0
 #define OCL_C_AS __attribute__((address_space(0)))
+#else
+#define OCL_C_AS
+#endif
 int printf(OCL_C_AS const char* restrict fmt, ...);
 int snprintf(OCL_C_AS char* restrict str, size_t size,
              OCL_C_AS const char* restrict fmt, ...);
@@ -106,9 +111,6 @@ float __attribute__((overloadable)) vload_half(size_t offset,
 
 // Note: To simplify implementation, we print double values with %lf,
 // although %f would suffice as well
-#define FLOAT_CONV_half   "h"
-#define FLOAT_CONV_float  ""
-#define FLOAT_CONV_double "l"
 #define FLOAT_GET_half(ptr)   vload_half(0, ptr)
 #define FLOAT_GET_float(ptr)  (*(ptr))
 #define FLOAT_GET_double(ptr) (*(ptr))
@@ -119,7 +121,7 @@ float __attribute__((overloadable)) vload_half(size_t offset,
   {                                                                     \
     DEBUG_PRINTF(("[printf:floats:n=%dd]\n", n));                       \
     char outfmt[1000];                                                  \
-    OCL_C_AS char str[] = "%%%s%s%s%s%s%.0d%s%.0d" FLOAT_CONV_##WIDTH "%c"; \
+    OCL_C_AS char str[] = "%%%s%s%s%s%s%.0d%s%.0d" "%c";                \
     snprintf(outfmt, sizeof outfmt,                                     \
              str,                                                       \
              flags.left ? "-" : "",                                     \
@@ -136,7 +138,10 @@ float __attribute__((overloadable)) vload_half(size_t offset,
     for (int d=0; d<n; ++d) {                                           \
       DEBUG_PRINTF(("[printf:floats:d=%d]\n", d));                      \
       if (d != 0) printf(comma);                                        \
-      printf(outfmt, FLOAT_GET_##WIDTH((OCL_C_AS const WIDTH*)vals+d)); \
+      WIDTH val = (FLOAT_GET_##WIDTH((OCL_C_AS const WIDTH*)vals+d));   \
+      if (isnan (val))                                                  \
+        val = NAN;                                                      \
+      printf(outfmt, (double)val);                                      \
     }                                                                   \
     DEBUG_PRINTF(("[printf:floats:done]\n"));                           \
   }
@@ -169,15 +174,19 @@ void _cl_print_char(flags_t flags, int field_width, int val)
   DEBUG_PRINTF(("[printf:char:done]\n"));
 }
 
-void _cl_print_string(flags_t flags, int field_width, OCL_C_AS const char* val)
+void
+_cl_print_string (flags_t flags, int field_width, int precision,
+                  OCL_C_AS const char *val)
 {
   DEBUG_PRINTF(("[printf:char]\n"));
   char outfmt[1000];
-  char string[] = "%%%s%.0ds";
+  char string[] = "%%%s%.0d%s%.0ds";
   snprintf(outfmt, sizeof outfmt,
            string,
            flags.left ? "-" : "",
-           field_width);
+           field_width,
+           (precision > 0) ? "." : "",
+           (precision > 0) ? precision : 0);
   DEBUG_PRINTF(("[printf:char:outfmt=%s]\n", outfmt));
   printf(outfmt, val);
   DEBUG_PRINTF(("[printf:char:done]\n"));
@@ -433,11 +442,10 @@ int __cl_printf(const OCL_CONSTANT_AS char* restrict format, ...)
           // Output a string
         case 's': {
           if (flags.plus || flags.space || flags.alt || flags.zero) goto error;
-          if (precision != -1) goto error;
           if (vector_length != 1) goto error;
           if (length != 0) goto error;
           OCL_C_AS const char* val = va_arg(ap, OCL_C_AS const char*);
-          _cl_print_string(flags, field_width, val);
+          _cl_print_string (flags, field_width, precision, val);
           break;
         }
           
diff --git a/lib/kernel/printf_constant.c b/lib/kernel/printf_constant.c
index 4e75cf5..c8aa37a 100644
--- a/lib/kernel/printf_constant.c
+++ b/lib/kernel/printf_constant.c
@@ -49,9 +49,16 @@
 
 /* AS 0 is required for the prototypes, otherwise they get assigned
  * the generic AS (#4) */
+
 #define OCL_C_AS __attribute__((address_space(0)))
+
+#ifdef LLVM_OLDER_THAN_5_0
 int vprintf(OCL_C_AS const char *, __builtin_va_list);
 int fflush(OCL_C_AS void *stream);
+#else
+int vprintf(const char *, __builtin_va_list);
+int fflush(void *stream);
+#endif
 
 #undef printf
 #define MAX_FORMAT_STR_SIZE 2048
diff --git a/lib/kernel/read_image.cl b/lib/kernel/read_image.cl
index ef000b7..e6720cf 100644
--- a/lib/kernel/read_image.cl
+++ b/lib/kernel/read_image.cl
@@ -3,6 +3,7 @@
    Copyright (c) 2013 Ville Korhonen
    Copyright (c) 2014 Felix Bytow
    Copyright (c) 2015 Matias Koskela
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
 
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
@@ -23,148 +24,1506 @@
    THE SOFTWARE.
 */
 
+/* NOTE: this file is NOT a generic implementation; it works with vectors
+   in a lot of places and requires that either device supports unaligned
+   vector operations, or that memory backing the images is properly aligned.
+   The maximum required alignment is 16bytes (4channels * 32bit color)
+
+   Not all CPUs support unaligned vector operations, but the pthread / basic
+   drivers allocate properly aligned memory for backing buffers; therefore
+   this should work for everything supported by pthread / basic.
+*/
+
 #include "templates.h"
 #include "pocl_image_rw_utils.h"
 
-#if (__clang_major__ == 3) && (__clang_minor__ >= 5)
-// Clang 3.5 crashes in case trying to cast to the private pointer,
-// adding the global qualifier fixes it. Clang 3.4 crashes if it's
-// there. The issue is in SROA.
-#define ADDRESS_SPACE global
+#define CLK_ADDRESS_MASK                                                      \
+  (CLK_ADDRESS_CLAMP_TO_EDGE | CLK_ADDRESS_CLAMP | CLK_ADDRESS_REPEAT         \
+   | CLK_ADDRESS_MIRRORED_REPEAT)
+
+_CL_READNONE static uint4
+map_channels (uint4 color, int order)
+{
+  switch (order)
+    {
+    case CLK_ARGB:
+      return color.yzwx;
+    case CLK_BGRA:
+      return color.zyxw;
+    case CLK_RGBA:
+    default:
+      return color;
+    }
+}
+
+/*************************************************************************/
+
+/* only for CLK_FLOAT, CLK_SNORM_INT8, CLK_UNORM_INT8,
+ * CLK_SNORM_INT16, CLK_UNORM_INT16 channel types */
+_CL_READONLY static float4
+get_float4_pixel (void *data, size_t base_index, int type)
+{
+  if (type == CLK_FLOAT)
+    return ((float4 *)data)[base_index];
+  if (type == CLK_HALF_FLOAT)
+    {
+#if !defined(LLVM_OLDER_THAN_3_8)
+      return vloada_half4(base_index, data);
 #else
-#define ADDRESS_SPACE
+      __builtin_trap ();
 #endif
+    }
+  const float4 one_127th = (float4) (1.0f / 127.0f);
+  const float4 one_32767th = (float4) (1.0f / 32767.0f);
+  const float4 one_255th = ((float4) (1.0f / (float)UCHAR_MAX));
+  const float4 one_65535th = ((float4) (1.0f / (float)USHRT_MAX));
+  if (type == CLK_SNORM_INT8)
+    {
+      /*  <I*_MIN, I*_MAX> to <-1.0, 1.0> */
+      int4 color = convert_int4 (((char4 *)data)[base_index]);
+      float4 colorf = convert_float4 (color);
+      return max ((float4) (-1.0f), (one_127th * colorf));
+    }
+  if (type == CLK_SNORM_INT16)
+    {
+      int4 color = convert_int4 (((short4 *)data)[base_index]);
+      float4 colorf = convert_float4 (color);
+      return max ((float4) (-1.0f), (one_32767th * colorf));
+    }
+  if (type == CLK_UNORM_INT8)
+    {
+      /* <0, I*_MAX> to <0.0, 1.0> */
+      return convert_float4 (((uchar4 *)data)[base_index]) * one_255th;
+    }
+  if (type == CLK_UNORM_INT16)
+    {
+      /* <0, I*_MAX> to <0.0, 1.0> */
+      return convert_float4 (((ushort4 *)data)[base_index]) * one_65535th;
+    }
+  return (float4) (123.0f);
+}
 
-/* checks if integer coord is out of bounds. If out of bounds: Sets coord in
-   bounds and returns false OR populates color with border colour and returns
-   true. If in bounds, returns false */
-int __pocl_is_out_of_bounds (ADDRESS_SPACE dev_image_t* dev_image, int4 *coord,
-                             dev_sampler_t* dev_sampler, void *color_)
+/* only for CLK_FLOAT, CLK_SNORM_INT8, CLK_UNORM_INT8,
+ * CLK_SNORM_INT16, CLK_UNORM_INT16 channel types */
+_CL_READONLY static float
+get_float_pixel (void *data, size_t base_index, int type)
 {
-  uint4 *color = (uint4*)color_;
-  if(*dev_sampler & CLK_ADDRESS_CLAMP_TO_EDGE)
+  if (type == CLK_FLOAT)
+    return ((float *)data)[base_index];
+  const float one_127th = (float)(1.0f / 127.0f);
+  const float one_32767th = (float)(1.0f / 32767.0f);
+  const float one_255th = ((float)(1.0f / (float)UCHAR_MAX));
+  const float one_65535th = ((float)(1.0f / (float)USHRT_MAX));
+  if (type == CLK_SNORM_INT8)
+    {
+      /*  <I*_MIN, I*_MAX> to <-1.0, 1.0> */
+      char color = ((char *)data)[base_index];
+      float colorf = convert_float (color);
+      return max ((-1.0f), (one_127th * colorf));
+    }
+  if (type == CLK_SNORM_INT16)
     {
-      if (coord->x >= dev_image->_width)
-        coord->x = dev_image->_width-1;
-      if (dev_image->_height != 0 && coord->y >= dev_image->_height)
-        coord->y = dev_image->_height-1;
-      if (dev_image->_depth != 0 && coord->z >= dev_image->_depth)
-        coord->z = dev_image->_depth-1;
+      short color = ((short *)data)[base_index];
+      float colorf = convert_float (color);
+      return max ((-1.0f), (one_32767th * colorf));
+    }
+  if (type == CLK_UNORM_INT8)
+    {
+      /* <0, I*_MAX> to <0.0, 1.0> */
+      return convert_float (((uchar *)data)[base_index]) * one_255th;
+    }
+  if (type == CLK_UNORM_INT16)
+    {
+      return convert_float (((ushort *)data)[base_index]) * one_65535th;
+    }
+
+  return 234.0f;
+}
 
-      if (coord->x < 0)
-        coord->x = 0;
-      if (coord->y < 0)
-        coord->y = 0;
-      if (coord->z < 0)
-        coord->z = 0;
+/*************************************************************************/
 
-      return 0;
+#define BORDER_COLOR (0)
+#define BORDER_COLOR_F (0.0f)
+
+/* for use inside filter functions
+ * no channel mapping
+ * no pointers to img metadata */
+_CL_READONLY static uint4
+pocl_read_pixel_fast_ui (size_t base_index, int order, int elem_size,
+                         void *data)
+{
+  uint4 color;
+
+  if (order == CLK_A)
+    {
+      color = (uint4)0;
+      if (elem_size == 1)
+        color.w = ((uchar *)data)[base_index];
+      else if (elem_size == 2)
+        color.w = ((ushort *)data)[base_index];
+      else if (elem_size == 4)
+        color.w = ((uint *)data)[base_index];
+      return color;
+    }
+
+  if (elem_size == 1)
+    {
+      return convert_uint4 (((uchar4 *)data)[base_index]);
+    }
+  else if (elem_size == 2)
+    {
+      return convert_uint4 (((ushort4 *)data)[base_index]);
     }
-  if (*dev_sampler & CLK_ADDRESS_CLAMP)
+  else if (elem_size == 4)
     {
-      if(coord->x >= dev_image->_width || coord->x < 0 ||
-         coord->y >= dev_image->_height || coord->y < 0 ||
-         (dev_image->_depth != 0 && (coord->z >= dev_image->_depth || coord->z <0)))
-        {
-          (*color)[0] = 0;
-          (*color)[1] = 0;
-          (*color)[2] = 0;
+      return ((uint4 *)data)[base_index];
+    }
 
-          if (dev_image->_order == CL_A || dev_image->_order == CL_INTENSITY ||
-              dev_image->_order == CL_RA || dev_image->_order == CL_ARGB ||
-              dev_image->_order == CL_BGRA || dev_image->_order == CL_RGBA)
-            (*color)[3] = 0;
-          else
-            (*color)[3] = 1;
+  return (uint4)0;
+}
 
-          return 1;
-        }
+/* for use inside filter functions
+ * no channel mapping
+ * no pointers to img metadata */
+_CL_READONLY static float4
+pocl_read_pixel_fast_f (size_t base_index, int channel_type, int order,
+                        void *data)
+{
+
+  if (order == CLK_A)
+    {
+      float p = get_float_pixel (data, base_index, channel_type);
+      return (float4) (0.0f, 0.0f, 0.0f, p);
+    }
+  else
+    {
+      return get_float4_pixel (data, base_index, channel_type);
     }
-  return 0;
 }
 
-/* Reads a four element pixel from image pointed by integer coords. */
-void __pocl_read_pixel (void* color, ADDRESS_SPACE dev_image_t* dev_image, int4 coord)
+/* for use inside filter functions
+ * no channel mapping
+ * no pointers to img metadata */
+_CL_READONLY static int4
+pocl_read_pixel_fast_i (size_t base_index, int order, int elem_size,
+                        void *data)
 {
+  int4 color;
 
-  uint4* color_ptr = (uint4*)color;
-  int width = dev_image->_width;
-  int height = dev_image->_height;
-  int num_channels = dev_image->_num_channels;
-  int i = num_channels;
-  int elem_size = dev_image->_elem_size;
-  int const base_index =
-    (coord.x + coord.y*width + coord.z*height*width) * num_channels;
-
-  if (dev_image->_order == CL_A)
-    {
-      /* these can be garbage
-      (*color_ptr)[0] = 0;
-      (*color_ptr)[1] = 0;
-      (*color_ptr)[2] = 0;
-      */
+  if (order == CLK_A)
+    {
+      color = (int4)0;
       if (elem_size == 1)
-        (*color_ptr)[3] = ((uchar*)(dev_image->_data))[base_index];
+        color.w = ((char *)data)[base_index];
       else if (elem_size == 2)
-        (*color_ptr)[3] = ((ushort*)(dev_image->_data))[base_index];
+        color.w = ((short *)data)[base_index];
       else if (elem_size == 4)
-        (*color_ptr)[3] = ((uint*)(dev_image->_data))[base_index];
-      return;
+        color.w = ((int *)data)[base_index];
+      return color;
     }
 
   if (elem_size == 1)
     {
-      if(dev_image->_order == CL_BGRA)
+      return convert_int4 (((char4 *)data)[base_index]);
+    }
+  else if (elem_size == 2)
+    {
+      return convert_int4 (((short4 *)data)[base_index]);
+    }
+  else if (elem_size == 4)
+    {
+      return ((int4 *)data)[base_index];
+    }
+  return (int4)0;
+}
+
+/*************************************************************************/
+
+_CL_READONLY static int4
+get_image_array_offset (global dev_image_t *img, int4 uvw_after_rint,
+                        int4 array_coord)
+{
+  int4 res = uvw_after_rint;
+  if (img->_image_array_size > 0)
+    {
+      if (img->_height > 0)
         {
-          (*color_ptr)[0] = ((uchar*)(dev_image->_data))[base_index + 2];
-          (*color_ptr)[1] = ((uchar*)(dev_image->_data))[base_index + 1];
-          (*color_ptr)[2] = ((uchar*)(dev_image->_data))[base_index + 0];
-          (*color_ptr)[3] = ((uchar*)(dev_image->_data))[base_index + 3]; 
+          res.z = clamp (array_coord.z, 0, (img->_image_array_size - 1));
+          res.w = 0;
         }
       else
         {
-          while (i--)
+          res.y = clamp (array_coord.y, 0, (img->_image_array_size - 1));
+          res.z = 0;
+          res.w = 0;
+        }
+    }
+  return res;
+}
+
+/* array_coord must be unnormalized & repeats removed */
+_CL_READONLY static int4
+get_image_array_offset2 (global dev_image_t *img, int4 uvw_after_rint,
+                         float4 array_coord)
+{
+  int4 res = uvw_after_rint;
+  if (img->_image_array_size > 0)
+    {
+      if (img->_height > 0)
+        {
+          res.z = clamp (convert_int (floor (array_coord.z + 0.5f)), 0,
+                         (img->_image_array_size - 1));
+          res.w = 0;
+        }
+      else
+        {
+          res.y = clamp (convert_int (floor (array_coord.y + 0.5f)), 0,
+                         (img->_image_array_size - 1));
+          res.z = 0;
+          res.w = 0;
+        }
+    }
+  return res;
+}
+
+/* RET: (int4) (img.x{,y,z}, array_size, 0 {,0 ...} ) */
+_CL_READONLY static int4
+pocl_get_image_array_size (global dev_image_t *img)
+{
+  int4 imgsize = (int4) (img->_width, img->_height, img->_depth, 0);
+  if (img->_image_array_size > 0)
+    {
+      if (img->_height > 0)
+        imgsize.z = img->_image_array_size;
+      else
+        imgsize.y = img->_image_array_size;
+    }
+  return imgsize;
+}
+/*************************************************************************/
+
+/* full read with channel map conversion etc  */
+/* Reads a four element pixel from image pointed by integer coords.
+ * Returns Border color (0) for out-of-range reads. This is OK since
+ * reads behind border should either return border color, or are undefined */
+_CL_READONLY static uint4
+pocl_read_pixel (global dev_image_t *img, int4 coord)
+{
+  uint4 color;
+  int width = img->_width;
+  int height = img->_height;
+  int depth = img->_depth;
+  int num_channels = img->_num_channels;
+  int order = img->_order;
+  int elem_size = img->_elem_size;
+  int channel_type = img->_data_type;
+  void *data = img->_data;
+  size_t elem_bytes = num_channels * elem_size;
+  size_t row_pitch = img->_row_pitch / elem_bytes;
+  size_t slice_pitch = img->_slice_pitch / elem_bytes;
+
+  if ((coord.x >= width || coord.x < 0)
+      || ((height != 0) && (coord.y >= height || coord.y < 0))
+      || ((depth != 0) && (coord.z >= depth || coord.z < 0)))
+    {
+      /* if out of bounds, return BORDER COLOR:
+       * since pocl's basic/pthread device only
+       * supports CLK_A + CLK_{RGBA combos},
+       * the border color is always zeroes. */
+      if ((channel_type == CLK_SIGNED_INT8) || (channel_type == CLK_SIGNED_INT16)
+          || (channel_type == CLK_SIGNED_INT32)
+          || (channel_type == CLK_UNSIGNED_INT8)
+          || (channel_type == CLK_UNSIGNED_INT16)
+          || (channel_type == CLK_UNSIGNED_INT32))
+        return (uint4)BORDER_COLOR;
+      else
+        return as_uint4 ((float4)BORDER_COLOR_F);
+    }
+
+  size_t base_index
+      = coord.x + (coord.y * row_pitch) + (coord.z * slice_pitch);
+
+  if ((channel_type == CLK_SIGNED_INT8) || (channel_type == CLK_SIGNED_INT16)
+      || (channel_type == CLK_SIGNED_INT32))
+    color = as_uint4 (
+        pocl_read_pixel_fast_i (base_index, order, elem_size, data));
+  else if ((channel_type == CLK_UNSIGNED_INT8)
+           || (channel_type == CLK_UNSIGNED_INT16)
+           || (channel_type == CLK_UNSIGNED_INT32))
+    color = pocl_read_pixel_fast_ui (base_index, order, elem_size, data);
+  else // TODO unsupported channel types
+    color = as_uint4 (
+        pocl_read_pixel_fast_f (base_index, channel_type, order, data));
+
+  return map_channels (color, order);
+}
+
+/* Transforms coords based on image addressing mode */
+_CL_READONLY static int4
+pocl_address_mode (global dev_image_t *img, int4 input_coord,
+                   dev_sampler_t samp)
+{
+  if ((samp & CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP_TO_EDGE)
+    {
+      int4 max_clamp = max (
+          (int4) (img->_width - 1, img->_height - 1, img->_depth - 1, 0),
+          (int4)0);
+      return clamp (input_coord, (int4) (0), max_clamp);
+    }
+
+  if ((samp & CLK_ADDRESS_MASK) == CLK_ADDRESS_CLAMP)
+    {
+      int4 max_clamp
+          = max ((int4) (img->_width, img->_height, img->_depth, 0), (int4)0);
+      return clamp (input_coord, (int4) (-1), max_clamp);
+    }
+
+  return input_coord;
+}
+
+/*************************************************************************/
+
+_CL_READONLY static float4
+read_pixel_linear_3d_float (float4 abc, float4 one_m, int4 ijk0, int4 ijk1,
+                            int width, int height, int depth, int channel_type,
+                            size_t row_pitch, size_t slice_pitch, int order,
+                            void *data)
+{
+  size_t base_index = 0;
+  int ijk0_y_OK = (ijk0.y >= 0 && ijk0.y < height);
+  int ijk1_y_OK = (ijk1.y >= 0 && ijk1.y < height);
+  int ijk0_x_OK = (ijk0.x >= 0 && ijk0.x < width);
+  int ijk1_x_OK = (ijk1.x >= 0 && ijk1.x < width);
+  // 3D image
+  // T = (1 – a) * (1 – b) * (1 – c) * Ti0j0k0
+  float4 sum = (float4) (0.0f);
+
+  if (ijk0.z >= 0 && ijk0.z < depth)
+    {
+      base_index += (ijk0.z * slice_pitch);
+
+      if (ijk0_y_OK)
+        {
+          base_index += (ijk0.y * row_pitch);
+
+          if (ijk0_x_OK)
             {
-              (*color_ptr)[i] = ((uchar*)(dev_image->_data))[base_index + i];
+              base_index += ijk0.x;
+              sum += (one_m.x * one_m.y * one_m.z
+                      * pocl_read_pixel_fast_f (base_index, channel_type,
+                                                order, data));
+              base_index -= ijk0.x;
             }
+
+          // + a * (1 – b) * (1 – c) * Ti1j0k0
+          if (ijk1_x_OK)
+            {
+              base_index += ijk1.x;
+              sum += (abc.x * one_m.y * one_m.z
+                      * pocl_read_pixel_fast_f (base_index, channel_type,
+                                                order, data));
+              base_index -= ijk1.x;
+            }
+
+          base_index -= (ijk0.y * row_pitch);
         }
+
+      if (ijk1_y_OK)
+        {
+          base_index += (ijk1.y * row_pitch);
+
+          // + (1 – a) * b * (1 – c) * Ti0j1k0
+          if (ijk0_x_OK)
+            {
+              base_index += ijk0.x;
+              sum += (one_m.x * abc.y * one_m.z
+                      * pocl_read_pixel_fast_f (base_index, channel_type,
+                                                order, data));
+              base_index -= ijk0.x;
+            }
+
+          // + a * b * (1 – c) * Ti1j1k0
+          if (ijk1_x_OK)
+            {
+              base_index += ijk1.x;
+              sum += (abc.x * abc.y * one_m.z
+                      * pocl_read_pixel_fast_f (base_index, channel_type,
+                                                order, data));
+              base_index -= ijk1.x;
+            }
+
+          base_index -= (ijk1.y * row_pitch);
+        }
+
+      base_index -= (ijk0.z * slice_pitch);
     }
-  else if (elem_size == 2)
+
+  if (ijk1.z >= 0 && ijk1.z < depth)
     {
-      if(dev_image->_order == CL_BGRA)
+      base_index += (ijk1.z * slice_pitch);
+
+      if (ijk0_y_OK)
         {
-          (*color_ptr)[0] = ((ushort*)(dev_image->_data))[base_index + 2];
-          (*color_ptr)[1] = ((ushort*)(dev_image->_data))[base_index + 1];
-          (*color_ptr)[2] = ((ushort*)(dev_image->_data))[base_index + 0];
-          (*color_ptr)[3] = ((ushort*)(dev_image->_data))[base_index + 3]; 
+          base_index += (ijk0.y * row_pitch);
+
+          // + (1 – a) * (1 – b) * c * Ti0j0k1
+          if (ijk0_x_OK)
+            {
+              base_index += ijk0.x;
+              sum += (one_m.x * one_m.y * abc.z
+                      * pocl_read_pixel_fast_f (base_index, channel_type,
+                                                order, data));
+              base_index -= ijk0.x;
+            }
+
+          // + a * (1 – b) * (1 – c) * Ti1j0k0
+          if (ijk1_x_OK)
+            {
+              base_index += ijk1.x;
+              sum += (abc.x * one_m.y * abc.z
+                      * pocl_read_pixel_fast_f (base_index, channel_type,
+                                                order, data));
+              base_index -= ijk1.x;
+            }
+
+          base_index -= (ijk0.y * row_pitch);
         }
-      else
+
+      if (ijk1_y_OK)
         {
-          while (i--)
+          base_index += (ijk1.y * row_pitch);
+
+          // + (1 – a) * b * (1 – c) * Ti0j1k0
+          if (ijk0_x_OK)
             {
-              (*color_ptr)[i] = ((ushort*)(dev_image->_data))[base_index + i];
+              base_index += ijk0.x;
+              sum += (one_m.x * abc.y * abc.z
+                      * pocl_read_pixel_fast_f (base_index, channel_type,
+                                                order, data));
+              base_index -= ijk0.x;
             }
-      }
+
+          // + a * b * (1 – c) * Ti1j1k0
+          if (ijk1_x_OK)
+            {
+              base_index += ijk1.x;
+              sum += (abc.x * abc.y * abc.z
+                      * pocl_read_pixel_fast_f (base_index, channel_type,
+                                                order, data));
+              base_index -= ijk1.x;
+            }
+
+          base_index -= (ijk1.y * row_pitch);
+        }
+
+      base_index -= (ijk1.z * slice_pitch);
     }
-  else if (elem_size == 4)
+
+  return sum;
+}
+
+/* TODO: float * convert_flaot(UINT32) is imprecise, so reading from images
+ * with 32bit channel types may return quite bad results.
+ */
+
+_CL_READONLY static uint4
+read_pixel_linear_3d_uint (float4 abc, float4 one_m, int4 ijk0, int4 ijk1,
+                           int width, int height, int depth, size_t row_pitch,
+                           size_t slice_pitch, int order, int elem_size,
+                           void *data)
+{
+  size_t base_index = 0;
+  int ijk0_y_OK = (ijk0.y >= 0 && ijk0.y < height);
+  int ijk1_y_OK = (ijk1.y >= 0 && ijk1.y < height);
+  int ijk0_x_OK = (ijk0.x >= 0 && ijk0.x < width);
+  int ijk1_x_OK = (ijk1.x >= 0 && ijk1.x < width);
+  // 3D image
+  // T = (1 – a) * (1 – b) * (1 – c) * Ti0j0k0
+  float4 sum = (float4) (0.0f);
+
+  if (ijk0.z >= 0 && ijk0.z < depth)
     {
-      if(dev_image->_order == CL_BGRA)
+      base_index += (ijk0.z * slice_pitch);
+
+      if (ijk0_y_OK)
         {
-          (*color_ptr)[0] = ((uint*)(dev_image->_data))[base_index + 2];
-          (*color_ptr)[1] = ((uint*)(dev_image->_data))[base_index + 1];
-          (*color_ptr)[2] = ((uint*)(dev_image->_data))[base_index + 0];
-          (*color_ptr)[3] = ((uint*)(dev_image->_data))[base_index + 3]; 
+          base_index += (ijk0.y * row_pitch);
+
+          if (ijk0_x_OK)
+            {
+              base_index += ijk0.x;
+              sum += (one_m.x * one_m.y * one_m.z
+                      * convert_float4 (pocl_read_pixel_fast_ui (
+                            base_index, order, elem_size, data)));
+              base_index -= ijk0.x;
+            }
+
+          // + a * (1 – b) * (1 – c) * Ti1j0k0
+          if (ijk1_x_OK)
+            {
+              base_index += ijk1.x;
+              sum += (abc.x * one_m.y * one_m.z
+                      * convert_float4 (pocl_read_pixel_fast_ui (
+                            base_index, order, elem_size, data)));
+              base_index -= ijk1.x;
+            }
+
+          base_index -= (ijk0.y * row_pitch);
         }
-      else
+
+      if (ijk1_y_OK)
+        {
+          base_index += (ijk1.y * row_pitch);
+
+          // + (1 – a) * b * (1 – c) * Ti0j1k0
+          if (ijk0_x_OK)
+            {
+              base_index += ijk0.x;
+              sum += (one_m.x * abc.y * one_m.z
+                      * convert_float4 (pocl_read_pixel_fast_ui (
+                            base_index, order, elem_size, data)));
+              base_index -= ijk0.x;
+            }
+
+          // + a * b * (1 – c) * Ti1j1k0
+          if (ijk1_x_OK)
+            {
+              base_index += ijk1.x;
+              sum += (abc.x * abc.y * one_m.z
+                      * convert_float4 (pocl_read_pixel_fast_ui (
+                            base_index, order, elem_size, data)));
+              base_index -= ijk1.x;
+            }
+
+          base_index -= (ijk1.y * row_pitch);
+        }
+
+      base_index -= (ijk0.z * slice_pitch);
+    }
+
+  if (ijk1.z >= 0 && ijk1.z < depth)
+    {
+      base_index += (ijk1.z * slice_pitch);
+
+      if (ijk0_y_OK)
+        {
+          base_index += (ijk0.y * row_pitch);
+
+          // + (1 – a) * (1 – b) * c * Ti0j0k1
+          if (ijk0_x_OK)
+            {
+              base_index += ijk0.x;
+              sum += (one_m.x * one_m.y * abc.z
+                      * convert_float4 (pocl_read_pixel_fast_ui (
+                            base_index, order, elem_size, data)));
+              base_index -= ijk0.x;
+            }
+
+          // + a * (1 – b) * (1 – c) * Ti1j0k0
+          if (ijk1_x_OK)
+            {
+              base_index += ijk1.x;
+              sum += (abc.x * one_m.y * abc.z
+                      * convert_float4 (pocl_read_pixel_fast_ui (
+                            base_index, order, elem_size, data)));
+              base_index -= ijk1.x;
+            }
+
+          base_index -= (ijk0.y * row_pitch);
+        }
+
+      if (ijk1_y_OK)
         {
-          while (i--)
+          base_index += (ijk1.y * row_pitch);
+
+          // + (1 – a) * b * (1 – c) * Ti0j1k0
+          if (ijk0_x_OK)
+            {
+              base_index += ijk0.x;
+              sum += (one_m.x * abc.y * abc.z
+                      * convert_float4 (pocl_read_pixel_fast_ui (
+                            base_index, order, elem_size, data)));
+              base_index -= ijk0.x;
+            }
+
+          // + a * b * (1 – c) * Ti1j1k0
+          if (ijk1_x_OK)
+            {
+              base_index += ijk1.x;
+              sum += (abc.x * abc.y * abc.z
+                      * convert_float4 (pocl_read_pixel_fast_ui (
+                            base_index, order, elem_size, data)));
+              base_index -= ijk1.x;
+            }
+
+          base_index -= (ijk1.y * row_pitch);
+        }
+
+      base_index -= (ijk1.z * slice_pitch);
+    }
+
+  return convert_uint4 (sum);
+}
+
+_CL_READONLY static int4
+read_pixel_linear_3d_int (float4 abc, float4 one_m, int4 ijk0, int4 ijk1,
+                          int width, int height, int depth, size_t row_pitch,
+                          size_t slice_pitch, int order, int elem_size,
+                          void *data)
+{
+  size_t base_index = 0;
+  int ijk0_y_OK = (ijk0.y >= 0 && ijk0.y < height);
+  int ijk1_y_OK = (ijk1.y >= 0 && ijk1.y < height);
+  int ijk0_x_OK = (ijk0.x >= 0 && ijk0.x < width);
+  int ijk1_x_OK = (ijk1.x >= 0 && ijk1.x < width);
+  // 3D image
+  // T = (1 – a) * (1 – b) * (1 – c) * Ti0j0k0
+  float4 sum = (float4) (0.0f);
+
+  if (ijk0.z >= 0 && ijk0.z < depth)
+    {
+      base_index += (ijk0.z * slice_pitch);
+
+      if (ijk0_y_OK)
+        {
+          base_index += (ijk0.y * row_pitch);
+
+          if (ijk0_x_OK)
+            {
+              base_index += ijk0.x;
+              sum += (one_m.x * one_m.y * one_m.z
+                      * convert_float4 (pocl_read_pixel_fast_i (
+                            base_index, order, elem_size, data)));
+              base_index -= ijk0.x;
+            }
+
+          // + a * (1 – b) * (1 – c) * Ti1j0k0
+          if (ijk1_x_OK)
+            {
+              base_index += ijk1.x;
+              sum += (abc.x * one_m.y * one_m.z
+                      * convert_float4 (pocl_read_pixel_fast_i (
+                            base_index, order, elem_size, data)));
+              base_index -= ijk1.x;
+            }
+
+          base_index -= (ijk0.y * row_pitch);
+        }
+
+      if (ijk1_y_OK)
+        {
+          base_index += (ijk1.y * row_pitch);
+
+          // + (1 – a) * b * (1 – c) * Ti0j1k0
+          if (ijk0_x_OK)
+            {
+              base_index += ijk0.x;
+              sum += (one_m.x * abc.y * one_m.z
+                      * convert_float4 (pocl_read_pixel_fast_i (
+                            base_index, order, elem_size, data)));
+              base_index -= ijk0.x;
+            }
+
+          // + a * b * (1 – c) * Ti1j1k0
+          if (ijk1_x_OK)
+            {
+              base_index += ijk1.x;
+              sum += (abc.x * abc.y * one_m.z
+                      * convert_float4 (pocl_read_pixel_fast_i (
+                            base_index, order, elem_size, data)));
+              base_index -= ijk1.x;
+            }
+
+          base_index -= (ijk1.y * row_pitch);
+        }
+
+      base_index -= (ijk0.z * slice_pitch);
+    }
+
+  if (ijk1.z >= 0 && ijk1.z < depth)
+    {
+      base_index += (ijk1.z * slice_pitch);
+
+      if (ijk0_y_OK)
+        {
+          base_index += (ijk0.y * row_pitch);
+
+          // + (1 – a) * (1 – b) * c * Ti0j0k1
+          if (ijk0_x_OK)
+            {
+              base_index += ijk0.x;
+              sum += (one_m.x * one_m.y * abc.z
+                      * convert_float4 (pocl_read_pixel_fast_i (
+                            base_index, order, elem_size, data)));
+              base_index -= ijk0.x;
+            }
+
+          // + a * (1 – b) * (1 – c) * Ti1j0k0
+          if (ijk1_x_OK)
+            {
+              base_index += ijk1.x;
+              sum += (abc.x * one_m.y * abc.z
+                      * convert_float4 (pocl_read_pixel_fast_i (
+                            base_index, order, elem_size, data)));
+              base_index -= ijk1.x;
+            }
+
+          base_index -= (ijk0.y * row_pitch);
+        }
+
+      if (ijk1_y_OK)
+        {
+          base_index += (ijk1.y * row_pitch);
+
+          // + (1 – a) * b * (1 – c) * Ti0j1k0
+          if (ijk0_x_OK)
+            {
+              base_index += ijk0.x;
+              sum += (one_m.x * abc.y * abc.z
+                      * convert_float4 (pocl_read_pixel_fast_i (
+                            base_index, order, elem_size, data)));
+              base_index -= ijk0.x;
+            }
+
+          // + a * b * (1 – c) * Ti1j1k0
+          if (ijk1_x_OK)
             {
-              (*color_ptr)[i] = ((uint*)(dev_image->_data))[base_index + i];
+              base_index += ijk1.x;
+              sum += (abc.x * abc.y * abc.z
+                      * convert_float4 (pocl_read_pixel_fast_i (
+                            base_index, order, elem_size, data)));
+              base_index -= ijk1.x;
             }
+
+          base_index -= (ijk1.y * row_pitch);
+        }
+
+      base_index -= (ijk1.z * slice_pitch);
+    }
+
+  return convert_int4 (sum);
+}
+
+_CL_READONLY static uint4
+read_pixel_linear_3d (float4 abc, float4 one_m, int4 ijk0, int4 ijk1,
+                      int width, int height, int depth, int channel_type,
+                      size_t row_pitch, size_t slice_pitch, int order,
+                      int elem_size, void *data)
+{
+  // TODO unsupported channel types
+  if ((channel_type == CLK_SIGNED_INT8) || (channel_type == CLK_SIGNED_INT16)
+      || (channel_type == CLK_SIGNED_INT32))
+    return as_uint4 (read_pixel_linear_3d_int (
+        abc, one_m, ijk0, ijk1, width, height, depth, row_pitch, slice_pitch,
+        order, elem_size, data));
+  if ((channel_type == CLK_UNSIGNED_INT8) || (channel_type == CLK_UNSIGNED_INT16)
+      || (channel_type == CLK_UNSIGNED_INT32))
+    return read_pixel_linear_3d_uint (abc, one_m, ijk0, ijk1, width, height,
+                                      depth, row_pitch, slice_pitch, order,
+                                      elem_size, data);
+  return as_uint4 (read_pixel_linear_3d_float (
+      abc, one_m, ijk0, ijk1, width, height, depth, channel_type, row_pitch,
+      slice_pitch, order, data));
+}
+
+/*************************************************************************/
+
+_CL_READONLY static float4
+read_pixel_linear_2d_float (float4 abc, float4 one_m, int4 ijk0, int4 ijk1,
+                            int array_coord, int width, int height,
+                            int channel_type, size_t row_pitch,
+                            size_t slice_pitch, int order, void *data)
+{
+  // 2D image
+  size_t base_index = 0;
+  int ijk0_x_OK = (ijk0.x >= 0 && ijk0.x < width);
+  int ijk1_x_OK = (ijk1.x >= 0 && ijk1.x < width);
+  float4 sum = (float4) (0.0f);
+
+  if (array_coord > 0)
+    base_index += (array_coord * slice_pitch);
+
+  if (ijk0.y >= 0 && ijk0.y < height)
+    {
+      base_index += (ijk0.y * row_pitch);
+
+      // T = (1 – a) * (1 – b) * Ti0j0
+      if (ijk0_x_OK)
+        {
+          base_index += ijk0.x;
+          sum += (one_m.x * one_m.y * pocl_read_pixel_fast_f (base_index,
+                                                              channel_type,
+                                                              order, data));
+          base_index -= ijk0.x;
+        }
+
+      // + a * (1 – b) * Ti1j0
+      if (ijk1_x_OK)
+        {
+          base_index += ijk1.x;
+          sum += (abc.x * one_m.y * pocl_read_pixel_fast_f (base_index,
+                                                            channel_type,
+                                                            order, data));
+          base_index -= ijk1.x;
+        }
+
+      base_index -= (ijk0.y * row_pitch);
+    }
+
+  if (ijk1.y >= 0 && ijk1.y < height)
+    {
+      base_index += (ijk1.y * row_pitch);
+
+      // + (1 – a) * b * Ti0j1
+      if (ijk0_x_OK)
+        {
+          base_index += ijk0.x;
+          sum += (one_m.x * abc.y * pocl_read_pixel_fast_f (base_index,
+                                                            channel_type,
+                                                            order, data));
+          base_index -= ijk0.x;
+        }
+
+      // + a * b * Ti1j1
+      if (ijk1_x_OK)
+        {
+          base_index += ijk1.x;
+          sum += (abc.x * abc.y * pocl_read_pixel_fast_f (
+                                      base_index, channel_type, order, data));
+          base_index -= ijk1.x;
         }
+
+      base_index -= (ijk1.y * row_pitch);
     }
+
+  return sum;
 }
 
+/* TODO: float * convert_flaot(UINT32) is imprecise, so reading from images
+ * with 32bit channel types may return quite bad results.
+ */
+
+_CL_READONLY static uint4
+read_pixel_linear_2d_uint (float4 abc, float4 one_m, int4 ijk0, int4 ijk1,
+                           int array_coord, int width, int height,
+                           size_t row_pitch, size_t slice_pitch, int order,
+                           int elem_size, void *data)
+{
+  // 2D image
+  size_t base_index = 0;
+  int ijk0_x_OK = (ijk0.x >= 0 && ijk0.x < width);
+  int ijk1_x_OK = (ijk1.x >= 0 && ijk1.x < width);
+  float4 sum = (float4) (0.0f);
+
+  if (array_coord > 0)
+    base_index += (array_coord * slice_pitch);
+
+  if (ijk0.y >= 0 && ijk0.y < height)
+    {
+      base_index += (ijk0.y * row_pitch);
+
+      // T = (1 – a) * (1 – b) * Ti0j0
+      if (ijk0_x_OK)
+        {
+          base_index += ijk0.x;
+          sum += (one_m.x * one_m.y
+                  * convert_float4 (pocl_read_pixel_fast_ui (
+                        base_index, order, elem_size, data)));
+          base_index -= ijk0.x;
+        }
+
+      // + a * (1 – b) * Ti1j0
+      if (ijk1_x_OK)
+        {
+          base_index += ijk1.x;
+          sum += (abc.x * one_m.y * convert_float4 (pocl_read_pixel_fast_ui (
+                                        base_index, order, elem_size, data)));
+          base_index -= ijk1.x;
+        }
+
+      base_index -= (ijk0.y * row_pitch);
+    }
+
+  if (ijk1.y >= 0 && ijk1.y < height)
+    {
+      base_index += (ijk1.y * row_pitch);
+
+      // + (1 – a) * b * Ti0j1
+      if (ijk0_x_OK)
+        {
+          base_index += ijk0.x;
+          sum += (one_m.x * abc.y * convert_float4 (pocl_read_pixel_fast_ui (
+                                        base_index, order, elem_size, data)));
+          base_index -= ijk0.x;
+        }
+
+      // + a * b * Ti1j1
+      if (ijk1_x_OK)
+        {
+          base_index += ijk1.x;
+          sum += (abc.x * abc.y * convert_float4 (pocl_read_pixel_fast_ui (
+                                      base_index, order, elem_size, data)));
+          base_index -= ijk1.x;
+        }
+
+      base_index -= (ijk1.y * row_pitch);
+    }
+
+  return convert_uint4 (sum);
+}
+
+_CL_READONLY static int4
+read_pixel_linear_2d_int (float4 abc, float4 one_m, int4 ijk0, int4 ijk1,
+                          int array_coord, int width, int height,
+                          size_t row_pitch, size_t slice_pitch, int order,
+                          int elem_size, void *data)
+{
+  // 2D image
+  size_t base_index = 0;
+  int ijk0_x_OK = (ijk0.x >= 0 && ijk0.x < width);
+  int ijk1_x_OK = (ijk1.x >= 0 && ijk1.x < width);
+  float4 sum = (float4) (0.0f);
+
+  if (array_coord > 0)
+    base_index += (array_coord * slice_pitch);
+
+  if (ijk0.y >= 0 && ijk0.y < height)
+    {
+      base_index += (ijk0.y * row_pitch);
+
+      // T = (1 – a) * (1 – b) * Ti0j0
+      if (ijk0_x_OK)
+        {
+          base_index += ijk0.x;
+          sum += (one_m.x * one_m.y
+                  * convert_float4 (pocl_read_pixel_fast_i (base_index, order,
+                                                            elem_size, data)));
+          base_index -= ijk0.x;
+        }
+
+      // + a * (1 – b) * Ti1j0
+      if (ijk1_x_OK)
+        {
+          base_index += ijk1.x;
+          sum += (abc.x * one_m.y * convert_float4 (pocl_read_pixel_fast_i (
+                                        base_index, order, elem_size, data)));
+          base_index -= ijk1.x;
+        }
+
+      base_index -= (ijk0.y * row_pitch);
+    }
+
+  if (ijk1.y >= 0 && ijk1.y < height)
+    {
+      base_index += (ijk1.y * row_pitch);
+
+      // + (1 – a) * b * Ti0j1
+      if (ijk0_x_OK)
+        {
+          base_index += ijk0.x;
+          sum += (one_m.x * abc.y * convert_float4 (pocl_read_pixel_fast_i (
+                                        base_index, order, elem_size, data)));
+          base_index -= ijk0.x;
+        }
+
+      // + a * b * Ti1j1
+      if (ijk1_x_OK)
+        {
+          base_index += ijk1.x;
+          sum += (abc.x * abc.y * convert_float4 (pocl_read_pixel_fast_i (
+                                      base_index, order, elem_size, data)));
+          base_index -= ijk1.x;
+        }
+
+      base_index -= (ijk1.y * row_pitch);
+    }
+
+  return convert_int4 (sum);
+}
+
+_CL_READONLY static uint4
+read_pixel_linear_2d (float4 abc, float4 one_m, int4 ijk0, int4 ijk1,
+                      int array_coord, int width, int height, int channel_type,
+                      size_t row_pitch, size_t slice_pitch, int order,
+                      int elem_size, void *data)
+{
+  // TODO unsupported channel types
+  if ((channel_type == CLK_SIGNED_INT8) || (channel_type == CLK_SIGNED_INT16)
+      || (channel_type == CLK_SIGNED_INT32))
+    return as_uint4 (read_pixel_linear_2d_int (
+        abc, one_m, ijk0, ijk1, array_coord, width, height, row_pitch,
+        slice_pitch, order, elem_size, data));
+  if ((channel_type == CLK_UNSIGNED_INT8) || (channel_type == CLK_UNSIGNED_INT16)
+      || (channel_type == CLK_UNSIGNED_INT32))
+    return read_pixel_linear_2d_uint (abc, one_m, ijk0, ijk1, array_coord,
+                                      width, height, row_pitch, slice_pitch,
+                                      order, elem_size, data);
+  return as_uint4 (read_pixel_linear_2d_float (
+      abc, one_m, ijk0, ijk1, array_coord, width, height, channel_type,
+      row_pitch, slice_pitch, order, data));
+}
+
+/*************************************************************************/
+
+_CL_READONLY static float4
+read_pixel_linear_1d_float (float4 abc, float4 one_m, int ijk0, int ijk1,
+                            int array_coord, int width, size_t slice_pitch,
+                            int channel_type, int order, void *data)
+{
+  // 1D image
+  size_t base_index = 0;
+  float4 sum = (float4) (0.0f);
+
+  if (array_coord > 0)
+    base_index += (array_coord * slice_pitch);
+
+  // T = (1 – a) * Ti0
+  if (ijk0 >= 0 && ijk0 < width)
+    {
+      base_index += ijk0;
+      sum += (one_m.x * pocl_read_pixel_fast_f (base_index, channel_type,
+                                                order, data));
+      base_index -= ijk0;
+    }
+
+  // + a * Ti1
+  if (ijk1 >= 0 && ijk1 < width)
+    {
+      base_index += ijk1;
+      sum += (abc.x * pocl_read_pixel_fast_f (base_index, channel_type, order,
+                                              data));
+      base_index -= ijk1;
+    }
+
+  return sum;
+}
+
+/* TODO: float * convert_flaot(UINT32) is imprecise, so reading from images
+ * with 32bit channel types may return quite bad results.
+ */
+
+_CL_READONLY static uint4
+read_pixel_linear_1d_uint (float4 abc, float4 one_m, int ijk0, int ijk1,
+                           int array_coord, int width, size_t slice_pitch,
+                           int order, int elem_size, void *data)
+{
+  // 1D image
+  size_t base_index = 0;
+  float4 sum = (float4) (0.0f);
+
+  if (array_coord > 0)
+    base_index += (array_coord * slice_pitch);
+
+  // T = (1 – a) * Ti0
+  if (ijk0 >= 0 && ijk0 < width)
+    {
+      base_index += ijk0;
+      sum += (one_m.x * convert_float4 (pocl_read_pixel_fast_ui (
+                            base_index, order, elem_size, data)));
+      base_index -= ijk0;
+    }
+
+  // + a * Ti1
+  if (ijk1 >= 0 && ijk1 < width)
+    {
+      base_index += ijk1;
+      sum += (abc.x * convert_float4 (pocl_read_pixel_fast_ui (
+                          base_index, order, elem_size, data)));
+      base_index -= ijk1;
+    }
+
+  return convert_uint4 (sum);
+}
+
+_CL_READONLY static int4
+read_pixel_linear_1d_int (float4 abc, float4 one_m, int ijk0, int ijk1,
+                          int array_coord, int width, size_t slice_pitch,
+                          int order, int elem_size, void *data)
+{
+  // 1D image
+  size_t base_index = 0;
+  float4 sum = (float4) (0.0f);
+
+  if (array_coord > 0)
+    base_index += (array_coord * slice_pitch);
+
+  // T = (1 – a) * Ti0
+  if (ijk0 >= 0 && ijk0 < width)
+    {
+      base_index += ijk0;
+      sum += (one_m.x * convert_float4 (pocl_read_pixel_fast_i (
+                            base_index, order, elem_size, data)));
+      base_index -= ijk0;
+    }
+
+  // + a * Ti1
+  if (ijk1 >= 0 && ijk1 < width)
+    {
+      base_index += ijk1;
+      sum += (abc.x * convert_float4 (pocl_read_pixel_fast_i (
+                          base_index, order, elem_size, data)));
+      base_index -= ijk1;
+    }
+
+  return convert_int4 (sum);
+}
+
+_CL_READONLY static uint4
+read_pixel_linear_1d (float4 abc, float4 one_m, int ijk0, int ijk1,
+                      int array_coord, int width, size_t slice_pitch,
+                      int channel_type, int order, int elem_size, void *data)
+{
+  // TODO unsupported channel types
+  if ((channel_type == CLK_SIGNED_INT8) || (channel_type == CLK_SIGNED_INT16)
+      || (channel_type == CLK_SIGNED_INT32))
+    return as_uint4 (read_pixel_linear_1d_int (abc, one_m, ijk0, ijk1,
+                                               array_coord, width, slice_pitch,
+                                               order, elem_size, data));
+  if ((channel_type == CLK_UNSIGNED_INT8) || (channel_type == CLK_UNSIGNED_INT16)
+      || (channel_type == CLK_UNSIGNED_INT32))
+    return read_pixel_linear_1d_uint (abc, one_m, ijk0, ijk1, array_coord,
+                                      width, slice_pitch, order, elem_size,
+                                      data);
+  return as_uint4 (read_pixel_linear_1d_float (abc, one_m, ijk0, ijk1,
+                                               array_coord, width, slice_pitch,
+                                               channel_type, order, data));
+}
+
+/*************************************************************************/
+
+/* These magic constant should be converted to some sort of
+ * error signaling */
+#define INVALID_SAMPLER_ADDRMODE (uint4) (0x1111)
+#define INVALID_SAMPLER_FILTER (uint4) (0x2222)
+#define INVALID_SAMPLER_NORMAL (uint4) (0x3333)
+
+_CL_READONLY static uint4
+nonrepeat_filter (global dev_image_t *img, float4 orig_coord,
+                  dev_sampler_t samp)
+{
+  float4 coord = orig_coord;
+  if (samp & CLK_NORMALIZED_COORDS_TRUE)
+    {
+      float4 imgsize = convert_float4 (pocl_get_image_array_size (img));
+      coord *= imgsize;
+    }
+
+  int num_channels = img->_num_channels;
+  int elem_size = img->_elem_size;
+  int a_index = 0;
+  size_t elem_bytes = num_channels * elem_size;
+  size_t row_pitch = img->_row_pitch / elem_bytes;
+  size_t slice_pitch = img->_slice_pitch / elem_bytes;
+
+  if (samp & CLK_FILTER_NEAREST)
+    {
+      int4 final_coord
+          = pocl_address_mode (img, convert_int4 (floor (coord)), samp);
+      int4 array_coord = get_image_array_offset2 (img, final_coord, coord);
+      return pocl_read_pixel (img, array_coord);
+    }
+  else if (samp & CLK_FILTER_LINEAR)
+    {
+      float4 r0 = floor (coord - (float4) (0.5f)); // ijk0, address mod
+      float4 r1 = r0 + (float4) (1.0f);            // ijk1, address mod
+      int4 ijk0 = pocl_address_mode (img, convert_int4 (r0), samp);
+      int4 ijk1 = pocl_address_mode (img, convert_int4 (r1), samp);
+      float4 unused;
+      float4 abc = fract ((coord - (float4) (0.5f)), &unused);
+      float4 one_m = (float4) (1.0f) - abc;
+      uint4 res;
+      if (img->_depth != 0)
+        {
+          res = read_pixel_linear_3d (
+              abc, one_m, ijk0, ijk1, img->_width, img->_height, img->_depth,
+              img->_data_type, row_pitch, slice_pitch, img->_order,
+              img->_elem_size, img->_data);
+        }
+      else if (img->_height != 0)
+        {
+          if (img->_image_array_size > 0)
+            a_index = clamp (convert_int (floor (coord.z + 0.5f)), 0,
+                             (int)(img->_image_array_size - 1));
+          res = read_pixel_linear_2d (
+              abc, one_m, ijk0, ijk1, a_index, img->_width, img->_height,
+              img->_data_type, row_pitch, slice_pitch, img->_order,
+              img->_elem_size, img->_data);
+        }
+      else
+        {
+          if (img->_image_array_size > 0)
+            a_index = clamp (convert_int (floor (coord.y + 0.5f)), 0,
+                             (int)(img->_image_array_size - 1));
+          res = read_pixel_linear_1d (
+              abc, one_m, ijk0.x, ijk1.x, a_index, img->_width, slice_pitch,
+              img->_data_type, img->_order, img->_elem_size, img->_data);
+        }
+      return map_channels (res, img->_order);
+    }
+  else
+    {
+      // this should never happen - filter can only be LINEAR/NEAREST
+      return INVALID_SAMPLER_FILTER;
+    }
+}
+
+_CL_READONLY static uint4
+repeat_filter (global dev_image_t *img, float4 coord, dev_sampler_t samp)
+{
+  int array_size = img->_image_array_size;
+  int num_channels = img->_num_channels;
+  size_t elem_bytes = num_channels * img->_elem_size;
+  size_t row_pitch = img->_row_pitch / elem_bytes;
+  size_t slice_pitch = img->_slice_pitch / elem_bytes;
+
+  if (samp & CLK_FILTER_NEAREST)
+    {
+      /*
+         uvw = (str – floor(str)) * whd
+         ijk = (int)floor(uvw)
+         if (ijk > whd – 1)
+           ijk = ijk – whd
+         ... same for 3 coords
+      */
+      int4 maxcoord = pocl_get_image_array_size (img);
+      float4 whd = convert_float4 (maxcoord);
+      float4 uvw = (coord - floor (coord)) * whd;
+      int4 ijk = convert_int4 (floor (uvw));
+      int4 final_coord = select (ijk, (ijk - maxcoord), (ijk >= maxcoord));
+      int4 array_coord
+          = get_image_array_offset2 (img, final_coord, (coord * whd));
+
+      return pocl_read_pixel (img, array_coord);
+    }
+  else if (samp & CLK_FILTER_LINEAR)
+    {
+      /*
+          u = (s – floor(s)) * wt
+          i0 = (int)floor(u – 0.5)
+          i1 = i0 + 1
+          if (i0 < 0)
+           i0 = wt + i0
+          if (i1 > wt – 1)
+           i1 = i1 – wt
+      */
+      int a_index = 0;
+      int4 maxcoord = (int4) (img->_width, img->_height, img->_depth, 1);
+      float4 whd = convert_float4 (maxcoord);
+      float4 uvw = (coord - floor (coord)) * whd;
+      int4 ijk0 = convert_int4 (floor (uvw - (float4) (0.5f)));
+      int4 ijk1 = ijk0 + (int4) (1);
+      ijk0 = select (ijk0, (ijk0 + maxcoord), (ijk0 < (int4) (0)));
+      maxcoord = max (maxcoord, (int4)1);
+      ijk1 = ijk1 % maxcoord;
+      float4 unused;
+      float arraysize_f = convert_float (array_size);
+      float4 abc = fract ((uvw - (float4) (0.5f)), &unused);
+      float4 one_m = (float4) (1.0f) - abc;
+
+      uint4 res;
+      if (img->_depth != 0)
+        {
+          res = read_pixel_linear_3d (
+              abc, one_m, ijk0, ijk1, img->_width, img->_height, img->_depth,
+              img->_data_type, row_pitch, slice_pitch, img->_order,
+              img->_elem_size, img->_data);
+        }
+      else if (img->_height != 0)
+        {
+          if (array_size > 0)
+            a_index
+                = clamp (convert_int (floor ((coord.z * arraysize_f) + 0.5f)),
+                         0, (array_size - 1));
+          res = read_pixel_linear_2d (
+              abc, one_m, ijk0, ijk1, a_index, img->_width, img->_height,
+              img->_data_type, row_pitch, slice_pitch, img->_order,
+              img->_elem_size, img->_data);
+        }
+      else
+        {
+          if (array_size > 0)
+            a_index
+                = clamp (convert_int (floor ((coord.y * arraysize_f) + 0.5f)),
+                         0, (array_size - 1));
+          res = read_pixel_linear_1d (
+              abc, one_m, ijk0.x, ijk1.x, a_index, img->_width, slice_pitch,
+              img->_data_type, img->_order, img->_elem_size, img->_data);
+        }
+      return map_channels (res, img->_order);
+    }
+  else
+    {
+      // this should never happen - filter can only be LINEAR/NEAREST
+      return INVALID_SAMPLER_FILTER;
+    }
+}
+
+_CL_READONLY static uint4
+mirrored_repeat_filter (global dev_image_t *img, float4 coord,
+                        dev_sampler_t samp)
+{
+  int array_size = img->_image_array_size;
+  int num_channels = img->_num_channels;
+  size_t elem_bytes = num_channels * img->_elem_size;
+  size_t row_pitch = img->_row_pitch / elem_bytes;
+  size_t slice_pitch = img->_slice_pitch / elem_bytes;
+
+  if (samp & CLK_FILTER_NEAREST)
+    {
+      /*
+        s’ = 2.0f * rint(0.5f * s)
+        s’ = fabs(s – s’)
+        u = s’ * wt
+        i = (int)floor(u)
+        i = min(i, wt – 1)
+      */
+
+      float4 ss = (float4) (2.0f) * rint ((float4) (0.5f) * coord);
+      ss = fabs (coord - ss);
+      int4 maxcoord = pocl_get_image_array_size (img);
+      float4 whd = convert_float4 (maxcoord);
+      float4 uvw = ss * whd;
+      int4 ijk = convert_int4 (floor (uvw));
+      int4 wdt = max ((maxcoord - (int4) (1)), (int4) (0));
+      int4 final_coord = select (ijk, wdt, (ijk > wdt));
+      int4 array_coord
+          = get_image_array_offset2 (img, final_coord, (coord * whd));
+      return pocl_read_pixel (img, array_coord);
+    }
+  else if (samp & CLK_FILTER_LINEAR)
+    {
+      /*
+        s’ = 2.0f * rint(0.5f * s)
+        s’ = fabs(s – s’)
+        u = s’ * wt
+        i0 = (int)floor(u – 0.5f)
+        i1 = i0 + 1
+        i0 = max(i0, 0)
+        i1 = min(i1, wt – 1)
+      */
+      float4 ss = (float4) (2.0f) * rint ((float4) (0.5f) * coord);
+      ss = fabs (coord - ss);
+      int4 maxcoord = (int4) (img->_width, img->_height, img->_depth, 1);
+      float4 uvw = ss * convert_float4 (maxcoord);
+      int4 ijk0 = convert_int4 (floor (uvw - (float4) (0.5f)));
+      int4 ijk1 = ijk0 + (int4) (1);
+      ijk0 = max (ijk0, (int4)0);
+      ijk1 = min (ijk1, (maxcoord - (int4) (1)));
+      float4 unused;
+      float arraysize_f = convert_float (array_size);
+      float4 abc = fract ((uvw - (float4) (0.5f)), &unused);
+      float4 one_m = (float4) (1.0f) - abc;
+      uint4 res;
+      int a_index = 0;
+      if (img->_depth != 0)
+        {
+          res = read_pixel_linear_3d (
+              abc, one_m, ijk0, ijk1, img->_width, img->_height, img->_depth,
+              img->_data_type, row_pitch, slice_pitch, img->_order,
+              img->_elem_size, img->_data);
+        }
+      else if (img->_height != 0)
+        {
+          if (array_size > 0)
+            a_index
+                = clamp (convert_int (floor ((coord.z * arraysize_f) + 0.5f)),
+                         0, (array_size - 1));
+          res = read_pixel_linear_2d (
+              abc, one_m, ijk0, ijk1, a_index, img->_width, img->_height,
+              img->_data_type, row_pitch, slice_pitch, img->_order,
+              img->_elem_size, img->_data);
+        }
+      else
+        {
+          if (array_size > 0)
+            a_index
+                = clamp (convert_int (floor ((coord.y * arraysize_f) + 0.5f)),
+                         0, (array_size - 1));
+          res = read_pixel_linear_1d (
+              abc, one_m, ijk0.x, ijk1.x, a_index, img->_width, slice_pitch,
+              img->_data_type, img->_order, img->_elem_size, img->_data);
+        }
+      return map_channels (res, img->_order);
+    }
+  else
+    {
+      // this should never happen - filter can only be LINEAR/NEAREST
+      return INVALID_SAMPLER_FILTER;
+    }
+}
+
+/*************************************************************************/
+/* read pixel with float coordinates */
+_CL_READONLY static uint4
+pocl_read_pixel_floatc (global dev_image_t *img, float4 coord,
+                        dev_sampler_t samp)
+{
+  if ((samp & CLK_ADDRESS_MASK) == CLK_ADDRESS_REPEAT)
+    return repeat_filter (img, coord, samp);
+  else if ((samp & CLK_ADDRESS_MASK) == CLK_ADDRESS_MIRRORED_REPEAT)
+    return mirrored_repeat_filter (img, coord, samp);
+  else
+    return nonrepeat_filter (img, coord, samp);
+}
+
+/*************************************************************************/
+/* read pixel with int coordinates
+ * from Spec:
+ *
+ * Furthermore, the read_imagei and read_imageui calls that take integer
+ * coordinates must use a sampler with normalized coordinates set to
+ * CLK_NORMALIZED_COORDS_FALSE and addressing mode set to
+ * CLK_ADDRESS_CLAMP_TO_EDGE, CLK_ADDRESS_CLAMP or CLK_ADDRESS_NONE;
+ * otherwise the values returned are undefined.
+*/
+
+_CL_READONLY static uint4
+pocl_read_pixel_intc (global dev_image_t *img, int4 coord, dev_sampler_t samp)
+{
+  if (samp & CLK_NORMALIZED_COORDS_TRUE)
+    return INVALID_SAMPLER_NORMAL;
+  if (((samp & CLK_ADDRESS_MASK) == CLK_ADDRESS_REPEAT)
+      || ((samp & CLK_ADDRESS_MASK) == CLK_ADDRESS_MIRRORED_REPEAT))
+    return INVALID_SAMPLER_ADDRMODE;
+
+  int4 final_coord = pocl_address_mode (img, coord, samp);
+  int4 array_coord = get_image_array_offset (img, final_coord, coord);
+  return pocl_read_pixel (img, array_coord);
+}
+
+/******************* DONE *************************************************/
+/* read pixel with float coordinates, WITHOUT sampler
+ * from Spec:
+ *
+ * The samplerless read image functions behave exactly as the corresponding
+ * read image functions that take integer coordinates and a sampler with
+ * filter mode set to CLK_FILTER_NEAREST, normalized coordinates set to
+ * CLK_NORMALIZED_COORDS_FALSE and addressing mode to CLK_ADDRESS_NONE.
+ */
+
+_CL_READONLY static uint4
+pocl_read_pixel_intc_samplerless (global dev_image_t *img, int4 coord)
+{
+  dev_sampler_t samp
+      = CLK_FILTER_NEAREST | CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE;
+
+  int4 final_coord = pocl_address_mode (img, coord, samp);
+  int4 array_coord = get_image_array_offset (img, final_coord, coord);
+  return pocl_read_pixel (img, array_coord);
+}
+
+/*************************************************************************/
+
+#if __clang_major__ > 3
+/* After Clang 4.0, the sampler_t is passed as an opaque struct (ptr)
+ which we convert to int32 with the LLVM pass HandleSamplerInitialization. */
+#define READ_SAMPLER                                                    \
+    dev_sampler_t s = *__builtin_astype(sampler, dev_sampler_t*);
+#else
+/* Before Clang 4.0, the sampler_t was passed as an int32. */
+#define READ_SAMPLER                                                    \
+    dev_sampler_t s = __builtin_astype(sampler, dev_sampler_t);
+#endif
 
 /* Implementation for read_image with any image data type and int coordinates
    __IMGTYPE__ = image type (image2d_t, ...)
@@ -173,57 +1532,59 @@ void __pocl_read_pixel (void* color, ADDRESS_SPACE dev_image_t* dev_image, int4
    __COORD__   = coordinate type (int, int2, int4)
 */
 
-#if __clang_major__ > 3
+#define IMPLEMENT_READ_INT4_IMAGE_INT_COORD(__IMGTYPE__, __RETVAL__,          \
+                                            __POSTFIX__, __COORD__)           \
+  __RETVAL__ _CL_OVERLOADABLE _CL_READONLY read_image##__POSTFIX__ (                       \
+      __IMGTYPE__ image, sampler_t sampler, __COORD__ coord)                  \
+  {                                                                           \
+    int4 coord4;                                                              \
+    INITCOORD##__COORD__ (coord4, coord);                                     \
+    global dev_image_t *i_ptr                                                 \
+        = __builtin_astype (image, global dev_image_t *);                     \
+    READ_SAMPLER                                                              \
+    uint4 color = pocl_read_pixel_intc (i_ptr, coord4, s);                    \
+    return as_##__RETVAL__ (color);                                           \
+  }
 
-// After Clang 4.0, the sampler_t is passed as an opaque struct (ptr)
-// which we convert to int32 with the LLVM pass HandleSamplerInitialization.
-
-#define IMPLEMENT_READ_IMAGE_INT_COORD(__IMGTYPE__,__RETVAL__,__POSTFIX__,\
-                                            __COORD__)                  \
-  __RETVAL__ _CL_OVERLOADABLE read_image##__POSTFIX__ (__IMGTYPE__ image, \
-                                                       sampler_t sampler, \
-                                                       __COORD__ coord) \
-  {                                                                     \
-    __RETVAL__ color;                                                   \
-    int4 coord4;                                                        \
-    INITCOORD##__COORD__(coord4, coord);                                \
-    ADDRESS_SPACE dev_image_t* i_ptr =                                  \
-      __builtin_astype (image, ADDRESS_SPACE dev_image_t*);             \
-    dev_sampler_t s = *__builtin_astype(sampler, dev_sampler_t*);	\
-    if (__pocl_is_out_of_bounds (i_ptr, &coord4, &s, &color))           \
-      {                                                                 \
-        return color;                                                   \
-      }                                                                 \
-    __pocl_read_pixel (&color, i_ptr, coord4); \
-                                                                        \
-    return color;                                                       \
+#define IMPLEMENT_READ_FLOAT4_IMAGE_INT_COORD(__IMGTYPE__, __COORD__)         \
+  float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (__IMGTYPE__ image, sampler_t sampler,  \
+                                       __COORD__ coord)                       \
+  {                                                                           \
+    int4 coord4;                                                              \
+    INITCOORD##__COORD__ (coord4, coord);                                     \
+    global dev_image_t *i_ptr                                                 \
+        = __builtin_astype (image, global dev_image_t *);                     \
+    READ_SAMPLER                                                              \
+    uint4 color = pocl_read_pixel_intc (i_ptr, coord4, s);                    \
+    return as_float4 (color);                                                 \
   }
 
-#else
+#define IMPLEMENT_READ_FLOAT4_IMAGE_FLOAT_COORD(__IMGTYPE__, __COORD__)       \
+  float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (__IMGTYPE__ image, sampler_t sampler,  \
+                                       __COORD__ coord)                       \
+  {                                                                           \
+    float4 coord4;                                                            \
+    INITCOORD##__COORD__ (coord4, coord);                                     \
+    global dev_image_t *i_ptr                                                 \
+        = __builtin_astype (image, global dev_image_t *);                     \
+    READ_SAMPLER                                                              \
+    uint4 color = pocl_read_pixel_floatc (i_ptr, coord4, s);                  \
+    return as_float4 (color);                                                 \
+  }
 
-// Before Clang 4.0, the sampler_t was passed as an int32.
-
-#define IMPLEMENT_READ_IMAGE_INT_COORD(__IMGTYPE__,__RETVAL__,__POSTFIX__,\
-                                            __COORD__)                  \
-  __RETVAL__ _CL_OVERLOADABLE read_image##__POSTFIX__ (__IMGTYPE__ image, \
-                                                       sampler_t sampler, \
-                                                       __COORD__ coord) \
-  {                                                                     \
-    __RETVAL__ color;                                                   \
-    int4 coord4;                                                        \
-    INITCOORD##__COORD__(coord4, coord);                                \
-    ADDRESS_SPACE dev_image_t* i_ptr =                                  \
-      __builtin_astype (image, ADDRESS_SPACE dev_image_t*);             \
-    dev_sampler_t s = __builtin_astype(sampler, dev_sampler_t);		\
-    if (__pocl_is_out_of_bounds (i_ptr, &coord4, &s, &color))           \
-      {                                                                 \
-        return color;                                                   \
-      }                                                                 \
-    __pocl_read_pixel (&color, i_ptr, coord4); \
-                                                                        \
-    return color;                                                       \
+#define IMPLEMENT_READ_INT4_IMAGE_FLOAT_COORD(__IMGTYPE__, __RETVAL__,        \
+                                              __POSTFIX__, __COORD__)         \
+  __RETVAL__ _CL_OVERLOADABLE _CL_READONLY read_image##__POSTFIX__ (                       \
+      __IMGTYPE__ image, sampler_t sampler, __COORD__ coord)                  \
+  {                                                                           \
+    float4 coord4;                                                            \
+    INITCOORD##__COORD__ (coord4, coord);                                     \
+    global dev_image_t *i_ptr                                                 \
+        = __builtin_astype (image, global dev_image_t *);                     \
+    READ_SAMPLER                                                              \
+    uint4 color = pocl_read_pixel_floatc (i_ptr, coord4, s);                  \
+    return as_##__RETVAL__ (color);                                           \
   }
-#endif
 
 /* NO Sampler Implementation for read_image with any image data type
    and int coordinates
@@ -247,34 +1608,232 @@ void __pocl_read_pixel (void* color, ADDRESS_SPACE dev_image_t* dev_image, int4
                                   CLK_ADDRESS_NONE |
                                   CLK_FILTER_NEAREST;
 */
-#define IMPLEMENT_READ_IMAGE_INT_COORD_NOSAMPLER(__IMGTYPE__, __RETVAL__, \
-                                                 __POSTFIX__, __COORD__) \
-  __RETVAL__ _CL_OVERLOADABLE read_image##__POSTFIX__ (__IMGTYPE__ image, \
-                                                       __COORD__ coord) \
-  {                                                                     \
-    __RETVAL__ color;                                                   \
-    int4 coord4;                                                        \
-    INITCOORD##__COORD__ (coord4, coord);                               \
-    ADDRESS_SPACE dev_image_t* i_ptr =                                  \
-      __builtin_astype (image, ADDRESS_SPACE dev_image_t*);             \
-    __pocl_read_pixel (&color, i_ptr, coord4);  \
-                                                                        \
-    return color;                                                       \
-  }                                                                     \
 
+#define IMPLEMENT_READ_INT4_IMAGE_INT_COORD_NOSAMPLER(                        \
+    __IMGTYPE__, __RETVAL__, __POSTFIX__, __COORD__)                          \
+  __RETVAL__ _CL_OVERLOADABLE _CL_READONLY read_image##__POSTFIX__ (__IMGTYPE__ image,     \
+                                                       __COORD__ coord)       \
+  {                                                                           \
+    int4 coord4;                                                              \
+    INITCOORD##__COORD__ (coord4, coord);                                     \
+    global dev_image_t *i_ptr                                                 \
+        = __builtin_astype (image, global dev_image_t *);                     \
+    uint4 color = pocl_read_pixel_intc_samplerless (i_ptr, coord4);           \
+    return as_##__RETVAL__ (color);                                           \
+  }
+
+#define IMPLEMENT_READ_FLOAT4_IMAGE_INT_COORD_NOSAMPLER(__IMGTYPE__,          \
+                                                        __COORD__)            \
+  float4 _CL_OVERLOADABLE _CL_READONLY read_imagef (__IMGTYPE__ image, __COORD__ coord)    \
+  {                                                                           \
+    int4 coord4;                                                              \
+    INITCOORD##__COORD__ (coord4, coord);                                     \
+    global dev_image_t *i_ptr                                                 \
+        = __builtin_astype (image, global dev_image_t *);                     \
+    uint4 color = pocl_read_pixel_intc_samplerless (i_ptr, coord4);           \
+    return as_float4 (color);                                                 \
+  }
+
+/* NO sampler */
 
-/* read_image 2d function instantions */
-IMPLEMENT_READ_IMAGE_INT_COORD (image2d_t, float4, f, int2)
+IMPLEMENT_READ_FLOAT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RO_AQ image1d_t, int)
+IMPLEMENT_READ_FLOAT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RO_AQ image1d_buffer_t,
+                                                 int)
+IMPLEMENT_READ_FLOAT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RO_AQ image1d_array_t, int2)
 
-IMPLEMENT_READ_IMAGE_INT_COORD_NOSAMPLER (image2d_t, float4, f, int2)
-IMPLEMENT_READ_IMAGE_INT_COORD_NOSAMPLER (image2d_array_t, float4, f, int4)
+IMPLEMENT_READ_FLOAT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RO_AQ image2d_t, int2)
+IMPLEMENT_READ_FLOAT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RO_AQ image2d_array_t,
+                                                 int4)
+IMPLEMENT_READ_FLOAT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RO_AQ image3d_t, int4)
 
-IMPLEMENT_READ_IMAGE_INT_COORD (image2d_array_t, float4, f, int4)
 
-IMPLEMENT_READ_IMAGE_INT_COORD (image2d_t, uint4, ui, int2)
-IMPLEMENT_READ_IMAGE_INT_COORD (image2d_t, int4, i, int2)
 
-/* read_image 3d function instantions */
-IMPLEMENT_READ_IMAGE_INT_COORD (image3d_t, uint4, ui, int4)
-IMPLEMENT_READ_IMAGE_INT_COORD (image3d_t, float4, f, int4)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RO_AQ image1d_t, uint4, ui,
+                                               int)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RO_AQ image1d_t, int4, i,
+                                               int)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RO_AQ image1d_array_t, uint4, ui,
+                                               int2)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RO_AQ image1d_array_t, int4, i,
+                                               int2)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RO_AQ image1d_buffer_t,
+                                               uint4, ui, int)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RO_AQ image1d_buffer_t,
+                                               int4, i, int)
 
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RO_AQ image2d_t, uint4, ui,
+                                               int2)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RO_AQ image2d_t, int4, i,
+                                               int2)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RO_AQ image2d_array_t,
+                                               uint4, ui, int4)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RO_AQ image2d_array_t, int4,
+                                               i, int4)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RO_AQ image3d_t, uint4, ui,
+                                               int4)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RO_AQ image3d_t, int4, i,
+                                               int4)
+
+/* float4 img + float coords + sampler */
+
+IMPLEMENT_READ_FLOAT4_IMAGE_FLOAT_COORD (IMG_RO_AQ image1d_t, float)
+IMPLEMENT_READ_FLOAT4_IMAGE_FLOAT_COORD (IMG_RO_AQ image1d_buffer_t, float)
+IMPLEMENT_READ_FLOAT4_IMAGE_FLOAT_COORD (IMG_RO_AQ image1d_array_t, float2)
+IMPLEMENT_READ_FLOAT4_IMAGE_FLOAT_COORD (IMG_RO_AQ image2d_t, float2)
+IMPLEMENT_READ_FLOAT4_IMAGE_FLOAT_COORD (IMG_RO_AQ image2d_array_t, float4)
+IMPLEMENT_READ_FLOAT4_IMAGE_FLOAT_COORD (IMG_RO_AQ image3d_t, float4)
+
+/* float4 img + int coords + sampler */
+
+IMPLEMENT_READ_FLOAT4_IMAGE_INT_COORD (IMG_RO_AQ image1d_t, int)
+IMPLEMENT_READ_FLOAT4_IMAGE_INT_COORD (IMG_RO_AQ image1d_buffer_t, int)
+IMPLEMENT_READ_FLOAT4_IMAGE_INT_COORD (IMG_RO_AQ image1d_array_t, int2)
+IMPLEMENT_READ_FLOAT4_IMAGE_INT_COORD (IMG_RO_AQ image2d_t, int2)
+IMPLEMENT_READ_FLOAT4_IMAGE_INT_COORD (IMG_RO_AQ image2d_array_t, int4)
+IMPLEMENT_READ_FLOAT4_IMAGE_INT_COORD (IMG_RO_AQ image3d_t, int4)
+
+/* int4 img + float coords + sampler */
+
+IMPLEMENT_READ_INT4_IMAGE_FLOAT_COORD (IMG_RO_AQ image1d_t, uint4, ui, float)
+IMPLEMENT_READ_INT4_IMAGE_FLOAT_COORD (IMG_RO_AQ image1d_t, int4, i, float)
+IMPLEMENT_READ_INT4_IMAGE_FLOAT_COORD (IMG_RO_AQ image1d_buffer_t, uint4, ui,
+                                       float)
+IMPLEMENT_READ_INT4_IMAGE_FLOAT_COORD (IMG_RO_AQ image1d_buffer_t, int4, i,
+                                       float)
+IMPLEMENT_READ_INT4_IMAGE_FLOAT_COORD (IMG_RO_AQ image1d_array_t, uint4, ui,
+                                       float2)
+IMPLEMENT_READ_INT4_IMAGE_FLOAT_COORD (IMG_RO_AQ image1d_array_t, int4, i,
+                                       float2)
+
+IMPLEMENT_READ_INT4_IMAGE_FLOAT_COORD (IMG_RO_AQ image2d_t, uint4, ui, float2)
+IMPLEMENT_READ_INT4_IMAGE_FLOAT_COORD (IMG_RO_AQ image2d_t, int4, i, float2)
+IMPLEMENT_READ_INT4_IMAGE_FLOAT_COORD (IMG_RO_AQ image2d_array_t, uint4, ui,
+                                       float4)
+IMPLEMENT_READ_INT4_IMAGE_FLOAT_COORD (IMG_RO_AQ image2d_array_t, int4, i,
+                                       float4)
+IMPLEMENT_READ_INT4_IMAGE_FLOAT_COORD (IMG_RO_AQ image3d_t, uint4, ui, float4)
+IMPLEMENT_READ_INT4_IMAGE_FLOAT_COORD (IMG_RO_AQ image3d_t, int4, i, float4)
+
+/* int4 img + int coords + sampler */
+
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD (IMG_RO_AQ image1d_t, uint4, ui, int)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD (IMG_RO_AQ image1d_t, int4, i, int)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD (IMG_RO_AQ image1d_buffer_t, uint4, ui,
+                                     int)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD (IMG_RO_AQ image1d_buffer_t, int4, i, int)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD (IMG_RO_AQ image1d_array_t, uint4, ui,
+                                     int2)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD (IMG_RO_AQ image1d_array_t, int4, i, int2)
+
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD (IMG_RO_AQ image2d_t, uint4, ui, int2)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD (IMG_RO_AQ image2d_t, int4, i, int2)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD (IMG_RO_AQ image2d_array_t, uint4, ui,
+                                     int4)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD (IMG_RO_AQ image2d_array_t, int4, i, int4)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD (IMG_RO_AQ image3d_t, uint4, ui, int4)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD (IMG_RO_AQ image3d_t, int4, i, int4)
+
+/******************************************************************************/
+/******************************************************************************/
+
+#ifdef CLANG_HAS_RW_IMAGES
+
+/* NO sampler */
+
+IMPLEMENT_READ_FLOAT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RW_AQ image1d_t, int)
+IMPLEMENT_READ_FLOAT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RW_AQ image1d_buffer_t,
+                                                 int)
+IMPLEMENT_READ_FLOAT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RW_AQ image1d_array_t, int2)
+
+IMPLEMENT_READ_FLOAT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RW_AQ image2d_t, int2)
+IMPLEMENT_READ_FLOAT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RW_AQ image2d_array_t,
+                                                 int4)
+IMPLEMENT_READ_FLOAT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RW_AQ image3d_t, int4)
+
+
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RW_AQ image1d_t, uint4, ui,
+                                               int)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RW_AQ image1d_t, int4, i,
+                                               int)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RW_AQ image1d_array_t, uint4, ui,
+                                               int2)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RW_AQ image1d_array_t, int4, i,
+                                               int2)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RW_AQ image1d_buffer_t,
+                                               uint4, ui, int)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RW_AQ image1d_buffer_t,
+                                               int4, i, int)
+
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RW_AQ image2d_t, uint4, ui,
+                                               int2)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RW_AQ image2d_t, int4, i,
+                                               int2)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RW_AQ image2d_array_t,
+                                               uint4, ui, int4)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RW_AQ image2d_array_t, int4,
+                                               i, int4)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RW_AQ image3d_t, uint4, ui,
+                                               int4)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD_NOSAMPLER (IMG_RW_AQ image3d_t, int4, i,
+                                               int4)
+
+/* float4 img + float coords + sampler */
+
+IMPLEMENT_READ_FLOAT4_IMAGE_FLOAT_COORD (IMG_RW_AQ image1d_t, float)
+IMPLEMENT_READ_FLOAT4_IMAGE_FLOAT_COORD (IMG_RW_AQ image1d_buffer_t, float)
+IMPLEMENT_READ_FLOAT4_IMAGE_FLOAT_COORD (IMG_RW_AQ image1d_array_t, float2)
+IMPLEMENT_READ_FLOAT4_IMAGE_FLOAT_COORD (IMG_RW_AQ image2d_t, float2)
+IMPLEMENT_READ_FLOAT4_IMAGE_FLOAT_COORD (IMG_RW_AQ image2d_array_t, float4)
+IMPLEMENT_READ_FLOAT4_IMAGE_FLOAT_COORD (IMG_RW_AQ image3d_t, float4)
+
+/* float4 img + int coords + sampler */
+
+IMPLEMENT_READ_FLOAT4_IMAGE_INT_COORD (IMG_RW_AQ image1d_t, int)
+IMPLEMENT_READ_FLOAT4_IMAGE_INT_COORD (IMG_RW_AQ image1d_buffer_t, int)
+IMPLEMENT_READ_FLOAT4_IMAGE_INT_COORD (IMG_RW_AQ image1d_array_t, int2)
+IMPLEMENT_READ_FLOAT4_IMAGE_INT_COORD (IMG_RW_AQ image2d_t, int2)
+IMPLEMENT_READ_FLOAT4_IMAGE_INT_COORD (IMG_RW_AQ image2d_array_t, int4)
+IMPLEMENT_READ_FLOAT4_IMAGE_INT_COORD (IMG_RW_AQ image3d_t, int4)
+
+/* int4 img + float coords + sampler */
+
+IMPLEMENT_READ_INT4_IMAGE_FLOAT_COORD (IMG_RW_AQ image1d_t, uint4, ui, float)
+IMPLEMENT_READ_INT4_IMAGE_FLOAT_COORD (IMG_RW_AQ image1d_t, int4, i, float)
+IMPLEMENT_READ_INT4_IMAGE_FLOAT_COORD (IMG_RW_AQ image1d_buffer_t, uint4, ui,
+                                       float)
+IMPLEMENT_READ_INT4_IMAGE_FLOAT_COORD (IMG_RW_AQ image1d_buffer_t, int4, i,
+                                       float)
+IMPLEMENT_READ_INT4_IMAGE_FLOAT_COORD (IMG_RW_AQ image1d_array_t, uint4, ui,
+                                       float2)
+IMPLEMENT_READ_INT4_IMAGE_FLOAT_COORD (IMG_RW_AQ image1d_array_t, int4, i,
+                                       float2)
+
+IMPLEMENT_READ_INT4_IMAGE_FLOAT_COORD (IMG_RW_AQ image2d_t, uint4, ui, float2)
+IMPLEMENT_READ_INT4_IMAGE_FLOAT_COORD (IMG_RW_AQ image2d_t, int4, i, float2)
+IMPLEMENT_READ_INT4_IMAGE_FLOAT_COORD (IMG_RW_AQ image2d_array_t, uint4, ui,
+                                       float4)
+IMPLEMENT_READ_INT4_IMAGE_FLOAT_COORD (IMG_RW_AQ image2d_array_t, int4, i,
+                                       float4)
+IMPLEMENT_READ_INT4_IMAGE_FLOAT_COORD (IMG_RW_AQ image3d_t, uint4, ui, float4)
+IMPLEMENT_READ_INT4_IMAGE_FLOAT_COORD (IMG_RW_AQ image3d_t, int4, i, float4)
+
+/* int4 img + int coords + sampler */
+
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD (IMG_RW_AQ image1d_t, uint4, ui, int)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD (IMG_RW_AQ image1d_t, int4, i, int)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD (IMG_RW_AQ image1d_buffer_t, uint4, ui,
+                                     int)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD (IMG_RW_AQ image1d_buffer_t, int4, i, int)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD (IMG_RW_AQ image1d_array_t, uint4, ui,
+                                     int2)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD (IMG_RW_AQ image1d_array_t, int4, i, int2)
+
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD (IMG_RW_AQ image2d_t, uint4, ui, int2)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD (IMG_RW_AQ image2d_t, int4, i, int2)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD (IMG_RW_AQ image2d_array_t, uint4, ui,
+                                     int4)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD (IMG_RW_AQ image2d_array_t, int4, i, int4)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD (IMG_RW_AQ image3d_t, uint4, ui, int4)
+IMPLEMENT_READ_INT4_IMAGE_INT_COORD (IMG_RW_AQ image3d_t, int4, i, int4)
+
+#endif
diff --git a/lib/kernel/rsqrt.cl b/lib/kernel/rsqrt.cl
index 3c75ca1..8ac12ea 100644
--- a/lib/kernel/rsqrt.cl
+++ b/lib/kernel/rsqrt.cl
@@ -25,5 +25,3 @@
 #include "templates.h"
 
 DEFINE_EXPR_V_V(rsqrt, (stype)1/sqrt(a))
-
-DEFINE_EXPR_F_F(half_rsqrt, rsqrt(a))
diff --git a/lib/kernel/select.cl b/lib/kernel/select.cl
index c467442..f122180 100644
--- a/lib/kernel/select.cl
+++ b/lib/kernel/select.cl
@@ -40,7 +40,7 @@
    bit). */
 
 #define IMPLEMENT_SELECT_SCALAR(GTYPE, UIGTYPE) \
-  GTYPE __attribute__ ((overloadable))          \
+  GTYPE _CL_OVERLOADABLE _CL_READNONE           \
   select(GTYPE a, GTYPE b, UIGTYPE c)           \
   {                                             \
     return c ? b : a;                           \
@@ -69,14 +69,19 @@ __IF_FP64(
 IMPLEMENT_SELECT_SCALAR(double, long  )
 IMPLEMENT_SELECT_SCALAR(double, ulong ))
 
-
-
+/* clang's ternary operator on extended vectors
+ * behaves suitably for OpenCL on x86-64 */
+#if defined(__x86_64__) && defined(__clang__)
+#define IMPLEMENT_SELECT_VECTOR(GTYPE, UIGTYPE, IGTYPE) \
+  IMPLEMENT_SELECT_SCALAR(GTYPE, UIGTYPE)
+#else
 #define IMPLEMENT_SELECT_VECTOR(GTYPE, UIGTYPE, IGTYPE) \
-  GTYPE __attribute__ ((overloadable))                  \
+  GTYPE _CL_OVERLOADABLE _CL_READNONE                   \
   select(GTYPE a, GTYPE b, UIGTYPE c)                   \
   {                                                     \
     return *(IGTYPE*)&c < (IGTYPE)0 ? b : a;            \
   }
+#endif
 
 IMPLEMENT_SELECT_VECTOR(char2  , char2  , char2 )
 IMPLEMENT_SELECT_VECTOR(char2  , uchar2 , char2 )
diff --git a/lib/kernel/sleef-pocl/README b/lib/kernel/sleef-pocl/README
new file mode 100644
index 0000000..5677abf
--- /dev/null
+++ b/lib/kernel/sleef-pocl/README
@@ -0,0 +1,10 @@
+Most of these file have been generated by
+running `generate.rb` script, then
+formatted by `clang-format -style=GNU`.
+
+These have been edited by hand:
+
+expfrexp.cl
+fma.cl
+frexp.cl
+scalars.cl
diff --git a/lib/kernel/sleef-pocl/acos.cl b/lib/kernel/sleef-pocl/acos.cl
new file mode 100644
index 0000000..d17e1b3
--- /dev/null
+++ b/lib/kernel/sleef-pocl/acos.cl
@@ -0,0 +1,229 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_acos (float x)
+{
+
+#ifdef MAX_PRECISION
+  return Sleef_acosf_u10 (x);
+#else
+  return Sleef_acosf_u35 (x);
+#endif
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_acos (float2 x)
+{
+
+  float lo = _cl_acos (x.lo);
+  float hi = _cl_acos (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_acos (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_acos (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_acos (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_acos (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_acosf4_u10 (x);
+#else
+  return Sleef_acosf4_u35 (x);
+#endif
+
+#else
+
+  float2 lo = _cl_acos (x.lo);
+  float2 hi = _cl_acos (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_acos (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_acosf8_u10 (x);
+#else
+  return Sleef_acosf8_u35 (x);
+#endif
+
+#else
+
+  float4 lo = _cl_acos (x.lo);
+  float4 hi = _cl_acos (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_acos (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_acosf16_u10 (x);
+#else
+  return Sleef_acosf16_u35 (x);
+#endif
+
+#else
+
+  float8 lo = _cl_acos (x.lo);
+  float8 hi = _cl_acos (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_acos (double x)
+{
+
+#ifdef MAX_PRECISION
+  return Sleef_acos_u10 (x);
+#else
+  return Sleef_acos_u35 (x);
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_acos (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_acosd2_u10 (x);
+#else
+  return Sleef_acosd2_u35 (x);
+#endif
+
+#else
+
+  double lo = _cl_acos (x.lo);
+  double hi = _cl_acos (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_acos (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_acos (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_acos (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_acos (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_acosd4_u10 (x);
+#else
+  return Sleef_acosd4_u35 (x);
+#endif
+
+#else
+
+  double2 lo = _cl_acos (x.lo);
+  double2 hi = _cl_acos (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_acos (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_acosd8_u10 (x);
+#else
+  return Sleef_acosd8_u35 (x);
+#endif
+
+#else
+
+  double4 lo = _cl_acos (x.lo);
+  double4 hi = _cl_acos (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_acos (double16 x)
+{
+
+  double8 lo = _cl_acos (x.lo);
+  double8 hi = _cl_acos (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/acosh.cl b/lib/kernel/sleef-pocl/acosh.cl
new file mode 100644
index 0000000..91d3bc5
--- /dev/null
+++ b/lib/kernel/sleef-pocl/acosh.cl
@@ -0,0 +1,183 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_acosh (float x)
+{
+  return Sleef_acoshf_u10 (x);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_acosh (float2 x)
+{
+
+  float lo = _cl_acosh (x.lo);
+  float hi = _cl_acosh (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_acosh (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_acosh (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_acosh (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_acosh (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_acoshf4_u10 (x);
+#else
+
+  float2 lo = _cl_acosh (x.lo);
+  float2 hi = _cl_acosh (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_acosh (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_acoshf8_u10 (x);
+#else
+
+  float4 lo = _cl_acosh (x.lo);
+  float4 hi = _cl_acosh (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_acosh (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_acoshf16_u10 (x);
+#else
+
+  float8 lo = _cl_acosh (x.lo);
+  float8 hi = _cl_acosh (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_acosh (double x)
+{
+  return Sleef_acosh_u10 (x);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_acosh (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_acoshd2_u10 (x);
+#else
+
+  double lo = _cl_acosh (x.lo);
+  double hi = _cl_acosh (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_acosh (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_acosh (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_acosh (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_acosh (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_acoshd4_u10 (x);
+#else
+
+  double2 lo = _cl_acosh (x.lo);
+  double2 hi = _cl_acosh (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_acosh (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_acoshd8_u10 (x);
+#else
+
+  double4 lo = _cl_acosh (x.lo);
+  double4 hi = _cl_acosh (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_acosh (double16 x)
+{
+
+  double8 lo = _cl_acosh (x.lo);
+  double8 hi = _cl_acosh (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/asin.cl b/lib/kernel/sleef-pocl/asin.cl
new file mode 100644
index 0000000..906fd6f
--- /dev/null
+++ b/lib/kernel/sleef-pocl/asin.cl
@@ -0,0 +1,229 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_asin (float x)
+{
+
+#ifdef MAX_PRECISION
+  return Sleef_asinf_u10 (x);
+#else
+  return Sleef_asinf_u35 (x);
+#endif
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_asin (float2 x)
+{
+
+  float lo = _cl_asin (x.lo);
+  float hi = _cl_asin (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_asin (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_asin (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_asin (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_asin (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_asinf4_u10 (x);
+#else
+  return Sleef_asinf4_u35 (x);
+#endif
+
+#else
+
+  float2 lo = _cl_asin (x.lo);
+  float2 hi = _cl_asin (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_asin (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_asinf8_u10 (x);
+#else
+  return Sleef_asinf8_u35 (x);
+#endif
+
+#else
+
+  float4 lo = _cl_asin (x.lo);
+  float4 hi = _cl_asin (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_asin (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_asinf16_u10 (x);
+#else
+  return Sleef_asinf16_u35 (x);
+#endif
+
+#else
+
+  float8 lo = _cl_asin (x.lo);
+  float8 hi = _cl_asin (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_asin (double x)
+{
+
+#ifdef MAX_PRECISION
+  return Sleef_asin_u10 (x);
+#else
+  return Sleef_asin_u35 (x);
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_asin (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_asind2_u10 (x);
+#else
+  return Sleef_asind2_u35 (x);
+#endif
+
+#else
+
+  double lo = _cl_asin (x.lo);
+  double hi = _cl_asin (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_asin (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_asin (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_asin (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_asin (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_asind4_u10 (x);
+#else
+  return Sleef_asind4_u35 (x);
+#endif
+
+#else
+
+  double2 lo = _cl_asin (x.lo);
+  double2 hi = _cl_asin (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_asin (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_asind8_u10 (x);
+#else
+  return Sleef_asind8_u35 (x);
+#endif
+
+#else
+
+  double4 lo = _cl_asin (x.lo);
+  double4 hi = _cl_asin (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_asin (double16 x)
+{
+
+  double8 lo = _cl_asin (x.lo);
+  double8 hi = _cl_asin (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/asinh.cl b/lib/kernel/sleef-pocl/asinh.cl
new file mode 100644
index 0000000..a8bab96
--- /dev/null
+++ b/lib/kernel/sleef-pocl/asinh.cl
@@ -0,0 +1,183 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_asinh (float x)
+{
+  return Sleef_asinhf_u10 (x);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_asinh (float2 x)
+{
+
+  float lo = _cl_asinh (x.lo);
+  float hi = _cl_asinh (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_asinh (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_asinh (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_asinh (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_asinh (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_asinhf4_u10 (x);
+#else
+
+  float2 lo = _cl_asinh (x.lo);
+  float2 hi = _cl_asinh (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_asinh (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_asinhf8_u10 (x);
+#else
+
+  float4 lo = _cl_asinh (x.lo);
+  float4 hi = _cl_asinh (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_asinh (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_asinhf16_u10 (x);
+#else
+
+  float8 lo = _cl_asinh (x.lo);
+  float8 hi = _cl_asinh (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_asinh (double x)
+{
+  return Sleef_asinh_u10 (x);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_asinh (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_asinhd2_u10 (x);
+#else
+
+  double lo = _cl_asinh (x.lo);
+  double hi = _cl_asinh (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_asinh (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_asinh (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_asinh (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_asinh (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_asinhd4_u10 (x);
+#else
+
+  double2 lo = _cl_asinh (x.lo);
+  double2 hi = _cl_asinh (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_asinh (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_asinhd8_u10 (x);
+#else
+
+  double4 lo = _cl_asinh (x.lo);
+  double4 hi = _cl_asinh (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_asinh (double16 x)
+{
+
+  double8 lo = _cl_asinh (x.lo);
+  double8 hi = _cl_asinh (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/atan.cl b/lib/kernel/sleef-pocl/atan.cl
new file mode 100644
index 0000000..95f14f4
--- /dev/null
+++ b/lib/kernel/sleef-pocl/atan.cl
@@ -0,0 +1,229 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_atan (float x)
+{
+
+#ifdef MAX_PRECISION
+  return Sleef_atanf_u10 (x);
+#else
+  return Sleef_atanf_u35 (x);
+#endif
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_atan (float2 x)
+{
+
+  float lo = _cl_atan (x.lo);
+  float hi = _cl_atan (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_atan (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_atan (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_atan (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_atan (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_atanf4_u10 (x);
+#else
+  return Sleef_atanf4_u35 (x);
+#endif
+
+#else
+
+  float2 lo = _cl_atan (x.lo);
+  float2 hi = _cl_atan (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_atan (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_atanf8_u10 (x);
+#else
+  return Sleef_atanf8_u35 (x);
+#endif
+
+#else
+
+  float4 lo = _cl_atan (x.lo);
+  float4 hi = _cl_atan (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_atan (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_atanf16_u10 (x);
+#else
+  return Sleef_atanf16_u35 (x);
+#endif
+
+#else
+
+  float8 lo = _cl_atan (x.lo);
+  float8 hi = _cl_atan (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_atan (double x)
+{
+
+#ifdef MAX_PRECISION
+  return Sleef_atan_u10 (x);
+#else
+  return Sleef_atan_u35 (x);
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_atan (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_atand2_u10 (x);
+#else
+  return Sleef_atand2_u35 (x);
+#endif
+
+#else
+
+  double lo = _cl_atan (x.lo);
+  double hi = _cl_atan (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_atan (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_atan (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_atan (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_atan (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_atand4_u10 (x);
+#else
+  return Sleef_atand4_u35 (x);
+#endif
+
+#else
+
+  double2 lo = _cl_atan (x.lo);
+  double2 hi = _cl_atan (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_atan (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_atand8_u10 (x);
+#else
+  return Sleef_atand8_u35 (x);
+#endif
+
+#else
+
+  double4 lo = _cl_atan (x.lo);
+  double4 hi = _cl_atan (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_atan (double16 x)
+{
+
+  double8 lo = _cl_atan (x.lo);
+  double8 hi = _cl_atan (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/atan2.cl b/lib/kernel/sleef-pocl/atan2.cl
new file mode 100644
index 0000000..70c4894
--- /dev/null
+++ b/lib/kernel/sleef-pocl/atan2.cl
@@ -0,0 +1,231 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_atan2 (float x, float y)
+{
+
+#ifdef MAX_PRECISION
+  return Sleef_atan2f_u10 (x, y);
+#else
+  return Sleef_atan2f_u35 (x, y);
+#endif
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_atan2 (float2 x, float2 y)
+{
+
+  float lo = _cl_atan2 (x.lo, y.lo);
+  float hi = _cl_atan2 (x.hi, y.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_atan2 (float4, float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_atan2 (float3 x, float3 y)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+  float4 y_3to4 = (float4) (y, (float)0);
+
+  float4 r = _cl_atan2 (x_3to4, y_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_atan2 (float4 x, float4 y)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_atan2f4_u10 (x, y);
+#else
+  return Sleef_atan2f4_u35 (x, y);
+#endif
+
+#else
+
+  float2 lo = _cl_atan2 (x.lo, y.lo);
+  float2 hi = _cl_atan2 (x.hi, y.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_atan2 (float8 x, float8 y)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_atan2f8_u10 (x, y);
+#else
+  return Sleef_atan2f8_u35 (x, y);
+#endif
+
+#else
+
+  float4 lo = _cl_atan2 (x.lo, y.lo);
+  float4 hi = _cl_atan2 (x.hi, y.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_atan2 (float16 x, float16 y)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_atan2f16_u10 (x, y);
+#else
+  return Sleef_atan2f16_u35 (x, y);
+#endif
+
+#else
+
+  float8 lo = _cl_atan2 (x.lo, y.lo);
+  float8 hi = _cl_atan2 (x.hi, y.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_atan2 (double x, double y)
+{
+
+#ifdef MAX_PRECISION
+  return Sleef_atan2_u10 (x, y);
+#else
+  return Sleef_atan2_u35 (x, y);
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_atan2 (double2 x, double2 y)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_atan2d2_u10 (x, y);
+#else
+  return Sleef_atan2d2_u35 (x, y);
+#endif
+
+#else
+
+  double lo = _cl_atan2 (x.lo, y.lo);
+  double hi = _cl_atan2 (x.hi, y.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_atan2 (double4, double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_atan2 (double3 x, double3 y)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+  double4 y_3to4 = (double4) (y, (double)0);
+
+  double4 r = _cl_atan2 (x_3to4, y_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_atan2 (double4 x, double4 y)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_atan2d4_u10 (x, y);
+#else
+  return Sleef_atan2d4_u35 (x, y);
+#endif
+
+#else
+
+  double2 lo = _cl_atan2 (x.lo, y.lo);
+  double2 hi = _cl_atan2 (x.hi, y.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_atan2 (double8 x, double8 y)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_atan2d8_u10 (x, y);
+#else
+  return Sleef_atan2d8_u35 (x, y);
+#endif
+
+#else
+
+  double4 lo = _cl_atan2 (x.lo, y.lo);
+  double4 hi = _cl_atan2 (x.hi, y.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_atan2 (double16 x, double16 y)
+{
+
+  double8 lo = _cl_atan2 (x.lo, y.lo);
+  double8 hi = _cl_atan2 (x.hi, y.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/atanh.cl b/lib/kernel/sleef-pocl/atanh.cl
new file mode 100644
index 0000000..407b846
--- /dev/null
+++ b/lib/kernel/sleef-pocl/atanh.cl
@@ -0,0 +1,183 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_atanh (float x)
+{
+  return Sleef_atanhf_u10 (x);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_atanh (float2 x)
+{
+
+  float lo = _cl_atanh (x.lo);
+  float hi = _cl_atanh (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_atanh (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_atanh (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_atanh (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_atanh (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_atanhf4_u10 (x);
+#else
+
+  float2 lo = _cl_atanh (x.lo);
+  float2 hi = _cl_atanh (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_atanh (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_atanhf8_u10 (x);
+#else
+
+  float4 lo = _cl_atanh (x.lo);
+  float4 hi = _cl_atanh (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_atanh (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_atanhf16_u10 (x);
+#else
+
+  float8 lo = _cl_atanh (x.lo);
+  float8 hi = _cl_atanh (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_atanh (double x)
+{
+  return Sleef_atanh_u10 (x);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_atanh (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_atanhd2_u10 (x);
+#else
+
+  double lo = _cl_atanh (x.lo);
+  double hi = _cl_atanh (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_atanh (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_atanh (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_atanh (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_atanh (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_atanhd4_u10 (x);
+#else
+
+  double2 lo = _cl_atanh (x.lo);
+  double2 hi = _cl_atanh (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_atanh (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_atanhd8_u10 (x);
+#else
+
+  double4 lo = _cl_atanh (x.lo);
+  double4 hi = _cl_atanh (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_atanh (double16 x)
+{
+
+  double8 lo = _cl_atanh (x.lo);
+  double8 hi = _cl_atanh (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/cbrt.cl b/lib/kernel/sleef-pocl/cbrt.cl
new file mode 100644
index 0000000..95e602e
--- /dev/null
+++ b/lib/kernel/sleef-pocl/cbrt.cl
@@ -0,0 +1,229 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_cbrt (float x)
+{
+
+#ifdef MAX_PRECISION
+  return Sleef_cbrtf_u10 (x);
+#else
+  return Sleef_cbrtf_u35 (x);
+#endif
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_cbrt (float2 x)
+{
+
+  float lo = _cl_cbrt (x.lo);
+  float hi = _cl_cbrt (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_cbrt (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_cbrt (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_cbrt (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_cbrt (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_cbrtf4_u10 (x);
+#else
+  return Sleef_cbrtf4_u35 (x);
+#endif
+
+#else
+
+  float2 lo = _cl_cbrt (x.lo);
+  float2 hi = _cl_cbrt (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_cbrt (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_cbrtf8_u10 (x);
+#else
+  return Sleef_cbrtf8_u35 (x);
+#endif
+
+#else
+
+  float4 lo = _cl_cbrt (x.lo);
+  float4 hi = _cl_cbrt (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_cbrt (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_cbrtf16_u10 (x);
+#else
+  return Sleef_cbrtf16_u35 (x);
+#endif
+
+#else
+
+  float8 lo = _cl_cbrt (x.lo);
+  float8 hi = _cl_cbrt (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_cbrt (double x)
+{
+
+#ifdef MAX_PRECISION
+  return Sleef_cbrt_u10 (x);
+#else
+  return Sleef_cbrt_u35 (x);
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_cbrt (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_cbrtd2_u10 (x);
+#else
+  return Sleef_cbrtd2_u35 (x);
+#endif
+
+#else
+
+  double lo = _cl_cbrt (x.lo);
+  double hi = _cl_cbrt (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_cbrt (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_cbrt (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_cbrt (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_cbrt (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_cbrtd4_u10 (x);
+#else
+  return Sleef_cbrtd4_u35 (x);
+#endif
+
+#else
+
+  double2 lo = _cl_cbrt (x.lo);
+  double2 hi = _cl_cbrt (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_cbrt (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_cbrtd8_u10 (x);
+#else
+  return Sleef_cbrtd8_u35 (x);
+#endif
+
+#else
+
+  double4 lo = _cl_cbrt (x.lo);
+  double4 hi = _cl_cbrt (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_cbrt (double16 x)
+{
+
+  double8 lo = _cl_cbrt (x.lo);
+  double8 hi = _cl_cbrt (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/ceil.cl b/lib/kernel/sleef-pocl/ceil.cl
new file mode 100644
index 0000000..7c48637
--- /dev/null
+++ b/lib/kernel/sleef-pocl/ceil.cl
@@ -0,0 +1,183 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_ceil (float x)
+{
+  return Sleef_ceilf (x);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_ceil (float2 x)
+{
+
+  float lo = _cl_ceil (x.lo);
+  float hi = _cl_ceil (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_ceil (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_ceil (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_ceil (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_ceil (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_ceilf4 (x);
+#else
+
+  float2 lo = _cl_ceil (x.lo);
+  float2 hi = _cl_ceil (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_ceil (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_ceilf8 (x);
+#else
+
+  float4 lo = _cl_ceil (x.lo);
+  float4 hi = _cl_ceil (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_ceil (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_ceilf16 (x);
+#else
+
+  float8 lo = _cl_ceil (x.lo);
+  float8 hi = _cl_ceil (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_ceil (double x)
+{
+  return Sleef_ceil (x);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_ceil (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_ceild2 (x);
+#else
+
+  double lo = _cl_ceil (x.lo);
+  double hi = _cl_ceil (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_ceil (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_ceil (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_ceil (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_ceil (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_ceild4 (x);
+#else
+
+  double2 lo = _cl_ceil (x.lo);
+  double2 hi = _cl_ceil (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_ceil (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_ceild8 (x);
+#else
+
+  double4 lo = _cl_ceil (x.lo);
+  double4 hi = _cl_ceil (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_ceil (double16 x)
+{
+
+  double8 lo = _cl_ceil (x.lo);
+  double8 hi = _cl_ceil (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/copysign.cl b/lib/kernel/sleef-pocl/copysign.cl
new file mode 100644
index 0000000..23ffba3
--- /dev/null
+++ b/lib/kernel/sleef-pocl/copysign.cl
@@ -0,0 +1,185 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_copysign (float x, float y)
+{
+  return Sleef_copysignf (x, y);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_copysign (float2 x, float2 y)
+{
+
+  float lo = _cl_copysign (x.lo, y.lo);
+  float hi = _cl_copysign (x.hi, y.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_copysign (float4, float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_copysign (float3 x, float3 y)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+  float4 y_3to4 = (float4) (y, (float)0);
+
+  float4 r = _cl_copysign (x_3to4, y_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_copysign (float4 x, float4 y)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_copysignf4 (x, y);
+#else
+
+  float2 lo = _cl_copysign (x.lo, y.lo);
+  float2 hi = _cl_copysign (x.hi, y.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_copysign (float8 x, float8 y)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_copysignf8 (x, y);
+#else
+
+  float4 lo = _cl_copysign (x.lo, y.lo);
+  float4 hi = _cl_copysign (x.hi, y.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_copysign (float16 x, float16 y)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_copysignf16 (x, y);
+#else
+
+  float8 lo = _cl_copysign (x.lo, y.lo);
+  float8 hi = _cl_copysign (x.hi, y.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_copysign (double x, double y)
+{
+  return Sleef_copysign (x, y);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_copysign (double2 x, double2 y)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_copysignd2 (x, y);
+#else
+
+  double lo = _cl_copysign (x.lo, y.lo);
+  double hi = _cl_copysign (x.hi, y.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_copysign (double4, double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_copysign (double3 x, double3 y)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+  double4 y_3to4 = (double4) (y, (double)0);
+
+  double4 r = _cl_copysign (x_3to4, y_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_copysign (double4 x, double4 y)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_copysignd4 (x, y);
+#else
+
+  double2 lo = _cl_copysign (x.lo, y.lo);
+  double2 hi = _cl_copysign (x.hi, y.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_copysign (double8 x, double8 y)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_copysignd8 (x, y);
+#else
+
+  double4 lo = _cl_copysign (x.lo, y.lo);
+  double4 hi = _cl_copysign (x.hi, y.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_copysign (double16 x, double16 y)
+{
+
+  double8 lo = _cl_copysign (x.lo, y.lo);
+  double8 hi = _cl_copysign (x.hi, y.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/cos.cl b/lib/kernel/sleef-pocl/cos.cl
new file mode 100644
index 0000000..97194a2
--- /dev/null
+++ b/lib/kernel/sleef-pocl/cos.cl
@@ -0,0 +1,229 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_cos (float x)
+{
+
+#ifdef MAX_PRECISION
+  return Sleef_cosf_u10 (x);
+#else
+  return Sleef_cosf_u35 (x);
+#endif
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_cos (float2 x)
+{
+
+  float lo = _cl_cos (x.lo);
+  float hi = _cl_cos (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_cos (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_cos (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_cos (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_cos (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_cosf4_u10 (x);
+#else
+  return Sleef_cosf4_u35 (x);
+#endif
+
+#else
+
+  float2 lo = _cl_cos (x.lo);
+  float2 hi = _cl_cos (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_cos (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_cosf8_u10 (x);
+#else
+  return Sleef_cosf8_u35 (x);
+#endif
+
+#else
+
+  float4 lo = _cl_cos (x.lo);
+  float4 hi = _cl_cos (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_cos (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_cosf16_u10 (x);
+#else
+  return Sleef_cosf16_u35 (x);
+#endif
+
+#else
+
+  float8 lo = _cl_cos (x.lo);
+  float8 hi = _cl_cos (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_cos (double x)
+{
+
+#ifdef MAX_PRECISION
+  return Sleef_cos_u10 (x);
+#else
+  return Sleef_cos_u35 (x);
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_cos (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_cosd2_u10 (x);
+#else
+  return Sleef_cosd2_u35 (x);
+#endif
+
+#else
+
+  double lo = _cl_cos (x.lo);
+  double hi = _cl_cos (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_cos (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_cos (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_cos (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_cos (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_cosd4_u10 (x);
+#else
+  return Sleef_cosd4_u35 (x);
+#endif
+
+#else
+
+  double2 lo = _cl_cos (x.lo);
+  double2 hi = _cl_cos (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_cos (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_cosd8_u10 (x);
+#else
+  return Sleef_cosd8_u35 (x);
+#endif
+
+#else
+
+  double4 lo = _cl_cos (x.lo);
+  double4 hi = _cl_cos (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_cos (double16 x)
+{
+
+  double8 lo = _cl_cos (x.lo);
+  double8 hi = _cl_cos (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/cosh.cl b/lib/kernel/sleef-pocl/cosh.cl
new file mode 100644
index 0000000..bd5573b
--- /dev/null
+++ b/lib/kernel/sleef-pocl/cosh.cl
@@ -0,0 +1,183 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_cosh (float x)
+{
+  return Sleef_coshf_u10 (x);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_cosh (float2 x)
+{
+
+  float lo = _cl_cosh (x.lo);
+  float hi = _cl_cosh (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_cosh (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_cosh (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_cosh (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_cosh (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_coshf4_u10 (x);
+#else
+
+  float2 lo = _cl_cosh (x.lo);
+  float2 hi = _cl_cosh (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_cosh (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_coshf8_u10 (x);
+#else
+
+  float4 lo = _cl_cosh (x.lo);
+  float4 hi = _cl_cosh (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_cosh (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_coshf16_u10 (x);
+#else
+
+  float8 lo = _cl_cosh (x.lo);
+  float8 hi = _cl_cosh (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_cosh (double x)
+{
+  return Sleef_cosh_u10 (x);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_cosh (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_coshd2_u10 (x);
+#else
+
+  double lo = _cl_cosh (x.lo);
+  double hi = _cl_cosh (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_cosh (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_cosh (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_cosh (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_cosh (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_coshd4_u10 (x);
+#else
+
+  double2 lo = _cl_cosh (x.lo);
+  double2 hi = _cl_cosh (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_cosh (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_coshd8_u10 (x);
+#else
+
+  double4 lo = _cl_cosh (x.lo);
+  double4 hi = _cl_cosh (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_cosh (double16 x)
+{
+
+  double8 lo = _cl_cosh (x.lo);
+  double8 hi = _cl_cosh (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/cospi.cl b/lib/kernel/sleef-pocl/cospi.cl
new file mode 100644
index 0000000..8da0c16
--- /dev/null
+++ b/lib/kernel/sleef-pocl/cospi.cl
@@ -0,0 +1,183 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_cospi (float x)
+{
+  return Sleef_cospif_u05 (x);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_cospi (float2 x)
+{
+
+  float lo = _cl_cospi (x.lo);
+  float hi = _cl_cospi (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_cospi (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_cospi (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_cospi (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_cospi (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_cospif4_u05 (x);
+#else
+
+  float2 lo = _cl_cospi (x.lo);
+  float2 hi = _cl_cospi (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_cospi (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_cospif8_u05 (x);
+#else
+
+  float4 lo = _cl_cospi (x.lo);
+  float4 hi = _cl_cospi (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_cospi (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_cospif16_u05 (x);
+#else
+
+  float8 lo = _cl_cospi (x.lo);
+  float8 hi = _cl_cospi (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_cospi (double x)
+{
+  return Sleef_cospi_u05 (x);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_cospi (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_cospid2_u05 (x);
+#else
+
+  double lo = _cl_cospi (x.lo);
+  double hi = _cl_cospi (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_cospi (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_cospi (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_cospi (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_cospi (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_cospid4_u05 (x);
+#else
+
+  double2 lo = _cl_cospi (x.lo);
+  double2 hi = _cl_cospi (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_cospi (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_cospid8_u05 (x);
+#else
+
+  double4 lo = _cl_cospi (x.lo);
+  double4 hi = _cl_cospi (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_cospi (double16 x)
+{
+
+  double8 lo = _cl_cospi (x.lo);
+  double8 hi = _cl_cospi (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/erf.cl b/lib/kernel/sleef-pocl/erf.cl
new file mode 100644
index 0000000..dcdd6e4
--- /dev/null
+++ b/lib/kernel/sleef-pocl/erf.cl
@@ -0,0 +1,183 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_erf (float x)
+{
+  return Sleef_erff_u10 (x);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_erf (float2 x)
+{
+
+  float lo = _cl_erf (x.lo);
+  float hi = _cl_erf (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_erf (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_erf (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_erf (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_erf (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_erff4_u10 (x);
+#else
+
+  float2 lo = _cl_erf (x.lo);
+  float2 hi = _cl_erf (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_erf (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_erff8_u10 (x);
+#else
+
+  float4 lo = _cl_erf (x.lo);
+  float4 hi = _cl_erf (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_erf (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_erff16_u10 (x);
+#else
+
+  float8 lo = _cl_erf (x.lo);
+  float8 hi = _cl_erf (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_erf (double x)
+{
+  return Sleef_erf_u10 (x);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_erf (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_erfd2_u10 (x);
+#else
+
+  double lo = _cl_erf (x.lo);
+  double hi = _cl_erf (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_erf (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_erf (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_erf (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_erf (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_erfd4_u10 (x);
+#else
+
+  double2 lo = _cl_erf (x.lo);
+  double2 hi = _cl_erf (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_erf (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_erfd8_u10 (x);
+#else
+
+  double4 lo = _cl_erf (x.lo);
+  double4 hi = _cl_erf (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_erf (double16 x)
+{
+
+  double8 lo = _cl_erf (x.lo);
+  double8 hi = _cl_erf (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/erfc.cl b/lib/kernel/sleef-pocl/erfc.cl
new file mode 100644
index 0000000..4066bd9
--- /dev/null
+++ b/lib/kernel/sleef-pocl/erfc.cl
@@ -0,0 +1,183 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_erfc (float x)
+{
+  return Sleef_erfcf_u15 (x);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_erfc (float2 x)
+{
+
+  float lo = _cl_erfc (x.lo);
+  float hi = _cl_erfc (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_erfc (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_erfc (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_erfc (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_erfc (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_erfcf4_u15 (x);
+#else
+
+  float2 lo = _cl_erfc (x.lo);
+  float2 hi = _cl_erfc (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_erfc (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_erfcf8_u15 (x);
+#else
+
+  float4 lo = _cl_erfc (x.lo);
+  float4 hi = _cl_erfc (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_erfc (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_erfcf16_u15 (x);
+#else
+
+  float8 lo = _cl_erfc (x.lo);
+  float8 hi = _cl_erfc (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_erfc (double x)
+{
+  return Sleef_erfc_u15 (x);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_erfc (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_erfcd2_u15 (x);
+#else
+
+  double lo = _cl_erfc (x.lo);
+  double hi = _cl_erfc (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_erfc (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_erfc (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_erfc (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_erfc (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_erfcd4_u15 (x);
+#else
+
+  double2 lo = _cl_erfc (x.lo);
+  double2 hi = _cl_erfc (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_erfc (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_erfcd8_u15 (x);
+#else
+
+  double4 lo = _cl_erfc (x.lo);
+  double4 hi = _cl_erfc (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_erfc (double16 x)
+{
+
+  double8 lo = _cl_erfc (x.lo);
+  double8 hi = _cl_erfc (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/exp.cl b/lib/kernel/sleef-pocl/exp.cl
new file mode 100644
index 0000000..44ad6b1
--- /dev/null
+++ b/lib/kernel/sleef-pocl/exp.cl
@@ -0,0 +1,183 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_exp (float x)
+{
+  return Sleef_expf_u10 (x);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_exp (float2 x)
+{
+
+  float lo = _cl_exp (x.lo);
+  float hi = _cl_exp (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_exp (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_exp (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_exp (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_exp (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_expf4_u10 (x);
+#else
+
+  float2 lo = _cl_exp (x.lo);
+  float2 hi = _cl_exp (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_exp (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_expf8_u10 (x);
+#else
+
+  float4 lo = _cl_exp (x.lo);
+  float4 hi = _cl_exp (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_exp (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_expf16_u10 (x);
+#else
+
+  float8 lo = _cl_exp (x.lo);
+  float8 hi = _cl_exp (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_exp (double x)
+{
+  return Sleef_exp_u10 (x);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_exp (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_expd2_u10 (x);
+#else
+
+  double lo = _cl_exp (x.lo);
+  double hi = _cl_exp (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_exp (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_exp (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_exp (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_exp (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_expd4_u10 (x);
+#else
+
+  double2 lo = _cl_exp (x.lo);
+  double2 hi = _cl_exp (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_exp (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_expd8_u10 (x);
+#else
+
+  double4 lo = _cl_exp (x.lo);
+  double4 hi = _cl_exp (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_exp (double16 x)
+{
+
+  double8 lo = _cl_exp (x.lo);
+  double8 hi = _cl_exp (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/exp10.cl b/lib/kernel/sleef-pocl/exp10.cl
new file mode 100644
index 0000000..1ba1cbf
--- /dev/null
+++ b/lib/kernel/sleef-pocl/exp10.cl
@@ -0,0 +1,183 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_exp10 (float x)
+{
+  return Sleef_exp10f_u10 (x);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_exp10 (float2 x)
+{
+
+  float lo = _cl_exp10 (x.lo);
+  float hi = _cl_exp10 (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_exp10 (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_exp10 (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_exp10 (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_exp10 (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_exp10f4_u10 (x);
+#else
+
+  float2 lo = _cl_exp10 (x.lo);
+  float2 hi = _cl_exp10 (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_exp10 (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_exp10f8_u10 (x);
+#else
+
+  float4 lo = _cl_exp10 (x.lo);
+  float4 hi = _cl_exp10 (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_exp10 (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_exp10f16_u10 (x);
+#else
+
+  float8 lo = _cl_exp10 (x.lo);
+  float8 hi = _cl_exp10 (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_exp10 (double x)
+{
+  return Sleef_exp10_u10 (x);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_exp10 (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_exp10d2_u10 (x);
+#else
+
+  double lo = _cl_exp10 (x.lo);
+  double hi = _cl_exp10 (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_exp10 (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_exp10 (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_exp10 (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_exp10 (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_exp10d4_u10 (x);
+#else
+
+  double2 lo = _cl_exp10 (x.lo);
+  double2 hi = _cl_exp10 (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_exp10 (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_exp10d8_u10 (x);
+#else
+
+  double4 lo = _cl_exp10 (x.lo);
+  double4 hi = _cl_exp10 (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_exp10 (double16 x)
+{
+
+  double8 lo = _cl_exp10 (x.lo);
+  double8 hi = _cl_exp10 (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/exp2.cl b/lib/kernel/sleef-pocl/exp2.cl
new file mode 100644
index 0000000..350c460
--- /dev/null
+++ b/lib/kernel/sleef-pocl/exp2.cl
@@ -0,0 +1,183 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_exp2 (float x)
+{
+  return Sleef_exp2f_u10 (x);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_exp2 (float2 x)
+{
+
+  float lo = _cl_exp2 (x.lo);
+  float hi = _cl_exp2 (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_exp2 (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_exp2 (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_exp2 (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_exp2 (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_exp2f4_u10 (x);
+#else
+
+  float2 lo = _cl_exp2 (x.lo);
+  float2 hi = _cl_exp2 (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_exp2 (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_exp2f8_u10 (x);
+#else
+
+  float4 lo = _cl_exp2 (x.lo);
+  float4 hi = _cl_exp2 (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_exp2 (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_exp2f16_u10 (x);
+#else
+
+  float8 lo = _cl_exp2 (x.lo);
+  float8 hi = _cl_exp2 (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_exp2 (double x)
+{
+  return Sleef_exp2_u10 (x);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_exp2 (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_exp2d2_u10 (x);
+#else
+
+  double lo = _cl_exp2 (x.lo);
+  double hi = _cl_exp2 (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_exp2 (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_exp2 (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_exp2 (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_exp2 (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_exp2d4_u10 (x);
+#else
+
+  double2 lo = _cl_exp2 (x.lo);
+  double2 hi = _cl_exp2 (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_exp2 (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_exp2d8_u10 (x);
+#else
+
+  double4 lo = _cl_exp2 (x.lo);
+  double4 hi = _cl_exp2 (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_exp2 (double16 x)
+{
+
+  double8 lo = _cl_exp2 (x.lo);
+  double8 hi = _cl_exp2 (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/expfrexp.cl b/lib/kernel/sleef-pocl/expfrexp.cl
new file mode 100644
index 0000000..7c3c595
--- /dev/null
+++ b/lib/kernel/sleef-pocl/expfrexp.cl
@@ -0,0 +1,193 @@
+#include "sleef_cl.h"
+
+_CL_ALWAYSINLINE long2 Sleef_expfrexpd2_long (double2 x);
+_CL_ALWAYSINLINE long4 Sleef_expfrexpd4_long (double4 x);
+_CL_ALWAYSINLINE long8 Sleef_expfrexpd8_long (double8 x);
+
+
+_CL_OVERLOADABLE
+int
+_cl_expfrexp (float x)
+{
+  return Sleef_expfrexpf (x);
+}
+
+_CL_OVERLOADABLE
+int2
+_cl_expfrexp (float2 x)
+{
+
+  int lo = _cl_expfrexp (x.lo);
+  int hi = _cl_expfrexp (x.hi);
+  return (int2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+int4 _cl_expfrexp (float4);
+
+_CL_OVERLOADABLE
+int3
+_cl_expfrexp (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  int4 r = _cl_expfrexp (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+int4
+_cl_expfrexp (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_expfrexpf4 (x);
+#else
+
+  int2 lo = _cl_expfrexp (x.lo);
+  int2 hi = _cl_expfrexp (x.hi);
+  return (int4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+int8
+_cl_expfrexp (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_expfrexpf8 (x);
+#else
+
+  int4 lo = _cl_expfrexp (x.lo);
+  int4 hi = _cl_expfrexp (x.hi);
+  return (int8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+int16
+_cl_expfrexp (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_expfrexpf16 (x);
+#else
+
+  int8 lo = _cl_expfrexp (x.lo);
+  int8 hi = _cl_expfrexp (x.hi);
+  return (int16) (lo, hi);
+
+#endif
+}
+
+/******************************************************************/
+/******************************************************************/
+/******************************************************************/
+/******************************************************************/
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+long
+_cl_expfrexp (double x)
+{
+  return convert_long (Sleef_expfrexp (x));
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+long2
+_cl_expfrexp (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_expfrexpd2_long (x);
+#else
+
+  long lo = _cl_expfrexp (x.lo);
+  long hi = _cl_expfrexp (x.hi);
+  return (long2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+long4 _cl_expfrexp (double4);
+
+_CL_OVERLOADABLE
+long3
+_cl_expfrexp (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  long4 r = _cl_expfrexp (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+long4
+_cl_expfrexp (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_expfrexpd4_long (x);
+#else
+
+  long2 lo = _cl_expfrexp (x.lo);
+  long2 hi = _cl_expfrexp (x.hi);
+  return (long4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+long8
+_cl_expfrexp (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_expfrexpd8_long (x);
+#else
+
+  long4 lo = _cl_expfrexp (x.lo);
+  long4 hi = _cl_expfrexp (x.hi);
+  return (long8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+long16
+_cl_expfrexp (double16 x)
+{
+
+  long8 lo = _cl_expfrexp (x.lo);
+  long8 hi = _cl_expfrexp (x.hi);
+  return (long16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/expm1.cl b/lib/kernel/sleef-pocl/expm1.cl
new file mode 100644
index 0000000..c502a93
--- /dev/null
+++ b/lib/kernel/sleef-pocl/expm1.cl
@@ -0,0 +1,183 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_expm1 (float x)
+{
+  return Sleef_expm1f_u10 (x);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_expm1 (float2 x)
+{
+
+  float lo = _cl_expm1 (x.lo);
+  float hi = _cl_expm1 (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_expm1 (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_expm1 (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_expm1 (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_expm1 (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_expm1f4_u10 (x);
+#else
+
+  float2 lo = _cl_expm1 (x.lo);
+  float2 hi = _cl_expm1 (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_expm1 (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_expm1f8_u10 (x);
+#else
+
+  float4 lo = _cl_expm1 (x.lo);
+  float4 hi = _cl_expm1 (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_expm1 (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_expm1f16_u10 (x);
+#else
+
+  float8 lo = _cl_expm1 (x.lo);
+  float8 hi = _cl_expm1 (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_expm1 (double x)
+{
+  return Sleef_expm1_u10 (x);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_expm1 (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_expm1d2_u10 (x);
+#else
+
+  double lo = _cl_expm1 (x.lo);
+  double hi = _cl_expm1 (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_expm1 (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_expm1 (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_expm1 (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_expm1 (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_expm1d4_u10 (x);
+#else
+
+  double2 lo = _cl_expm1 (x.lo);
+  double2 hi = _cl_expm1 (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_expm1 (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_expm1d8_u10 (x);
+#else
+
+  double4 lo = _cl_expm1 (x.lo);
+  double4 hi = _cl_expm1 (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_expm1 (double16 x)
+{
+
+  double8 lo = _cl_expm1 (x.lo);
+  double8 hi = _cl_expm1 (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/fabs.cl b/lib/kernel/sleef-pocl/fabs.cl
new file mode 100644
index 0000000..7460a6f
--- /dev/null
+++ b/lib/kernel/sleef-pocl/fabs.cl
@@ -0,0 +1,183 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_fabs (float x)
+{
+  return Sleef_fabsf (x);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_fabs (float2 x)
+{
+
+  float lo = _cl_fabs (x.lo);
+  float hi = _cl_fabs (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_fabs (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_fabs (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_fabs (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_fabs (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_fabsf4 (x);
+#else
+
+  float2 lo = _cl_fabs (x.lo);
+  float2 hi = _cl_fabs (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_fabs (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_fabsf8 (x);
+#else
+
+  float4 lo = _cl_fabs (x.lo);
+  float4 hi = _cl_fabs (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_fabs (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_fabsf16 (x);
+#else
+
+  float8 lo = _cl_fabs (x.lo);
+  float8 hi = _cl_fabs (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_fabs (double x)
+{
+  return Sleef_fabs (x);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_fabs (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_fabsd2 (x);
+#else
+
+  double lo = _cl_fabs (x.lo);
+  double hi = _cl_fabs (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_fabs (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_fabs (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_fabs (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_fabs (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_fabsd4 (x);
+#else
+
+  double2 lo = _cl_fabs (x.lo);
+  double2 hi = _cl_fabs (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_fabs (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_fabsd8 (x);
+#else
+
+  double4 lo = _cl_fabs (x.lo);
+  double4 hi = _cl_fabs (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_fabs (double16 x)
+{
+
+  double8 lo = _cl_fabs (x.lo);
+  double8 hi = _cl_fabs (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/fdim.cl b/lib/kernel/sleef-pocl/fdim.cl
new file mode 100644
index 0000000..592ec86
--- /dev/null
+++ b/lib/kernel/sleef-pocl/fdim.cl
@@ -0,0 +1,185 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_fdim (float x, float y)
+{
+  return Sleef_fdimf (x, y);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_fdim (float2 x, float2 y)
+{
+
+  float lo = _cl_fdim (x.lo, y.lo);
+  float hi = _cl_fdim (x.hi, y.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_fdim (float4, float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_fdim (float3 x, float3 y)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+  float4 y_3to4 = (float4) (y, (float)0);
+
+  float4 r = _cl_fdim (x_3to4, y_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_fdim (float4 x, float4 y)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_fdimf4 (x, y);
+#else
+
+  float2 lo = _cl_fdim (x.lo, y.lo);
+  float2 hi = _cl_fdim (x.hi, y.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_fdim (float8 x, float8 y)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_fdimf8 (x, y);
+#else
+
+  float4 lo = _cl_fdim (x.lo, y.lo);
+  float4 hi = _cl_fdim (x.hi, y.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_fdim (float16 x, float16 y)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_fdimf16 (x, y);
+#else
+
+  float8 lo = _cl_fdim (x.lo, y.lo);
+  float8 hi = _cl_fdim (x.hi, y.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_fdim (double x, double y)
+{
+  return Sleef_fdim (x, y);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_fdim (double2 x, double2 y)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_fdimd2 (x, y);
+#else
+
+  double lo = _cl_fdim (x.lo, y.lo);
+  double hi = _cl_fdim (x.hi, y.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_fdim (double4, double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_fdim (double3 x, double3 y)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+  double4 y_3to4 = (double4) (y, (double)0);
+
+  double4 r = _cl_fdim (x_3to4, y_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_fdim (double4 x, double4 y)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_fdimd4 (x, y);
+#else
+
+  double2 lo = _cl_fdim (x.lo, y.lo);
+  double2 hi = _cl_fdim (x.hi, y.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_fdim (double8 x, double8 y)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_fdimd8 (x, y);
+#else
+
+  double4 lo = _cl_fdim (x.lo, y.lo);
+  double4 hi = _cl_fdim (x.hi, y.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_fdim (double16 x, double16 y)
+{
+
+  double8 lo = _cl_fdim (x.lo, y.lo);
+  double8 hi = _cl_fdim (x.hi, y.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/floor.cl b/lib/kernel/sleef-pocl/floor.cl
new file mode 100644
index 0000000..8cf164c
--- /dev/null
+++ b/lib/kernel/sleef-pocl/floor.cl
@@ -0,0 +1,183 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_floor (float x)
+{
+  return Sleef_floorf (x);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_floor (float2 x)
+{
+
+  float lo = _cl_floor (x.lo);
+  float hi = _cl_floor (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_floor (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_floor (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_floor (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_floor (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_floorf4 (x);
+#else
+
+  float2 lo = _cl_floor (x.lo);
+  float2 hi = _cl_floor (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_floor (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_floorf8 (x);
+#else
+
+  float4 lo = _cl_floor (x.lo);
+  float4 hi = _cl_floor (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_floor (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_floorf16 (x);
+#else
+
+  float8 lo = _cl_floor (x.lo);
+  float8 hi = _cl_floor (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_floor (double x)
+{
+  return Sleef_floor (x);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_floor (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_floord2 (x);
+#else
+
+  double lo = _cl_floor (x.lo);
+  double hi = _cl_floor (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_floor (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_floor (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_floor (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_floor (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_floord4 (x);
+#else
+
+  double2 lo = _cl_floor (x.lo);
+  double2 hi = _cl_floor (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_floor (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_floord8 (x);
+#else
+
+  double4 lo = _cl_floor (x.lo);
+  double4 hi = _cl_floor (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_floor (double16 x)
+{
+
+  double8 lo = _cl_floor (x.lo);
+  double8 hi = _cl_floor (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/fma.cl b/lib/kernel/sleef-pocl/fma.cl
new file mode 100644
index 0000000..bcd2be9
--- /dev/null
+++ b/lib/kernel/sleef-pocl/fma.cl
@@ -0,0 +1,187 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_fma (float x, float y, float z)
+{
+  return Sleef_fmaf (x, y, z);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_fma (float2 x, float2 y, float2 z)
+{
+
+  float lo = _cl_fma (x.lo, y.lo, z.lo);
+  float hi = _cl_fma (x.hi, y.hi, z.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_fma (float4, float4, float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_fma (float3 x, float3 y, float3 z)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+  float4 y_3to4 = (float4) (y, (float)0);
+  float4 z_3to4 = (float4) (z, (float)0);
+
+  float4 r = _cl_fma (x_3to4, y_3to4, z_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_fma (float4 x, float4 y, float4 z)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(HAVE_FMA32_128)
+  return Sleef_fmaf4 (x, y, z);
+#else
+
+  float2 lo = _cl_fma (x.lo, y.lo, z.lo);
+  float2 hi = _cl_fma (x.hi, y.hi, z.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_fma (float8 x, float8 y, float8 z)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(HAVE_FMA32_256)
+  return Sleef_fmaf8 (x, y, z);
+#else
+
+  float4 lo = _cl_fma (x.lo, y.lo, z.lo);
+  float4 hi = _cl_fma (x.hi, y.hi, z.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_fma (float16 x, float16 y, float16 z)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(HAVE_FMA32_512)
+  return Sleef_fmaf16 (x, y, z);
+#else
+
+  float8 lo = _cl_fma (x.lo, y.lo, z.lo);
+  float8 hi = _cl_fma (x.hi, y.hi, z.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_fma (double x, double y, double z)
+{
+  return Sleef_fma (x, y, z);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_fma (double2 x, double2 y, double2 z)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE) && defined(HAVE_FMA64_128)
+  return Sleef_fmad2 (x, y, z);
+#else
+
+  double lo = _cl_fma (x.lo, y.lo, z.lo);
+  double hi = _cl_fma (x.hi, y.hi, z.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_fma (double4, double4, double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_fma (double3 x, double3 y, double3 z)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+  double4 y_3to4 = (double4) (y, (double)0);
+  double4 z_3to4 = (double4) (z, (double)0);
+
+  double4 r = _cl_fma (x_3to4, y_3to4, z_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_fma (double4 x, double4 y, double4 z)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE) && defined(HAVE_FMA64_256)
+  return Sleef_fmad4 (x, y, z);
+#else
+
+  double2 lo = _cl_fma (x.lo, y.lo, z.lo);
+  double2 hi = _cl_fma (x.hi, y.hi, z.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_fma (double8 x, double8 y, double8 z)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE) && defined(HAVE_FMA64_512)
+  return Sleef_fmad8 (x, y, z);
+#else
+
+  double4 lo = _cl_fma (x.lo, y.lo, z.lo);
+  double4 hi = _cl_fma (x.hi, y.hi, z.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_fma (double16 x, double16 y, double16 z)
+{
+
+  double8 lo = _cl_fma (x.lo, y.lo, z.lo);
+  double8 hi = _cl_fma (x.hi, y.hi, z.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/fmax.cl b/lib/kernel/sleef-pocl/fmax.cl
new file mode 100644
index 0000000..9986a01
--- /dev/null
+++ b/lib/kernel/sleef-pocl/fmax.cl
@@ -0,0 +1,185 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_fmax (float x, float y)
+{
+  return Sleef_fmaxf (x, y);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_fmax (float2 x, float2 y)
+{
+
+  float lo = _cl_fmax (x.lo, y.lo);
+  float hi = _cl_fmax (x.hi, y.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_fmax (float4, float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_fmax (float3 x, float3 y)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+  float4 y_3to4 = (float4) (y, (float)0);
+
+  float4 r = _cl_fmax (x_3to4, y_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_fmax (float4 x, float4 y)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_fmaxf4 (x, y);
+#else
+
+  float2 lo = _cl_fmax (x.lo, y.lo);
+  float2 hi = _cl_fmax (x.hi, y.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_fmax (float8 x, float8 y)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_fmaxf8 (x, y);
+#else
+
+  float4 lo = _cl_fmax (x.lo, y.lo);
+  float4 hi = _cl_fmax (x.hi, y.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_fmax (float16 x, float16 y)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_fmaxf16 (x, y);
+#else
+
+  float8 lo = _cl_fmax (x.lo, y.lo);
+  float8 hi = _cl_fmax (x.hi, y.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_fmax (double x, double y)
+{
+  return Sleef_fmax (x, y);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_fmax (double2 x, double2 y)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_fmaxd2 (x, y);
+#else
+
+  double lo = _cl_fmax (x.lo, y.lo);
+  double hi = _cl_fmax (x.hi, y.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_fmax (double4, double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_fmax (double3 x, double3 y)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+  double4 y_3to4 = (double4) (y, (double)0);
+
+  double4 r = _cl_fmax (x_3to4, y_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_fmax (double4 x, double4 y)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_fmaxd4 (x, y);
+#else
+
+  double2 lo = _cl_fmax (x.lo, y.lo);
+  double2 hi = _cl_fmax (x.hi, y.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_fmax (double8 x, double8 y)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_fmaxd8 (x, y);
+#else
+
+  double4 lo = _cl_fmax (x.lo, y.lo);
+  double4 hi = _cl_fmax (x.hi, y.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_fmax (double16 x, double16 y)
+{
+
+  double8 lo = _cl_fmax (x.lo, y.lo);
+  double8 hi = _cl_fmax (x.hi, y.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/fmin.cl b/lib/kernel/sleef-pocl/fmin.cl
new file mode 100644
index 0000000..1bc051d
--- /dev/null
+++ b/lib/kernel/sleef-pocl/fmin.cl
@@ -0,0 +1,185 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_fmin (float x, float y)
+{
+  return Sleef_fminf (x, y);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_fmin (float2 x, float2 y)
+{
+
+  float lo = _cl_fmin (x.lo, y.lo);
+  float hi = _cl_fmin (x.hi, y.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_fmin (float4, float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_fmin (float3 x, float3 y)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+  float4 y_3to4 = (float4) (y, (float)0);
+
+  float4 r = _cl_fmin (x_3to4, y_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_fmin (float4 x, float4 y)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_fminf4 (x, y);
+#else
+
+  float2 lo = _cl_fmin (x.lo, y.lo);
+  float2 hi = _cl_fmin (x.hi, y.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_fmin (float8 x, float8 y)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_fminf8 (x, y);
+#else
+
+  float4 lo = _cl_fmin (x.lo, y.lo);
+  float4 hi = _cl_fmin (x.hi, y.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_fmin (float16 x, float16 y)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_fminf16 (x, y);
+#else
+
+  float8 lo = _cl_fmin (x.lo, y.lo);
+  float8 hi = _cl_fmin (x.hi, y.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_fmin (double x, double y)
+{
+  return Sleef_fmin (x, y);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_fmin (double2 x, double2 y)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_fmind2 (x, y);
+#else
+
+  double lo = _cl_fmin (x.lo, y.lo);
+  double hi = _cl_fmin (x.hi, y.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_fmin (double4, double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_fmin (double3 x, double3 y)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+  double4 y_3to4 = (double4) (y, (double)0);
+
+  double4 r = _cl_fmin (x_3to4, y_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_fmin (double4 x, double4 y)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_fmind4 (x, y);
+#else
+
+  double2 lo = _cl_fmin (x.lo, y.lo);
+  double2 hi = _cl_fmin (x.hi, y.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_fmin (double8 x, double8 y)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_fmind8 (x, y);
+#else
+
+  double4 lo = _cl_fmin (x.lo, y.lo);
+  double4 hi = _cl_fmin (x.hi, y.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_fmin (double16 x, double16 y)
+{
+
+  double8 lo = _cl_fmin (x.lo, y.lo);
+  double8 hi = _cl_fmin (x.hi, y.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/fmod.cl b/lib/kernel/sleef-pocl/fmod.cl
new file mode 100644
index 0000000..a29dea2
--- /dev/null
+++ b/lib/kernel/sleef-pocl/fmod.cl
@@ -0,0 +1,185 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_fmod (float x, float y)
+{
+  return Sleef_fmodf (x, y);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_fmod (float2 x, float2 y)
+{
+
+  float lo = _cl_fmod (x.lo, y.lo);
+  float hi = _cl_fmod (x.hi, y.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_fmod (float4, float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_fmod (float3 x, float3 y)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+  float4 y_3to4 = (float4) (y, (float)0);
+
+  float4 r = _cl_fmod (x_3to4, y_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_fmod (float4 x, float4 y)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_fmodf4 (x, y);
+#else
+
+  float2 lo = _cl_fmod (x.lo, y.lo);
+  float2 hi = _cl_fmod (x.hi, y.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_fmod (float8 x, float8 y)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_fmodf8 (x, y);
+#else
+
+  float4 lo = _cl_fmod (x.lo, y.lo);
+  float4 hi = _cl_fmod (x.hi, y.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_fmod (float16 x, float16 y)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_fmodf16 (x, y);
+#else
+
+  float8 lo = _cl_fmod (x.lo, y.lo);
+  float8 hi = _cl_fmod (x.hi, y.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_fmod (double x, double y)
+{
+  return Sleef_fmod (x, y);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_fmod (double2 x, double2 y)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_fmodd2 (x, y);
+#else
+
+  double lo = _cl_fmod (x.lo, y.lo);
+  double hi = _cl_fmod (x.hi, y.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_fmod (double4, double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_fmod (double3 x, double3 y)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+  double4 y_3to4 = (double4) (y, (double)0);
+
+  double4 r = _cl_fmod (x_3to4, y_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_fmod (double4 x, double4 y)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_fmodd4 (x, y);
+#else
+
+  double2 lo = _cl_fmod (x.lo, y.lo);
+  double2 hi = _cl_fmod (x.hi, y.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_fmod (double8 x, double8 y)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_fmodd8 (x, y);
+#else
+
+  double4 lo = _cl_fmod (x.lo, y.lo);
+  double4 hi = _cl_fmod (x.hi, y.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_fmod (double16 x, double16 y)
+{
+
+  double8 lo = _cl_fmod (x.lo, y.lo);
+  double8 hi = _cl_fmod (x.hi, y.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/frexp.cl b/lib/kernel/sleef-pocl/frexp.cl
new file mode 100644
index 0000000..2678378
--- /dev/null
+++ b/lib/kernel/sleef-pocl/frexp.cl
@@ -0,0 +1,77 @@
+/* OpenCL built-in library: frexp() using SLEEF
+
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include "../templates.h"
+
+_CL_DECLARE_FUNC_V_V (_cl_frfrexp)
+_CL_DECLARE_FUNC_K_V (_cl_expfrexp)
+
+int _CL_OVERLOADABLE
+_cl_expfrexp_int (float x)
+{
+  return _cl_expfrexp (x);
+}
+int2 _CL_OVERLOADABLE
+_cl_expfrexp_int (float2 x)
+{
+  return _cl_expfrexp (x);
+}
+int3 _CL_OVERLOADABLE
+_cl_expfrexp_int (float3 x)
+{
+  return _cl_expfrexp (x);
+}
+int4 _CL_OVERLOADABLE
+_cl_expfrexp_int (float4 x)
+{
+  return _cl_expfrexp (x);
+}
+int8 _CL_OVERLOADABLE
+_cl_expfrexp_int (float8 x)
+{
+  return _cl_expfrexp (x);
+}
+int16 _CL_OVERLOADABLE
+_cl_expfrexp_int (float16 x)
+{
+  return _cl_expfrexp (x);
+}
+
+__IF_FP64 (int _CL_OVERLOADABLE _cl_expfrexp_int (double x) {
+  return convert_int (_cl_expfrexp (x));
+} int2 _CL_OVERLOADABLE _cl_expfrexp_int (double2 x) {
+  return convert_int2 (_cl_expfrexp (x));
+} int3 _CL_OVERLOADABLE _cl_expfrexp_int (double3 x) {
+  return convert_int3 (_cl_expfrexp (x));
+} int4 _CL_OVERLOADABLE _cl_expfrexp_int (double4 x) {
+  return convert_int4 (_cl_expfrexp (x));
+} int8 _CL_OVERLOADABLE _cl_expfrexp_int (double8 x) {
+  return convert_int8 (_cl_expfrexp (x));
+} int16 _CL_OVERLOADABLE _cl_expfrexp_int (double16 x) {
+  return convert_int16 (_cl_expfrexp (x));
+})
+
+DEFINE_EXPR_V_VIPV (frexp, ({
+                      *b = _cl_expfrexp_int (a);
+                      _cl_frfrexp (a);
+                    }))
diff --git a/lib/kernel/sleef-pocl/frfrexp.cl b/lib/kernel/sleef-pocl/frfrexp.cl
new file mode 100644
index 0000000..a0e118d
--- /dev/null
+++ b/lib/kernel/sleef-pocl/frfrexp.cl
@@ -0,0 +1,183 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_frfrexp (float x)
+{
+  return Sleef_frfrexpf (x);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_frfrexp (float2 x)
+{
+
+  float lo = _cl_frfrexp (x.lo);
+  float hi = _cl_frfrexp (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_frfrexp (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_frfrexp (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_frfrexp (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_frfrexp (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_frfrexpf4 (x);
+#else
+
+  float2 lo = _cl_frfrexp (x.lo);
+  float2 hi = _cl_frfrexp (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_frfrexp (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_frfrexpf8 (x);
+#else
+
+  float4 lo = _cl_frfrexp (x.lo);
+  float4 hi = _cl_frfrexp (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_frfrexp (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_frfrexpf16 (x);
+#else
+
+  float8 lo = _cl_frfrexp (x.lo);
+  float8 hi = _cl_frfrexp (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_frfrexp (double x)
+{
+  return Sleef_frfrexp (x);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_frfrexp (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_frfrexpd2 (x);
+#else
+
+  double lo = _cl_frfrexp (x.lo);
+  double hi = _cl_frfrexp (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_frfrexp (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_frfrexp (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_frfrexp (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_frfrexp (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_frfrexpd4 (x);
+#else
+
+  double2 lo = _cl_frfrexp (x.lo);
+  double2 hi = _cl_frfrexp (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_frfrexp (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_frfrexpd8 (x);
+#else
+
+  double4 lo = _cl_frfrexp (x.lo);
+  double4 hi = _cl_frfrexp (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_frfrexp (double16 x)
+{
+
+  double8 lo = _cl_frfrexp (x.lo);
+  double8 hi = _cl_frfrexp (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/hypot.cl b/lib/kernel/sleef-pocl/hypot.cl
new file mode 100644
index 0000000..2587f55
--- /dev/null
+++ b/lib/kernel/sleef-pocl/hypot.cl
@@ -0,0 +1,231 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_hypot (float x, float y)
+{
+
+#ifdef MAX_PRECISION
+  return Sleef_hypotf_u05 (x, y);
+#else
+  return Sleef_hypotf_u35 (x, y);
+#endif
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_hypot (float2 x, float2 y)
+{
+
+  float lo = _cl_hypot (x.lo, y.lo);
+  float hi = _cl_hypot (x.hi, y.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_hypot (float4, float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_hypot (float3 x, float3 y)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+  float4 y_3to4 = (float4) (y, (float)0);
+
+  float4 r = _cl_hypot (x_3to4, y_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_hypot (float4 x, float4 y)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_hypotf4_u05 (x, y);
+#else
+  return Sleef_hypotf4_u35 (x, y);
+#endif
+
+#else
+
+  float2 lo = _cl_hypot (x.lo, y.lo);
+  float2 hi = _cl_hypot (x.hi, y.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_hypot (float8 x, float8 y)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_hypotf8_u05 (x, y);
+#else
+  return Sleef_hypotf8_u35 (x, y);
+#endif
+
+#else
+
+  float4 lo = _cl_hypot (x.lo, y.lo);
+  float4 hi = _cl_hypot (x.hi, y.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_hypot (float16 x, float16 y)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_hypotf16_u05 (x, y);
+#else
+  return Sleef_hypotf16_u35 (x, y);
+#endif
+
+#else
+
+  float8 lo = _cl_hypot (x.lo, y.lo);
+  float8 hi = _cl_hypot (x.hi, y.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_hypot (double x, double y)
+{
+
+#ifdef MAX_PRECISION
+  return Sleef_hypot_u05 (x, y);
+#else
+  return Sleef_hypot_u35 (x, y);
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_hypot (double2 x, double2 y)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_hypotd2_u05 (x, y);
+#else
+  return Sleef_hypotd2_u35 (x, y);
+#endif
+
+#else
+
+  double lo = _cl_hypot (x.lo, y.lo);
+  double hi = _cl_hypot (x.hi, y.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_hypot (double4, double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_hypot (double3 x, double3 y)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+  double4 y_3to4 = (double4) (y, (double)0);
+
+  double4 r = _cl_hypot (x_3to4, y_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_hypot (double4 x, double4 y)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_hypotd4_u05 (x, y);
+#else
+  return Sleef_hypotd4_u35 (x, y);
+#endif
+
+#else
+
+  double2 lo = _cl_hypot (x.lo, y.lo);
+  double2 hi = _cl_hypot (x.hi, y.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_hypot (double8 x, double8 y)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_hypotd8_u05 (x, y);
+#else
+  return Sleef_hypotd8_u35 (x, y);
+#endif
+
+#else
+
+  double4 lo = _cl_hypot (x.lo, y.lo);
+  double4 hi = _cl_hypot (x.hi, y.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_hypot (double16 x, double16 y)
+{
+
+  double8 lo = _cl_hypot (x.lo, y.lo);
+  double8 hi = _cl_hypot (x.hi, y.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/ilogb.cl b/lib/kernel/sleef-pocl/ilogb.cl
new file mode 100644
index 0000000..d8fc6c3
--- /dev/null
+++ b/lib/kernel/sleef-pocl/ilogb.cl
@@ -0,0 +1,183 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+int
+_cl_ilogb (float x)
+{
+  return Sleef_ilogbf (x);
+}
+
+_CL_OVERLOADABLE
+int2
+_cl_ilogb (float2 x)
+{
+
+  int lo = _cl_ilogb (x.lo);
+  int hi = _cl_ilogb (x.hi);
+  return (int2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+int4 _cl_ilogb (float4);
+
+_CL_OVERLOADABLE
+int3
+_cl_ilogb (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  int4 r = _cl_ilogb (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+int4
+_cl_ilogb (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_ilogbf4 (x);
+#else
+
+  int2 lo = _cl_ilogb (x.lo);
+  int2 hi = _cl_ilogb (x.hi);
+  return (int4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+int8
+_cl_ilogb (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_ilogbf8 (x);
+#else
+
+  int4 lo = _cl_ilogb (x.lo);
+  int4 hi = _cl_ilogb (x.hi);
+  return (int8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+int16
+_cl_ilogb (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_ilogbf16 (x);
+#else
+
+  int8 lo = _cl_ilogb (x.lo);
+  int8 hi = _cl_ilogb (x.hi);
+  return (int16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+int
+_cl_ilogb (double x)
+{
+  return Sleef_ilogb (x);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+int2
+_cl_ilogb (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_ilogbd2 (x);
+#else
+
+  int lo = _cl_ilogb (x.lo);
+  int hi = _cl_ilogb (x.hi);
+  return (int2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+int4 _cl_ilogb (double4);
+
+_CL_OVERLOADABLE
+int3
+_cl_ilogb (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  int4 r = _cl_ilogb (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+int4
+_cl_ilogb (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_ilogbd4 (x);
+#else
+
+  int2 lo = _cl_ilogb (x.lo);
+  int2 hi = _cl_ilogb (x.hi);
+  return (int4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+int8
+_cl_ilogb (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_ilogbd8 (x);
+#else
+
+  int4 lo = _cl_ilogb (x.lo);
+  int4 hi = _cl_ilogb (x.hi);
+  return (int8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+int16
+_cl_ilogb (double16 x)
+{
+
+  int8 lo = _cl_ilogb (x.lo);
+  int8 hi = _cl_ilogb (x.hi);
+  return (int16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/ldexp.cl b/lib/kernel/sleef-pocl/ldexp.cl
new file mode 100644
index 0000000..304920d
--- /dev/null
+++ b/lib/kernel/sleef-pocl/ldexp.cl
@@ -0,0 +1,185 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_ldexp (float x, int k)
+{
+  return Sleef_ldexpf (x, k);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_ldexp (float2 x, int2 k)
+{
+
+  float lo = _cl_ldexp (x.lo, k.lo);
+  float hi = _cl_ldexp (x.hi, k.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_ldexp (float4, int4);
+
+_CL_OVERLOADABLE
+float3
+_cl_ldexp (float3 x, int3 k)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+  int4 k_3to4 = (int4) (k, (float)0);
+
+  float4 r = _cl_ldexp (x_3to4, k_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_ldexp (float4 x, int4 k)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_ldexpf4 (x, k);
+#else
+
+  float2 lo = _cl_ldexp (x.lo, k.lo);
+  float2 hi = _cl_ldexp (x.hi, k.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_ldexp (float8 x, int8 k)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_ldexpf8 (x, k);
+#else
+
+  float4 lo = _cl_ldexp (x.lo, k.lo);
+  float4 hi = _cl_ldexp (x.hi, k.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_ldexp (float16 x, int16 k)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_ldexpf16 (x, k);
+#else
+
+  float8 lo = _cl_ldexp (x.lo, k.lo);
+  float8 hi = _cl_ldexp (x.hi, k.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_ldexp (double x, int k)
+{
+  return Sleef_ldexp (x, k);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_ldexp (double2 x, int2 k)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_ldexpd2 (x, k);
+#else
+
+  double lo = _cl_ldexp (x.lo, k.lo);
+  double hi = _cl_ldexp (x.hi, k.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_ldexp (double4, int4);
+
+_CL_OVERLOADABLE
+double3
+_cl_ldexp (double3 x, int3 k)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+  int4 k_3to4 = (int4) (k, (double)0);
+
+  double4 r = _cl_ldexp (x_3to4, k_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_ldexp (double4 x, int4 k)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_ldexpd4 (x, k);
+#else
+
+  double2 lo = _cl_ldexp (x.lo, k.lo);
+  double2 hi = _cl_ldexp (x.hi, k.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_ldexp (double8 x, int8 k)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_ldexpd8 (x, k);
+#else
+
+  double4 lo = _cl_ldexp (x.lo, k.lo);
+  double4 hi = _cl_ldexp (x.hi, k.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_ldexp (double16 x, int16 k)
+{
+
+  double8 lo = _cl_ldexp (x.lo, k.lo);
+  double8 hi = _cl_ldexp (x.hi, k.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/lgamma.cl b/lib/kernel/sleef-pocl/lgamma.cl
new file mode 100644
index 0000000..afda973
--- /dev/null
+++ b/lib/kernel/sleef-pocl/lgamma.cl
@@ -0,0 +1,183 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_lgamma (float x)
+{
+  return Sleef_lgammaf_u10 (x);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_lgamma (float2 x)
+{
+
+  float lo = _cl_lgamma (x.lo);
+  float hi = _cl_lgamma (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_lgamma (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_lgamma (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_lgamma (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_lgamma (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_lgammaf4_u10 (x);
+#else
+
+  float2 lo = _cl_lgamma (x.lo);
+  float2 hi = _cl_lgamma (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_lgamma (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_lgammaf8_u10 (x);
+#else
+
+  float4 lo = _cl_lgamma (x.lo);
+  float4 hi = _cl_lgamma (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_lgamma (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_lgammaf16_u10 (x);
+#else
+
+  float8 lo = _cl_lgamma (x.lo);
+  float8 hi = _cl_lgamma (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_lgamma (double x)
+{
+  return Sleef_lgamma_u10 (x);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_lgamma (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_lgammad2_u10 (x);
+#else
+
+  double lo = _cl_lgamma (x.lo);
+  double hi = _cl_lgamma (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_lgamma (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_lgamma (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_lgamma (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_lgamma (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_lgammad4_u10 (x);
+#else
+
+  double2 lo = _cl_lgamma (x.lo);
+  double2 hi = _cl_lgamma (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_lgamma (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_lgammad8_u10 (x);
+#else
+
+  double4 lo = _cl_lgamma (x.lo);
+  double4 hi = _cl_lgamma (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_lgamma (double16 x)
+{
+
+  double8 lo = _cl_lgamma (x.lo);
+  double8 hi = _cl_lgamma (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/lgamma_r.cl b/lib/kernel/sleef-pocl/lgamma_r.cl
new file mode 100644
index 0000000..ec1cca6
--- /dev/null
+++ b/lib/kernel/sleef-pocl/lgamma_r.cl
@@ -0,0 +1,608 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_lgamma_r (float x, global int *iptr)
+{
+  Sleef_float2 temp;
+  temp = Sleef_lgamma_rf_u10(x);
+  *iptr = convert_int(temp.y);
+  return temp.x;
+}
+
+_CL_OVERLOADABLE
+float3
+_cl_lgamma_r (float3 x, global int3 *iptr)
+{
+  int4 temp;
+  float4 x_3to4;
+  x_3to4.xyz = x;
+  float4 r = _cl_lgamma_r (x_3to4, &temp);
+  *iptr = temp.xyz;
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_lgamma_r (float2 x, global int2 *iptr)
+{
+  int plo, phi;
+  float lo = _cl_lgamma_r (x.lo, &plo);
+  float hi = _cl_lgamma_r (x.hi, &phi);
+
+  *iptr = (int2)(plo, phi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_lgamma_r (float4 x, global int4 *iptr)
+{
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  Sleef_float4_2 temp;
+  temp = Sleef_lgamma_rf4_u10(x);
+  *iptr = convert_int4(temp.y);
+  return temp.x;
+#else
+
+  int2 plo, phi;
+  float2 lo = _cl_lgamma_r (x.lo, &plo);
+  float2 hi = _cl_lgamma_r (x.hi, &phi);
+
+  *iptr = (int4) (plo, phi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_lgamma_r (float8 x, global int8 *iptr)
+{
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  Sleef_float8_2 temp;
+  temp = Sleef_lgamma_rf8_u10(x);
+  *iptr = convert_int8(temp.y);
+  return temp.x;
+#else
+
+  int4 plo, phi;
+  float4 lo = _cl_lgamma_r (x.lo, &plo);
+  float4 hi = _cl_lgamma_r (x.hi, &phi);
+
+  *iptr = (int8) (plo, phi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_lgamma_r (float16 x, global int16 *iptr)
+{
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  Sleef_float16_2 temp;
+  temp = Sleef_lgamma_rf16_u10(x);
+  *iptr = convert_int16(temp.y);
+  return temp.x;
+#else
+
+  int8 plo, phi;
+  float8 lo = _cl_lgamma_r (x.lo, &plo);
+  float8 hi = _cl_lgamma_r (x.hi, &phi);
+
+  *iptr = (int16) (plo, phi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_lgamma_r (double x, global int *iptr)
+{
+  Sleef_double2 temp;
+  temp = Sleef_lgamma_r_u10(x);
+  *iptr = convert_int(temp.y);
+  return temp.x;
+}
+
+_CL_OVERLOADABLE
+double3
+_cl_lgamma_r (double3 x, global int3 *iptr)
+{
+  int4 temp;
+  double4 x_3to4;
+  x_3to4.xyz = x;
+  double4 r = _cl_lgamma_r (x_3to4, &temp);
+  *iptr = temp.xyz;
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+double16
+_cl_lgamma_r (double16 x, global int16 *iptr)
+{
+  int8 plo, phi;
+  double8 lo = _cl_lgamma_r (x.lo, &plo);
+  double8 hi = _cl_lgamma_r (x.hi, &phi);
+
+  *iptr = (int16) (plo, phi);
+  return (double16) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+double2
+_cl_lgamma_r (double2 x, global int2 *iptr)
+{
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  Sleef_double2_2 temp;
+  temp = Sleef_lgamma_rd2_u10(x);
+  *iptr = convert_int2(temp.y);
+  return temp.x;
+#else
+
+  int plo, phi;
+  double lo = _cl_lgamma_r (x.lo, &plo);
+  double hi = _cl_lgamma_r (x.hi, &phi);
+
+  *iptr = (int2) (plo, phi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+double4
+_cl_lgamma_r (double4 x, global int4 *iptr)
+{
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  Sleef_double4_2 temp;
+  temp = Sleef_lgamma_rd4_u10(x);
+  *iptr = convert_int4(temp.y);
+  return temp.x;
+#else
+
+  int2 plo, phi;
+  double2 lo = _cl_lgamma_r (x.lo, &plo);
+  double2 hi = _cl_lgamma_r (x.hi, &phi);
+
+  *iptr = (int4) (plo, phi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+double8
+_cl_lgamma_r (double8 x, global int8 *iptr)
+{
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  Sleef_double8_2 temp;
+  temp = Sleef_lgamma_rd8_u10(x);
+  *iptr = convert_int8(temp.y);
+  return temp.x;
+#else
+
+  int4 plo, phi;
+  double4 lo = _cl_lgamma_r (x.lo, &plo);
+  double4 hi = _cl_lgamma_r (x.hi, &phi);
+
+  *iptr = (int8) (plo, phi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+/****************************************************************/
+/****************************************************************/
+/****************************************************************/
+/****************************************************************/
+
+
+_CL_OVERLOADABLE
+float
+_cl_lgamma_r (float x, local int *iptr)
+{
+  Sleef_float2 temp;
+  temp = Sleef_lgamma_rf_u10(x);
+  *iptr = convert_int(temp.y);
+  return temp.x;
+}
+
+_CL_OVERLOADABLE
+float3
+_cl_lgamma_r (float3 x, local int3 *iptr)
+{
+  int4 temp;
+  float4 x_3to4;
+  x_3to4.xyz = x;
+  float4 r = _cl_lgamma_r (x_3to4, &temp);
+  *iptr = temp.xyz;
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_lgamma_r (float2 x, local int2 *iptr)
+{
+  int plo, phi;
+  float lo = _cl_lgamma_r (x.lo, &plo);
+  float hi = _cl_lgamma_r (x.hi, &phi);
+
+  *iptr = (int2)(plo, phi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_lgamma_r (float4 x, local int4 *iptr)
+{
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  Sleef_float4_2 temp;
+  temp = Sleef_lgamma_rf4_u10(x);
+  *iptr = convert_int4(temp.y);
+  return temp.x;
+#else
+
+  int2 plo, phi;
+  float2 lo = _cl_lgamma_r (x.lo, &plo);
+  float2 hi = _cl_lgamma_r (x.hi, &phi);
+
+  *iptr = (int4) (plo, phi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_lgamma_r (float8 x, local int8 *iptr)
+{
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  Sleef_float8_2 temp;
+  temp = Sleef_lgamma_rf8_u10(x);
+  *iptr = convert_int8(temp.y);
+  return temp.x;
+#else
+
+  int4 plo, phi;
+  float4 lo = _cl_lgamma_r (x.lo, &plo);
+  float4 hi = _cl_lgamma_r (x.hi, &phi);
+
+  *iptr = (int8) (plo, phi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_lgamma_r (float16 x, local int16 *iptr)
+{
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  Sleef_float16_2 temp;
+  temp = Sleef_lgamma_rf16_u10(x);
+  *iptr = convert_int16(temp.y);
+  return temp.x;
+#else
+
+  int8 plo, phi;
+  float8 lo = _cl_lgamma_r (x.lo, &plo);
+  float8 hi = _cl_lgamma_r (x.hi, &phi);
+
+  *iptr = (int16) (plo, phi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_lgamma_r (double x, local int *iptr)
+{
+  Sleef_double2 temp;
+  temp = Sleef_lgamma_r_u10(x);
+  *iptr = convert_int(temp.y);
+  return temp.x;
+}
+
+_CL_OVERLOADABLE
+double3
+_cl_lgamma_r (double3 x, local int3 *iptr)
+{
+  int4 temp;
+  double4 x_3to4;
+  x_3to4.xyz = x;
+  double4 r = _cl_lgamma_r (x_3to4, &temp);
+  *iptr = temp.xyz;
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+double16
+_cl_lgamma_r (double16 x, local int16 *iptr)
+{
+  int8 plo, phi;
+  double8 lo = _cl_lgamma_r (x.lo, &plo);
+  double8 hi = _cl_lgamma_r (x.hi, &phi);
+
+  *iptr = (int16) (plo, phi);
+  return (double16) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+double2
+_cl_lgamma_r (double2 x, local int2 *iptr)
+{
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  Sleef_double2_2 temp;
+  temp = Sleef_lgamma_rd2_u10(x);
+  *iptr = convert_int2(temp.y);
+  return temp.x;
+#else
+
+  int plo, phi;
+  double lo = _cl_lgamma_r (x.lo, &plo);
+  double hi = _cl_lgamma_r (x.hi, &phi);
+
+  *iptr = (int2) (plo, phi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+double4
+_cl_lgamma_r (double4 x, local int4 *iptr)
+{
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  Sleef_double4_2 temp;
+  temp = Sleef_lgamma_rd4_u10(x);
+  *iptr = convert_int4(temp.y);
+  return temp.x;
+#else
+
+  int2 plo, phi;
+  double2 lo = _cl_lgamma_r (x.lo, &plo);
+  double2 hi = _cl_lgamma_r (x.hi, &phi);
+
+  *iptr = (int4) (plo, phi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+double8
+_cl_lgamma_r (double8 x, local int8 *iptr)
+{
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  Sleef_double8_2 temp;
+  temp = Sleef_lgamma_rd8_u10(x);
+  *iptr = convert_int8(temp.y);
+  return temp.x;
+#else
+
+  int4 plo, phi;
+  double4 lo = _cl_lgamma_r (x.lo, &plo);
+  double4 hi = _cl_lgamma_r (x.hi, &phi);
+
+  *iptr = (int8) (plo, phi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+
+/****************************************************************/
+/****************************************************************/
+/****************************************************************/
+/****************************************************************/
+
+
+_CL_OVERLOADABLE
+float
+_cl_lgamma_r (float x, private int *iptr)
+{
+  Sleef_float2 temp;
+  temp = Sleef_lgamma_rf_u10(x);
+  *iptr = convert_int(temp.y);
+  return temp.x;
+}
+
+_CL_OVERLOADABLE
+float3
+_cl_lgamma_r (float3 x, private int3 *iptr)
+{
+  int4 temp;
+  float4 x_3to4;
+  x_3to4.xyz = x;
+  float4 r = _cl_lgamma_r (x_3to4, &temp);
+  *iptr = temp.xyz;
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_lgamma_r (float2 x, private int2 *iptr)
+{
+  int plo, phi;
+  float lo = _cl_lgamma_r (x.lo, &plo);
+  float hi = _cl_lgamma_r (x.hi, &phi);
+
+  *iptr = (int2)(plo, phi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_lgamma_r (float4 x, private int4 *iptr)
+{
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  Sleef_float4_2 temp;
+  temp = Sleef_lgamma_rf4_u10(x);
+  *iptr = convert_int4(temp.y);
+  return temp.x;
+#else
+
+  int2 plo, phi;
+  float2 lo = _cl_lgamma_r (x.lo, &plo);
+  float2 hi = _cl_lgamma_r (x.hi, &phi);
+
+  *iptr = (int4) (plo, phi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_lgamma_r (float8 x, private int8 *iptr)
+{
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  Sleef_float8_2 temp;
+  temp = Sleef_lgamma_rf8_u10(x);
+  *iptr = convert_int8(temp.y);
+  return temp.x;
+#else
+
+  int4 plo, phi;
+  float4 lo = _cl_lgamma_r (x.lo, &plo);
+  float4 hi = _cl_lgamma_r (x.hi, &phi);
+
+  *iptr = (int8) (plo, phi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_lgamma_r (float16 x, private int16 *iptr)
+{
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  Sleef_float16_2 temp;
+  temp = Sleef_lgamma_rf16_u10(x);
+  *iptr = convert_int16(temp.y);
+  return temp.x;
+#else
+
+  int8 plo, phi;
+  float8 lo = _cl_lgamma_r (x.lo, &plo);
+  float8 hi = _cl_lgamma_r (x.hi, &phi);
+
+  *iptr = (int16) (plo, phi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_lgamma_r (double x, private int *iptr)
+{
+  Sleef_double2 temp;
+  temp = Sleef_lgamma_r_u10(x);
+  *iptr = convert_int(temp.y);
+  return temp.x;
+}
+
+_CL_OVERLOADABLE
+double3
+_cl_lgamma_r (double3 x, private int3 *iptr)
+{
+  int4 temp;
+  double4 x_3to4;
+  x_3to4.xyz = x;
+  double4 r = _cl_lgamma_r (x_3to4, &temp);
+  *iptr = temp.xyz;
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+double16
+_cl_lgamma_r (double16 x, private int16 *iptr)
+{
+  int8 plo, phi;
+  double8 lo = _cl_lgamma_r (x.lo, &plo);
+  double8 hi = _cl_lgamma_r (x.hi, &phi);
+
+  *iptr = (int16) (plo, phi);
+  return (double16) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+double2
+_cl_lgamma_r (double2 x, private int2 *iptr)
+{
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  Sleef_double2_2 temp;
+  temp = Sleef_lgamma_rd2_u10(x);
+  *iptr = convert_int2(temp.y);
+  return temp.x;
+#else
+
+  int plo, phi;
+  double lo = _cl_lgamma_r (x.lo, &plo);
+  double hi = _cl_lgamma_r (x.hi, &phi);
+
+  *iptr = (int2) (plo, phi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+double4
+_cl_lgamma_r (double4 x, private int4 *iptr)
+{
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  Sleef_double4_2 temp;
+  temp = Sleef_lgamma_rd4_u10(x);
+  *iptr = convert_int4(temp.y);
+  return temp.x;
+#else
+
+  int2 plo, phi;
+  double2 lo = _cl_lgamma_r (x.lo, &plo);
+  double2 hi = _cl_lgamma_r (x.hi, &phi);
+
+  *iptr = (int4) (plo, phi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+double8
+_cl_lgamma_r (double8 x, private int8 *iptr)
+{
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  Sleef_double8_2 temp;
+  temp = Sleef_lgamma_rd8_u10(x);
+  *iptr = convert_int8(temp.y);
+  return temp.x;
+#else
+
+  int4 plo, phi;
+  double4 lo = _cl_lgamma_r (x.lo, &plo);
+  double4 hi = _cl_lgamma_r (x.hi, &phi);
+
+  *iptr = (int8) (plo, phi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/log.cl b/lib/kernel/sleef-pocl/log.cl
new file mode 100644
index 0000000..adf1aa4
--- /dev/null
+++ b/lib/kernel/sleef-pocl/log.cl
@@ -0,0 +1,229 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_log (float x)
+{
+
+#ifdef MAX_PRECISION
+  return Sleef_logf_u10 (x);
+#else
+  return Sleef_logf_u35 (x);
+#endif
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_log (float2 x)
+{
+
+  float lo = _cl_log (x.lo);
+  float hi = _cl_log (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_log (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_log (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_log (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_log (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_logf4_u10 (x);
+#else
+  return Sleef_logf4_u35 (x);
+#endif
+
+#else
+
+  float2 lo = _cl_log (x.lo);
+  float2 hi = _cl_log (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_log (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_logf8_u10 (x);
+#else
+  return Sleef_logf8_u35 (x);
+#endif
+
+#else
+
+  float4 lo = _cl_log (x.lo);
+  float4 hi = _cl_log (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_log (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_logf16_u10 (x);
+#else
+  return Sleef_logf16_u35 (x);
+#endif
+
+#else
+
+  float8 lo = _cl_log (x.lo);
+  float8 hi = _cl_log (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_log (double x)
+{
+
+#ifdef MAX_PRECISION
+  return Sleef_log_u10 (x);
+#else
+  return Sleef_log_u35 (x);
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_log (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_logd2_u10 (x);
+#else
+  return Sleef_logd2_u35 (x);
+#endif
+
+#else
+
+  double lo = _cl_log (x.lo);
+  double hi = _cl_log (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_log (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_log (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_log (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_log (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_logd4_u10 (x);
+#else
+  return Sleef_logd4_u35 (x);
+#endif
+
+#else
+
+  double2 lo = _cl_log (x.lo);
+  double2 hi = _cl_log (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_log (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_logd8_u10 (x);
+#else
+  return Sleef_logd8_u35 (x);
+#endif
+
+#else
+
+  double4 lo = _cl_log (x.lo);
+  double4 hi = _cl_log (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_log (double16 x)
+{
+
+  double8 lo = _cl_log (x.lo);
+  double8 hi = _cl_log (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/log10.cl b/lib/kernel/sleef-pocl/log10.cl
new file mode 100644
index 0000000..55e362a
--- /dev/null
+++ b/lib/kernel/sleef-pocl/log10.cl
@@ -0,0 +1,183 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_log10 (float x)
+{
+  return Sleef_log10f_u10 (x);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_log10 (float2 x)
+{
+
+  float lo = _cl_log10 (x.lo);
+  float hi = _cl_log10 (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_log10 (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_log10 (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_log10 (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_log10 (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_log10f4_u10 (x);
+#else
+
+  float2 lo = _cl_log10 (x.lo);
+  float2 hi = _cl_log10 (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_log10 (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_log10f8_u10 (x);
+#else
+
+  float4 lo = _cl_log10 (x.lo);
+  float4 hi = _cl_log10 (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_log10 (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_log10f16_u10 (x);
+#else
+
+  float8 lo = _cl_log10 (x.lo);
+  float8 hi = _cl_log10 (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_log10 (double x)
+{
+  return Sleef_log10_u10 (x);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_log10 (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_log10d2_u10 (x);
+#else
+
+  double lo = _cl_log10 (x.lo);
+  double hi = _cl_log10 (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_log10 (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_log10 (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_log10 (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_log10 (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_log10d4_u10 (x);
+#else
+
+  double2 lo = _cl_log10 (x.lo);
+  double2 hi = _cl_log10 (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_log10 (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_log10d8_u10 (x);
+#else
+
+  double4 lo = _cl_log10 (x.lo);
+  double4 hi = _cl_log10 (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_log10 (double16 x)
+{
+
+  double8 lo = _cl_log10 (x.lo);
+  double8 hi = _cl_log10 (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/log1p.cl b/lib/kernel/sleef-pocl/log1p.cl
new file mode 100644
index 0000000..e9c5dec
--- /dev/null
+++ b/lib/kernel/sleef-pocl/log1p.cl
@@ -0,0 +1,183 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_log1p (float x)
+{
+  return Sleef_log1pf_u10 (x);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_log1p (float2 x)
+{
+
+  float lo = _cl_log1p (x.lo);
+  float hi = _cl_log1p (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_log1p (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_log1p (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_log1p (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_log1p (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_log1pf4_u10 (x);
+#else
+
+  float2 lo = _cl_log1p (x.lo);
+  float2 hi = _cl_log1p (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_log1p (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_log1pf8_u10 (x);
+#else
+
+  float4 lo = _cl_log1p (x.lo);
+  float4 hi = _cl_log1p (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_log1p (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_log1pf16_u10 (x);
+#else
+
+  float8 lo = _cl_log1p (x.lo);
+  float8 hi = _cl_log1p (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_log1p (double x)
+{
+  return Sleef_log1p_u10 (x);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_log1p (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_log1pd2_u10 (x);
+#else
+
+  double lo = _cl_log1p (x.lo);
+  double hi = _cl_log1p (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_log1p (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_log1p (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_log1p (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_log1p (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_log1pd4_u10 (x);
+#else
+
+  double2 lo = _cl_log1p (x.lo);
+  double2 hi = _cl_log1p (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_log1p (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_log1pd8_u10 (x);
+#else
+
+  double4 lo = _cl_log1p (x.lo);
+  double4 hi = _cl_log1p (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_log1p (double16 x)
+{
+
+  double8 lo = _cl_log1p (x.lo);
+  double8 hi = _cl_log1p (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/modf.cl b/lib/kernel/sleef-pocl/modf.cl
new file mode 100644
index 0000000..cb34f1d
--- /dev/null
+++ b/lib/kernel/sleef-pocl/modf.cl
@@ -0,0 +1,595 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_modf (float x, global float *iptr)
+{
+  Sleef_float2 temp;
+  temp = Sleef_modff (x);
+  *iptr = temp.y;
+  return temp.x;
+}
+
+_CL_OVERLOADABLE
+float3
+_cl_modf (float3 x, global float3 *iptr)
+{
+  float4 temp;
+  float4 x_3to4;
+  x_3to4.xyz = x;
+  float4 r = _cl_modf (x_3to4, &temp);
+  *iptr = temp.xyz;
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_modf (float2 x, global float2 *iptr)
+{
+  float plo, phi;
+  float lo = _cl_modf (x.lo, &plo);
+  float hi = _cl_modf (x.hi, &phi);
+
+  *iptr = (float2) (plo, phi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_modf (float4 x, global float4 *iptr)
+{
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  Sleef_float4_2 temp;
+  temp = Sleef_modff4 (x);
+  *iptr = temp.y;
+  return temp.x;
+#else
+
+  float2 plo, phi;
+  float2 lo = _cl_modf (x.lo, &plo);
+  float2 hi = _cl_modf (x.hi, &phi);
+
+  *iptr = (float4) (plo, phi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_modf (float8 x, global float8 *iptr)
+{
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  Sleef_float8_2 temp;
+  temp = Sleef_modff8 (x);
+  *iptr = temp.y;
+  return temp.x;
+#else
+
+  float4 plo, phi;
+  float4 lo = _cl_modf (x.lo, &plo);
+  float4 hi = _cl_modf (x.hi, &phi);
+
+  *iptr = (float8) (plo, phi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_modf (float16 x, global float16 *iptr)
+{
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  Sleef_float16_2 temp;
+  temp = Sleef_modff16 (x);
+  *iptr = temp.y;
+  return temp.x;
+#else
+
+  float8 plo, phi;
+  float8 lo = _cl_modf (x.lo, &plo);
+  float8 hi = _cl_modf (x.hi, &phi);
+
+  *iptr = (float16) (plo, phi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_modf (double x, global double *iptr)
+{
+  Sleef_double2 temp;
+  temp = Sleef_modf (x);
+  *iptr = temp.y;
+  return temp.x;
+}
+
+_CL_OVERLOADABLE
+double3
+_cl_modf (double3 x, global double3 *iptr)
+{
+  double4 temp;
+  double4 x_3to4;
+  x_3to4.xyz = x;
+  double4 r = _cl_modf (x_3to4, &temp);
+  *iptr = temp.xyz;
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+double16
+_cl_modf (double16 x, global double16 *iptr)
+{
+  double8 plo, phi;
+  double8 lo = _cl_modf (x.lo, &plo);
+  double8 hi = _cl_modf (x.hi, &phi);
+
+  *iptr = (double16) (plo, phi);
+  return (double16) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+double2
+_cl_modf (double2 x, global double2 *iptr)
+{
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  Sleef_double2_2 temp;
+  temp = Sleef_modfd2 (x);
+  *iptr = temp.y;
+  return temp.x;
+#else
+
+  double plo, phi;
+  double lo = _cl_modf (x.lo, &plo);
+  double hi = _cl_modf (x.hi, &phi);
+
+  *iptr = (double2) (plo, phi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+double4
+_cl_modf (double4 x, global double4 *iptr)
+{
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  Sleef_double4_2 temp;
+  temp = Sleef_modfd4 (x);
+  *iptr = temp.y;
+  return temp.x;
+#else
+
+  double2 plo, phi;
+  double2 lo = _cl_modf (x.lo, &plo);
+  double2 hi = _cl_modf (x.hi, &phi);
+
+  *iptr = (double4) (plo, phi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+double8
+_cl_modf (double8 x, global double8 *iptr)
+{
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  Sleef_double8_2 temp;
+  temp = Sleef_modfd8 (x);
+  *iptr = temp.y;
+  return temp.x;
+#else
+
+  double4 plo, phi;
+  double4 lo = _cl_modf (x.lo, &plo);
+  double4 hi = _cl_modf (x.hi, &phi);
+
+  *iptr = (double8) (plo, phi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+_CL_OVERLOADABLE
+float
+_cl_modf (float x, local float *iptr)
+{
+  Sleef_float2 temp;
+  temp = Sleef_modff (x);
+  *iptr = temp.y;
+  return temp.x;
+}
+
+_CL_OVERLOADABLE
+float3
+_cl_modf (float3 x, local float3 *iptr)
+{
+  float4 temp;
+  float4 x_3to4;
+  x_3to4.xyz = x;
+  float4 r = _cl_modf (x_3to4, &temp);
+  *iptr = temp.xyz;
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_modf (float2 x, local float2 *iptr)
+{
+  float plo, phi;
+  float lo = _cl_modf (x.lo, &plo);
+  float hi = _cl_modf (x.hi, &phi);
+
+  *iptr = (float2) (plo, phi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_modf (float4 x, local float4 *iptr)
+{
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  Sleef_float4_2 temp;
+  temp = Sleef_modff4 (x);
+  *iptr = temp.y;
+  return temp.x;
+#else
+
+  float2 plo, phi;
+  float2 lo = _cl_modf (x.lo, &plo);
+  float2 hi = _cl_modf (x.hi, &phi);
+
+  *iptr = (float4) (plo, phi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_modf (float8 x, local float8 *iptr)
+{
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  Sleef_float8_2 temp;
+  temp = Sleef_modff8 (x);
+  *iptr = temp.y;
+  return temp.x;
+#else
+
+  float4 plo, phi;
+  float4 lo = _cl_modf (x.lo, &plo);
+  float4 hi = _cl_modf (x.hi, &phi);
+
+  *iptr = (float8) (plo, phi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_modf (float16 x, local float16 *iptr)
+{
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  Sleef_float16_2 temp;
+  temp = Sleef_modff16 (x);
+  *iptr = temp.y;
+  return temp.x;
+#else
+
+  float8 plo, phi;
+  float8 lo = _cl_modf (x.lo, &plo);
+  float8 hi = _cl_modf (x.hi, &phi);
+
+  *iptr = (float16) (plo, phi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_modf (double x, local double *iptr)
+{
+  Sleef_double2 temp;
+  temp = Sleef_modf (x);
+  *iptr = temp.y;
+  return temp.x;
+}
+
+_CL_OVERLOADABLE
+double3
+_cl_modf (double3 x, local double3 *iptr)
+{
+  double4 temp;
+  double4 x_3to4;
+  x_3to4.xyz = x;
+  double4 r = _cl_modf (x_3to4, &temp);
+  *iptr = temp.xyz;
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+double16
+_cl_modf (double16 x, local double16 *iptr)
+{
+  double8 plo, phi;
+  double8 lo = _cl_modf (x.lo, &plo);
+  double8 hi = _cl_modf (x.hi, &phi);
+
+  *iptr = (double16) (plo, phi);
+  return (double16) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+double2
+_cl_modf (double2 x, local double2 *iptr)
+{
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  Sleef_double2_2 temp;
+  temp = Sleef_modfd2 (x);
+  *iptr = temp.y;
+  return temp.x;
+#else
+
+  double plo, phi;
+  double lo = _cl_modf (x.lo, &plo);
+  double hi = _cl_modf (x.hi, &phi);
+
+  *iptr = (double2) (plo, phi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+double4
+_cl_modf (double4 x, local double4 *iptr)
+{
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  Sleef_double4_2 temp;
+  temp = Sleef_modfd4 (x);
+  *iptr = temp.y;
+  return temp.x;
+#else
+
+  double2 plo, phi;
+  double2 lo = _cl_modf (x.lo, &plo);
+  double2 hi = _cl_modf (x.hi, &phi);
+
+  *iptr = (double4) (plo, phi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+double8
+_cl_modf (double8 x, local double8 *iptr)
+{
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  Sleef_double8_2 temp;
+  temp = Sleef_modfd8 (x);
+  *iptr = temp.y;
+  return temp.x;
+#else
+
+  double4 plo, phi;
+  double4 lo = _cl_modf (x.lo, &plo);
+  double4 hi = _cl_modf (x.hi, &phi);
+
+  *iptr = (double8) (plo, phi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+_CL_OVERLOADABLE
+float
+_cl_modf (float x, private float *iptr)
+{
+  Sleef_float2 temp;
+  temp = Sleef_modff (x);
+  *iptr = temp.y;
+  return temp.x;
+}
+
+_CL_OVERLOADABLE
+float3
+_cl_modf (float3 x, private float3 *iptr)
+{
+  float4 temp;
+  float4 x_3to4;
+  x_3to4.xyz = x;
+  float4 r = _cl_modf (x_3to4, &temp);
+  *iptr = temp.xyz;
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_modf (float2 x, private float2 *iptr)
+{
+  float plo, phi;
+  float lo = _cl_modf (x.lo, &plo);
+  float hi = _cl_modf (x.hi, &phi);
+
+  *iptr = (float2) (plo, phi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_modf (float4 x, private float4 *iptr)
+{
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  Sleef_float4_2 temp;
+  temp = Sleef_modff4 (x);
+  *iptr = temp.y;
+  return temp.x;
+#else
+
+  float2 plo, phi;
+  float2 lo = _cl_modf (x.lo, &plo);
+  float2 hi = _cl_modf (x.hi, &phi);
+
+  *iptr = (float4) (plo, phi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_modf (float8 x, private float8 *iptr)
+{
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  Sleef_float8_2 temp;
+  temp = Sleef_modff8 (x);
+  *iptr = temp.y;
+  return temp.x;
+#else
+
+  float4 plo, phi;
+  float4 lo = _cl_modf (x.lo, &plo);
+  float4 hi = _cl_modf (x.hi, &phi);
+
+  *iptr = (float8) (plo, phi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_modf (float16 x, private float16 *iptr)
+{
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  Sleef_float16_2 temp;
+  temp = Sleef_modff16 (x);
+  *iptr = temp.y;
+  return temp.x;
+#else
+
+  float8 plo, phi;
+  float8 lo = _cl_modf (x.lo, &plo);
+  float8 hi = _cl_modf (x.hi, &phi);
+
+  *iptr = (float16) (plo, phi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_modf (double x, private double *iptr)
+{
+  Sleef_double2 temp;
+  temp = Sleef_modf (x);
+  *iptr = temp.y;
+  return temp.x;
+}
+
+_CL_OVERLOADABLE
+double3
+_cl_modf (double3 x, private double3 *iptr)
+{
+  double4 temp;
+  double4 x_3to4;
+  x_3to4.xyz = x;
+  double4 r = _cl_modf (x_3to4, &temp);
+  *iptr = temp.xyz;
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+double16
+_cl_modf (double16 x, private double16 *iptr)
+{
+  double8 plo, phi;
+  double8 lo = _cl_modf (x.lo, &plo);
+  double8 hi = _cl_modf (x.hi, &phi);
+
+  *iptr = (double16) (plo, phi);
+  return (double16) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+double2
+_cl_modf (double2 x, private double2 *iptr)
+{
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  Sleef_double2_2 temp;
+  temp = Sleef_modfd2 (x);
+  *iptr = temp.y;
+  return temp.x;
+#else
+
+  double plo, phi;
+  double lo = _cl_modf (x.lo, &plo);
+  double hi = _cl_modf (x.hi, &phi);
+
+  *iptr = (double2) (plo, phi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+double4
+_cl_modf (double4 x, private double4 *iptr)
+{
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  Sleef_double4_2 temp;
+  temp = Sleef_modfd4 (x);
+  *iptr = temp.y;
+  return temp.x;
+#else
+
+  double2 plo, phi;
+  double2 lo = _cl_modf (x.lo, &plo);
+  double2 hi = _cl_modf (x.hi, &phi);
+
+  *iptr = (double4) (plo, phi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+double8
+_cl_modf (double8 x, private double8 *iptr)
+{
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  Sleef_double8_2 temp;
+  temp = Sleef_modfd8 (x);
+  *iptr = temp.y;
+  return temp.x;
+#else
+
+  double4 plo, phi;
+  double4 lo = _cl_modf (x.lo, &plo);
+  double4 hi = _cl_modf (x.hi, &phi);
+
+  *iptr = (double8) (plo, phi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/native_cos.cl b/lib/kernel/sleef-pocl/native_cos.cl
new file mode 100644
index 0000000..553003b
--- /dev/null
+++ b/lib/kernel/sleef-pocl/native_cos.cl
@@ -0,0 +1,183 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_native_cos (float x)
+{
+  return Sleef_cosf_u35 (x);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_native_cos (float2 x)
+{
+
+  float lo = _cl_native_cos (x.lo);
+  float hi = _cl_native_cos (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_native_cos (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_native_cos (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_native_cos (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_native_cos (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_cosf4_u35 (x);
+#else
+
+  float2 lo = _cl_native_cos (x.lo);
+  float2 hi = _cl_native_cos (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_native_cos (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_cosf8_u35 (x);
+#else
+
+  float4 lo = _cl_native_cos (x.lo);
+  float4 hi = _cl_native_cos (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_native_cos (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_cosf16_u35 (x);
+#else
+
+  float8 lo = _cl_native_cos (x.lo);
+  float8 hi = _cl_native_cos (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_native_cos (double x)
+{
+  return Sleef_cos_u35 (x);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_native_cos (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_cosd2_u35 (x);
+#else
+
+  double lo = _cl_native_cos (x.lo);
+  double hi = _cl_native_cos (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_native_cos (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_native_cos (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_native_cos (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_native_cos (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_cosd4_u35 (x);
+#else
+
+  double2 lo = _cl_native_cos (x.lo);
+  double2 hi = _cl_native_cos (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_native_cos (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_cosd8_u35 (x);
+#else
+
+  double4 lo = _cl_native_cos (x.lo);
+  double4 hi = _cl_native_cos (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_native_cos (double16 x)
+{
+
+  double8 lo = _cl_native_cos (x.lo);
+  double8 hi = _cl_native_cos (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/native_sin.cl b/lib/kernel/sleef-pocl/native_sin.cl
new file mode 100644
index 0000000..72527e0
--- /dev/null
+++ b/lib/kernel/sleef-pocl/native_sin.cl
@@ -0,0 +1,183 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_native_sin (float x)
+{
+  return Sleef_sinf_u35 (x);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_native_sin (float2 x)
+{
+
+  float lo = _cl_native_sin (x.lo);
+  float hi = _cl_native_sin (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_native_sin (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_native_sin (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_native_sin (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_native_sin (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_sinf4_u35 (x);
+#else
+
+  float2 lo = _cl_native_sin (x.lo);
+  float2 hi = _cl_native_sin (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_native_sin (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_sinf8_u35 (x);
+#else
+
+  float4 lo = _cl_native_sin (x.lo);
+  float4 hi = _cl_native_sin (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_native_sin (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_sinf16_u35 (x);
+#else
+
+  float8 lo = _cl_native_sin (x.lo);
+  float8 hi = _cl_native_sin (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_native_sin (double x)
+{
+  return Sleef_sin_u35 (x);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_native_sin (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_sind2_u35 (x);
+#else
+
+  double lo = _cl_native_sin (x.lo);
+  double hi = _cl_native_sin (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_native_sin (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_native_sin (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_native_sin (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_native_sin (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_sind4_u35 (x);
+#else
+
+  double2 lo = _cl_native_sin (x.lo);
+  double2 hi = _cl_native_sin (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_native_sin (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_sind8_u35 (x);
+#else
+
+  double4 lo = _cl_native_sin (x.lo);
+  double4 hi = _cl_native_sin (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_native_sin (double16 x)
+{
+
+  double8 lo = _cl_native_sin (x.lo);
+  double8 hi = _cl_native_sin (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/native_tan.cl b/lib/kernel/sleef-pocl/native_tan.cl
new file mode 100644
index 0000000..101003e
--- /dev/null
+++ b/lib/kernel/sleef-pocl/native_tan.cl
@@ -0,0 +1,183 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_native_tan (float x)
+{
+  return Sleef_tanf_u35 (x);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_native_tan (float2 x)
+{
+
+  float lo = _cl_native_tan (x.lo);
+  float hi = _cl_native_tan (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_native_tan (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_native_tan (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_native_tan (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_native_tan (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_tanf4_u35 (x);
+#else
+
+  float2 lo = _cl_native_tan (x.lo);
+  float2 hi = _cl_native_tan (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_native_tan (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_tanf8_u35 (x);
+#else
+
+  float4 lo = _cl_native_tan (x.lo);
+  float4 hi = _cl_native_tan (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_native_tan (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_tanf16_u35 (x);
+#else
+
+  float8 lo = _cl_native_tan (x.lo);
+  float8 hi = _cl_native_tan (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_native_tan (double x)
+{
+  return Sleef_tan_u35 (x);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_native_tan (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_tand2_u35 (x);
+#else
+
+  double lo = _cl_native_tan (x.lo);
+  double hi = _cl_native_tan (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_native_tan (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_native_tan (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_native_tan (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_native_tan (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_tand4_u35 (x);
+#else
+
+  double2 lo = _cl_native_tan (x.lo);
+  double2 hi = _cl_native_tan (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_native_tan (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_tand8_u35 (x);
+#else
+
+  double4 lo = _cl_native_tan (x.lo);
+  double4 hi = _cl_native_tan (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_native_tan (double16 x)
+{
+
+  double8 lo = _cl_native_tan (x.lo);
+  double8 hi = _cl_native_tan (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/nextafter.cl b/lib/kernel/sleef-pocl/nextafter.cl
new file mode 100644
index 0000000..3e74ec7
--- /dev/null
+++ b/lib/kernel/sleef-pocl/nextafter.cl
@@ -0,0 +1,185 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_nextafter (float x, float y)
+{
+  return Sleef_nextafterf (x, y);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_nextafter (float2 x, float2 y)
+{
+
+  float lo = _cl_nextafter (x.lo, y.lo);
+  float hi = _cl_nextafter (x.hi, y.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_nextafter (float4, float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_nextafter (float3 x, float3 y)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+  float4 y_3to4 = (float4) (y, (float)0);
+
+  float4 r = _cl_nextafter (x_3to4, y_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_nextafter (float4 x, float4 y)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_nextafterf4 (x, y);
+#else
+
+  float2 lo = _cl_nextafter (x.lo, y.lo);
+  float2 hi = _cl_nextafter (x.hi, y.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_nextafter (float8 x, float8 y)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_nextafterf8 (x, y);
+#else
+
+  float4 lo = _cl_nextafter (x.lo, y.lo);
+  float4 hi = _cl_nextafter (x.hi, y.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_nextafter (float16 x, float16 y)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_nextafterf16 (x, y);
+#else
+
+  float8 lo = _cl_nextafter (x.lo, y.lo);
+  float8 hi = _cl_nextafter (x.hi, y.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_nextafter (double x, double y)
+{
+  return Sleef_nextafter (x, y);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_nextafter (double2 x, double2 y)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_nextafterd2 (x, y);
+#else
+
+  double lo = _cl_nextafter (x.lo, y.lo);
+  double hi = _cl_nextafter (x.hi, y.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_nextafter (double4, double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_nextafter (double3 x, double3 y)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+  double4 y_3to4 = (double4) (y, (double)0);
+
+  double4 r = _cl_nextafter (x_3to4, y_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_nextafter (double4 x, double4 y)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_nextafterd4 (x, y);
+#else
+
+  double2 lo = _cl_nextafter (x.lo, y.lo);
+  double2 hi = _cl_nextafter (x.hi, y.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_nextafter (double8 x, double8 y)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_nextafterd8 (x, y);
+#else
+
+  double4 lo = _cl_nextafter (x.lo, y.lo);
+  double4 hi = _cl_nextafter (x.hi, y.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_nextafter (double16 x, double16 y)
+{
+
+  double8 lo = _cl_nextafter (x.lo, y.lo);
+  double8 hi = _cl_nextafter (x.hi, y.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/pow.cl b/lib/kernel/sleef-pocl/pow.cl
new file mode 100644
index 0000000..9b1a02d
--- /dev/null
+++ b/lib/kernel/sleef-pocl/pow.cl
@@ -0,0 +1,185 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_pow (float x, float y)
+{
+  return Sleef_powf_u10 (x, y);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_pow (float2 x, float2 y)
+{
+
+  float lo = _cl_pow (x.lo, y.lo);
+  float hi = _cl_pow (x.hi, y.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_pow (float4, float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_pow (float3 x, float3 y)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+  float4 y_3to4 = (float4) (y, (float)0);
+
+  float4 r = _cl_pow (x_3to4, y_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_pow (float4 x, float4 y)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_powf4_u10 (x, y);
+#else
+
+  float2 lo = _cl_pow (x.lo, y.lo);
+  float2 hi = _cl_pow (x.hi, y.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_pow (float8 x, float8 y)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_powf8_u10 (x, y);
+#else
+
+  float4 lo = _cl_pow (x.lo, y.lo);
+  float4 hi = _cl_pow (x.hi, y.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_pow (float16 x, float16 y)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_powf16_u10 (x, y);
+#else
+
+  float8 lo = _cl_pow (x.lo, y.lo);
+  float8 hi = _cl_pow (x.hi, y.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_pow (double x, double y)
+{
+  return Sleef_pow_u10 (x, y);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_pow (double2 x, double2 y)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_powd2_u10 (x, y);
+#else
+
+  double lo = _cl_pow (x.lo, y.lo);
+  double hi = _cl_pow (x.hi, y.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_pow (double4, double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_pow (double3 x, double3 y)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+  double4 y_3to4 = (double4) (y, (double)0);
+
+  double4 r = _cl_pow (x_3to4, y_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_pow (double4 x, double4 y)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_powd4_u10 (x, y);
+#else
+
+  double2 lo = _cl_pow (x.lo, y.lo);
+  double2 hi = _cl_pow (x.hi, y.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_pow (double8 x, double8 y)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_powd8_u10 (x, y);
+#else
+
+  double4 lo = _cl_pow (x.lo, y.lo);
+  double4 hi = _cl_pow (x.hi, y.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_pow (double16 x, double16 y)
+{
+
+  double8 lo = _cl_pow (x.lo, y.lo);
+  double8 hi = _cl_pow (x.hi, y.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/pown.cl b/lib/kernel/sleef-pocl/pown.cl
new file mode 100644
index 0000000..ccf6ebd
--- /dev/null
+++ b/lib/kernel/sleef-pocl/pown.cl
@@ -0,0 +1,185 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_pown (float x, int y)
+{
+  return Sleef_pownf_u10 (x, y);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_pown (float2 x, int2 y)
+{
+
+  float lo = _cl_pown (x.lo, y.lo);
+  float hi = _cl_pown (x.hi, y.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_pown (float4, int4);
+
+_CL_OVERLOADABLE
+float3
+_cl_pown (float3 x, int3 y)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+  int4 y_3to4 = (int4) (y, (float)0);
+
+  float4 r = _cl_pown (x_3to4, y_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_pown (float4 x, int4 y)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_pownf4_u10 (x, y);
+#else
+
+  float2 lo = _cl_pown (x.lo, y.lo);
+  float2 hi = _cl_pown (x.hi, y.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_pown (float8 x, int8 y)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_pownf8_u10 (x, y);
+#else
+
+  float4 lo = _cl_pown (x.lo, y.lo);
+  float4 hi = _cl_pown (x.hi, y.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_pown (float16 x, int16 y)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_pownf16_u10 (x, y);
+#else
+
+  float8 lo = _cl_pown (x.lo, y.lo);
+  float8 hi = _cl_pown (x.hi, y.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_pown (double x, int y)
+{
+  return Sleef_pown_u10 (x, y);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_pown (double2 x, int2 y)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_pownd2_u10 (x, y);
+#else
+
+  double lo = _cl_pown (x.lo, y.lo);
+  double hi = _cl_pown (x.hi, y.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_pown (double4, int4);
+
+_CL_OVERLOADABLE
+double3
+_cl_pown (double3 x, int3 y)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+  int4 y_3to4 = (int4) (y, (double)0);
+
+  double4 r = _cl_pown (x_3to4, y_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_pown (double4 x, int4 y)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_pownd4_u10 (x, y);
+#else
+
+  double2 lo = _cl_pown (x.lo, y.lo);
+  double2 hi = _cl_pown (x.hi, y.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_pown (double8 x, int8 y)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_pownd8_u10 (x, y);
+#else
+
+  double4 lo = _cl_pown (x.lo, y.lo);
+  double4 hi = _cl_pown (x.hi, y.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_pown (double16 x, int16 y)
+{
+
+  double8 lo = _cl_pown (x.lo, y.lo);
+  double8 hi = _cl_pown (x.hi, y.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/powr.cl b/lib/kernel/sleef-pocl/powr.cl
new file mode 100644
index 0000000..a32025e
--- /dev/null
+++ b/lib/kernel/sleef-pocl/powr.cl
@@ -0,0 +1,185 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_powr (float x, float y)
+{
+  return Sleef_powrf_u10 (x, y);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_powr (float2 x, float2 y)
+{
+
+  float lo = _cl_powr (x.lo, y.lo);
+  float hi = _cl_powr (x.hi, y.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_powr (float4, float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_powr (float3 x, float3 y)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+  float4 y_3to4 = (float4) (y, (float)0);
+
+  float4 r = _cl_powr (x_3to4, y_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_powr (float4 x, float4 y)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_powrf4_u10 (x, y);
+#else
+
+  float2 lo = _cl_powr (x.lo, y.lo);
+  float2 hi = _cl_powr (x.hi, y.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_powr (float8 x, float8 y)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_powrf8_u10 (x, y);
+#else
+
+  float4 lo = _cl_powr (x.lo, y.lo);
+  float4 hi = _cl_powr (x.hi, y.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_powr (float16 x, float16 y)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_powrf16_u10 (x, y);
+#else
+
+  float8 lo = _cl_powr (x.lo, y.lo);
+  float8 hi = _cl_powr (x.hi, y.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_powr (double x, double y)
+{
+  return Sleef_powr_u10 (x, y);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_powr (double2 x, double2 y)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_powrd2_u10 (x, y);
+#else
+
+  double lo = _cl_powr (x.lo, y.lo);
+  double hi = _cl_powr (x.hi, y.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_powr (double4, double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_powr (double3 x, double3 y)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+  double4 y_3to4 = (double4) (y, (double)0);
+
+  double4 r = _cl_powr (x_3to4, y_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_powr (double4 x, double4 y)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_powrd4_u10 (x, y);
+#else
+
+  double2 lo = _cl_powr (x.lo, y.lo);
+  double2 hi = _cl_powr (x.hi, y.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_powr (double8 x, double8 y)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_powrd8_u10 (x, y);
+#else
+
+  double4 lo = _cl_powr (x.lo, y.lo);
+  double4 hi = _cl_powr (x.hi, y.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_powr (double16 x, double16 y)
+{
+
+  double8 lo = _cl_powr (x.lo, y.lo);
+  double8 hi = _cl_powr (x.hi, y.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/rint.cl b/lib/kernel/sleef-pocl/rint.cl
new file mode 100644
index 0000000..7459788
--- /dev/null
+++ b/lib/kernel/sleef-pocl/rint.cl
@@ -0,0 +1,183 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_rint (float x)
+{
+  return Sleef_rintf (x);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_rint (float2 x)
+{
+
+  float lo = _cl_rint (x.lo);
+  float hi = _cl_rint (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_rint (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_rint (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_rint (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_rint (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_rintf4 (x);
+#else
+
+  float2 lo = _cl_rint (x.lo);
+  float2 hi = _cl_rint (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_rint (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_rintf8 (x);
+#else
+
+  float4 lo = _cl_rint (x.lo);
+  float4 hi = _cl_rint (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_rint (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_rintf16 (x);
+#else
+
+  float8 lo = _cl_rint (x.lo);
+  float8 hi = _cl_rint (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_rint (double x)
+{
+  return Sleef_rint (x);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_rint (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_rintd2 (x);
+#else
+
+  double lo = _cl_rint (x.lo);
+  double hi = _cl_rint (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_rint (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_rint (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_rint (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_rint (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_rintd4 (x);
+#else
+
+  double2 lo = _cl_rint (x.lo);
+  double2 hi = _cl_rint (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_rint (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_rintd8 (x);
+#else
+
+  double4 lo = _cl_rint (x.lo);
+  double4 hi = _cl_rint (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_rint (double16 x)
+{
+
+  double8 lo = _cl_rint (x.lo);
+  double8 hi = _cl_rint (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/round.cl b/lib/kernel/sleef-pocl/round.cl
new file mode 100644
index 0000000..e88ee97
--- /dev/null
+++ b/lib/kernel/sleef-pocl/round.cl
@@ -0,0 +1,183 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_round (float x)
+{
+  return Sleef_roundf (x);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_round (float2 x)
+{
+
+  float lo = _cl_round (x.lo);
+  float hi = _cl_round (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_round (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_round (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_round (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_round (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_roundf4 (x);
+#else
+
+  float2 lo = _cl_round (x.lo);
+  float2 hi = _cl_round (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_round (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_roundf8 (x);
+#else
+
+  float4 lo = _cl_round (x.lo);
+  float4 hi = _cl_round (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_round (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_roundf16 (x);
+#else
+
+  float8 lo = _cl_round (x.lo);
+  float8 hi = _cl_round (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_round (double x)
+{
+  return Sleef_round (x);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_round (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_roundd2 (x);
+#else
+
+  double lo = _cl_round (x.lo);
+  double hi = _cl_round (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_round (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_round (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_round (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_round (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_roundd4 (x);
+#else
+
+  double2 lo = _cl_round (x.lo);
+  double2 hi = _cl_round (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_round (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_roundd8 (x);
+#else
+
+  double4 lo = _cl_round (x.lo);
+  double4 hi = _cl_round (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_round (double16 x)
+{
+
+  double8 lo = _cl_round (x.lo);
+  double8 hi = _cl_round (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/rsqrt.cl b/lib/kernel/sleef-pocl/scalars.cl
similarity index 78%
copy from lib/kernel/rsqrt.cl
copy to lib/kernel/sleef-pocl/scalars.cl
index 3c75ca1..d154f52 100644
--- a/lib/kernel/rsqrt.cl
+++ b/lib/kernel/sleef-pocl/scalars.cl
@@ -1,18 +1,17 @@
-/* OpenCL built-in library: rsqrt()
+/* OpenCL built-in library: sleef-pocl/scalars.cl
+
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
 
-   Copyright (c) 2011 Erik Schnetter <eschnetter at perimeterinstitute.ca>
-                      Perimeter Institute for Theoretical Physics
-   
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -22,8 +21,11 @@
    THE SOFTWARE.
 */
 
-#include "templates.h"
 
-DEFINE_EXPR_V_V(rsqrt, (stype)1/sqrt(a))
+#include "../templates.h"
+
+DEFINE_EXPR_V_VS(fmax, fmax(a, (vtype)b))
+
+DEFINE_EXPR_V_VS(fmin, fmin(a, (vtype)b))
 
-DEFINE_EXPR_F_F(half_rsqrt, rsqrt(a))
+DEFINE_EXPR_V_VI(ldexp, ldexp(a, (jtype)b))
diff --git a/lib/kernel/sleef-pocl/sin.cl b/lib/kernel/sleef-pocl/sin.cl
new file mode 100644
index 0000000..3480cd9
--- /dev/null
+++ b/lib/kernel/sleef-pocl/sin.cl
@@ -0,0 +1,229 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_sin (float x)
+{
+
+#ifdef MAX_PRECISION
+  return Sleef_sinf_u10 (x);
+#else
+  return Sleef_sinf_u35 (x);
+#endif
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_sin (float2 x)
+{
+
+  float lo = _cl_sin (x.lo);
+  float hi = _cl_sin (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_sin (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_sin (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_sin (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_sin (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_sinf4_u10 (x);
+#else
+  return Sleef_sinf4_u35 (x);
+#endif
+
+#else
+
+  float2 lo = _cl_sin (x.lo);
+  float2 hi = _cl_sin (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_sin (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_sinf8_u10 (x);
+#else
+  return Sleef_sinf8_u35 (x);
+#endif
+
+#else
+
+  float4 lo = _cl_sin (x.lo);
+  float4 hi = _cl_sin (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_sin (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_sinf16_u10 (x);
+#else
+  return Sleef_sinf16_u35 (x);
+#endif
+
+#else
+
+  float8 lo = _cl_sin (x.lo);
+  float8 hi = _cl_sin (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_sin (double x)
+{
+
+#ifdef MAX_PRECISION
+  return Sleef_sin_u10 (x);
+#else
+  return Sleef_sin_u35 (x);
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_sin (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_sind2_u10 (x);
+#else
+  return Sleef_sind2_u35 (x);
+#endif
+
+#else
+
+  double lo = _cl_sin (x.lo);
+  double hi = _cl_sin (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_sin (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_sin (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_sin (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_sin (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_sind4_u10 (x);
+#else
+  return Sleef_sind4_u35 (x);
+#endif
+
+#else
+
+  double2 lo = _cl_sin (x.lo);
+  double2 hi = _cl_sin (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_sin (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_sind8_u10 (x);
+#else
+  return Sleef_sind8_u35 (x);
+#endif
+
+#else
+
+  double4 lo = _cl_sin (x.lo);
+  double4 hi = _cl_sin (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_sin (double16 x)
+{
+
+  double8 lo = _cl_sin (x.lo);
+  double8 hi = _cl_sin (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/sincos.cl b/lib/kernel/sleef-pocl/sincos.cl
new file mode 100644
index 0000000..a15afd7
--- /dev/null
+++ b/lib/kernel/sleef-pocl/sincos.cl
@@ -0,0 +1,739 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_sincos (float x, global float *cosval)
+{
+  Sleef_float2 temp;
+
+#ifdef MAX_PRECISION
+  temp = Sleef_sincosf_u10 (x);
+#else
+  temp = Sleef_sincosf_u35 (x);
+#endif
+
+  *cosval = temp.y;
+  return temp.x;
+}
+
+_CL_OVERLOADABLE
+float3
+_cl_sincos (float3 x, global float3 *cosval)
+{
+  float4 temp;
+  float4 x_3to4;
+  x_3to4.xyz = x;
+  float4 r = _cl_sincos (x_3to4, &temp);
+  *cosval = temp.xyz;
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_sincos (float2 x, global float2 *cosval)
+{
+  float plo, phi;
+  float lo = _cl_sincos (x.lo, &plo);
+  float hi = _cl_sincos (x.hi, &phi);
+
+  *cosval = (float2) (plo, phi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_sincos (float4 x, global float4 *cosval)
+{
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  Sleef_float4_2 temp;
+
+#ifdef MAX_PRECISION
+  temp = Sleef_sincosf4_u10 (x);
+#else
+  temp = Sleef_sincosf4_u35 (x);
+#endif
+
+  *cosval = temp.y;
+  return temp.x;
+#else
+
+  float2 plo, phi;
+  float2 lo = _cl_sincos (x.lo, &plo);
+  float2 hi = _cl_sincos (x.hi, &phi);
+
+  *cosval = (float4) (plo, phi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_sincos (float8 x, global float8 *cosval)
+{
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  Sleef_float8_2 temp;
+
+#ifdef MAX_PRECISION
+  temp = Sleef_sincosf8_u10 (x);
+#else
+  temp = Sleef_sincosf8_u35 (x);
+#endif
+
+  *cosval = temp.y;
+  return temp.x;
+#else
+
+  float4 plo, phi;
+  float4 lo = _cl_sincos (x.lo, &plo);
+  float4 hi = _cl_sincos (x.hi, &phi);
+
+  *cosval = (float8) (plo, phi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_sincos (float16 x, global float16 *cosval)
+{
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  Sleef_float16_2 temp;
+
+#ifdef MAX_PRECISION
+  temp = Sleef_sincosf16_u10 (x);
+#else
+  temp = Sleef_sincosf16_u35 (x);
+#endif
+
+  *cosval = temp.y;
+  return temp.x;
+#else
+
+  float8 plo, phi;
+  float8 lo = _cl_sincos (x.lo, &plo);
+  float8 hi = _cl_sincos (x.hi, &phi);
+
+  *cosval = (float16) (plo, phi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_sincos (double x, global double *cosval)
+{
+  Sleef_double2 temp;
+
+#ifdef MAX_PRECISION
+  temp = Sleef_sincos_u10 (x);
+#else
+  temp = Sleef_sincos_u35 (x);
+#endif
+
+  *cosval = temp.y;
+  return temp.x;
+}
+
+_CL_OVERLOADABLE
+double3
+_cl_sincos (double3 x, global double3 *cosval)
+{
+  double4 temp;
+  double4 x_3to4;
+  x_3to4.xyz = x;
+  double4 r = _cl_sincos (x_3to4, &temp);
+  *cosval = temp.xyz;
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+double16
+_cl_sincos (double16 x, global double16 *cosval)
+{
+  double8 plo, phi;
+  double8 lo = _cl_sincos (x.lo, &plo);
+  double8 hi = _cl_sincos (x.hi, &phi);
+
+  *cosval = (double16) (plo, phi);
+  return (double16) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+double2
+_cl_sincos (double2 x, global double2 *cosval)
+{
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  Sleef_double2_2 temp;
+
+#ifdef MAX_PRECISION
+  temp = Sleef_sincosd2_u10 (x);
+#else
+  temp = Sleef_sincosd2_u35 (x);
+#endif
+
+  *cosval = temp.y;
+  return temp.x;
+#else
+
+  double plo, phi;
+  double lo = _cl_sincos (x.lo, &plo);
+  double hi = _cl_sincos (x.hi, &phi);
+
+  *cosval = (double2) (plo, phi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+double4
+_cl_sincos (double4 x, global double4 *cosval)
+{
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  Sleef_double4_2 temp;
+
+#ifdef MAX_PRECISION
+  temp = Sleef_sincosd4_u10 (x);
+#else
+  temp = Sleef_sincosd4_u35 (x);
+#endif
+
+  *cosval = temp.y;
+  return temp.x;
+#else
+
+  double2 plo, phi;
+  double2 lo = _cl_sincos (x.lo, &plo);
+  double2 hi = _cl_sincos (x.hi, &phi);
+
+  *cosval = (double4) (plo, phi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+double8
+_cl_sincos (double8 x, global double8 *cosval)
+{
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  Sleef_double8_2 temp;
+
+#ifdef MAX_PRECISION
+  temp = Sleef_sincosd8_u10 (x);
+#else
+  temp = Sleef_sincosd8_u35 (x);
+#endif
+
+  *cosval = temp.y;
+  return temp.x;
+#else
+
+  double4 plo, phi;
+  double4 lo = _cl_sincos (x.lo, &plo);
+  double4 hi = _cl_sincos (x.hi, &phi);
+
+  *cosval = (double8) (plo, phi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+_CL_OVERLOADABLE
+float
+_cl_sincos (float x, local float *cosval)
+{
+  Sleef_float2 temp;
+
+#ifdef MAX_PRECISION
+  temp = Sleef_sincosf_u10 (x);
+#else
+  temp = Sleef_sincosf_u35 (x);
+#endif
+
+  *cosval = temp.y;
+  return temp.x;
+}
+
+_CL_OVERLOADABLE
+float3
+_cl_sincos (float3 x, local float3 *cosval)
+{
+  float4 temp;
+  float4 x_3to4;
+  x_3to4.xyz = x;
+  float4 r = _cl_sincos (x_3to4, &temp);
+  *cosval = temp.xyz;
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_sincos (float2 x, local float2 *cosval)
+{
+  float plo, phi;
+  float lo = _cl_sincos (x.lo, &plo);
+  float hi = _cl_sincos (x.hi, &phi);
+
+  *cosval = (float2) (plo, phi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_sincos (float4 x, local float4 *cosval)
+{
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  Sleef_float4_2 temp;
+
+#ifdef MAX_PRECISION
+  temp = Sleef_sincosf4_u10 (x);
+#else
+  temp = Sleef_sincosf4_u35 (x);
+#endif
+
+  *cosval = temp.y;
+  return temp.x;
+#else
+
+  float2 plo, phi;
+  float2 lo = _cl_sincos (x.lo, &plo);
+  float2 hi = _cl_sincos (x.hi, &phi);
+
+  *cosval = (float4) (plo, phi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_sincos (float8 x, local float8 *cosval)
+{
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  Sleef_float8_2 temp;
+
+#ifdef MAX_PRECISION
+  temp = Sleef_sincosf8_u10 (x);
+#else
+  temp = Sleef_sincosf8_u35 (x);
+#endif
+
+  *cosval = temp.y;
+  return temp.x;
+#else
+
+  float4 plo, phi;
+  float4 lo = _cl_sincos (x.lo, &plo);
+  float4 hi = _cl_sincos (x.hi, &phi);
+
+  *cosval = (float8) (plo, phi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_sincos (float16 x, local float16 *cosval)
+{
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  Sleef_float16_2 temp;
+
+#ifdef MAX_PRECISION
+  temp = Sleef_sincosf16_u10 (x);
+#else
+  temp = Sleef_sincosf16_u35 (x);
+#endif
+
+  *cosval = temp.y;
+  return temp.x;
+#else
+
+  float8 plo, phi;
+  float8 lo = _cl_sincos (x.lo, &plo);
+  float8 hi = _cl_sincos (x.hi, &phi);
+
+  *cosval = (float16) (plo, phi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_sincos (double x, local double *cosval)
+{
+  Sleef_double2 temp;
+
+#ifdef MAX_PRECISION
+  temp = Sleef_sincos_u10 (x);
+#else
+  temp = Sleef_sincos_u35 (x);
+#endif
+
+  *cosval = temp.y;
+  return temp.x;
+}
+
+_CL_OVERLOADABLE
+double3
+_cl_sincos (double3 x, local double3 *cosval)
+{
+  double4 temp;
+  double4 x_3to4;
+  x_3to4.xyz = x;
+  double4 r = _cl_sincos (x_3to4, &temp);
+  *cosval = temp.xyz;
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+double16
+_cl_sincos (double16 x, local double16 *cosval)
+{
+  double8 plo, phi;
+  double8 lo = _cl_sincos (x.lo, &plo);
+  double8 hi = _cl_sincos (x.hi, &phi);
+
+  *cosval = (double16) (plo, phi);
+  return (double16) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+double2
+_cl_sincos (double2 x, local double2 *cosval)
+{
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  Sleef_double2_2 temp;
+
+#ifdef MAX_PRECISION
+  temp = Sleef_sincosd2_u10 (x);
+#else
+  temp = Sleef_sincosd2_u35 (x);
+#endif
+
+  *cosval = temp.y;
+  return temp.x;
+#else
+
+  double plo, phi;
+  double lo = _cl_sincos (x.lo, &plo);
+  double hi = _cl_sincos (x.hi, &phi);
+
+  *cosval = (double2) (plo, phi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+double4
+_cl_sincos (double4 x, local double4 *cosval)
+{
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  Sleef_double4_2 temp;
+
+#ifdef MAX_PRECISION
+  temp = Sleef_sincosd4_u10 (x);
+#else
+  temp = Sleef_sincosd4_u35 (x);
+#endif
+
+  *cosval = temp.y;
+  return temp.x;
+#else
+
+  double2 plo, phi;
+  double2 lo = _cl_sincos (x.lo, &plo);
+  double2 hi = _cl_sincos (x.hi, &phi);
+
+  *cosval = (double4) (plo, phi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+double8
+_cl_sincos (double8 x, local double8 *cosval)
+{
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  Sleef_double8_2 temp;
+
+#ifdef MAX_PRECISION
+  temp = Sleef_sincosd8_u10 (x);
+#else
+  temp = Sleef_sincosd8_u35 (x);
+#endif
+
+  *cosval = temp.y;
+  return temp.x;
+#else
+
+  double4 plo, phi;
+  double4 lo = _cl_sincos (x.lo, &plo);
+  double4 hi = _cl_sincos (x.hi, &phi);
+
+  *cosval = (double8) (plo, phi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+_CL_OVERLOADABLE
+float
+_cl_sincos (float x, private float *cosval)
+{
+  Sleef_float2 temp;
+
+#ifdef MAX_PRECISION
+  temp = Sleef_sincosf_u10 (x);
+#else
+  temp = Sleef_sincosf_u35 (x);
+#endif
+
+  *cosval = temp.y;
+  return temp.x;
+}
+
+_CL_OVERLOADABLE
+float3
+_cl_sincos (float3 x, private float3 *cosval)
+{
+  float4 temp;
+  float4 x_3to4;
+  x_3to4.xyz = x;
+  float4 r = _cl_sincos (x_3to4, &temp);
+  *cosval = temp.xyz;
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_sincos (float2 x, private float2 *cosval)
+{
+  float plo, phi;
+  float lo = _cl_sincos (x.lo, &plo);
+  float hi = _cl_sincos (x.hi, &phi);
+
+  *cosval = (float2) (plo, phi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_sincos (float4 x, private float4 *cosval)
+{
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  Sleef_float4_2 temp;
+
+#ifdef MAX_PRECISION
+  temp = Sleef_sincosf4_u10 (x);
+#else
+  temp = Sleef_sincosf4_u35 (x);
+#endif
+
+  *cosval = temp.y;
+  return temp.x;
+#else
+
+  float2 plo, phi;
+  float2 lo = _cl_sincos (x.lo, &plo);
+  float2 hi = _cl_sincos (x.hi, &phi);
+
+  *cosval = (float4) (plo, phi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_sincos (float8 x, private float8 *cosval)
+{
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  Sleef_float8_2 temp;
+
+#ifdef MAX_PRECISION
+  temp = Sleef_sincosf8_u10 (x);
+#else
+  temp = Sleef_sincosf8_u35 (x);
+#endif
+
+  *cosval = temp.y;
+  return temp.x;
+#else
+
+  float4 plo, phi;
+  float4 lo = _cl_sincos (x.lo, &plo);
+  float4 hi = _cl_sincos (x.hi, &phi);
+
+  *cosval = (float8) (plo, phi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_sincos (float16 x, private float16 *cosval)
+{
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  Sleef_float16_2 temp;
+
+#ifdef MAX_PRECISION
+  temp = Sleef_sincosf16_u10 (x);
+#else
+  temp = Sleef_sincosf16_u35 (x);
+#endif
+
+  *cosval = temp.y;
+  return temp.x;
+#else
+
+  float8 plo, phi;
+  float8 lo = _cl_sincos (x.lo, &plo);
+  float8 hi = _cl_sincos (x.hi, &phi);
+
+  *cosval = (float16) (plo, phi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_sincos (double x, private double *cosval)
+{
+  Sleef_double2 temp;
+
+#ifdef MAX_PRECISION
+  temp = Sleef_sincos_u10 (x);
+#else
+  temp = Sleef_sincos_u35 (x);
+#endif
+
+  *cosval = temp.y;
+  return temp.x;
+}
+
+_CL_OVERLOADABLE
+double3
+_cl_sincos (double3 x, private double3 *cosval)
+{
+  double4 temp;
+  double4 x_3to4;
+  x_3to4.xyz = x;
+  double4 r = _cl_sincos (x_3to4, &temp);
+  *cosval = temp.xyz;
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+double16
+_cl_sincos (double16 x, private double16 *cosval)
+{
+  double8 plo, phi;
+  double8 lo = _cl_sincos (x.lo, &plo);
+  double8 hi = _cl_sincos (x.hi, &phi);
+
+  *cosval = (double16) (plo, phi);
+  return (double16) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+double2
+_cl_sincos (double2 x, private double2 *cosval)
+{
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  Sleef_double2_2 temp;
+
+#ifdef MAX_PRECISION
+  temp = Sleef_sincosd2_u10 (x);
+#else
+  temp = Sleef_sincosd2_u35 (x);
+#endif
+
+  *cosval = temp.y;
+  return temp.x;
+#else
+
+  double plo, phi;
+  double lo = _cl_sincos (x.lo, &plo);
+  double hi = _cl_sincos (x.hi, &phi);
+
+  *cosval = (double2) (plo, phi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+double4
+_cl_sincos (double4 x, private double4 *cosval)
+{
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  Sleef_double4_2 temp;
+
+#ifdef MAX_PRECISION
+  temp = Sleef_sincosd4_u10 (x);
+#else
+  temp = Sleef_sincosd4_u35 (x);
+#endif
+
+  *cosval = temp.y;
+  return temp.x;
+#else
+
+  double2 plo, phi;
+  double2 lo = _cl_sincos (x.lo, &plo);
+  double2 hi = _cl_sincos (x.hi, &phi);
+
+  *cosval = (double4) (plo, phi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+double8
+_cl_sincos (double8 x, private double8 *cosval)
+{
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  Sleef_double8_2 temp;
+
+#ifdef MAX_PRECISION
+  temp = Sleef_sincosd8_u10 (x);
+#else
+  temp = Sleef_sincosd8_u35 (x);
+#endif
+
+  *cosval = temp.y;
+  return temp.x;
+#else
+
+  double4 plo, phi;
+  double4 lo = _cl_sincos (x.lo, &plo);
+  double4 hi = _cl_sincos (x.hi, &phi);
+
+  *cosval = (double8) (plo, phi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/sinh.cl b/lib/kernel/sleef-pocl/sinh.cl
new file mode 100644
index 0000000..183cf50
--- /dev/null
+++ b/lib/kernel/sleef-pocl/sinh.cl
@@ -0,0 +1,183 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_sinh (float x)
+{
+  return Sleef_sinhf_u10 (x);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_sinh (float2 x)
+{
+
+  float lo = _cl_sinh (x.lo);
+  float hi = _cl_sinh (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_sinh (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_sinh (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_sinh (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_sinh (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_sinhf4_u10 (x);
+#else
+
+  float2 lo = _cl_sinh (x.lo);
+  float2 hi = _cl_sinh (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_sinh (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_sinhf8_u10 (x);
+#else
+
+  float4 lo = _cl_sinh (x.lo);
+  float4 hi = _cl_sinh (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_sinh (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_sinhf16_u10 (x);
+#else
+
+  float8 lo = _cl_sinh (x.lo);
+  float8 hi = _cl_sinh (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_sinh (double x)
+{
+  return Sleef_sinh_u10 (x);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_sinh (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_sinhd2_u10 (x);
+#else
+
+  double lo = _cl_sinh (x.lo);
+  double hi = _cl_sinh (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_sinh (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_sinh (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_sinh (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_sinh (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_sinhd4_u10 (x);
+#else
+
+  double2 lo = _cl_sinh (x.lo);
+  double2 hi = _cl_sinh (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_sinh (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_sinhd8_u10 (x);
+#else
+
+  double4 lo = _cl_sinh (x.lo);
+  double4 hi = _cl_sinh (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_sinh (double16 x)
+{
+
+  double8 lo = _cl_sinh (x.lo);
+  double8 hi = _cl_sinh (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/sinpi.cl b/lib/kernel/sleef-pocl/sinpi.cl
new file mode 100644
index 0000000..ade5712
--- /dev/null
+++ b/lib/kernel/sleef-pocl/sinpi.cl
@@ -0,0 +1,183 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_sinpi (float x)
+{
+  return Sleef_sinpif_u05 (x);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_sinpi (float2 x)
+{
+
+  float lo = _cl_sinpi (x.lo);
+  float hi = _cl_sinpi (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_sinpi (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_sinpi (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_sinpi (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_sinpi (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_sinpif4_u05 (x);
+#else
+
+  float2 lo = _cl_sinpi (x.lo);
+  float2 hi = _cl_sinpi (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_sinpi (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_sinpif8_u05 (x);
+#else
+
+  float4 lo = _cl_sinpi (x.lo);
+  float4 hi = _cl_sinpi (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_sinpi (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_sinpif16_u05 (x);
+#else
+
+  float8 lo = _cl_sinpi (x.lo);
+  float8 hi = _cl_sinpi (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_sinpi (double x)
+{
+  return Sleef_sinpi_u05 (x);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_sinpi (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_sinpid2_u05 (x);
+#else
+
+  double lo = _cl_sinpi (x.lo);
+  double hi = _cl_sinpi (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_sinpi (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_sinpi (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_sinpi (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_sinpi (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_sinpid4_u05 (x);
+#else
+
+  double2 lo = _cl_sinpi (x.lo);
+  double2 hi = _cl_sinpi (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_sinpi (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_sinpid8_u05 (x);
+#else
+
+  double4 lo = _cl_sinpi (x.lo);
+  double4 hi = _cl_sinpi (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_sinpi (double16 x)
+{
+
+  double8 lo = _cl_sinpi (x.lo);
+  double8 hi = _cl_sinpi (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/sqrt.cl b/lib/kernel/sleef-pocl/sqrt.cl
new file mode 100644
index 0000000..bc8a8a7
--- /dev/null
+++ b/lib/kernel/sleef-pocl/sqrt.cl
@@ -0,0 +1,183 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_sqrt (float x)
+{
+  return Sleef_sqrtf_u05 (x);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_sqrt (float2 x)
+{
+
+  float lo = _cl_sqrt (x.lo);
+  float hi = _cl_sqrt (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_sqrt (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_sqrt (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_sqrt (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_sqrt (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_sqrtf4_u05 (x);
+#else
+
+  float2 lo = _cl_sqrt (x.lo);
+  float2 hi = _cl_sqrt (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_sqrt (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_sqrtf8_u05 (x);
+#else
+
+  float4 lo = _cl_sqrt (x.lo);
+  float4 hi = _cl_sqrt (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_sqrt (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_sqrtf16_u05 (x);
+#else
+
+  float8 lo = _cl_sqrt (x.lo);
+  float8 hi = _cl_sqrt (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_sqrt (double x)
+{
+  return Sleef_sqrt_u05 (x);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_sqrt (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_sqrtd2_u05 (x);
+#else
+
+  double lo = _cl_sqrt (x.lo);
+  double hi = _cl_sqrt (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_sqrt (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_sqrt (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_sqrt (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_sqrt (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_sqrtd4_u05 (x);
+#else
+
+  double2 lo = _cl_sqrt (x.lo);
+  double2 hi = _cl_sqrt (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_sqrt (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_sqrtd8_u05 (x);
+#else
+
+  double4 lo = _cl_sqrt (x.lo);
+  double4 hi = _cl_sqrt (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_sqrt (double16 x)
+{
+
+  double8 lo = _cl_sqrt (x.lo);
+  double8 hi = _cl_sqrt (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/tan.cl b/lib/kernel/sleef-pocl/tan.cl
new file mode 100644
index 0000000..d8a6d9d
--- /dev/null
+++ b/lib/kernel/sleef-pocl/tan.cl
@@ -0,0 +1,229 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_tan (float x)
+{
+
+#ifdef MAX_PRECISION
+  return Sleef_tanf_u10 (x);
+#else
+  return Sleef_tanf_u35 (x);
+#endif
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_tan (float2 x)
+{
+
+  float lo = _cl_tan (x.lo);
+  float hi = _cl_tan (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_tan (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_tan (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_tan (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_tan (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_tanf4_u10 (x);
+#else
+  return Sleef_tanf4_u35 (x);
+#endif
+
+#else
+
+  float2 lo = _cl_tan (x.lo);
+  float2 hi = _cl_tan (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_tan (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_tanf8_u10 (x);
+#else
+  return Sleef_tanf8_u35 (x);
+#endif
+
+#else
+
+  float4 lo = _cl_tan (x.lo);
+  float4 hi = _cl_tan (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_tan (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_tanf16_u10 (x);
+#else
+  return Sleef_tanf16_u35 (x);
+#endif
+
+#else
+
+  float8 lo = _cl_tan (x.lo);
+  float8 hi = _cl_tan (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_tan (double x)
+{
+
+#ifdef MAX_PRECISION
+  return Sleef_tan_u10 (x);
+#else
+  return Sleef_tan_u35 (x);
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_tan (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_tand2_u10 (x);
+#else
+  return Sleef_tand2_u35 (x);
+#endif
+
+#else
+
+  double lo = _cl_tan (x.lo);
+  double hi = _cl_tan (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_tan (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_tan (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_tan (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_tan (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_tand4_u10 (x);
+#else
+  return Sleef_tand4_u35 (x);
+#endif
+
+#else
+
+  double2 lo = _cl_tan (x.lo);
+  double2 hi = _cl_tan (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_tan (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+
+#ifdef MAX_PRECISION
+  return Sleef_tand8_u10 (x);
+#else
+  return Sleef_tand8_u35 (x);
+#endif
+
+#else
+
+  double4 lo = _cl_tan (x.lo);
+  double4 hi = _cl_tan (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_tan (double16 x)
+{
+
+  double8 lo = _cl_tan (x.lo);
+  double8 hi = _cl_tan (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/tanh.cl b/lib/kernel/sleef-pocl/tanh.cl
new file mode 100644
index 0000000..7e3fd34
--- /dev/null
+++ b/lib/kernel/sleef-pocl/tanh.cl
@@ -0,0 +1,183 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_tanh (float x)
+{
+  return Sleef_tanhf_u10 (x);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_tanh (float2 x)
+{
+
+  float lo = _cl_tanh (x.lo);
+  float hi = _cl_tanh (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_tanh (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_tanh (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_tanh (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_tanh (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_tanhf4_u10 (x);
+#else
+
+  float2 lo = _cl_tanh (x.lo);
+  float2 hi = _cl_tanh (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_tanh (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_tanhf8_u10 (x);
+#else
+
+  float4 lo = _cl_tanh (x.lo);
+  float4 hi = _cl_tanh (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_tanh (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_tanhf16_u10 (x);
+#else
+
+  float8 lo = _cl_tanh (x.lo);
+  float8 hi = _cl_tanh (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_tanh (double x)
+{
+  return Sleef_tanh_u10 (x);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_tanh (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_tanhd2_u10 (x);
+#else
+
+  double lo = _cl_tanh (x.lo);
+  double hi = _cl_tanh (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_tanh (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_tanh (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_tanh (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_tanh (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_tanhd4_u10 (x);
+#else
+
+  double2 lo = _cl_tanh (x.lo);
+  double2 hi = _cl_tanh (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_tanh (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_tanhd8_u10 (x);
+#else
+
+  double4 lo = _cl_tanh (x.lo);
+  double4 hi = _cl_tanh (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_tanh (double16 x)
+{
+
+  double8 lo = _cl_tanh (x.lo);
+  double8 hi = _cl_tanh (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/tgamma.cl b/lib/kernel/sleef-pocl/tgamma.cl
new file mode 100644
index 0000000..cac30f6
--- /dev/null
+++ b/lib/kernel/sleef-pocl/tgamma.cl
@@ -0,0 +1,183 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_tgamma (float x)
+{
+  return Sleef_tgammaf_u10 (x);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_tgamma (float2 x)
+{
+
+  float lo = _cl_tgamma (x.lo);
+  float hi = _cl_tgamma (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_tgamma (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_tgamma (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_tgamma (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_tgamma (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_tgammaf4_u10 (x);
+#else
+
+  float2 lo = _cl_tgamma (x.lo);
+  float2 hi = _cl_tgamma (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_tgamma (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_tgammaf8_u10 (x);
+#else
+
+  float4 lo = _cl_tgamma (x.lo);
+  float4 hi = _cl_tgamma (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_tgamma (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_tgammaf16_u10 (x);
+#else
+
+  float8 lo = _cl_tgamma (x.lo);
+  float8 hi = _cl_tgamma (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_tgamma (double x)
+{
+  return Sleef_tgamma_u10 (x);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_tgamma (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_tgammad2_u10 (x);
+#else
+
+  double lo = _cl_tgamma (x.lo);
+  double hi = _cl_tgamma (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_tgamma (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_tgamma (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_tgamma (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_tgamma (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_tgammad4_u10 (x);
+#else
+
+  double2 lo = _cl_tgamma (x.lo);
+  double2 hi = _cl_tgamma (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_tgamma (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_tgammad8_u10 (x);
+#else
+
+  double4 lo = _cl_tgamma (x.lo);
+  double4 hi = _cl_tgamma (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_tgamma (double16 x)
+{
+
+  double8 lo = _cl_tgamma (x.lo);
+  double8 hi = _cl_tgamma (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef-pocl/trunc.cl b/lib/kernel/sleef-pocl/trunc.cl
new file mode 100644
index 0000000..7ba32b0
--- /dev/null
+++ b/lib/kernel/sleef-pocl/trunc.cl
@@ -0,0 +1,183 @@
+#include "sleef_cl.h"
+
+_CL_OVERLOADABLE
+float
+_cl_trunc (float x)
+{
+  return Sleef_truncf (x);
+}
+
+_CL_OVERLOADABLE
+float2
+_cl_trunc (float2 x)
+{
+
+  float lo = _cl_trunc (x.lo);
+  float hi = _cl_trunc (x.hi);
+  return (float2) (lo, hi);
+}
+
+_CL_OVERLOADABLE
+float4 _cl_trunc (float4);
+
+_CL_OVERLOADABLE
+float3
+_cl_trunc (float3 x)
+{
+
+  float4 x_3to4 = (float4) (x, (float)0);
+
+  float4 r = _cl_trunc (x_3to4);
+  return r.xyz;
+}
+
+_CL_OVERLOADABLE
+float4
+_cl_trunc (float4 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE)
+  return Sleef_truncf4 (x);
+#else
+
+  float2 lo = _cl_trunc (x.lo);
+  float2 hi = _cl_trunc (x.hi);
+  return (float4) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float8
+_cl_trunc (float8 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE)
+  return Sleef_truncf8 (x);
+#else
+
+  float4 lo = _cl_trunc (x.lo);
+  float4 hi = _cl_trunc (x.hi);
+  return (float8) (lo, hi);
+
+#endif
+}
+
+_CL_OVERLOADABLE
+float16
+_cl_trunc (float16 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE)
+  return Sleef_truncf16 (x);
+#else
+
+  float8 lo = _cl_trunc (x.lo);
+  float8 hi = _cl_trunc (x.hi);
+  return (float16) (lo, hi);
+
+#endif
+}
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double
+_cl_trunc (double x)
+{
+  return Sleef_trunc (x);
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double2
+_cl_trunc (double2 x)
+{
+
+#if defined(SLEEF_VEC_128_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_truncd2 (x);
+#else
+
+  double lo = _cl_trunc (x.lo);
+  double hi = _cl_trunc (x.hi);
+  return (double2) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4 _cl_trunc (double4);
+
+_CL_OVERLOADABLE
+double3
+_cl_trunc (double3 x)
+{
+
+  double4 x_3to4 = (double4) (x, (double)0);
+
+  double4 r = _cl_trunc (x_3to4);
+  return r.xyz;
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double4
+_cl_trunc (double4 x)
+{
+
+#if defined(SLEEF_VEC_256_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_truncd4 (x);
+#else
+
+  double2 lo = _cl_trunc (x.lo);
+  double2 hi = _cl_trunc (x.hi);
+  return (double4) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double8
+_cl_trunc (double8 x)
+{
+
+#if defined(SLEEF_VEC_512_AVAILABLE) && defined(SLEEF_DOUBLE_VEC_AVAILABLE)
+  return Sleef_truncd8 (x);
+#else
+
+  double4 lo = _cl_trunc (x.lo);
+  double4 hi = _cl_trunc (x.hi);
+  return (double8) (lo, hi);
+
+#endif
+}
+
+#endif /* cl_khr_fp64 */
+
+#ifdef cl_khr_fp64
+
+_CL_OVERLOADABLE
+double16
+_cl_trunc (double16 x)
+{
+
+  double8 lo = _cl_trunc (x.lo);
+  double8 hi = _cl_trunc (x.hi);
+  return (double16) (lo, hi);
+}
+
+#endif /* cl_khr_fp64 */
diff --git a/lib/kernel/sleef/arch/helperadvsimd.h b/lib/kernel/sleef/arch/helperadvsimd.h
new file mode 100644
index 0000000..ac76221
--- /dev/null
+++ b/lib/kernel/sleef/arch/helperadvsimd.h
@@ -0,0 +1,701 @@
+/*********************************************************************/
+/*          Copyright ARM Ltd. 2010 - 2017.                          */
+/* Distributed under the Boost Software License, Version 1.0.        */
+/*    (See accompanying file LICENSE.txt or copy at                  */
+/*          http://www.boost.org/LICENSE_1_0.txt)                    */
+/*********************************************************************/
+
+#ifndef __ARM_NEON
+#error Please specify advsimd flags.
+#endif
+
+#include <arm_neon.h>
+#include <stdint.h>
+
+#include "misc.h"
+
+#define ENABLE_DP
+#define LOG2VECTLENDP 1
+#define VECTLENDP (1 << LOG2VECTLENDP)
+#define ENABLE_FMA_DP
+
+#define ENABLE_SP
+#define LOG2VECTLENSP 2
+#define VECTLENSP (1 << LOG2VECTLENSP)
+#define ENABLE_FMA_SP
+
+#define FULL_FP_ROUNDING
+
+#define ISANAME "AArch64 AdvSIMD"
+
+#ifdef SLEEF_SINGLE_MINMAXNUM_AVAILABLE
+#undef SLEEF_SINGLE_MINMAXNUM_AVAILABLE
+#endif
+#define SLEEF_SINGLE_MINMAXNUM_AVAILABLE 1
+
+#ifdef SLEEF_DOUBLE_MINMAXNUM_AVAILABLE
+#undef SLEEF_DOUBLE_MINMAXNUM_AVAILABLE
+#endif
+#define SLEEF_DOUBLE_MINMAXNUM_AVAILABLE 1
+
+// Mask definition
+typedef uint32x4_t vmask;
+typedef uint32x4_t vopmask;
+
+// Single precision definitions
+typedef float32x4_t vfloat;
+typedef int32x4_t vint2;
+
+// Double precision definitions
+typedef float64x2_t vdouble;
+typedef int32x2_t vint;
+
+#define DFTPRIORITY 10
+
+static INLINE int vavailability_i(int name) { return 3; }
+static INLINE void vprefetch_v_p(const void *ptr) { }
+
+static INLINE int vtestallones_i_vo32(vopmask g) {
+  uint32x2_t x0 = vand_u32(vget_low_u32(g), vget_high_u32(g));
+  uint32x2_t x1 = vpmin_u32(x0, x0);
+  return vget_lane_u32(x1, 0);
+}
+
+static INLINE int vtestallones_i_vo64(vopmask g) {
+  uint32x2_t x0 = vand_u32(vget_low_u32(g), vget_high_u32(g));
+  uint32x2_t x1 = vpmin_u32(x0, x0);
+  return vget_lane_u32(x1, 0);
+}
+
+// Vector load / store
+static INLINE vdouble vload_vd_p(const double *ptr) { return vld1q_f64(ptr); }
+static INLINE vdouble vloadu_vd_p(const double *ptr) { return vld1q_f64(ptr); }
+static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { vst1q_f64(ptr, v); }
+static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { vst1q_f64(ptr, v); }
+static INLINE vfloat vload_vf_p(const float *ptr) { return vld1q_f32(ptr); }
+static INLINE vfloat vloadu_vf_p(const float *ptr) { return vld1q_f32(ptr); }
+static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { vst1q_f32(ptr, v); }
+static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { vst1q_f32(ptr, v); }
+static INLINE vint2 vloadu_vi2_p(int32_t *p) { return vld1q_s32(p); }
+static INLINE void vstoreu_v_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); }
+static INLINE vint vloadu_vi_p(int32_t *p) { return vld1_s32(p); }
+static INLINE void vstoreu_v_p_vi(int32_t *p, vint v) { vst1_s32(p, v); }
+
+// Basic logical operations for mask
+static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vandq_u32(x, y); }
+static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) {
+  return vbicq_u32(y, x);
+}
+static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vorrq_u32(x, y); }
+static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return veorq_u32(x, y); }
+
+// Mask <--> single precision reinterpret
+static INLINE vmask vreinterpret_vm_vf(vfloat vf) {
+  return vreinterpretq_u32_f32(vf);
+}
+static INLINE vfloat vreinterpret_vf_vm(vmask vm) {
+  return vreinterpretq_f32_u32(vm);
+}
+static INLINE vint2 vcast_vi2_vm(vmask vm) { return vreinterpretq_s32_u32(vm); }
+static INLINE vmask vcast_vm_vi2(vint2 vi) { return vreinterpretq_u32_s32(vi); }
+
+// Mask <--> double precision reinterpret
+static INLINE vmask vreinterpret_vm_vd(vdouble vd) {
+  return vreinterpretq_u32_f64(vd);
+}
+static INLINE vdouble vreinterpret_vd_vm(vmask vm) {
+  return vreinterpretq_f64_u32(vm);
+}
+static INLINE vfloat vreinterpret_vf_vi2(vint2 vm) {
+  return vreinterpretq_f32_s32(vm);
+}
+static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) {
+  return vreinterpretq_s32_f32(vf);
+}
+static INLINE vint2 vreinterpret_vi2_vd(vdouble vd) {
+  return vreinterpretq_s32_f64(vd);
+}
+
+/****************************************/
+/* Single precision FP operations */
+/****************************************/
+// Broadcast
+static INLINE vfloat vcast_vf_f(float f) { return vdupq_n_f32(f); }
+
+// Add, Sub, Mul, Reciprocal 1/x, Division, Square root
+static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) {
+  return vaddq_f32(x, y);
+}
+static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) {
+  return vsubq_f32(x, y);
+}
+static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) {
+  return vmulq_f32(x, y);
+}
+static INLINE vfloat vrec_vf_vf(vfloat d) {
+  return vdivq_f32(vcast_vf_f(1.0f), d);
+}
+static INLINE vfloat vdiv_vf_vf_vf(vfloat n, vfloat d) {
+  return vdivq_f32(n, d);
+}
+static INLINE vfloat vsqrt_vf_vf(vfloat d) { return vsqrtq_f32(d); }
+
+// Multiply accumulate: z = z + x * y
+static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) {
+  return vfmaq_f32(z, x, y);
+}
+// Multiply subtract: z = z = x * y
+static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) {
+  return vfmsq_f32(z, x, y);
+}
+
+// |x|, -x
+static INLINE vfloat vabs_vf_vf(vfloat f) { return vabsq_f32(f); }
+static INLINE vfloat vneg_vf_vf(vfloat f) { return vnegq_f32(f); }
+
+// max, min
+static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) {
+  return vmaxq_f32(x, y);
+}
+static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) {
+  return vminq_f32(x, y);
+}
+
+// max number, min number
+static INLINE vfloat vmaxnum_vf_vf_vf(vfloat x, vfloat y) {
+  return vmaxnmq_f32(x, y);
+}
+static INLINE vfloat vminnum_vf_vf_vf(vfloat x, vfloat y) {
+  return vminnmq_f32(x, y);
+}
+
+// Comparisons
+static INLINE vmask veq_vm_vf_vf(vfloat x, vfloat y) { return vceqq_f32(x, y); }
+static INLINE vmask vneq_vm_vf_vf(vfloat x, vfloat y) {
+  return vmvnq_u32(vceqq_f32(x, y));
+}
+static INLINE vmask vlt_vm_vf_vf(vfloat x, vfloat y) { return vcltq_f32(x, y); }
+static INLINE vmask vle_vm_vf_vf(vfloat x, vfloat y) { return vcleq_f32(x, y); }
+static INLINE vmask vgt_vm_vf_vf(vfloat x, vfloat y) { return vcgtq_f32(x, y); }
+static INLINE vmask vge_vm_vf_vf(vfloat x, vfloat y) { return vcgeq_f32(x, y); }
+
+// Conditional select
+static INLINE vfloat vsel_vf_vm_vf_vf(vmask mask, vfloat x, vfloat y) {
+  return vbslq_f32(mask, x, y);
+}
+
+// int <--> float conversions
+static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcvtq_s32_f32(vf); }
+static INLINE vfloat vcast_vf_vi2(vint2 vi) { return vcvtq_f32_s32(vi); }
+static INLINE vint2 vcast_vi2_i(int i) { return vdupq_n_s32(i); }
+static INLINE vint2 vrint_vi2_vf(vfloat d) {
+  return vcvtq_s32_f32(vrndaq_f32(d));
+}
+
+/***************************************/
+/* Single precision integer operations */
+/***************************************/
+
+// Add, Sub, Neg (-x)
+static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) {
+  return vaddq_s32(x, y);
+}
+static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) {
+  return vsubq_s32(x, y);
+}
+static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vnegq_s32(e); }
+
+// Logical operations
+static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) {
+  return vandq_s32(x, y);
+}
+static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) {
+  return vbicq_s32(y, x);
+}
+static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) {
+  return vorrq_s32(x, y);
+}
+static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) {
+  return veorq_s32(x, y);
+}
+
+// Shifts
+#if defined(__clang__)
+#define vsll_vi2_vi2_i(x, c) vshlq_n_s32(x, c)
+#define vsrl_vi2_vi2_i(x, c)                                                   \
+  vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(x), c))
+
+#define vsra_vi2_vi2_i(x, c) vshrq_n_s32(x, c)
+#define vsra_vi_vi_i(x, c) vshr_n_s32(x, c)
+#define vsll_vi_vi_i(x, c) vshl_n_s32(x, c)
+#define vsrl_vi_vi_i(x, c)                                                     \
+  vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(x), c))
+
+static INLINE vint2 vsrl64_vi2_vi_52(vint2 x) {
+  return vreinterpretq_s32_u64(vshrq_n_u64(vreinterpretq_u64_s32(x), 52));
+}
+
+static INLINE vint2 vadd64_vi2_vi2_vi2(vint2 a, vint2 b) {
+  return vreinterpretq_s32_s64(vaddq_s64(vreinterpretq_s64_s32(a), vreinterpretq_s64_s32(b)));
+}
+
+static INLINE vint2 vsub64_vi2_vi2_vi2(vint2 a, vint2 b) {
+  return vreinterpretq_s32_s64(vsubq_s64(vreinterpretq_s64_s32(a), vreinterpretq_s64_s32(b)));
+}
+
+static INLINE vint2 vcast_vi2_i64(int64_t l) {
+  return vreinterpretq_s32_s64(vdupq_n_s64(l));
+}
+
+static INLINE vmask vgt_cvt_vo_vi_vi(vint a, vint b) {
+  return vreinterpretq_u32_s64(vmovl_s32(vreinterpret_s32_u32(vcgt_s32(a, b))));
+}
+
+static INLINE vmask veq_cvt_vo_vi_vi(vint a, vint b) {
+  return vreinterpretq_u32_s64(vmovl_s32(vreinterpret_s32_u32(vceq_s32(a, b))));
+}
+
+#else
+static INLINE vint2 vsll_vi2_vi2_i(vint2 x, const int c) {
+  return vshlq_n_s32(x, c);
+}
+static INLINE vint vsll_vi_vi_i(vint x, const int c) {
+  return vshl_n_s32(x, c);
+}
+static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, const int c) {
+  return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(x), c));
+}
+static INLINE vint vsrl_vi_vi_i(vint x, const int c) {
+  return vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(x), c));
+}
+static INLINE vint2 vsra_vi2_vi2_i(vint2 x, const int c) {
+  return vshrq_n_s32(x, c);
+}
+static INLINE vint vsra_vi_vi_i(vint x, const int c) {
+  return vshr_n_s32(x, c);
+}
+#endif
+
+
+
+// Comparison returning masks
+static INLINE vmask veq_vm_vi2_vi2(vint2 x, vint2 y) { return vceqq_s32(x, y); }
+static INLINE vmask vgt_vm_vi2_vi2(vint2 x, vint2 y) { return vcgeq_s32(x, y); }
+// Comparison returning integers
+static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) {
+  return vreinterpretq_s32_u32(vcgeq_s32(x, y));
+}
+static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) {
+  return vreinterpretq_s32_u32(vceqq_s32(x, y));
+}
+
+// Conditional select
+static INLINE vint2 vsel_vi2_vm_vi2_vi2(vmask m, vint2 x, vint2 y) {
+  return vbslq_s32(m, x, y);
+}
+
+/* -------------------------------------------------------------------------- */
+/* -------------------------------------------------------------------------- */
+/* -------------------------------------------------------------------------- */
+/* -------------------------------------------------------------------------- */
+
+/****************************************/
+/* Double precision FP operations */
+/****************************************/
+// Broadcast
+static INLINE vdouble vcast_vd_d(double f) { return vdupq_n_f64(f); }
+
+// Add, Sub, Mul, Reciprocal 1/x, Division, Square root
+static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) {
+  return vaddq_f64(x, y);
+}
+static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) {
+  return vsubq_f64(x, y);
+}
+static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) {
+  return vmulq_f64(x, y);
+}
+static INLINE vdouble vrec_vd_vd(vdouble d) {
+  return vdivq_f64(vcast_vd_d(1.0f), d);
+}
+static INLINE vdouble vdiv_vd_vd_vd(vdouble n, vdouble d) {
+  return vdivq_f64(n, d);
+}
+static INLINE vdouble vsqrt_vd_vd(vdouble d) { return vsqrtq_f64(d); }
+
+// |x|, -x
+static INLINE vdouble vabs_vd_vd(vdouble f) { return vabsq_f64(f); }
+static INLINE vdouble vneg_vd_vd(vdouble f) { return vnegq_f64(f); }
+
+// max, min
+static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) {
+  return vmaxq_f64(x, y);
+}
+static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) {
+  return vminq_f64(x, y);
+}
+
+// max number, min number
+static INLINE vdouble vmaxnum_vd_vd_vd(vdouble x, vdouble y) {
+  return vmaxnmq_f64(x, y);
+}
+static INLINE vdouble vminnum_vd_vd_vd(vdouble x, vdouble y) {
+  return vminnmq_f64(x, y);
+}
+
+// Multiply accumulate: z = z + x * y
+static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) {
+  return vfmaq_f64(z, x, y);
+}
+
+static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) {
+  return vfmsq_f64(z, x, y);
+}
+
+static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // z + x * y
+  return vfmaq_f64(z, x, y);
+}
+
+static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // z - x * y
+  return vfmsq_f64(z, x, y);
+}
+
+//[z = x * y - z]
+static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) {
+  return vneg_vd_vd(vfmanp_vd_vd_vd_vd(x, y, z));
+}
+
+static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { // x * y - z
+  return vneg_vd_vd(vfmanp_vd_vd_vd_vd(x, y, z));
+}
+
+static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // z + x * y
+  return vfmaq_f32(z, x, y);
+}
+
+static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // z - x * y
+  return vfmsq_f32(z, x, y);
+}
+
+static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { // x * y - z
+  return vneg_vf_vf(vfmanp_vf_vf_vf_vf(x, y, z));
+}
+
+/* Comparisons */
+static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) {
+  return vreinterpretq_u32_u64(vceqq_f64(x, y));
+}
+static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) {
+  return vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(x, y)));
+}
+static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) {
+  return vreinterpretq_u32_u64(vcltq_f64(x, y));
+}
+static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) {
+  return vreinterpretq_u32_u64(vcgtq_f64(x, y));
+}
+static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) {
+  return vreinterpretq_u32_u64(vcleq_f64(x, y));
+}
+static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) {
+  return vreinterpretq_u32_u64(vcgeq_f64(x, y));
+}
+
+// Conditional select
+static INLINE vdouble vsel_vd_vo_vd_vd(vopmask mask, vdouble x, vdouble y) {
+  return vbslq_f64(vreinterpretq_u64_u32(mask), x, y);
+}
+
+#if 1
+static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
+  return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
+}
+
+static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
+  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
+}
+
+static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
+  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
+}
+#else
+// This implementation is slower on the current CPU models (as of May 2017.)
+// I(Naoki Shibata) expect that on future CPU models with hardware similar to Super Shuffle Engine, this implementation will be faster.
+static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double d0, double d1) {
+  uint8x16_t idx = vbslq_u8(vreinterpretq_u8_u32(o), (uint8x16_t) { 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 },
+          (uint8x16_t) { 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15 });
+
+  uint8x16_t tab = (uint8x16_t) (float64x2_t) { d0, d1 };
+  return (vdouble) vqtbl1q_u8(tab, idx);
+}
+
+static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
+  uint8x16_t idx = vbslq_u8(vreinterpretq_u8_u32(o0), (uint8x16_t) { 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 },
+          vbslq_u8(vreinterpretq_u8_u32(o1), (uint8x16_t) { 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15 },
+             vbslq_u8(vreinterpretq_u8_u32(o2), (uint8x16_t) { 16, 17, 18, 19, 20, 21, 22, 23, 16, 17, 18, 19, 20, 21, 22, 23 },
+                (uint8x16_t) { 24, 25, 26, 27, 28, 29, 30, 31, 24, 25, 26, 27, 28, 29, 30, 31 })));
+
+  uint8x16x2_t tab = { { (uint8x16_t) (float64x2_t) { d0, d1 }, (uint8x16_t) (float64x2_t) { d2, d3 } } };
+  return (vdouble) vqtbl2q_u8(tab, idx);
+}
+
+static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
+  return vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o1, d0, d1, d2, d2);
+}
+#endif
+
+static INLINE vdouble vrint_vd_vd(vdouble d) { return vrndnq_f64(d); }
+static INLINE vfloat vrint_vf_vf(vfloat d) { return vrndnq_f32(d); }
+
+/****************************************/
+/* int <--> float conversions           */
+/****************************************/
+static INLINE vint vtruncate_vi_vd(vdouble vf) {
+  return vmovn_s64(vcvtq_s64_f64(vf));
+}
+static INLINE vdouble vcast_vd_vi(vint vi) {
+  return vcvtq_f64_s64(vmovl_s32(vi));
+}
+static INLINE vint vcast_vi_i(int i) { return vdup_n_s32(i); }
+static INLINE vint vrint_vi_vd(vdouble d) {
+  return vqmovn_s64(vcvtq_s64_f64(vrndaq_f64(d)));
+}
+
+/***************************************/
+/* Integer operations */
+/***************************************/
+
+// Add, Sub, Neg (-x)
+static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return vadd_s32(x, y); }
+static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return vsub_s32(x, y); }
+static INLINE vint vneg_vi_vi(vint e) { return vneg_s32(e); }
+
+// Logical operations
+static INLINE vint vand_vi_vi_vi(vint x, vint y) { return vand_s32(x, y); }
+static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return vbic_s32(y, x); }
+static INLINE vint vor_vi_vi_vi(vint x, vint y) { return vorr_s32(x, y); }
+static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return veor_s32(x, y); }
+
+// Comparison returning masks
+static INLINE vopmask veq_vo_vi_vi(vint x, vint y) {
+  return vcombine_u32(vceq_s32(x, y), vdup_n_u32(0));
+}
+
+// Conditional select
+static INLINE vint vsel_vi_vm_vi_vi(vmask m, vint x, vint y) {
+  return vbsl_s32(vget_low_u32(m), x, y);
+}
+
+/***************************************/
+/* Predicates                          */
+/***************************************/
+static INLINE vopmask visinf_vo_vd(vdouble d) {
+  const float64x2_t inf = vdupq_n_f64(INFINITY);
+  const float64x2_t neg_inf = vdupq_n_f64(-INFINITY);
+  uint64x2_t cmp = vorrq_u64(vceqq_f64(d, inf), vceqq_f64(d, neg_inf));
+  return vreinterpretq_u32_u64(cmp);
+}
+
+static INLINE vopmask visnan_vo_vd(vdouble d) {
+  return vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(d, d)));
+}
+
+static INLINE vopmask vispinf_vo_vd(vdouble d) {
+  return vreinterpretq_u32_u64(vceqq_f64(d, vdupq_n_f64(INFINITY)));
+}
+
+static INLINE vopmask visminf_vo_vd(vdouble d) {
+  return vreinterpretq_u32_u64(vceqq_f64(d, vdupq_n_f64(-INFINITY)));
+}
+
+static INLINE vfloat vsel_vf_vo_vf_vf(vopmask mask, vfloat x, vfloat y) {
+  return vbslq_f32(mask, x, y);
+}
+
+static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
+  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
+}
+
+static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) {
+  return vceqq_f32(x, y);
+}
+static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) {
+  return vmvnq_u32(vceqq_f32(x, y));
+}
+static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) {
+  return vcltq_f32(x, y);
+}
+static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) {
+  return vcleq_f32(x, y);
+}
+static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) {
+  return vcgtq_f32(x, y);
+}
+static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) {
+  return vcgeq_f32(x, y);
+}
+
+static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) {
+  return vceqq_s32(x, y);
+}
+static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) {
+  return vcgeq_s32(x, y);
+}
+static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) {
+  return vcombine_u32(vcge_s32(x, y), vdup_n_u32(0));
+}
+static INLINE vopmask visinf_vo_vf(vfloat d) {
+  return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(INFINITYf));
+}
+static INLINE vopmask vispinf_vo_vf(vfloat d) {
+  return veq_vo_vf_vf(d, vcast_vf_f(INFINITYf));
+}
+static INLINE vopmask visminf_vo_vf(vfloat d) {
+  return veq_vo_vf_vf(d, vcast_vf_f(-INFINITYf));
+}
+static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
+
+static INLINE vopmask vcast_vo32_vo64(vopmask m) {
+  return vuzpq_u32(m, m).val[0];
+}
+static INLINE vopmask vcast_vo64_vo32(vopmask m) {
+  return vzipq_u32(m, m).val[0];
+}
+
+static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) {
+  return vandq_u32(x, y);
+}
+static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) {
+  return vbicq_u32(y, x);
+}
+static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) {
+  return vorrq_u32(x, y);
+}
+static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) {
+  return veorq_u32(x, y);
+}
+
+static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
+  return vbslq_s32(m, x, y);
+}
+static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) {
+  return vandq_s32(vreinterpretq_s32_u32(x), y);
+}
+static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) {
+  return vbicq_s32(y, vreinterpretq_s32_u32(x));
+}
+static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) {
+  return vbic_s32(y, vget_low_s32(vreinterpretq_s32_u32(x)));
+}
+static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) {
+  return vandq_u32(x, y);
+}
+static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) {
+  return vandq_u32(x, y);
+}
+static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) {
+  return vbicq_u32(y, x);
+}
+static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) {
+  return vbicq_u32(y, x);
+}
+static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) {
+  return vorrq_u32(x, y);
+}
+static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) {
+  return vorrq_u32(x, y);
+}
+static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) {
+  return veorq_u32(x, y);
+}
+
+static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vrndq_f32(vd); }
+
+static INLINE vmask vcast_vm_i_i(int i0, int i1) {
+  return vreinterpretq_u32_u64(vdupq_n_u64((0xffffffff & (uint64_t)i1) | (((uint64_t)i0) << 32)));
+}
+
+static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {
+  return vreinterpretq_u32_u64(vceqq_s64(vreinterpretq_s64_u32(x), vreinterpretq_s64_u32(y)));
+}
+
+static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) {
+  return vreinterpretq_u32_s64(vaddq_s64(vreinterpretq_s64_u32(x), vreinterpretq_s64_u32(y)));
+}
+
+static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {
+  return vbsl_s32(vget_low_u32(m), x, y);
+}
+
+// Logical operations
+static INLINE vint vand_vi_vo_vi(vopmask x, vint y) {
+  return vand_s32(vreinterpret_s32_u32(vget_low_u32(x)), y);
+}
+
+static INLINE vint2 vcastu_vi2_vi(vint vi) {
+  return vreinterpretq_s32_u32(vrev64q_u32(vreinterpretq_u32_u64(vmovl_u32(vreinterpret_u32_s32(vi)))));
+}
+static INLINE vint vcastu_vi_vi2(vint2 vi2) {
+  return vreinterpret_s32_u32(vmovn_u64(vreinterpretq_u64_u32(vrev64q_u32(vreinterpretq_u32_s32(vi2)))));
+}
+static INLINE vdouble vreinterpret_vd_vi2(vint2 vi) {
+  return vreinterpretq_f64_s32(vi);
+}
+static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return vrndq_f64(vd); }
+
+//
+
+#define PNMASK ((vdouble) { +0.0, -0.0 })
+#define NPMASK ((vdouble) { -0.0, +0.0 })
+#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
+#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })
+
+static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
+static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
+static INLINE vfloat vposneg_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)PNMASKf); }
+static INLINE vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)NPMASKf); }
+
+static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); }
+static INLINE vfloat vsubadd_vf_vf_vf(vfloat d0, vfloat d1) { return vadd_vf_vf_vf(d0, vnegpos_vf_vf(d1)); }
+static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
+static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
+
+static INLINE vdouble vrev21_vd_vd(vdouble d0) { return (float64x2_t)vcombine_u64(vget_high_u64((uint64x2_t)d0), vget_low_u64((uint64x2_t)d0)); }
+static INLINE vdouble vreva2_vd_vd(vdouble vd) { return vd; }
+
+static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { vstore_v_p_vd(ptr, v); }
+static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); }
+static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); }
+
+static INLINE vfloat vrev21_vf_vf(vfloat d0) { return vrev64q_f32(d0); }
+static INLINE vfloat vreva2_vf_vf(vfloat d0) { return vcombine_f32(vget_high_f32(d0), vget_low_f32(d0)); }
+
+static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { vstore_v_p_vf(ptr, v); }
+
+static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
+  vst1_f32((float *)(ptr+(offset + step * 0)*2), vget_low_f32(v));
+  vst1_f32((float *)(ptr+(offset + step * 1)*2), vget_high_f32(v));
+}
+
+static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
+  vst1_f32((float *)(ptr+(offset + step * 0)*2), vget_low_f32(v));
+  vst1_f32((float *)(ptr+(offset + step * 1)*2), vget_high_f32(v));
+}
+
+// TODO
+
+static INLINE int vall_lte64_i_vd_vd(vdouble x, vdouble lim) {
+  return 0;
+}
+
+static INLINE int vall_lte32_i_vf_vf(vfloat x, vfloat lim) {
+  return 0;
+}
diff --git a/lib/kernel/sleef/arch/helperavx.h b/lib/kernel/sleef/arch/helperavx.h
new file mode 100644
index 0000000..2320161
--- /dev/null
+++ b/lib/kernel/sleef/arch/helperavx.h
@@ -0,0 +1,539 @@
+//          Copyright Naoki Shibata 2010 - 2017.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#if CONFIG == 1
+
+#if !defined(__AVX__)
+#error Please specify -mavx.
+#endif
+
+#elif CONFIG == 4
+
+#define ENABLE_FMA_DP
+#define ENABLE_FMA_SP
+
+#if !defined(__AVX__) || !defined(__FMA4__)
+#error Please specify -mavx and -mfma4.
+#endif
+
+#else
+#error CONFIG macro invalid or not defined
+#endif
+
+#define ENABLE_DP
+#define LOG2VECTLENDP 2
+#define VECTLENDP (1 << LOG2VECTLENDP)
+
+#define ENABLE_SP
+#define LOG2VECTLENSP (LOG2VECTLENDP+1)
+#define VECTLENSP (1 << LOG2VECTLENSP)
+
+#define FULL_FP_ROUNDING
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+
+typedef __m256i vmask;
+typedef __m256i vopmask;
+
+typedef __m256d vdouble;
+typedef __m128i vint;
+
+typedef __m256 vfloat;
+typedef struct { __m128i x, y; } vint2;
+
+//
+
+static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }
+
+static INLINE int vtestallones_i_vo32(vopmask g) {
+  return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1)));
+}
+
+static INLINE int vtestallones_i_vo64(vopmask g) {
+  return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1)));
+}
+
+//
+
+static INLINE vdouble vcast_vd_d(double d) { return _mm256_set1_pd(d); }
+static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm256_castpd_si256(vd); }
+static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm256_castsi256_pd(vm);  }
+static INLINE vint2 vreinterpret_vi2_vd(vdouble vd) {
+  vint2 r;
+  r.x = _mm256_castsi256_si128(vreinterpret_vm_vd(vd));
+  r.y = _mm256_extractf128_si256(vreinterpret_vm_vd(vd), 1);
+  return r;
+}
+static INLINE vdouble vreinterpret_vd_vi2(vint2 vi) {
+  vmask m = _mm256_castsi128_si256(vi.x);
+  m = _mm256_insertf128_si256(m, vi.y, 1);
+  return vreinterpret_vd_vm(m);
+}
+
+//
+
+static vint2 vloadu_vi2_p(int32_t *p) {
+  vint2 r;
+  r.x = _mm_loadu_si128((__m128i *) p     );
+  r.y = _mm_loadu_si128((__m128i *)(p + 4));
+  return r;
+}
+
+static void vstoreu_v_p_vi2(int32_t *p, vint2 v) {
+  _mm_storeu_si128((__m128i *) p     , v.x);
+  _mm_storeu_si128((__m128i *)(p + 4), v.y);
+}
+
+static vint vloadu_vi_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); }
+static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, v); }
+
+//
+
+static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+
+static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+
+static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+
+static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+
+static INLINE vopmask vcast_vo32_vo64(vopmask o) {
+  return _mm256_castsi128_si256(_mm256_cvtpd_epi32(_mm256_and_pd(vreinterpret_vd_vm(o), _mm256_set1_pd(-1.0))));
+}
+
+static INLINE vopmask vcast_vo64_vo32(vopmask o) {
+  return vreinterpret_vm_vd(_mm256_cmp_pd(_mm256_cvtepi32_pd(_mm256_castsi256_si128(o)), _mm256_set1_pd(-1.0), _CMP_EQ_OQ));
+}
+
+//
+
+static INLINE vint vrint_vi_vd(vdouble vd) { return _mm256_cvtpd_epi32(vd); }
+static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm256_cvttpd_epi32(vd); }
+static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
+static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
+static INLINE vfloat vrint_vf_vf(vfloat vd) { return _mm256_round_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
+static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm256_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
+static INLINE vdouble vcast_vd_vi(vint vi) { return _mm256_cvtepi32_pd(vi); }
+static INLINE vint vcast_vi_i(int i) { return _mm_set1_epi32(i); }
+static INLINE vint2 vcastu_vi2_vi(vint vi) {
+  vint2 r;
+  r.x = _mm_and_si128(_mm_shuffle_epi32(vi, 0x40), _mm_set_epi32(-1, 0, -1, 0));
+  r.y = _mm_and_si128(_mm_shuffle_epi32(vi, 0xc8), _mm_set_epi32(-1, 0, -1, 0));
+  return r;
+}
+
+static INLINE vint vcastu_vi_vi2(vint2 vi) {
+  return _mm_or_si128(_mm_and_si128(_mm_shuffle_epi32(vi.x, 0x0d), _mm_set_epi32( 0,  0, -1, -1)),
+          _mm_and_si128(_mm_shuffle_epi32(vi.y, 0xd0), _mm_set_epi32(-1, -1,  0,  0)));
+}
+
+static INLINE vmask vcast_vm_i_i(int i0, int i1) {
+  return _mm256_set_epi32(i0, i1, i0, i1, i0, i1, i0, i1);
+}
+
+static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {
+  return vreinterpret_vm_vd(_mm256_cmp_pd(vreinterpret_vd_vm(vxor_vm_vm_vm(vxor_vm_vm_vm(x, y), vreinterpret_vm_vd(_mm256_set1_pd(1.0)))), _mm256_set1_pd(1.0), _CMP_EQ_OQ));
+}
+
+static INLINE vint2 vsrl64_vi2_vi(vint2 x, int i) {
+  vint2 vi = { _mm_srli_epi64(x.x, i), _mm_srli_epi64(x.y, i) };
+  return vi;
+}
+
+static INLINE vint2 vadd64_vi2_vi2_vi2(vint2 x, vint2 y) {
+  vint2 vi = { _mm_add_epi64(x.x, y.x), _mm_add_epi64(x.y, y.y) };
+  return vi;
+}
+
+static INLINE vint2 vsub64_vi2_vi2_vi2(vint2 x, vint2 y) {
+  vint2 vi = { _mm_sub_epi64(x.x, y.x), _mm_sub_epi64(x.y, y.y) };
+  return vi;
+}
+
+static INLINE vint2 vcast_vi2_i64(long x) {
+  vint2 vi = { _mm_set1_epi64x(x), _mm_set1_epi64x(x) };
+  return vi;
+}
+
+
+//
+
+static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_add_pd(x, y); }
+static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm256_sub_pd(x, y); }
+static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm256_mul_pd(x, y); }
+static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm256_div_pd(x, y); }
+static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm256_div_pd(_mm256_set1_pd(1), x); }
+static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm256_sqrt_pd(x); }
+static INLINE vdouble vabs_vd_vd(vdouble d) { return _mm256_andnot_pd(_mm256_set1_pd(-0.0), d); }
+static INLINE vdouble vneg_vd_vd(vdouble d) { return _mm256_xor_pd(_mm256_set1_pd(-0.0), d); }
+static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm256_max_pd(x, y); }
+static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm256_min_pd(x, y); }
+
+#if CONFIG == 1
+static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
+static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
+#else
+static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_macc_pd(x, y, z); }
+static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_msub_pd(x, y, z); }
+static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_macc_pd(x, y, z); }
+static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_macc_pd(x, y, z); }
+static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_msub_pd(x, y, z); }
+static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_nmacc_pd(x, y, z); }
+static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_nmsub_pd(x, y, z); }
+#endif
+
+static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_EQ_OQ)); }
+static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_NEQ_UQ)); }
+static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LT_OQ)); }
+static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LE_OQ)); }
+static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_GT_OQ)); }
+static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_GE_OQ)); }
+
+//
+
+static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); }
+static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); }
+static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); }
+
+static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); }
+static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); }
+static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); }
+static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); }
+
+static INLINE vint vandnot_vi_vo_vi(vopmask m, vint y) { return _mm_andnot_si128(_mm256_castsi256_si128(m), y); }
+static INLINE vint vand_vi_vo_vi(vopmask m, vint y) { return _mm_and_si128(_mm256_castsi256_si128(m), y); }
+
+static INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); }
+static INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); }
+static INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); }
+
+static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }
+static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }
+
+static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return _mm256_castsi128_si256(_mm_cmpeq_epi32(x, y)); }
+static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return _mm256_castsi128_si256(_mm_cmpgt_epi32(x, y)); }
+
+static INLINE vopmask veq_cvt_vo_vi_vi(vint x, vint y) {
+  return _mm256_cvtps_pd(_mm_and_ps(_mm_cmpeq_epi32(x, y), _mm_set_ps1(-0.0f)));
+}
+static INLINE vopmask vgt_cvt_vo_vi_vi(vint x, vint y) {
+  return _mm256_cvtps_pd(_mm_and_ps(_mm_cmpgt_epi32(x, y), _mm_set_ps1(-0.0f)));
+}
+
+static INLINE vint vsel_vi_vo_vi_vi(vopmask o, vint x, vint y) { return _mm_blendv_epi8(y, x, _mm256_castsi256_si128(o)); }
+
+static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return _mm256_blendv_pd(y, x, _mm256_castsi256_pd(o)); }
+
+static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
+  return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
+}
+
+static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
+  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
+}
+
+static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
+  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
+}
+
+static INLINE vopmask visinf_vo_vd(vdouble d) {
+  return vreinterpret_vm_vd(_mm256_cmp_pd(vabs_vd_vd(d), _mm256_set1_pd(INFINITY), _CMP_EQ_OQ));
+}
+
+static INLINE vopmask vispinf_vo_vd(vdouble d) {
+  return vreinterpret_vm_vd(_mm256_cmp_pd(d, _mm256_set1_pd(INFINITY), _CMP_EQ_OQ));
+}
+
+static INLINE vopmask visminf_vo_vd(vdouble d) {
+  return vreinterpret_vm_vd(_mm256_cmp_pd(d, _mm256_set1_pd(-INFINITY), _CMP_EQ_OQ));
+}
+
+static INLINE vopmask visnan_vo_vd(vdouble d) {
+  return vreinterpret_vm_vd(_mm256_cmp_pd(d, d, _CMP_NEQ_UQ));
+}
+
+static INLINE vdouble vload_vd_p(const double *ptr) { return _mm256_load_pd(ptr); }
+static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm256_loadu_pd(ptr); }
+
+static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm256_store_pd(ptr, v); }
+static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm256_storeu_pd(ptr, v); }
+
+#if defined(_MSC_VER)
+// This function is needed when debugging on MSVC.
+static INLINE double vcast_d_vd(vdouble v) {
+  double a[VECTLENDP];
+  vstoreu_v_p_vd(a, v);
+  return a[0];
+}
+#endif
+
+//
+
+static INLINE vint2 vcast_vi2_vm(vmask vm) {
+  vint2 r;
+  r.x = _mm256_castsi256_si128(vm);
+  r.y = _mm256_extractf128_si256(vm, 1);
+  return r;
+}
+
+static INLINE vmask vcast_vm_vi2(vint2 vi) {
+  vmask m = _mm256_castsi128_si256(vi.x);
+  m = _mm256_insertf128_si256(m, vi.y, 1);
+  return m;
+}
+
+static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256_cvtps_epi32(vf)); }
+static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256_cvttps_epi32(vf)); }
+static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm256_cvtepi32_ps(vcast_vm_vi2(vi)); }
+static INLINE vfloat vcast_vf_f(float f) { return _mm256_set1_ps(f); }
+static INLINE vint2 vcast_vi2_i(int i) { vint2 r; r.x = r.y = _mm_set1_epi32(i); return r; }
+static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm256_castps_si256(vf); }
+static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm256_castsi256_ps(vm); }
+
+static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return vreinterpret_vf_vm(vcast_vm_vi2(vi)); }
+static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return vcast_vi2_vm(vreinterpret_vm_vf(vf)); }
+
+static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_add_ps(x, y); }
+static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm256_sub_ps(x, y); }
+static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm256_mul_ps(x, y); }
+static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm256_div_ps(x, y); }
+static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }
+static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm256_sqrt_ps(x); }
+static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); }
+static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); }
+static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm256_max_ps(x, y); }
+static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm256_min_ps(x, y); }
+
+#if CONFIG == 1
+static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
+static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); }
+#else
+static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); }
+static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_nmacc_ps(x, y, z); }
+static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); }
+static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_macc_ps(x, y, z); }
+static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_msub_ps(x, y, z); }
+static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_nmacc_ps(x, y, z); }
+static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_nmsub_ps(x, y, z); }
+#endif
+
+static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_EQ_OQ)); }
+static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_NEQ_UQ)); }
+static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LT_OQ)); }
+static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LE_OQ)); }
+static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_GT_OQ)); }
+static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_GE_OQ)); }
+
+static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) {
+  vint2 vi = { _mm_add_epi32(x.x, y.x), _mm_add_epi32(x.y, y.y) };
+  return vi;
+}
+
+static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) {
+  vint2 vi = { _mm_sub_epi32(x.x, y.x), _mm_sub_epi32(x.y, y.y) };
+  return vi;
+}
+
+static INLINE vint2 vneg_vi2_vi2(vint2 e) {
+  vint2 vi = { _mm_sub_epi32(_mm_set1_epi32(0), e.x), _mm_sub_epi32(_mm_set1_epi32(0), e.y) };
+  return vi;
+}
+
+static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) {
+  vint2 vi = { _mm_and_si128(x.x, y.x), _mm_and_si128(x.y, y.y) };
+  return vi;
+}
+
+static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) {
+  vint2 vi = { _mm_andnot_si128(x.x, y.x), _mm_andnot_si128(x.y, y.y) };
+  return vi;
+}
+
+static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) {
+  vint2 vi = { _mm_or_si128(x.x, y.x), _mm_or_si128(x.y, y.y) };
+  return vi;
+}
+
+static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) {
+  vint2 vi = { _mm_xor_si128(x.x, y.x), _mm_xor_si128(x.y, y.y) };
+  return vi;
+}
+
+static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vand_vi2_vi2_vi2(vcast_vi2_vm(x), y); }
+static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi2_vi2_vi2(vcast_vi2_vm(x), y); }
+
+static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) {
+  vint2 vi = { _mm_slli_epi32(x.x, c), _mm_slli_epi32(x.y, c) };
+  return vi;
+}
+
+static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) {
+  vint2 vi = { _mm_srli_epi32(x.x, c), _mm_srli_epi32(x.y, c) };
+  return vi;
+}
+
+static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) {
+  vint2 vi = { _mm_srai_epi32(x.x, c), _mm_srai_epi32(x.y, c) };
+  return vi;
+}
+
+static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) {
+  vint2 r;
+  r.x = _mm_cmpeq_epi32(x.x, y.x);
+  r.y = _mm_cmpeq_epi32(x.y, y.y);
+  return vcast_vm_vi2(r);
+}
+
+static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) {
+  vint2 r;
+  r.x = _mm_cmpgt_epi32(x.x, y.x);
+  r.y = _mm_cmpgt_epi32(x.y, y.y);
+  return vcast_vm_vi2(r);
+}
+
+static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) {
+  vint2 r;
+  r.x = _mm_cmpeq_epi32(x.x, y.x);
+  r.y = _mm_cmpeq_epi32(x.y, y.y);
+  return r;
+}
+
+static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) {
+  vint2 r;
+  r.x = _mm_cmpgt_epi32(x.x, y.x);
+  r.y = _mm_cmpgt_epi32(x.y, y.y);
+  return r;
+}
+
+static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
+  vint2 n = vcast_vi2_vm(m);
+  vint2 r = { _mm_blendv_epi8(y.x, x.x, n.x), _mm_blendv_epi8(y.y, x.y, n.y) };
+  return r;
+}
+
+static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) {
+  vint2 ix = vcast_vi2_vm(x), iy = vcast_vi2_vm(y), iz;
+  iz.x = _mm_add_epi64(ix.x, iy.x);
+  iz.y = _mm_add_epi64(ix.y, iy.y);
+  return vcast_vm_vi2(iz);
+}
+
+static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return _mm256_blendv_ps(y, x, _mm256_castsi256_ps(o)); }
+
+static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
+  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
+}
+
+static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(INFINITYf)); }
+static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(INFINITYf)); }
+static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-INFINITYf)); }
+static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
+
+//
+
+static INLINE int vall_lte64_i_vd_vd(vdouble x, vdouble lim) {
+  vdouble gt = _mm256_cmp_pd(x, lim, _CMP_GT_OQ);
+  return _mm256_testz_pd(gt, gt);
+}
+
+static INLINE int vall_lte32_i_vf_vf(vfloat x, vfloat lim) {
+  vfloat gt = _mm256_cmp_ps(x, lim, _CMP_GT_OQ);
+  return _mm256_testz_ps(gt, gt);
+}
+
+
+//
+
+static INLINE vfloat vload_vf_p(const float *ptr) { return _mm256_load_ps(ptr); }
+static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm256_loadu_ps(ptr); }
+
+static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm256_store_ps(ptr, v); }
+static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm256_storeu_ps(ptr, v); }
+
+#ifdef _MSC_VER
+// This function is needed when debugging on MSVC.
+static INLINE float vcast_f_vf(vfloat v) {
+  float a[VECTLENSP];
+  vstoreu_v_p_vf(a, v);
+  return a[0];
+}
+#endif
+//
+
+#define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0 })
+#define NPMASK ((vdouble) { -0.0, +0.0, -0.0, +0.0 })
+#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f })
+#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f })
+
+static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
+static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
+static INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(PNMASKf))); }
+static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(NPMASKf))); }
+
+static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_addsub_pd(x, y); }
+static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_addsub_ps(x, y); }
+
+#if CONFIG == 1
+static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
+static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
+#else
+static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vmla_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); }
+static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmla_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); }
+#endif
+
+
+static INLINE vdouble vrev21_vd_vd(vdouble d0) { return  _mm256_shuffle_pd(d0, d0, (0 << 3) | (1 << 2) | (0 << 1) | (1 << 0)); }
+static INLINE vdouble vreva2_vd_vd(vdouble d0) { d0 = _mm256_permute2f128_pd(d0, d0, 1); return _mm256_shuffle_pd(d0, d0, (1 << 3) | (0 << 2) | (1 << 1) | (0 << 0)); }
+
+static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm256_stream_pd(ptr, v); }
+static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
+  _mm_store_pd(&ptr[(offset + step * 0)*2], _mm256_extractf128_pd(v, 0));
+  _mm_store_pd(&ptr[(offset + step * 1)*2], _mm256_extractf128_pd(v, 1));
+}
+
+static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
+  _mm_stream_pd(&ptr[(offset + step * 0)*2], _mm256_extractf128_pd(v, 0));
+  _mm_stream_pd(&ptr[(offset + step * 1)*2], _mm256_extractf128_pd(v, 1));
+}
+
+//
+
+static INLINE vfloat vrev21_vf_vf(vfloat d0) { return _mm256_shuffle_ps(d0, d0, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)); }
+static INLINE vfloat vreva2_vf_vf(vfloat d0) { d0 = _mm256_permute2f128_ps(d0, d0, 1); return _mm256_shuffle_ps(d0, d0, (1 << 6) | (0 << 4) | (3 << 2) | (2 << 0)); }
+
+static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm256_stream_ps(ptr, v); }
+
+static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
+  _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 0))));
+  _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 0))));
+  _mm_storel_pd((double *)(ptr+(offset + step * 2)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 1))));
+  _mm_storeh_pd((double *)(ptr+(offset + step * 3)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 1))));
+}
+
+static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
diff --git a/lib/kernel/sleef/arch/helperavx2.h b/lib/kernel/sleef/arch/helperavx2.h
new file mode 100644
index 0000000..895ded6
--- /dev/null
+++ b/lib/kernel/sleef/arch/helperavx2.h
@@ -0,0 +1,397 @@
+//          Copyright Naoki Shibata 2010 - 2017.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#if CONFIG == 1
+
+#ifndef __AVX2__
+#error Please specify -mavx2.
+#endif
+
+#else
+#error CONFIG macro invalid or not defined
+#endif
+
+#define ENABLE_DP
+#define LOG2VECTLENDP 2
+#define VECTLENDP (1 << LOG2VECTLENDP)
+#define ENABLE_FMA_DP
+
+#define ENABLE_SP
+#define LOG2VECTLENSP (LOG2VECTLENDP+1)
+#define VECTLENSP (1 << LOG2VECTLENSP)
+#define ENABLE_FMA_SP
+
+#define FULL_FP_ROUNDING
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+
+typedef __m256i vmask;
+typedef __m256i vopmask;
+
+typedef __m256d vdouble;
+typedef __m128i vint;
+
+typedef __m256 vfloat;
+typedef __m256i vint2;
+
+//
+
+static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }
+
+static INLINE int vtestallones_i_vo32(vopmask g) {
+  return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1)));
+}
+
+static INLINE int vtestallones_i_vo64(vopmask g) {
+  return _mm_test_all_ones(_mm_and_si128(_mm256_extractf128_si256(g, 0), _mm256_extractf128_si256(g, 1)));
+}
+
+//
+
+static INLINE vdouble vcast_vd_d(double d) { return _mm256_set1_pd(d); }
+static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm256_castpd_si256(vd); }
+static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm256_castsi256_pd(vm);  }
+static INLINE vint2 vreinterpret_vi2_vd(vdouble vd) { return _mm256_castpd_si256(vd); }
+static INLINE vdouble vreinterpret_vd_vi2(vint2 vi) { return _mm256_castsi256_pd(vi); }
+
+//
+
+static vint2 vloadu_vi2_p(int32_t *p) { return _mm256_loadu_si256((__m256i const *)p); }
+static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { _mm256_storeu_si256((__m256i *)p, v); }
+static vint vloadu_vi_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); }
+static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, v); }
+
+//
+
+static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+
+static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+
+static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+
+static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm256_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+
+static INLINE vopmask vcast_vo32_vo64(vopmask o) {
+  return _mm256_permutevar8x32_epi32(o, _mm256_set_epi32(0, 0, 0, 0, 6, 4, 2, 0));
+}
+
+static INLINE vopmask vcast_vo64_vo32(vopmask o) {
+  return _mm256_permutevar8x32_epi32(o, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
+}
+
+//
+
+static INLINE vint vrint_vi_vd(vdouble vd) { return _mm256_cvtpd_epi32(vd); }
+static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm256_cvttpd_epi32(vd); }
+static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
+static INLINE vfloat vrint_vf_vf(vfloat vd) { return _mm256_round_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
+static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm256_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
+static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm256_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
+static INLINE vdouble vcast_vd_vi(vint vi) { return _mm256_cvtepi32_pd(vi); }
+static INLINE vint vcast_vi_i(int i) { return _mm_set1_epi32(i); }
+
+static INLINE vint2 vcastu_vi2_vi(vint vi) {
+  return _mm256_slli_epi64(_mm256_cvtepi32_epi64(vi), 32);
+}
+
+static INLINE vint vcastu_vi_vi2(vint2 vi) {
+  return _mm_or_si128(_mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm256_castsi256_si128(vi)), _mm_set1_ps(0), 0x0d)),
+         _mm_castps_si128(_mm_shuffle_ps(_mm_set1_ps(0), _mm_castsi128_ps(_mm256_extractf128_si256(vi, 1)), 0xd0)));
+}
+
+static INLINE vmask vcast_vm_i_i(int i0, int i1) {
+  return _mm256_set_epi32(i0, i1, i0, i1, i0, i1, i0, i1);
+}
+
+static INLINE vint2 vsrl64_vi2_vi(vint2 x, int i) { return _mm256_srli_epi64(x, i); }
+static INLINE vint2 vadd64_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_add_epi64(x, y); }
+static INLINE vint2 vsub64_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_sub_epi64(x, y); }
+static INLINE vint2 vcast_vi2_i64(long x) { return _mm256_set1_epi64x(x); }
+
+
+static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return _mm256_cmpeq_epi64(x, y); }
+static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return _mm256_add_epi64(x, y); }
+
+//
+
+static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_add_pd(x, y); }
+static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm256_sub_pd(x, y); }
+static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm256_mul_pd(x, y); }
+static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm256_div_pd(x, y); }
+static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm256_div_pd(_mm256_set1_pd(1), x); }
+static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm256_sqrt_pd(x); }
+static INLINE vdouble vabs_vd_vd(vdouble d) { return _mm256_andnot_pd(_mm256_set1_pd(-0.0), d); }
+static INLINE vdouble vneg_vd_vd(vdouble d) { return _mm256_xor_pd(_mm256_set1_pd(-0.0), d); }
+static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmadd_pd(x, y, z); }
+static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmsub_pd(x, y, z); }
+static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fnmadd_pd(x, y, z); }
+static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm256_max_pd(x, y); }
+static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm256_min_pd(x, y); }
+
+static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmadd_pd(x, y, z); }
+static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmadd_pd(x, y, z); }
+static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fmsub_pd(x, y, z); }
+static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fnmadd_pd(x, y, z); }
+static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm256_fnmsub_pd(x, y, z); }
+
+static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_EQ_OQ)); }
+static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_NEQ_UQ)); }
+static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LT_OQ)); }
+static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_LE_OQ)); }
+static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_GT_OQ)); }
+static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm256_cmp_pd(x, y, _CMP_GE_OQ)); }
+
+//
+
+static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); }
+static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); }
+static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); }
+
+static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); }
+static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); }
+static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); }
+static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); }
+
+static INLINE vint vandnot_vi_vo_vi(vopmask m, vint y) { return _mm_andnot_si128(_mm256_castsi256_si128(m), y); }
+static INLINE vint vand_vi_vo_vi(vopmask m, vint y) { return _mm_and_si128(_mm256_castsi256_si128(m), y); }
+
+static INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); }
+static INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); }
+static INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); }
+
+static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }
+static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }
+
+static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return _mm256_castsi128_si256(_mm_cmpeq_epi32(x, y)); }
+static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return _mm256_castsi128_si256(_mm_cmpgt_epi32(x, y)); }
+
+static INLINE vopmask veq_cvt_vo_vi_vi(vint x, vint y) { return _mm256_cvtepi32_epi64(_mm_cmpeq_epi32(x, y)); }
+static INLINE vopmask vgt_cvt_vo_vi_vi(vint x, vint y) { return _mm256_cvtepi32_epi64(_mm_cmpgt_epi32(x, y)); }
+
+static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return _mm_blendv_epi8(y, x, _mm256_castsi256_si128(m)); }
+
+static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return _mm256_blendv_pd(y, x, _mm256_castsi256_pd(o)); }
+static INLINE vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) { return _mm256_permutevar_pd(_mm256_set_pd(v1, v0, v1, v0), o); }
+
+static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
+  __m256i v = _mm256_castpd_si256(vsel_vd_vo_vd_vd(o0, _mm256_castsi256_pd(_mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0)),
+						   vsel_vd_vo_vd_vd(o1, _mm256_castsi256_pd(_mm256_set_epi32(3, 2, 3, 2, 3, 2, 3, 2)),
+								    vsel_vd_vo_vd_vd(o2, _mm256_castsi256_pd(_mm256_set_epi32(5, 4, 5, 4, 5, 4, 5, 4)),
+										     _mm256_castsi256_pd(_mm256_set_epi32(7, 6, 7, 6, 7, 6, 7, 6))))));
+  return _mm256_castsi256_pd(_mm256_permutevar8x32_epi32(_mm256_castpd_si256(_mm256_set_pd(d3, d2, d1, d0)), v));
+}
+
+static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
+  return vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o1, d0, d1, d2, d2);
+}
+
+static INLINE vopmask visinf_vo_vd(vdouble d) {
+  return vreinterpret_vm_vd(_mm256_cmp_pd(vabs_vd_vd(d), _mm256_set1_pd(INFINITY), _CMP_EQ_OQ));
+}
+
+static INLINE vopmask vispinf_vo_vd(vdouble d) {
+  return vreinterpret_vm_vd(_mm256_cmp_pd(d, _mm256_set1_pd(INFINITY), _CMP_EQ_OQ));
+}
+
+static INLINE vopmask visminf_vo_vd(vdouble d) {
+  return vreinterpret_vm_vd(_mm256_cmp_pd(d, _mm256_set1_pd(-INFINITY), _CMP_EQ_OQ));
+}
+
+static INLINE vopmask visnan_vo_vd(vdouble d) {
+  return vreinterpret_vm_vd(_mm256_cmp_pd(d, d, _CMP_NEQ_UQ));
+}
+
+#if defined(_MSC_VER)
+// This function is needed when debugging on MSVC.
+static INLINE double vcast_d_vd(vdouble v) {
+  double s[4];
+  _mm256_storeu_pd(s, v);
+  return s[0];
+}
+#endif
+
+static INLINE vdouble vload_vd_p(const double *ptr) { return _mm256_load_pd(ptr); }
+static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm256_loadu_pd(ptr); }
+
+static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm256_store_pd(ptr, v); }
+static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm256_storeu_pd(ptr, v); }
+
+//
+
+static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }
+static INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; }
+
+static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256_cvtps_epi32(vf)); }
+static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm256_cvttps_epi32(vf)); }
+static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm256_cvtepi32_ps(vcast_vm_vi2(vi)); }
+static INLINE vfloat vcast_vf_f(float f) { return _mm256_set1_ps(f); }
+static INLINE vint2 vcast_vi2_i(int i) { return _mm256_set1_epi32(i); }
+static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm256_castps_si256(vf); }
+static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm256_castsi256_ps(vm); }
+
+static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return vreinterpret_vf_vm(vcast_vm_vi2(vi)); }
+static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return vcast_vi2_vm(vreinterpret_vm_vf(vf)); }
+
+static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_add_ps(x, y); }
+static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm256_sub_ps(x, y); }
+static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm256_mul_ps(x, y); }
+static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm256_div_ps(x, y); }
+static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }
+static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm256_sqrt_ps(x); }
+static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); }
+static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); }
+static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmadd_ps(x, y, z); }
+static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmsub_ps(x, y, z); }
+static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fnmadd_ps(x, y, z); }
+static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm256_max_ps(x, y); }
+static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm256_min_ps(x, y); }
+
+static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmadd_ps(x, y, z); }
+static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmadd_ps(x, y, z); }
+static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fmsub_ps(x, y, z); }
+static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fnmadd_ps(x, y, z); }
+static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm256_fnmsub_ps(x, y, z); }
+
+static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_EQ_OQ)); }
+static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_NEQ_UQ)); }
+static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LT_OQ)); }
+static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_LE_OQ)); }
+static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_GT_OQ)); }
+static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm256_cmp_ps(x, y, _CMP_GE_OQ)); }
+
+static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_add_epi32(x, y); }
+static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_sub_epi32(x, y); }
+static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); }
+
+static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_and_si256(x, y); }
+static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_andnot_si256(x, y); }
+static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_or_si256(x, y); }
+static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_xor_si256(x, y); }
+
+static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vand_vi2_vi2_vi2(vcast_vi2_vm(x), y); }
+static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi2_vi2_vi2(vcast_vi2_vm(x), y); }
+
+static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return _mm256_slli_epi32(x, c); }
+static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return _mm256_srli_epi32(x, c); }
+static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return _mm256_srai_epi32(x, c); }
+
+static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpeq_epi32(x, y); }
+static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpgt_epi32(x, y); }
+static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpeq_epi32(x, y); }
+static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm256_cmpgt_epi32(x, y); }
+
+static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
+  return _mm256_blendv_epi8(y, x, m);
+}
+
+static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return _mm256_blendv_ps(y, x, _mm256_castsi256_ps(o)); }
+
+// At this point, the following three functions are implemented in a generic way,
+// but I will try target-specific optimization later on.
+static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
+  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
+}
+
+static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(INFINITYf)); }
+static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(INFINITYf)); }
+static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-INFINITYf)); }
+static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
+
+static INLINE int vall_lte64_i_vd_vd(vdouble x, vdouble lim) {
+  vdouble gt = _mm256_cmp_pd(x, lim, _CMP_GT_OQ);
+  return _mm256_testz_pd(gt, gt);
+}
+
+static INLINE int vall_lte32_i_vf_vf(vfloat x, vfloat lim) {
+  vfloat gt = _mm256_cmp_ps(x, lim, _CMP_GT_OQ);
+  return _mm256_testz_ps(gt, gt);
+}
+
+#ifdef _MSC_VER
+// This function is needed when debugging on MSVC.
+static INLINE float vcast_f_vf(vfloat v) {
+  float s[8];
+  _mm256_storeu_ps(s, v);
+  return s[0];
+}
+#endif
+
+static INLINE vfloat vload_vf_p(const float *ptr) { return _mm256_load_ps(ptr); }
+static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm256_loadu_ps(ptr); }
+
+static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm256_store_ps(ptr, v); }
+static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm256_storeu_ps(ptr, v); }
+
+//
+
+#define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0 })
+#define NPMASK ((vdouble) { -0.0, +0.0, -0.0, +0.0 })
+#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f })
+#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f })
+
+static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
+static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
+static INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(PNMASKf))); }
+static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(NPMASKf))); }
+
+static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return _mm256_addsub_pd(x, y); }
+static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return _mm256_addsub_ps(x, y); }
+
+static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vmla_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); }
+static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmla_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); }
+
+static INLINE vdouble vrev21_vd_vd(vdouble d0) { return  _mm256_shuffle_pd(d0, d0, (0 << 3) | (1 << 2) | (0 << 1) | (1 << 0)); }
+static INLINE vdouble vreva2_vd_vd(vdouble d0) { d0 = _mm256_permute2f128_pd(d0, d0, 1); return _mm256_shuffle_pd(d0, d0, (1 << 3) | (0 << 2) | (1 << 1) | (0 << 0)); }
+
+static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm256_stream_pd(ptr, v); }
+static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
+  _mm_store_pd(&ptr[(offset + step * 0)*2], _mm256_extractf128_pd(v, 0));
+  _mm_store_pd(&ptr[(offset + step * 1)*2], _mm256_extractf128_pd(v, 1));
+}
+
+static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
+  _mm_stream_pd(&ptr[(offset + step * 0)*2], _mm256_extractf128_pd(v, 0));
+  _mm_stream_pd(&ptr[(offset + step * 1)*2], _mm256_extractf128_pd(v, 1));
+}
+
+//
+
+static INLINE vfloat vrev21_vf_vf(vfloat d0) { return _mm256_shuffle_ps(d0, d0, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)); }
+static INLINE vfloat vreva2_vf_vf(vfloat d0) { d0 = _mm256_permute2f128_ps(d0, d0, 1); return _mm256_shuffle_ps(d0, d0, (1 << 6) | (0 << 4) | (3 << 2) | (2 << 0)); }
+
+static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm256_stream_ps(ptr, v); }
+
+static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
+  _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 0))));
+  _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 0))));
+  _mm_storel_pd((double *)(ptr+(offset + step * 2)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 1))));
+  _mm_storeh_pd((double *)(ptr+(offset + step * 3)*2), _mm_castsi128_pd(_mm_castps_si128(_mm256_extractf128_ps(v, 1))));
+}
+
+static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
diff --git a/lib/kernel/sleef/arch/helperavx2_128.h b/lib/kernel/sleef/arch/helperavx2_128.h
new file mode 100644
index 0000000..a433144
--- /dev/null
+++ b/lib/kernel/sleef/arch/helperavx2_128.h
@@ -0,0 +1,370 @@
+//          Copyright Naoki Shibata 2010 - 2017.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#if CONFIG == 1
+
+#ifndef __AVX2__
+#error Please specify -mavx2.
+#endif
+
+#else
+#error CONFIG macro invalid or not defined
+#endif
+
+#define ENABLE_DP
+#define LOG2VECTLENDP 1
+#define VECTLENDP (1 << LOG2VECTLENDP)
+#define ENABLE_FMA_DP
+
+#define ENABLE_SP
+#define LOG2VECTLENSP (LOG2VECTLENDP+1)
+#define VECTLENSP (1 << LOG2VECTLENSP)
+#define ENABLE_FMA_SP
+
+#define FULL_FP_ROUNDING
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+
+typedef __m128i vmask;
+typedef __m128i vopmask;
+
+typedef __m128d vdouble;
+typedef __m128i vint;
+
+typedef __m128  vfloat;
+typedef __m128i vint2;
+
+//
+
+static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }
+
+static INLINE int vtestallones_i_vo32(vopmask g) { return _mm_movemask_epi8(g) == 0xFFFF; }
+static INLINE int vtestallones_i_vo64(vopmask g) { return _mm_movemask_epi8(g) == 0xFFFF; }
+
+//
+
+static INLINE vdouble vcast_vd_d(double d) { return _mm_set1_pd(d); }
+static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm_castpd_si128(vd); }
+static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm_castsi128_pd(vm);  }
+static INLINE vint2 vreinterpret_vi2_vd(vdouble vd) { return _mm_castpd_si128(vd); }
+static INLINE vdouble vreinterpret_vd_vi2(vint2 vi) { return _mm_castsi128_pd(vi); }
+
+//
+
+static vint2 vloadu_vi2_p(int32_t *p) { return _mm_loadu_si128((__m128i const *)p); }
+static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { _mm_storeu_si128((__m128i *)p, v); }
+static vint vloadu_vi_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); }
+static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, v); }
+
+//
+
+static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return vreinterpret_vm_vd(_mm_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+
+static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return vreinterpret_vm_vd(_mm_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+
+static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+
+static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_and_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_andnot_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_or_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return vreinterpret_vm_vd(_mm_xor_pd(vreinterpret_vd_vm(x), vreinterpret_vd_vm(y))); }
+
+static INLINE vopmask vcast_vo32_vo64(vopmask m) { return _mm_shuffle_epi32(m, 0x08); }
+static INLINE vopmask vcast_vo64_vo32(vopmask m) { return _mm_shuffle_epi32(m, 0x50); }
+
+//
+
+static INLINE vint vrint_vi_vd(vdouble vd) { return _mm_cvtpd_epi32(vd); }
+static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm_cvttpd_epi32(vd); }
+static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
+static INLINE vfloat vrint_vf_vf(vfloat vd) { return _mm_round_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
+static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
+static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
+static INLINE vdouble vcast_vd_vi(vint vi) { return _mm_cvtepi32_pd(vi); }
+static INLINE vint vcast_vi_i(int i) { return _mm_set1_epi32(i); }
+
+static INLINE vint2 vcastu_vi2_vi(vint vi) { return _mm_and_si128(_mm_shuffle_epi32(vi, 0x73), _mm_set_epi32(-1, 0, -1, 0)); }
+
+static INLINE vint vcastu_vi_vi2(vint2 vi) { return _mm_shuffle_epi32(vi, 0x0d); }
+
+static INLINE vmask vcast_vm_i_i(int i0, int i1) { return _mm_set_epi32(i0, i1, i0, i1); }
+
+static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return _mm_cmpeq_epi64(x, y); }
+static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return _mm_add_epi64(x, y); }
+
+static INLINE vint2 vsrl64_vi2_vi(vint2 x, int i) { return _mm_srli_epi64(x, i); }
+static INLINE vint2 vadd64_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_add_epi64(x, y); }
+static INLINE vint2 vsub64_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_sub_epi64(x, y); }
+static INLINE vint2 vcast_vi2_i64(long x) { return _mm_set1_epi64x(x); }
+
+//
+
+static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm_add_pd(x, y); }
+static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm_sub_pd(x, y); }
+static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm_mul_pd(x, y); }
+static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm_div_pd(x, y); }
+static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm_div_pd(_mm_set1_pd(1), x); }
+static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm_sqrt_pd(x); }
+static INLINE vdouble vabs_vd_vd(vdouble d) { return _mm_andnot_pd(_mm_set1_pd(-0.0), d); }
+static INLINE vdouble vneg_vd_vd(vdouble d) { return _mm_xor_pd(_mm_set1_pd(-0.0), d); }
+static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fmadd_pd(x, y, z); }
+static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fmsub_pd(x, y, z); }
+static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fnmadd_pd(x, y, z); }
+static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm_max_pd(x, y); }
+static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm_min_pd(x, y); }
+
+static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fmadd_pd(x, y, z); }
+static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fmadd_pd(x, y, z); }
+static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fmsub_pd(x, y, z); }
+static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fnmadd_pd(x, y, z); }
+static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_fnmsub_pd(x, y, z); }
+
+static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm_cmp_pd(x, y, _CMP_EQ_OQ)); }
+static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm_cmp_pd(x, y, _CMP_NEQ_UQ)); }
+static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm_cmp_pd(x, y, _CMP_LT_OQ)); }
+static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm_cmp_pd(x, y, _CMP_LE_OQ)); }
+static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm_cmp_pd(x, y, _CMP_GT_OQ)); }
+static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return vreinterpret_vm_vd(_mm_cmp_pd(x, y, _CMP_GE_OQ)); }
+
+//
+
+static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); }
+static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); }
+static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); }
+
+static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); }
+static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); }
+static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); }
+static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); }
+
+static INLINE vint vand_vi_vo_vi(vopmask x, vint y) { return _mm_and_si128(x, y); }
+static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return _mm_andnot_si128(x, y); }
+
+static INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); }
+static INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); }
+static INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); }
+
+static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }
+static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }
+
+static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }
+static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }
+
+static INLINE vopmask veq_cvt_vo_vi_vi(vint x, vint y) { return _mm_shuffle_epi32(_mm_cmpeq_epi32(x, y), 0xf5); }
+static INLINE vopmask vgt_cvt_vo_vi_vi(vint x, vint y) { return _mm_shuffle_epi32(_mm_cmpgt_epi32(x, y), 0xf5); }
+
+static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return _mm_blendv_epi8(y, x, m); }
+
+static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return _mm_blendv_pd(y, x, _mm_castsi128_pd(o)); }
+
+static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
+  return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
+}
+
+static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
+  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
+}
+
+static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
+  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
+}
+
+static INLINE vopmask visinf_vo_vd(vdouble d) {
+  return vreinterpret_vm_vd(_mm_cmp_pd(vabs_vd_vd(d), _mm_set1_pd(INFINITY), _CMP_EQ_OQ));
+}
+
+static INLINE vopmask vispinf_vo_vd(vdouble d) {
+  return vreinterpret_vm_vd(_mm_cmp_pd(d, _mm_set1_pd(INFINITY), _CMP_EQ_OQ));
+}
+
+static INLINE vopmask visminf_vo_vd(vdouble d) {
+  return vreinterpret_vm_vd(_mm_cmp_pd(d, _mm_set1_pd(-INFINITY), _CMP_EQ_OQ));
+}
+
+static INLINE vopmask visnan_vo_vd(vdouble d) {
+  return vreinterpret_vm_vd(_mm_cmp_pd(d, d, _CMP_NEQ_UQ));
+}
+
+static INLINE vdouble vload_vd_p(const double *ptr) { return _mm_load_pd(ptr); }
+static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm_loadu_pd(ptr); }
+
+static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm_store_pd(ptr, v); }
+static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm_storeu_pd(ptr, v); }
+
+#if defined(_MSC_VER)
+// This function is needed when debugging on MSVC.
+static INLINE double vcast_d_vd(vdouble v) {
+  double a[VECTLENDP];
+  vstoreu_v_p_vd(a, v);
+  return a[0];
+}
+#endif
+
+//
+
+static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }
+static INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; }
+
+static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm_cvtps_epi32(vf)); }
+static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm(_mm_cvttps_epi32(vf)); }
+static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm_cvtepi32_ps(vcast_vm_vi2(vi)); }
+static INLINE vfloat vcast_vf_f(float f) { return _mm_set1_ps(f); }
+static INLINE vint2 vcast_vi2_i(int i) { return _mm_set1_epi32(i); }
+static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm_castps_si128(vf); }
+static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm_castsi128_ps(vm); }
+
+static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return vreinterpret_vf_vm(vcast_vm_vi2(vi)); }
+static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return vcast_vi2_vm(vreinterpret_vm_vf(vf)); }
+
+static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm_add_ps(x, y); }
+static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm_sub_ps(x, y); }
+static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm_mul_ps(x, y); }
+static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm_div_ps(x, y); }
+static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }
+static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm_sqrt_ps(x); }
+static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); }
+static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); }
+static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fmadd_ps(x, y, z); }
+static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fmsub_ps(x, y, z); }
+static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fnmadd_ps(x, y, z); }
+static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm_max_ps(x, y); }
+static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm_min_ps(x, y); }
+
+static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fmadd_ps(x, y, z); }
+static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fmadd_ps(x, y, z); }
+static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fmsub_ps(x, y, z); }
+static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fnmadd_ps(x, y, z); }
+static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_fnmsub_ps(x, y, z); }
+
+static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmp_ps(x, y, _CMP_EQ_OQ)); }
+static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmp_ps(x, y, _CMP_NEQ_UQ)); }
+static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmp_ps(x, y, _CMP_LT_OQ)); }
+static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmp_ps(x, y, _CMP_LE_OQ)); }
+static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmp_ps(x, y, _CMP_GT_OQ)); }
+static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmp_ps(x, y, _CMP_GE_OQ)); }
+
+static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_add_epi32(x, y); }
+static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_sub_epi32(x, y); }
+static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); }
+
+static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_and_si128(x, y); }
+static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_andnot_si128(x, y); }
+static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_or_si128(x, y); }
+static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_xor_si128(x, y); }
+
+static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vand_vi2_vi2_vi2(vcast_vi2_vm(x), y); }
+static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi2_vi2_vi2(vcast_vi2_vm(x), y); }
+
+static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return _mm_slli_epi32(x, c); }
+static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return _mm_srli_epi32(x, c); }
+static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return _mm_srai_epi32(x, c); }
+
+static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpeq_epi32(x, y); }
+static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi32(x, y); }
+static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpeq_epi32(x, y); }
+static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi32(x, y); }
+
+static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
+  return _mm_blendv_epi8(y, x, m);
+}
+
+static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return _mm_blendv_ps(y, x, _mm_castsi128_ps(o)); }
+
+static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
+  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
+}
+
+static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(INFINITYf)); }
+static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(INFINITYf)); }
+static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-INFINITYf)); }
+static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
+
+static INLINE int vall_lte64_i_vd_vd(vdouble x, vdouble lim) {
+  vdouble gt = _mm_cmp_pd(x, lim, _CMP_GT_OQ);
+  return _mm_testz_pd(gt, gt);
+}
+
+static INLINE int vall_lte32_i_vf_vf(vfloat x, vfloat lim) {
+  vfloat gt = _mm_cmp_ps(x, lim, _CMP_GT_OQ);
+  return _mm_testz_ps(gt, gt);
+}
+
+static INLINE vfloat vload_vf_p(const float *ptr) { return _mm_load_ps(ptr); }
+static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm_loadu_ps(ptr); }
+
+static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm_store_ps(ptr, v); }
+static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm_storeu_ps(ptr, v); }
+
+#ifdef _MSC_VER
+// This function is needed when debugging on MSVC.
+static INLINE float vcast_f_vf(vfloat v) {
+  float a[VECTLENSP];
+  vstoreu_v_p_vf(a, v);
+  return a[0];
+}
+#endif
+
+//
+
+#define PNMASK ((vdouble) { +0.0, -0.0 })
+#define NPMASK ((vdouble) { -0.0, +0.0 })
+#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
+#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })
+
+static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
+static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
+static INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(PNMASKf))); }
+static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(NPMASKf))); }
+
+static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return _mm_addsub_pd(x, y); }
+static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return _mm_addsub_ps(x, y); }
+
+static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vmla_vd_vd_vd_vd(x, y, vnegpos_vd_vd(z)); }
+static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmla_vf_vf_vf_vf(x, y, vnegpos_vf_vf(z)); }
+
+static INLINE vdouble vrev21_vd_vd(vdouble d0) { return _mm_shuffle_pd(d0, d0, 1); }
+static INLINE vdouble vreva2_vd_vd(vdouble vd) { return vd; }
+
+static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm_stream_pd(ptr, v); }
+static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); }
+static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { _mm_stream_pd((double *)(&ptr[2*offset]), v); }
+
+//
+
+static INLINE vfloat vrev21_vf_vf(vfloat d0) { return _mm_shuffle_ps(d0, d0, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)); }
+static INLINE vfloat vreva2_vf_vf(vfloat d0) { return _mm_shuffle_ps(d0, d0, (1 << 6) | (0 << 4) | (3 << 2) | (2 << 0)); }
+
+static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm_stream_ps(ptr, v); }
+
+static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
+  _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
+  _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
+}
+
+static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
+  _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
+  _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
+}
diff --git a/lib/kernel/sleef/arch/helperavx512f.h b/lib/kernel/sleef/arch/helperavx512f.h
new file mode 100644
index 0000000..2a866e9
--- /dev/null
+++ b/lib/kernel/sleef/arch/helperavx512f.h
@@ -0,0 +1,499 @@
+//          Copyright Naoki Shibata 2010 - 2017.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#if CONFIG == 1
+
+#ifndef __AVX512F__
+#error Please specify -mavx512f.
+#endif
+
+#else
+#error CONFIG macro invalid or not defined
+#endif
+
+#define ENABLE_DP
+#define LOG2VECTLENDP 3
+#define VECTLENDP (1 << LOG2VECTLENDP)
+#define ENABLE_FMA_DP
+
+#define ENABLE_SP
+#define LOG2VECTLENSP (LOG2VECTLENDP+1)
+#define VECTLENSP (1 << LOG2VECTLENSP)
+#define ENABLE_FMA_SP
+
+#define FULL_FP_ROUNDING
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+
+typedef __m512i vmask;
+typedef __mmask16 vopmask;
+
+typedef __m512d vdouble;
+typedef __m256i vint;
+
+typedef __m512 vfloat;
+typedef __m512i vint2;
+
+//
+
+static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }
+
+#ifdef __INTEL_COMPILER
+static INLINE int vtestallones_i_vo64(vopmask g) { return _mm512_mask2int(g) == 0xff; }
+static INLINE int vtestallones_i_vo32(vopmask g) { return _mm512_mask2int(g) == 0xffff; }
+#else
+static INLINE int vtestallones_i_vo64(vopmask g) { return g == 0xff; }
+static INLINE int vtestallones_i_vo32(vopmask g) { return g == 0xffff; }
+#endif
+
+//
+
+static vint2 vloadu_vi2_p(int32_t *p) { return _mm512_loadu_si512((__m512i const *)p); }
+static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { return _mm512_storeu_si512((__m512i *)p, v); }
+static vint vloadu_vi_p(int32_t *p) { return _mm256_loadu_si256((__m256i const *)p); }
+static void vstoreu_v_p_vi(int32_t *p, vint v) { return _mm256_storeu_si256((__m256i *)p, v); }
+
+//
+
+static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return _mm512_and_si512(x, y); }
+static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return _mm512_andnot_si512(x, y); }
+static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return _mm512_or_si512(x, y); }
+static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return _mm512_xor_si512(x, y); }
+
+static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return _mm512_kand(x, y); }
+static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return _mm512_kandn(x, y); }
+static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return _mm512_kor(x, y); }
+static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return _mm512_kxor(x, y); }
+
+static INLINE vmask vand_vm_vo64_vm(vopmask o, vmask m) { return _mm512_mask_and_epi64(_mm512_set1_epi32(0), o, m, m); }
+static INLINE vmask vandnot_vm_vo64_vm(vopmask o, vmask m) { return _mm512_mask_and_epi64(m, o, _mm512_set1_epi32(0), _mm512_set1_epi32(0)); }
+static INLINE vmask vor_vm_vo64_vm(vopmask o, vmask m) { return _mm512_mask_or_epi64(m, o, _mm512_set1_epi32(-1), _mm512_set1_epi32(-1)); }
+
+static INLINE vmask vand_vm_vo32_vm(vopmask o, vmask m) { return _mm512_mask_and_epi32(_mm512_set1_epi32(0), o, m, m); }
+static INLINE vmask vandnot_vm_vo32_vm(vopmask o, vmask m) { return _mm512_mask_and_epi32(m, o, _mm512_set1_epi32(0), _mm512_set1_epi32(0)); }
+static INLINE vmask vor_vm_vo32_vm(vopmask o, vmask m) { return _mm512_mask_or_epi32(m, o, _mm512_set1_epi32(-1), _mm512_set1_epi32(-1)); }
+
+static INLINE vopmask vcast_vo32_vo64(vopmask o) { return o; }
+static INLINE vopmask vcast_vo64_vo32(vopmask o) { return o; }
+
+//
+
+static INLINE vint vrint_vi_vd(vdouble vd) {
+  return _mm512_cvt_roundpd_epi32(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC);
+}
+
+static INLINE vint vtruncate_vi_vd(vdouble vd) {
+  return _mm512_cvt_roundpd_epi32(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC);
+}
+
+static INLINE vdouble vcast_vd_vi(vint vi) { return _mm512_cvtepi32_pd(vi); }
+static INLINE vint vcast_vi_i(int i) { return _mm256_set1_epi32(i); }
+
+static INLINE vdouble vtruncate_vd_vd(vdouble vd) {
+  __m256d hi = _mm512_extractf64x4_pd(vd, 1), lo = _mm512_extractf64x4_pd(vd, 0);
+  hi = _mm256_round_pd(hi, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC);
+  lo = _mm256_round_pd(lo, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC);
+  return _mm512_insertf64x4(_mm512_castpd256_pd512(lo), hi, 1);
+}
+
+static INLINE vdouble vrint_vd_vd(vdouble vd) {
+  __m256d hi = _mm512_extractf64x4_pd(vd, 1), lo = _mm512_extractf64x4_pd(vd, 0);
+  hi = _mm256_round_pd(hi, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC);
+  lo = _mm256_round_pd(lo, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC);
+  return _mm512_insertf64x4(_mm512_castpd256_pd512(lo), hi, 1);
+}
+
+static INLINE vint2 vcastu_vi2_vi(vint vi) {
+  return _mm512_maskz_permutexvar_epi32(0xaaaa, _mm512_set_epi32(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0), _mm512_castsi256_si512(vi));
+}
+
+static INLINE vint vcastu_vi_vi2(vint2 vi) {
+  return _mm512_castsi512_si256(_mm512_maskz_permutexvar_epi32(0x00ff, _mm512_set_epi32(0, 0, 0, 0, 0, 0, 0, 0, 15, 13, 11, 9, 7, 5, 3, 1), vi));
+}
+
+static INLINE vmask vcast_vm_i_i(int i0, int i1) { return _mm512_set_epi32(i0, i1, i0, i1, i0, i1, i0, i1, i0, i1, i0, i1, i0, i1, i0, i1); }
+
+static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return _mm512_cmp_epi64_mask(x, y, _MM_CMPINT_EQ); }
+static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return _mm512_add_epi64(x, y); }
+
+//
+
+static INLINE vdouble vcast_vd_d(double d) { return _mm512_set1_pd(d); }
+static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return (__m512i)vd; }
+static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return (__m512d)vm; }
+static INLINE vint2 vreinterpret_vi2_vd(vdouble vd) { return (__m512i)vd; }
+static INLINE vdouble vreinterpret_vd_vi2(vint2 vi) { return (__m512d)vi; }
+
+static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm512_add_pd(x, y); }
+static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm512_sub_pd(x, y); }
+static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm512_mul_pd(x, y); }
+static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm512_div_pd(x, y); }
+static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm512_div_pd(_mm512_set1_pd(1), x); }
+static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm512_sqrt_pd(x); }
+static INLINE vdouble vabs_vd_vd(vdouble d) { return (__m512d)_mm512_andnot_si512((__m512i)_mm512_set1_pd(-0.0), (__m512i)d); }
+static INLINE vdouble vneg_vd_vd(vdouble d) { return (__m512d)_mm512_xor_si512((__m512i)_mm512_set1_pd(-0.0), (__m512i)d); }
+static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmadd_pd(x, y, z); }
+static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmsub_pd(x, y, z); }
+static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fnmadd_pd(x, y, z); }
+static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm512_max_pd(x, y); }
+static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm512_min_pd(x, y); }
+
+static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmadd_pd(x, y, z); }
+static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmadd_pd(x, y, z); }
+static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmsub_pd(x, y, z); }
+static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fnmadd_pd(x, y, z); }
+static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fnmsub_pd(x, y, z); }
+
+static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_EQ_OQ); }
+static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_NEQ_UQ); }
+static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_LT_OQ); }
+static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_LE_OQ); }
+static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_GT_OQ); }
+static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return _mm512_cmp_pd_mask(x, y, _CMP_GE_OQ); }
+
+//
+
+static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm256_add_epi32(x, y); }
+static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm256_sub_epi32(x, y); }
+static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); }
+
+static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm256_and_si256(x, y); }
+static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm256_andnot_si256(x, y); }
+
+static INLINE vint vandnot_vi_vo_vi(vopmask o, vint y) {
+  return _mm512_castsi512_si256(_mm512_mask_and_epi32(_mm512_castsi256_si512(y), o, _mm512_set1_epi32(0), _mm512_set1_epi32(0)));
+}
+static INLINE vint vand_vi_vo_vi(vopmask o, vint y) {
+  return _mm512_castsi512_si256(_mm512_mask_and_epi32(_mm512_set1_epi32(0), o, _mm512_castsi256_si512(y), _mm512_castsi256_si512(y)));
+}
+
+static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm256_or_si256(x, y); }
+static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm256_xor_si256(x, y); }
+static INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm256_slli_epi32(x, c); }
+static INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm256_srli_epi32(x, c); }
+static INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm256_srai_epi32(x, c); }
+
+static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm256_cmpeq_epi32(x, y); }
+static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm256_cmpgt_epi32(x, y); }
+
+static INLINE vopmask veq_vo_vi_vi(vint x, vint y) {
+  return _mm512_cmp_epi32_mask(_mm512_castsi256_si512(x), _mm512_castsi256_si512(y), _MM_CMPINT_EQ);
+}
+static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) {
+  return _mm512_cmp_epi32_mask(_mm512_castsi256_si512(y), _mm512_castsi256_si512(x), _MM_CMPINT_LT);
+}
+
+static INLINE vopmask veq_cvt_vo_vi_vi(vint x, vint y) {
+  return _mm512_cmp_epi64_mask(_mm512_cvtepi32_epi64(x), _mm512_cvtepi32_epi64(y), _MM_CMPINT_EQ);
+}
+static INLINE vopmask vgt_cvt_vo_vi_vi(vint x, vint y) {
+  return _mm512_cmp_epi64_mask(_mm512_cvtepi32_epi64(y), _mm512_cvtepi32_epi64(x), _MM_CMPINT_LT);
+}
+
+static INLINE vdouble vsel_vd_vo_vd_vd(vopmask mask, vdouble x, vdouble y) {
+  return _mm512_mask_blend_pd(mask, y, x);
+}
+
+static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
+  return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
+}
+
+#if 1
+// Probably this is faster
+static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
+  __m512i v = _mm512_castpd_si512(vsel_vd_vo_vd_vd(o0, _mm512_castsi512_pd(_mm512_set_epi64(0, 0, 0, 0, 0, 0, 0, 0)),
+						   vsel_vd_vo_vd_vd(o1, _mm512_castsi512_pd(_mm512_set_epi64(1, 1, 1, 1, 1, 1, 1, 1)),
+								    vsel_vd_vo_vd_vd(o2, _mm512_castsi512_pd(_mm512_set_epi64(2, 2, 2, 2, 2, 2, 2, 2)),
+										     _mm512_castsi512_pd(_mm512_set_epi64(3, 3, 3, 3, 3, 3, 3, 3))))));
+  return _mm512_permutexvar_pd(v, _mm512_castpd256_pd512(_mm256_set_pd(d3, d2, d1, d0)));
+}
+
+static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
+  return vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o1, d0, d1, d2, d2);
+}
+#else
+static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
+  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
+}
+
+static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
+  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
+}
+#endif
+
+static INLINE vopmask visinf_vo_vd(vdouble d) {
+  return _mm512_cmp_pd_mask(vabs_vd_vd(d), _mm512_set1_pd(INFINITY), _CMP_EQ_OQ);
+}
+
+static INLINE vopmask vispinf_vo_vd(vdouble d) {
+  return _mm512_cmp_pd_mask(d, _mm512_set1_pd(INFINITY), _CMP_EQ_OQ);
+}
+
+static INLINE vopmask visminf_vo_vd(vdouble d) {
+  return _mm512_cmp_pd_mask(d, _mm512_set1_pd(-INFINITY), _CMP_EQ_OQ);
+}
+
+static INLINE vopmask visnan_vo_vd(vdouble d) {
+  return _mm512_cmp_pd_mask(d, d, _CMP_NEQ_UQ);
+}
+
+static INLINE vint vilogbk_vi_vd(vdouble d) { return vrint_vi_vd(_mm512_getexp_pd(d)); }
+
+// vilogb2k_vi_vd is similar to vilogbk_vi_vd, but the argument has to
+// be a normalized FP value.
+static INLINE vint vilogb2k_vi_vd(vdouble d) { return vrint_vi_vd(_mm512_getexp_pd(d)); }
+
+static INLINE vdouble vgetexp_vd_vd(vdouble d) { return _mm512_getexp_pd(d); }
+static INLINE vfloat vgetexp_vf_vf(vfloat d) { return _mm512_getexp_ps(d); }
+
+static INLINE vdouble vgetmant_vd_vd(vdouble d) { return _mm512_getmant_pd(d, _MM_MANT_NORM_p75_1p5, _MM_MANT_SIGN_nan); }
+static INLINE vfloat vgetmant_vf_vf(vfloat d) { return _mm512_getmant_ps(d, _MM_MANT_NORM_p75_1p5, _MM_MANT_SIGN_nan); }
+
+#if defined(__clang__)
+#define vfixup_vd_vd_vd_vi2_i(a, b, c, imm) ({ _mm512_fixupimm_pd((a), (b), (c), (imm)); })
+#define vfixup_vf_vf_vf_vi2_i(a, b, c, imm) ({ _mm512_fixupimm_ps((a), (b), (c), (imm)); })
+#else
+static INLINE vdouble vfixup_vd_vd_vd_vi2_i(vdouble a, vdouble b, vint2 c, int imm) { return _mm512_fixupimm_pd(a, b, c, imm); }
+static INLINE vfloat vfixup_vf_vf_vf_vi2_i(vfloat a, vfloat b, vint2 c, int imm) { return _mm512_fixupimm_ps(a, b, c, imm); }
+#endif
+
+#if defined(_MSC_VER)
+// This function is needed when debugging on MSVC.
+static INLINE double vcast_d_vd(vdouble v) {
+  double s[VECTLENDP];
+  _mm512_storeu_pd(s, v);
+  return s[0];
+}
+#endif
+
+static INLINE vdouble vload_vd_p(const double *ptr) { return _mm512_load_pd(ptr); }
+static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm512_loadu_pd(ptr); }
+
+static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm512_store_pd(ptr, v); }
+static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm512_storeu_pd(ptr, v); }
+
+//
+
+static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {
+  return _mm512_castsi512_si256(_mm512_mask_blend_epi32(m, _mm512_castsi256_si512(y), _mm512_castsi256_si512(x)));
+}
+
+//
+
+static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }
+static INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; }
+
+static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm512_cvtepi32_ps((vmask)vcast_vm_vi2(vi)); }
+static INLINE vfloat vcast_vf_f(float f) { return _mm512_set1_ps(f); }
+static INLINE vint2 vcast_vi2_i(int i) { return _mm512_set1_epi32(i); }
+static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vcast_vi2_vm((vmask)_mm512_cvtps_epi32(vf)); }
+static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcast_vi2_vm((vmask)_mm512_cvttps_epi32(vf)); }
+
+static INLINE vfloat vtruncate_vf_vf(vfloat vd) {
+  __m256 hi = (__m256)_mm512_extractf64x4_pd((__m512d)vd, 1), lo = (__m256)_mm512_extractf64x4_pd((__m512d)vd, 0);
+  hi = _mm256_round_ps(hi, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC);
+  lo = _mm256_round_ps(lo, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC);
+  return (__m512)_mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)lo), (__m256d)hi, 1);
+}
+static INLINE vfloat vrint_vf_vf(vfloat vd) {
+  __m256 hi = (__m256)_mm512_extractf64x4_pd((__m512d)vd, 1), lo = (__m256)_mm512_extractf64x4_pd((__m512d)vd, 0);
+  hi = _mm256_round_ps(hi, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC);
+  lo = _mm256_round_ps(lo, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC);
+  return (__m512)_mm512_insertf64x4(_mm512_castpd256_pd512((__m256d)lo), (__m256d)hi, 1);
+}
+
+static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (vmask)vf; }
+static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (vfloat)vm; }
+
+static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return (vfloat)vi; }
+static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return (vint2)vf; }
+
+static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm512_add_ps(x, y); }
+static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm512_sub_ps(x, y); }
+static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm512_mul_ps(x, y); }
+static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm512_div_ps(x, y); }
+static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }
+static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm512_sqrt_ps(x); }
+static INLINE vfloat vabs_vf_vf(vfloat f) { return (vfloat)vandnot_vm_vm_vm((vmask)vcast_vf_f(-0.0f), (vmask)f); }
+static INLINE vfloat vneg_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)vcast_vf_f(-0.0f), (vmask)d); }
+static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmadd_ps(x, y, z); }
+static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmsub_ps(x, y, z); }
+static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fnmadd_ps(x, y, z); }
+static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm512_max_ps(x, y); }
+static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm512_min_ps(x, y); }
+
+static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmadd_ps(x, y, z); }
+static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmadd_ps(x, y, z); }
+static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmsub_ps(x, y, z); }
+static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fnmadd_ps(x, y, z); }
+static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fnmsub_ps(x, y, z); }
+
+static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_EQ_OQ); }
+static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_NEQ_UQ); }
+static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_LT_OQ); }
+static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_LE_OQ); }
+static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_GT_OQ); }
+static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return _mm512_cmp_ps_mask(x, y, _CMP_GE_OQ); }
+
+static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_add_epi32(x, y); }
+static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_sub_epi32(x, y); }
+static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); }
+static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_and_si512(x, y); }
+static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_andnot_si512(x, y); }
+static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_or_si512(x, y); }
+static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_xor_si512(x, y); }
+
+static INLINE vint2 vand_vi2_vo_vi2(vopmask o, vint2 m) {
+  return _mm512_mask_and_epi32(_mm512_set1_epi32(0), o, m, m);
+}
+
+static INLINE vint2 vandnot_vi2_vo_vi2(vopmask o, vint2 m) {
+  return _mm512_mask_and_epi32(m, o, _mm512_set1_epi32(0), _mm512_set1_epi32(0));
+}
+
+static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return _mm512_slli_epi32(x, c); }
+static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return _mm512_srli_epi32(x, c); }
+static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return _mm512_srai_epi32(x, c); }
+static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return _mm512_cmpeq_epi32_mask(x, y); }
+static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return _mm512_cmpgt_epi32_mask(x, y); }
+
+static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) {
+  __mmask16 m = _mm512_cmp_epi32_mask(x, y, _MM_CMPINT_EQ);
+  return _mm512_mask_and_epi32(_mm512_set1_epi32(0), m, _mm512_set1_epi32(-1), _mm512_set1_epi32(-1));
+}
+static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) {
+  __mmask16 m = _mm512_cmp_epi32_mask(y, x, _MM_CMPINT_LT);
+  return _mm512_mask_and_epi32(_mm512_set1_epi32(0), m, _mm512_set1_epi32(-1), _mm512_set1_epi32(-1));
+}
+
+static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
+  return _mm512_mask_blend_epi32(m, y, x);
+}
+
+static INLINE vfloat vsel_vf_vo_vf_vf(vopmask m, vfloat x, vfloat y) {
+  return _mm512_mask_blend_ps(m, y, x);
+}
+
+static INLINE vint2 vsrl64_vi2_vi(vint2 x, int i) { return _mm512_srli_epi64(x, i); }
+static INLINE vint2 vadd64_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_add_epi64(x, y); }
+static INLINE vint2 vsub64_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm512_sub_epi64(x, y); }
+static INLINE vint2 vcast_vi2_i64(long x) { return _mm512_set1_epi64(x); }
+
+// At this point, the following three functions are implemented in a generic way,
+// but I will try target-specific optimization later on.
+static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
+  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
+}
+
+static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(INFINITYf)); }
+static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(INFINITYf)); }
+static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-INFINITYf)); }
+static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
+
+static INLINE vint2 vilogbk_vi2_vf(vfloat d) { return vrint_vi2_vf(_mm512_getexp_ps(d)); }
+
+
+static INLINE int vall_lte64_i_vd_vd(vdouble x, vdouble lim) {
+  return vgt_vo_vd_vd(x, lim) == 0;
+}
+
+static INLINE int vall_lte32_i_vf_vf(vfloat x, vfloat lim) {
+  return vgt_vo_vf_vf(x, lim) == 0;
+}
+
+
+#ifdef _MSC_VER
+// This function is needed when debugging on MSVC.
+static INLINE float vcast_f_vf(vfloat v) {
+  float s[VECTLENSP];
+  _mm512_storeu_ps(s, v);
+  return s[0];
+}
+#endif
+
+static INLINE vfloat vload_vf_p(const float *ptr) { return _mm512_load_ps(ptr); }
+static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm512_loadu_ps(ptr); }
+
+static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm512_store_ps(ptr, v); }
+static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm512_storeu_ps(ptr, v); }
+
+//
+
+#define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0, +0.0, -0.0, +0.0, -0.0 })
+#define NPMASK ((vdouble) { -0.0, +0.0, -0.0, +0.0, -0.0, +0.0, -0.0, +0.0 })
+#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f })
+#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f })
+
+static INLINE vdouble vposneg_vd_vd(vdouble d) { return (vdouble)vxor_vm_vm_vm((vmask)d, (vmask)PNMASK); }
+static INLINE vdouble vnegpos_vd_vd(vdouble d) { return (vdouble)vxor_vm_vm_vm((vmask)d, (vmask)NPMASK); }
+static INLINE vfloat vposneg_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)PNMASKf); }
+static INLINE vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)NPMASKf); }
+
+static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); }
+static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return vadd_vf_vf_vf(x, vnegpos_vf_vf(y)); }
+
+static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm512_fmaddsub_pd(x, y, z); }
+static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm512_fmaddsub_ps(x, y, z); }
+
+static INLINE vdouble vrev21_vd_vd(vdouble vd) {
+  return (vdouble)_mm512_permutexvar_epi32(_mm512_set_epi32(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2), (__m512i)vd);
+}
+
+static INLINE vdouble vreva2_vd_vd(vdouble vd) {
+  return (vdouble)_mm512_permutexvar_epi32(_mm512_set_epi32(3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12), (__m512i)vd);
+}
+
+static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm512_stream_pd(ptr, v); }
+
+static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
+  _mm_store_pd(&ptr[(offset + step * 0)*2], (__m128d)_mm512_extractf32x4_ps((__m512)v, 0));
+  _mm_store_pd(&ptr[(offset + step * 1)*2], (__m128d)_mm512_extractf32x4_ps((__m512)v, 1));
+  _mm_store_pd(&ptr[(offset + step * 2)*2], (__m128d)_mm512_extractf32x4_ps((__m512)v, 2));
+  _mm_store_pd(&ptr[(offset + step * 3)*2], (__m128d)_mm512_extractf32x4_ps((__m512)v, 3));
+}
+
+static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
+  _mm_stream_pd(&ptr[(offset + step * 0)*2], (__m128d)_mm512_extractf32x4_ps((__m512)v, 0));
+  _mm_stream_pd(&ptr[(offset + step * 1)*2], (__m128d)_mm512_extractf32x4_ps((__m512)v, 1));
+  _mm_stream_pd(&ptr[(offset + step * 2)*2], (__m128d)_mm512_extractf32x4_ps((__m512)v, 2));
+  _mm_stream_pd(&ptr[(offset + step * 3)*2], (__m128d)_mm512_extractf32x4_ps((__m512)v, 3));
+}
+
+//
+
+static INLINE vfloat vrev21_vf_vf(vfloat vf) {
+  return (vfloat)_mm512_permutexvar_epi32(_mm512_set_epi32(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1), (__m512i)vf);
+}
+
+static INLINE vfloat vreva2_vf_vf(vfloat vf) {
+  return (vfloat)_mm512_permutexvar_epi32(_mm512_set_epi32(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14), (__m512i)vf);
+}
+
+static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm512_stream_ps(ptr, v); }
+
+static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
+  _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), (__m128d)_mm512_extractf32x4_ps(v, 0));
+  _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), (__m128d)_mm512_extractf32x4_ps(v, 0));
+  _mm_storel_pd((double *)(ptr+(offset + step * 2)*2), (__m128d)_mm512_extractf32x4_ps(v, 1));
+  _mm_storeh_pd((double *)(ptr+(offset + step * 3)*2), (__m128d)_mm512_extractf32x4_ps(v, 1));
+  _mm_storel_pd((double *)(ptr+(offset + step * 4)*2), (__m128d)_mm512_extractf32x4_ps(v, 2));
+  _mm_storeh_pd((double *)(ptr+(offset + step * 5)*2), (__m128d)_mm512_extractf32x4_ps(v, 2));
+  _mm_storel_pd((double *)(ptr+(offset + step * 6)*2), (__m128d)_mm512_extractf32x4_ps(v, 3));
+  _mm_storeh_pd((double *)(ptr+(offset + step * 7)*2), (__m128d)_mm512_extractf32x4_ps(v, 3));
+}
+
+static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
diff --git a/lib/kernel/sleef/arch/helperneon32.h b/lib/kernel/sleef/arch/helperneon32.h
new file mode 100644
index 0000000..64ac93e
--- /dev/null
+++ b/lib/kernel/sleef/arch/helperneon32.h
@@ -0,0 +1,244 @@
+//          Copyright Naoki Shibata 2010 - 2017.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#ifndef __ARM_NEON
+#error Please specify -mfpu=neon.
+#endif
+
+#ifdef __aarch64__
+#warning This implementation is for AARCH32.
+#endif
+
+#define ENABLE_SP
+#define LOG2VECTLENSP 2
+#define VECTLENSP (1 << LOG2VECTLENSP)
+
+#define ISANAME "AARCH32 NEON"
+#define DFTPRIORITY 10
+
+#define ENABLE_RECSQRT_SP
+
+#include <arm_neon.h>
+
+typedef uint32x4_t vmask;
+typedef uint32x4_t vopmask;
+
+//typedef int32x4_t vint;
+
+typedef float32x4_t vfloat;
+typedef int32x4_t vint2;
+
+//
+
+static INLINE void vprefetch_v_p(const void *ptr) { }
+
+static INLINE int vtestallones_i_vo32(vopmask g) {
+  uint32x2_t x0 = vand_u32(vget_low_u32(g), vget_high_u32(g));
+  uint32x2_t x1 = vpmin_u32(x0, x0);
+  return vget_lane_u32(x1, 0);
+}
+
+static vfloat vloaduf(float *p) { return vld1q_f32(p); }
+static void vstoreuf(float *p, vfloat v) { vst1q_f32(p, v); }
+
+static vint2 vloadu_vi2_p(int32_t *p) { return vld1q_s32(p); }
+static void vstoreu_p_vi2(int32_t *p, vint2 v) { vst1q_s32(p, v); }
+
+//
+
+static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return vandq_u32(x, y); }
+static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return vbicq_u32(y, x); }
+static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return vorrq_u32(x, y); }
+static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return veorq_u32(x, y); }
+
+static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return vandq_u32(x, y); }
+static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return vbicq_u32(y, x); }
+static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return vorrq_u32(x, y); }
+static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return veorq_u32(x, y); }
+
+static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return vandq_u32(x, y); }
+static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return vbicq_u32(y, x); }
+static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return vorrq_u32(x, y); }
+static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return veorq_u32(x, y); }
+
+static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return vandq_u32(x, y); }
+static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return vbicq_u32(y, x); }
+static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return vorrq_u32(x, y); }
+static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return veorq_u32(x, y); }
+
+static INLINE vopmask vcast_vo32_vo64(vopmask m) { return vuzpq_u32(m, m).val[0]; }
+static INLINE vopmask vcast_vo64_vo32(vopmask m) { return vzipq_u32(m, m).val[0]; }
+
+//
+
+static INLINE vmask vcast_vm_i_i(int i0, int i1) { return (vmask)vdupq_n_u64((uint64_t)i0 | (((uint64_t)i1) << 32)); }
+static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {
+  uint32x4_t t = vceqq_u32(x, y);
+  return vandq_u32(t, vrev64q_u32(t));
+}
+
+//
+
+static INLINE vint2 vcast_vi2_vm(vmask vm) { return (vint2)vm; }
+static INLINE vmask vcast_vm_vi2(vint2 vi) { return (vmask)vi; }
+static INLINE vint2 vrint_vi2_vf(vfloat d) {
+  return vcvtq_s32_f32(vaddq_f32(d, (float32x4_t)vorrq_u32(vandq_u32((uint32x4_t)d, (uint32x4_t)vdupq_n_f32(-0.0f)), (uint32x4_t)vdupq_n_f32(0.5f))));
+}
+static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return vcvtq_s32_f32(vf); }
+static INLINE vfloat vcast_vf_vi2(vint2 vi) { return vcvtq_f32_s32(vi); }
+
+static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); }
+static INLINE vfloat vrint_vf_vf(vfloat vd) { return vcast_vf_vi2(vrint_vi2_vf(vd)); }
+
+static INLINE vfloat vcast_vf_f(float f) { return vdupq_n_f32(f); }
+static INLINE vint2 vcast_vi2_i(int i) { return vdupq_n_s32(i); }
+static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (vmask)vf; }
+static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (vfloat)vm; }
+static INLINE vfloat vreinterpret_vf_vi2(vint2 vm) { return (vfloat)vm; }
+static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return (vint2)vf; }
+
+static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return vaddq_f32(x, y); }
+static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return vsubq_f32(x, y); }
+static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return vmulq_f32(x, y); }
+
+static INLINE vfloat vdiv_vf_vf_vf(vfloat n, vfloat d) {
+  float32x4_t x = vrecpeq_f32(d);
+  x = vmulq_f32(x, vrecpsq_f32(d, x));
+  float32x4_t t = vmulq_f32(n, x);
+  return vmlsq_f32(vaddq_f32(t, t), vmulq_f32(t, x), d);
+}
+
+static INLINE vfloat vrec_vf_vf(vfloat d) {
+  float32x4_t x = vrecpeq_f32(d);
+  x = vmulq_f32(x, vrecpsq_f32(d, x));
+  return vmlsq_f32(vaddq_f32(x, x), vmulq_f32(x, x), d);
+}
+
+static INLINE vfloat vsqrt_vf_vf(vfloat d) {
+  float32x4_t x = vrsqrteq_f32(d);
+  x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x)));
+  float32x4_t u = vmulq_f32(x, d);
+  u = vmlaq_f32(u, vmlsq_f32(d, u, u), vmulq_f32(x, vdupq_n_f32(0.5)));
+  return (float32x4_t)vbicq_u32((uint32x4_t)u, vceqq_f32(d, vdupq_n_f32(0.0f)));
+}
+
+static INLINE vfloat vrecsqrt_vf_vf(vfloat d) {
+  float32x4_t x = vrsqrteq_f32(d);
+  x = vmulq_f32(x, vrsqrtsq_f32(d, vmulq_f32(x, x)));
+  return vmlaq_f32(x, vmlsq_f32(vdupq_n_f32(1), x, vmulq_f32(x, d)), vmulq_f32(x, vdupq_n_f32(0.5)));
+}
+
+static INLINE vfloat vabs_vf_vf(vfloat f) { return vabsq_f32(f); }
+static INLINE vfloat vneg_vf_vf(vfloat f) { return vnegq_f32(f); }
+static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmlaq_f32(z, x, y); }
+static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vmlsq_f32(z, x, y); }
+static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return vmaxq_f32(x, y); }
+static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return vminq_f32(x, y); }
+
+static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vceqq_f32(x, y); }
+static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vmvnq_u32(vceqq_f32(x, y)); }
+static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vcltq_f32(x, y); }
+static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vcleq_f32(x, y); }
+static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vcgtq_f32(x, y); }
+static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vcgeq_f32(x, y); }
+
+static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return vaddq_s32(x, y); }
+static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return vsubq_s32(x, y); }
+static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vnegq_s32(e); }
+
+static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return vandq_s32(x, y); }
+static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return vbicq_s32(y, x); }
+static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return vorrq_s32(x, y); }
+static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return veorq_s32(x, y); }
+
+static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return (vint2)vandq_u32(x, (vopmask)y); }
+static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return (vint2)vbicq_u32((vopmask)y, x); }
+
+#if defined(__clang__)
+#define vsll_vi2_vi2_i(x, c) vshlq_n_s32(x, c)
+#define vsrl_vi2_vi2_i(x, c) vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(x), c))
+#define vsra_vi2_vi2_i(x, c) vshrq_n_s32(x, c)
+#else
+static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return (int32x4_t) vshlq_n_u32((uint32x4_t)x, c); }
+static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return (int32x4_t) vshrq_n_u32((uint32x4_t)x, c); }
+static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return vshrq_n_s32(x, c); }
+#endif
+
+static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return vceqq_s32(x, y); }
+static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return vcgeq_s32(x, y); }
+static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return (vint2)vceqq_s32(x, y); }
+static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return (vint2)vcgeq_s32(x, y); }
+
+static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) { return (vint2)vbslq_u32(m, (vmask)x, (vmask)y); }
+
+static INLINE vfloat vsel_vf_vo_vf_vf(vopmask mask, vfloat x, vfloat y) {
+  return (vfloat)vbslq_u32(mask, (vmask)x, (vmask)y);
+}
+
+static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
+  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
+}
+
+static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(INFINITYf)); }
+static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(INFINITYf)); }
+static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-INFINITYf)); }
+static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
+
+// This function is needed when debugging on MSVC.
+static INLINE float vcast_f_vf(vfloat v) {
+  float p[4];
+  vst1q_f32 (p, v);
+  return p[0];
+}
+
+static INLINE int vavailability_i(int name) {
+  if (name != 2) return 0;
+  return vcast_f_vf(vadd_vf_vf_vf(vcast_vf_f(name), vcast_vf_f(name))) != 0.0;
+}
+
+
+static INLINE vfloat vload_vf_p(const float *ptr) { return vld1q_f32(__builtin_assume_aligned(ptr, 16)); }
+static INLINE vfloat vloadu_vf_p(const float *ptr) { return vld1q_f32(ptr); }
+
+static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { vst1q_f32(__builtin_assume_aligned(ptr, 16), v); }
+static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { vst1q_f32(ptr, v); }
+
+#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
+#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })
+
+static INLINE vfloat vposneg_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)PNMASKf); }
+static INLINE vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)vxor_vm_vm_vm((vmask)d, (vmask)NPMASKf); }
+
+static INLINE vfloat vsubadd_vf_vf_vf(vfloat d0, vfloat d1) { return vadd_vf_vf_vf(d0, vnegpos_vf_vf(d1)); }
+static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
+
+static INLINE vfloat vrev21_vf_vf(vfloat d0) { return vrev64q_f32(d0); }
+static INLINE vfloat vreva2_vf_vf(vfloat d0) { return vcombine_f32(vget_high_f32(d0), vget_low_f32(d0)); }
+
+static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { vstore_v_p_vf(ptr, v); }
+
+static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
+  vst1_f32((float *)(ptr+(offset + step * 0)*2), vget_low_f32(v));
+  vst1_f32((float *)(ptr+(offset + step * 1)*2), vget_high_f32(v));
+}
+
+static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
+  vst1_f32((float *)(ptr+(offset + step * 0)*2), vget_low_f32(v));
+  vst1_f32((float *)(ptr+(offset + step * 1)*2), vget_high_f32(v));
+}
+
+// TODO
+
+static INLINE int vall_lte32_i_vf_vf(vfloat x, vfloat lim) {
+  return 0;
+}
diff --git a/lib/kernel/sleef/arch/helperpurec.h b/lib/kernel/sleef/arch/helperpurec.h
new file mode 100644
index 0000000..1cd220b
--- /dev/null
+++ b/lib/kernel/sleef/arch/helperpurec.h
@@ -0,0 +1,540 @@
+//          Copyright Naoki Shibata 2010 - 2017.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#include <stdint.h>
+#include <math.h>
+#include "misc.h"
+
+#ifndef CONFIG
+#error CONFIG macro not defined
+#endif
+
+#define ENABLE_DP
+#define ENABLE_SP
+
+#define LOG2VECTLENDP CONFIG
+#define VECTLENDP (1 << LOG2VECTLENDP)
+#define LOG2VECTLENSP (LOG2VECTLENDP+1)
+#define VECTLENSP (1 << LOG2VECTLENSP)
+
+#define DFTPRIORITY LOG2VECTLENDP
+#define ISANAME "Pure C Array"
+
+typedef union {
+  uint32_t u[VECTLENDP*2];
+  uint64_t x[VECTLENDP];
+  double d[VECTLENDP];
+  float f[VECTLENDP*2];
+  int32_t i[VECTLENDP*2];
+} versatileVector;
+
+typedef versatileVector vmask;
+typedef versatileVector vopmask;
+typedef versatileVector vdouble;
+typedef versatileVector vint;
+typedef versatileVector vfloat;
+typedef versatileVector vint2;
+
+typedef union {
+  uint8_t u[sizeof(long double)*VECTLENDP];
+  long double ld[VECTLENDP];
+} longdoubleVector;
+
+typedef longdoubleVector vmaskl;
+typedef longdoubleVector vlongdouble;
+
+#ifdef Sleef_quad2_DEFINED
+typedef union {
+  uint8_t u[sizeof(Sleef_quad)*VECTLENDP];
+  Sleef_quad q[VECTLENDP];
+} quadVector;
+
+typedef quadVector vmaskq;
+typedef quadVector vquad;
+#endif
+
+//
+
+static INLINE int vavailability_i(int name) { return -1; }
+static INLINE void vprefetch_v_p(const void *ptr) { }
+
+static INLINE int vtestallones_i_vo64(vopmask g) {
+  int ret = 1; for(int i=0;i<VECTLENDP;i++) ret = ret && g.x[i]; return ret;
+}
+
+static INLINE int vtestallones_i_vo32(vopmask g) {
+  int ret = 1; for(int i=0;i<VECTLENSP;i++) ret = ret && g.u[i]; return ret;
+}
+
+//
+
+static vint2 vloadu_vi2_p(int32_t *p) {
+  vint2 vi;
+  for(int i=0;i<VECTLENSP;i++) vi.i[i] = p[i];
+  return vi;
+}
+
+static void vstoreu_v_p_vi2(int32_t *p, vint2 v) {
+  for(int i=0;i<VECTLENSP;i++) p[i] = v.i[i];
+}
+
+static vint vloadu_vi_p(int32_t *p) {
+  vint vi;
+  for(int i=0;i<VECTLENDP;i++) vi.i[i] = p[i];
+  return vi;
+}
+
+static void vstoreu_v_p_vi(int32_t *p, vint v) {
+  for(int i=0;i<VECTLENDP;i++) p[i] = v.i[i];
+}
+
+//
+
+static INLINE vopmask vcast_vo32_vo64(vopmask m) {
+  vopmask ret;
+  for(int i=0;i<VECTLENDP;i++) ret.u[i] = m.u[i*2+1];
+  for(int i=VECTLENDP;i<VECTLENDP*2;i++) ret.u[i] = 0;
+  return ret;
+}
+
+static INLINE vopmask vcast_vo64_vo32(vopmask m) {
+  vopmask ret;
+  for(int i=0;i<VECTLENDP;i++) ret.u[i*2] = ret.u[i*2+1] = m.u[i];
+  return ret;
+}
+
+static INLINE vmask vcast_vm_i_i(int h, int l) {
+  vmask ret;
+  for(int i=0;i<VECTLENDP;i++) {
+    ret.u[i*2+0] = l;
+    ret.u[i*2+1] = h;
+  }
+  return ret;
+}
+
+static INLINE vint2 vcastu_vi2_vi(vint vi) {
+  vint2 ret;
+  for(int i=0;i<VECTLENDP;i++) {
+    ret.i[i*2+0] = 0;
+    ret.i[i*2+1] = vi.i[i];
+  }
+  return ret;
+}
+
+static INLINE vint vcastu_vi_vi2(vint2 vi2) {
+  vint ret;
+  for(int i=0;i<VECTLENDP;i++) ret.i[i] = vi2.i[i*2+1];
+  return ret;
+}
+
+static INLINE vint vreinterpretFirstHalf_vi_vi2(vint2 vi2) {
+  vint ret;
+  for(int i=0;i<VECTLENDP;i++) ret.i[i] = vi2.i[i];
+  return ret;
+}
+
+static INLINE vint2 vreinterpretFirstHalf_vi2_vi(vint vi) {
+  vint2 ret;
+  for(int i=0;i<VECTLENDP;i++) ret.i[i] = vi.i[i];
+  for(int i=VECTLENDP;i<VECTLENDP*2;i++) ret.i[i] = 0;
+  return ret;
+}
+
+static INLINE vdouble vrev21_vd_vd(vdouble d0) {
+  vdouble r;
+  for(int i=0;i<VECTLENDP/2;i++) {
+    r.d[i*2+0] = d0.d[i*2+1];
+    r.d[i*2+1] = d0.d[i*2+0];
+  }
+  return r;
+}
+
+static INLINE vdouble vreva2_vd_vd(vdouble d0) {
+  vdouble r;
+  for(int i=0;i<VECTLENDP/2;i++) {
+    r.d[i*2+0] = d0.d[(VECTLENDP/2-1-i)*2+0];
+    r.d[i*2+1] = d0.d[(VECTLENDP/2-1-i)*2+1];
+  }
+  return r;
+}
+
+static INLINE vfloat vrev21_vf_vf(vfloat d0) {
+  vfloat r;
+  for(int i=0;i<VECTLENSP/2;i++) {
+    r.f[i*2+0] = d0.f[i*2+1];
+    r.f[i*2+1] = d0.f[i*2+0];
+  }
+  return r;
+}
+
+static INLINE vfloat vreva2_vf_vf(vfloat d0) {
+  vfloat r;
+  for(int i=0;i<VECTLENSP/2;i++) {
+    r.f[i*2+0] = d0.f[(VECTLENSP/2-1-i)*2+0];
+    r.f[i*2+1] = d0.f[(VECTLENSP/2-1-i)*2+1];
+  }
+  return r;
+}
+
+static INLINE vdouble vcast_vd_d(double d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = d; return ret; }
+
+//
+
+static INLINE vopmask vand_vo_vo_vo   (vopmask x, vopmask y) { vopmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] &  y.u[i]; return ret; }
+static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { vopmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = y.u[i] & ~x.u[i]; return ret; }
+static INLINE vopmask vor_vo_vo_vo    (vopmask x, vopmask y) { vopmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] |  y.u[i]; return ret; }
+static INLINE vopmask vxor_vo_vo_vo   (vopmask x, vopmask y) { vopmask ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] ^  y.u[i]; return ret; }
+
+static INLINE vmask vand_vm_vm_vm     (vmask x, vmask y)     { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] &  y.u[i]; return ret; }
+static INLINE vmask vandnot_vm_vm_vm  (vmask x, vmask y)     { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = y.u[i] & ~x.u[i]; return ret; }
+static INLINE vmask vor_vm_vm_vm      (vmask x, vmask y)     { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] |  y.u[i]; return ret; }
+static INLINE vmask vxor_vm_vm_vm     (vmask x, vmask y)     { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] ^  y.u[i]; return ret; }
+
+static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y)      { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] &  y.u[i]; return ret; }
+static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y)   { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = y.u[i] & ~x.u[i]; return ret; }
+static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y)       { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] |  y.u[i]; return ret; }
+static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y)      { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] ^  y.u[i]; return ret; }
+
+static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y)      { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] &  y.u[i]; return ret; }
+static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y)   { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = y.u[i] & ~x.u[i]; return ret; }
+static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y)       { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] |  y.u[i]; return ret; }
+static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y)      { vmask   ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = x.u[i] ^  y.u[i]; return ret; }
+
+//
+
+static INLINE vdouble vsel_vd_vo_vd_vd   (vopmask o, vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = (o.u[i] & x.u[i]) | (y.u[i] & ~o.u[i]); return ret; }
+static INLINE vint2   vsel_vi2_vo_vi2_vi2(vopmask o, vint2 x, vint2 y)     { vint2 ret;   for(int i=0;i<VECTLENDP*2;i++) ret.u[i] = (o.u[i] & x.u[i]) | (y.u[i] & ~o.u[i]); return ret; }
+
+static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
+  return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
+}
+
+static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
+  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
+}
+
+static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
+  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
+}
+
+static INLINE vdouble vcast_vd_vi(vint vi) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = vi.i[i]; return ret; }
+static INLINE vint vtruncate_vi_vd(vdouble vd) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = (int)vd.d[i]; return ret; }
+static INLINE vint vrint_vi_vd(vdouble vd) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = vd.d[i] > 0 ? (int)(vd.d[i] + 0.5) : (int)(vd.d[i] - 0.5); return ret; }
+static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return vcast_vd_vi(vtruncate_vi_vd(vd)); }
+static INLINE vdouble vrint_vd_vd(vdouble vd) { return vcast_vd_vi(vrint_vi_vd(vd)); }
+static INLINE vint vcast_vi_i(int j) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = j; return ret; }
+
+static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.x[i] == y.x[i] ? -1 : 0; return ret; }
+static INLINE vmask veq64_vo_vm_vm(vmask x, vmask y) { vmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.x[i] + y.x[i]; return ret; }
+
+//
+
+static INLINE vmask vreinterpret_vm_vd(vdouble vd) { union { vdouble vd; vmask vm; } cnv; cnv.vd = vd; return cnv.vm; }
+static INLINE vint2 vreinterpret_vi2_vd(vdouble vd) { union { vdouble vd; vint2 vi2; } cnv; cnv.vd = vd; return cnv.vi2; }
+static INLINE vdouble vreinterpret_vd_vi2(vint2 vi) { union { vint2 vi2; vdouble vd; } cnv; cnv.vi2 = vi; return cnv.vd; }
+static INLINE vdouble vreinterpret_vd_vm(vmask vm) { union { vmask vm; vdouble vd; } cnv; cnv.vm = vm; return cnv.vd; }
+
+static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] + y.d[i]; return ret; }
+static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] - y.d[i]; return ret; }
+static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] * y.d[i]; return ret; }
+static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] / y.d[i]; return ret; }
+static INLINE vdouble vrec_vd_vd(vdouble x)               { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = 1.0 / x.d[i];    return ret; }
+
+static INLINE vdouble vabs_vd_vd(vdouble d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = d.x[i] & 0x7fffffffffffffffULL; return ret; }
+static INLINE vdouble vneg_vd_vd(vdouble d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = -d.d[i]; return ret; }
+static INLINE vdouble vmla_vd_vd_vd_vd  (vdouble x, vdouble y, vdouble z) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] * y.d[i] + z.d[i]; return ret; }
+static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] * y.d[i] - z.d[i]; return ret; }
+static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] > y.d[i] ? x.d[i] : y.d[i]; return ret; }
+static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = x.d[i] < y.d[i] ? x.d[i] : y.d[i]; return ret; }
+
+static INLINE vdouble vposneg_vd_vd(vdouble d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = (i & 1) == 0 ?  d.d[i] : -d.d[i]; return ret; }
+static INLINE vdouble vnegpos_vd_vd(vdouble d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = (i & 1) == 0 ? -d.d[i] :  d.d[i]; return ret; }
+static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = (i & 1) == 0 ? x.d[i] - y.d[i] : x.d[i] + y.d[i]; return ret; }
+static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
+
+static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y)  { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] == y.d[i] ? -1 : 0; return ret; }
+static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] != y.d[i] ? -1 : 0; return ret; }
+static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y)  { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] <  y.d[i] ? -1 : 0; return ret; }
+static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y)  { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] <= y.d[i] ? -1 : 0; return ret; }
+static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y)  { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] >  y.d[i] ? -1 : 0; return ret; }
+static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y)  { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = x.d[i] >= y.d[i] ? -1 : 0; return ret; }
+
+static INLINE vint vadd_vi_vi_vi(vint x, vint y) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] + y.i[i]; return ret; }
+static INLINE vint vsub_vi_vi_vi(vint x, vint y) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] - y.i[i]; return ret; }
+static INLINE vint vneg_vi_vi   (vint x)         { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = -x.i[i];         return ret; }
+
+static INLINE vint vand_vi_vi_vi(vint x, vint y)    { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] &  y.i[i]; return ret; }
+static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = y.i[i] & ~x.i[i]; return ret; }
+static INLINE vint vor_vi_vi_vi(vint x, vint y)     { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] |  y.i[i]; return ret; }
+static INLINE vint vxor_vi_vi_vi(vint x, vint y)    { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] ^  y.i[i]; return ret; }
+
+static INLINE vint vand_vi_vo_vi(vopmask x, vint y)    { return vand_vi_vi_vi(vreinterpretFirstHalf_vi_vi2(x), y); }
+static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return vandnot_vi_vi_vi(vreinterpretFirstHalf_vi_vi2(x), y); }
+
+static INLINE vint vsll_vi_vi_i(vint x, int c) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] << c; return ret; }
+static INLINE vint vsrl_vi_vi_i(vint x, int c) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = ((uint32_t)x.i[i]) >> c; return ret; }
+static INLINE vint vsra_vi_vi_i(vint x, int c) { vint ret; for(int i=0;i<VECTLENDP;i++) ret.i[i] = x.i[i] >> c; return ret; }
+
+static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.u[i] = x.i[i] == y.i[i] ? -1 : 0; return ret; }
+static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.u[i] = x.i[i] >  y.i[i] ? -1 : 0; return ret; }
+
+static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {
+  union { vopmask vo; vint2 vi2; } cnv;
+  cnv.vo = m;
+  return vor_vi_vi_vi(vand_vi_vi_vi(vreinterpretFirstHalf_vi_vi2(cnv.vi2), x),
+		      vandnot_vi_vi_vi(vreinterpretFirstHalf_vi_vi2(cnv.vi2), y));
+}
+
+static INLINE vopmask visinf_vo_vd(vdouble d)  { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = (d.d[i] == INFINITY || d.d[i] == -INFINITY) ? -1 : 0; return ret; }
+static INLINE vopmask vispinf_vo_vd(vdouble d) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = d.d[i] == INFINITY ? -1 : 0; return ret; }
+static INLINE vopmask visminf_vo_vd(vdouble d) { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = d.d[i] == -INFINITY ? -1 : 0; return ret; }
+static INLINE vopmask visnan_vo_vd(vdouble d)  { vopmask ret; for(int i=0;i<VECTLENDP;i++) ret.x[i] = d.d[i] != d.d[i] ? -1 : 0; return ret; }
+
+static INLINE vdouble vsqrt_vd_vd(vdouble d) { vdouble ret; for(int i=0;i<VECTLENDP;i++) ret.d[i] = sqrt(d.d[i]); return ret; }
+
+#if defined(_MSC_VER)
+// This function is needed when debugging on MSVC.
+static INLINE double vcast_d_vd(vdouble v) { return v.d[0]; }
+#endif
+
+static INLINE vdouble vload_vd_p(const double *ptr) { return *(vdouble *)ptr; }
+static INLINE vdouble vloadu_vd_p(const double *ptr) { vdouble vd; for(int i=0;i<VECTLENDP;i++) vd.d[i] = ptr[i]; return vd; }
+
+static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
+static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { for(int i=0;i<VECTLENDP;i++) ptr[i] = v.d[i]; }
+static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
+
+static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
+  for(int i=0;i<VECTLENDP/2;i++) {
+    *(ptr+(offset + step * i)*2 + 0) = v.d[i*2+0];
+    *(ptr+(offset + step * i)*2 + 1) = v.d[i*2+1];
+  }
+}
+
+static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vscatter2_v_p_i_i_vd(ptr, offset, step, v); }
+
+//
+
+static INLINE vint2 vcast_vi2_vm(vmask vm) { union { vint2 vi2; vmask vm; } cnv; cnv.vm = vm; return cnv.vi2; }
+static INLINE vmask vcast_vm_vi2(vint2 vi) { union { vint2 vi2; vmask vm; } cnv; cnv.vi2 = vi; return cnv.vm; }
+
+static INLINE vfloat vcast_vf_vi2(vint2 vi) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = vi.i[i]; return ret; }
+static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { vint2 ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = (int)vf.f[i]; return ret; }
+static INLINE vint2 vrint_vi2_vf(vfloat vf) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = vf.f[i] > 0 ? (int)(vf.f[i] + 0.5) : (int)(vf.f[i] - 0.5); return ret; }
+static INLINE vint2 vcast_vi2_i(int j) { vint2 ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = j; return ret; }
+static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); }
+static INLINE vfloat vrint_vf_vf(vfloat vd) { return vcast_vf_vi2(vrint_vi2_vf(vd)); }
+
+static INLINE vfloat vcast_vf_f(float f) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = f; return ret; }
+static INLINE vmask vreinterpret_vm_vf(vfloat vf) { union { vfloat vf; vmask vm; } cnv; cnv.vf = vf; return cnv.vm; }
+static INLINE vfloat vreinterpret_vf_vm(vmask vm) { union { vfloat vf; vmask vm; } cnv; cnv.vm = vm; return cnv.vf; }
+static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { union { vfloat vf; vint2 vi2; } cnv; cnv.vi2 = vi; return cnv.vf; }
+static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { union { vfloat vf; vint2 vi2; } cnv; cnv.vf = vf; return cnv.vi2; }
+
+static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] + y.f[i]; return ret; }
+static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] - y.f[i]; return ret; }
+static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] * y.f[i]; return ret; }
+static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] / y.f[i]; return ret; }
+static INLINE vfloat vrec_vf_vf   (vfloat x)           { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = 1.0    / x.f[i]; return ret; }
+
+static INLINE vfloat vabs_vf_vf(vfloat x) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] & 0x7fffffff; return ret; }
+static INLINE vfloat vneg_vf_vf(vfloat x) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = -x.f[i]; return ret; }
+static INLINE vfloat vmla_vf_vf_vf_vf  (vfloat x, vfloat y, vfloat z) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] * y.f[i] + z.f[i]; return ret; }
+static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] * y.f[i] - z.f[i]; return ret; }
+static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] > y.f[i] ? x.f[i] : y.f[i]; return ret; }
+static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = x.f[i] < y.f[i] ? x.f[i] : y.f[i]; return ret; }
+
+static INLINE vfloat vposneg_vf_vf(vfloat x) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = (i & 1) == 0 ?  x.f[i] : -x.f[i]; return ret; }
+static INLINE vfloat vnegpos_vf_vf(vfloat x) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = (i & 1) == 0 ? -x.f[i] :  x.f[i]; return ret; }
+static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = (i & 1) == 0 ? x.f[i] - y.f[i] : x.f[i] + y.f[i]; return ret; }
+static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
+
+static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y)  { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] == y.f[i]) ? -1 : 0); return ret; }
+static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] != y.f[i]) ? -1 : 0); return ret; }
+static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y)  { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] <  y.f[i]) ? -1 : 0); return ret; }
+static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y)  { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] <= y.f[i]) ? -1 : 0); return ret; }
+static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y)  { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] >  y.f[i]) ? -1 : 0); return ret; }
+static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y)  { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = ((x.f[i] >= y.f[i]) ? -1 : 0); return ret; }
+
+static INLINE vint vadd_vi2_vi2_vi2(vint x, vint y) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] + y.i[i]; return ret; }
+static INLINE vint vsub_vi2_vi2_vi2(vint x, vint y) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] - y.i[i]; return ret; }
+static INLINE vint vneg_vi2_vi2(vint x)             { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = -x.i[i]; return ret; }
+
+static INLINE vint vand_vi2_vi2_vi2(vint x, vint y)    { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] &  y.i[i]; return ret; }
+static INLINE vint vandnot_vi2_vi2_vi2(vint x, vint y) { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = y.i[i] & ~x.i[i]; return ret; }
+static INLINE vint vor_vi2_vi2_vi2(vint x, vint y)     { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] |  y.i[i]; return ret; }
+static INLINE vint vxor_vi2_vi2_vi2(vint x, vint y)    { vint ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] ^  y.i[i]; return ret; }
+
+static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = (o.u[i] & x.u[i]) | (y.u[i] & ~o.u[i]); return ret; }
+
+static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
+  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
+}
+
+static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) {
+  union { vopmask vo; vint2 vi2; } cnv;
+  cnv.vo = x;
+  return vand_vi2_vi2_vi2(cnv.vi2, y);
+}
+static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi2_vi2_vi2(x, y); }
+
+static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { vint2 ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] << c; return ret; }
+static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { vint2 ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = ((uint32_t)x.i[i]) >> c; return ret; }
+static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { vint2 ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] >> c; return ret; }
+
+static INLINE vopmask visinf_vo_vf (vfloat d) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = (d.f[i] == INFINITYf || d.f[i] == -INFINITYf) ? -1 : 0; return ret; }
+static INLINE vopmask vispinf_vo_vf(vfloat d) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = d.f[i] == INFINITYf ? -1 : 0; return ret; }
+static INLINE vopmask visminf_vo_vf(vfloat d) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = d.f[i] == -INFINITYf ? -1 : 0; return ret; }
+static INLINE vopmask visnan_vo_vf (vfloat d) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = d.f[i] != d.f[i] ? -1 : 0; return ret; }
+
+static INLINE vopmask veq_vo_vi2_vi2 (vint2 x, vint2 y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = x.i[i] == y.i[i] ? -1 : 0; return ret; }
+static INLINE vopmask vgt_vo_vi2_vi2 (vint2 x, vint2 y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.u[i] = x.i[i] >  y.i[i] ? -1 : 0; return ret; }
+static INLINE vint2   veq_vi2_vi2_vi2(vint2 x, vint2 y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] == y.i[i] ? -1 : 0; return ret; }
+static INLINE vint2   vgt_vi2_vi2_vi2(vint2 x, vint2 y) { vopmask ret; for(int i=0;i<VECTLENSP;i++) ret.i[i] = x.i[i] >  y.i[i] ? -1 : 0; return ret; }
+
+static INLINE vfloat vsqrt_vf_vf(vfloat x) { vfloat ret; for(int i=0;i<VECTLENSP;i++) ret.f[i] = sqrtf(x.f[i]); return ret; }
+
+#ifdef _MSC_VER
+// This function is needed when debugging on MSVC.
+static INLINE float vcast_f_vf(vfloat v) { return v.f[0]; }
+#endif
+
+static INLINE vfloat vload_vf_p(const float *ptr) { return *(vfloat *)ptr; }
+static INLINE vfloat vloadu_vf_p(const float *ptr) {
+  vfloat vf;
+  for(int i=0;i<VECTLENSP;i++) vf.f[i] = ptr[i];
+  return vf;
+}
+
+static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { *(vfloat *)ptr = v; }
+static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) {
+  for(int i=0;i<VECTLENSP;i++) ptr[i] = v.f[i];
+}
+static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { *(vfloat *)ptr = v; }
+
+static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
+  for(int i=0;i<VECTLENSP/2;i++) {
+    *(ptr+(offset + step * i)*2 + 0) = v.f[i*2+0];
+    *(ptr+(offset + step * i)*2 + 1) = v.f[i*2+1];
+  }
+}
+
+static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
+
+//
+
+static INLINE vlongdouble vcast_vl_l(long double d) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = d; return ret; }
+
+static INLINE vlongdouble vrev21_vl_vl(vlongdouble d0) {
+  vlongdouble r;
+  for(int i=0;i<VECTLENDP/2;i++) {
+    r.ld[i*2+0] = d0.ld[i*2+1];
+    r.ld[i*2+1] = d0.ld[i*2+0];
+  }
+  return r;
+}
+
+static INLINE vlongdouble vreva2_vl_vl(vlongdouble d0) {
+  vlongdouble r;
+  for(int i=0;i<VECTLENDP/2;i++) {
+    r.ld[i*2+0] = d0.ld[(VECTLENDP/2-1-i)*2+0];
+    r.ld[i*2+1] = d0.ld[(VECTLENDP/2-1-i)*2+1];
+  }
+  return r;
+}
+
+static INLINE vlongdouble vadd_vl_vl_vl(vlongdouble x, vlongdouble y) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = x.ld[i] + y.ld[i]; return ret; }
+static INLINE vlongdouble vsub_vl_vl_vl(vlongdouble x, vlongdouble y) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = x.ld[i] - y.ld[i]; return ret; }
+static INLINE vlongdouble vmul_vl_vl_vl(vlongdouble x, vlongdouble y) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = x.ld[i] * y.ld[i]; return ret; }
+
+static INLINE vlongdouble vneg_vl_vl(vlongdouble x) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = -x.ld[i]; return ret; }
+static INLINE vlongdouble vsubadd_vl_vl_vl(vlongdouble x, vlongdouble y) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = (i & 1) == 0 ? x.ld[i] - y.ld[i] : x.ld[i] + y.ld[i]; return ret; }
+static INLINE vlongdouble vmlsubadd_vl_vl_vl_vl(vlongdouble x, vlongdouble y, vlongdouble z) { return vsubadd_vl_vl_vl(vmul_vl_vl_vl(x, y), z); }
+static INLINE vlongdouble vposneg_vl_vl(vlongdouble x) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = (i & 1) == 0 ?  x.ld[i] : -x.ld[i]; return ret; }
+static INLINE vlongdouble vnegpos_vl_vl(vlongdouble x) { vlongdouble ret; for(int i=0;i<VECTLENDP;i++) ret.ld[i] = (i & 1) == 0 ? -x.ld[i] :  x.ld[i]; return ret; }
+
+static INLINE vlongdouble vload_vl_p(const long double *ptr) { return *(vlongdouble *)ptr; }
+static INLINE vlongdouble vloadu_vl_p(const long double *ptr) {
+  vlongdouble vd;
+  for(int i=0;i<VECTLENDP;i++) vd.ld[i] = ptr[i];
+  return vd;
+}
+
+static INLINE void vstore_v_p_vl(long double *ptr, vlongdouble v) { *(vlongdouble *)ptr = v; }
+static INLINE void vstoreu_v_p_vl(long double *ptr, vlongdouble v) {
+  for(int i=0;i<VECTLENDP;i++) ptr[i] = v.ld[i];
+}
+static INLINE void vstream_v_p_vl(long double *ptr, vlongdouble v) { *(vlongdouble *)ptr = v; }
+
+static INLINE void vscatter2_v_p_i_i_vl(long double *ptr, int offset, int step, vlongdouble v) {
+  for(int i=0;i<VECTLENDP/2;i++) {
+    *(ptr+(offset + step * i)*2 + 0) = v.ld[i*2+0];
+    *(ptr+(offset + step * i)*2 + 1) = v.ld[i*2+1];
+  }
+}
+
+static INLINE void vsscatter2_v_p_i_i_vl(long double *ptr, int offset, int step, vlongdouble v) { vscatter2_v_p_i_i_vl(ptr, offset, step, v); }
+
+#ifdef Sleef_quad2_DEFINED
+static INLINE vquad vcast_vq_q(Sleef_quad d) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = d; return ret; }
+
+static INLINE vquad vrev21_vq_vq(vquad d0) {
+  vquad r;
+  for(int i=0;i<VECTLENDP/2;i++) {
+    r.q[i*2+0] = d0.q[i*2+1];
+    r.q[i*2+1] = d0.q[i*2+0];
+  }
+  return r;
+}
+
+static INLINE vquad vreva2_vq_vq(vquad d0) {
+  vquad r;
+  for(int i=0;i<VECTLENDP/2;i++) {
+    r.q[i*2+0] = d0.q[(VECTLENDP/2-1-i)*2+0];
+    r.q[i*2+1] = d0.q[(VECTLENDP/2-1-i)*2+1];
+  }
+  return r;
+}
+
+static INLINE vquad vadd_vq_vq_vq(vquad x, vquad y) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = x.q[i] + y.q[i]; return ret; }
+static INLINE vquad vsub_vq_vq_vq(vquad x, vquad y) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = x.q[i] - y.q[i]; return ret; }
+static INLINE vquad vmul_vq_vq_vq(vquad x, vquad y) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = x.q[i] * y.q[i]; return ret; }
+
+static INLINE vquad vneg_vq_vq(vquad x) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = -x.q[i]; return ret; }
+static INLINE vquad vsubadd_vq_vq_vq(vquad x, vquad y) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = (i & 1) == 0 ? x.q[i] - y.q[i] : x.q[i] + y.q[i]; return ret; }
+static INLINE vquad vmlsubadd_vq_vq_vq_vq(vquad x, vquad y, vquad z) { return vsubadd_vq_vq_vq(vmul_vq_vq_vq(x, y), z); }
+static INLINE vquad vposneg_vq_vq(vquad x) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = (i & 1) == 0 ?  x.q[i] : -x.q[i]; return ret; }
+static INLINE vquad vnegpos_vq_vq(vquad x) { vquad ret; for(int i=0;i<VECTLENDP;i++) ret.q[i] = (i & 1) == 0 ? -x.q[i] :  x.q[i]; return ret; }
+
+static INLINE vquad vload_vq_p(const Sleef_quad *ptr) { return *(vquad *)ptr; }
+static INLINE vquad vloadu_vq_p(const Sleef_quad *ptr) {
+  vquad vd;
+  for(int i=0;i<VECTLENDP;i++) vd.q[i] = ptr[i];
+  return vd;
+}
+
+static INLINE void vstore_v_p_vq(Sleef_quad *ptr, vquad v) { *(vquad *)ptr = v; }
+static INLINE void vstoreu_v_p_vq(Sleef_quad *ptr, vquad v) {
+  for(int i=0;i<VECTLENDP;i++) ptr[i] = v.q[i];
+}
+static INLINE void vstream_v_p_vq(Sleef_quad *ptr, vquad v) { *(vquad *)ptr = v; }
+
+static INLINE void vscatter2_v_p_i_i_vq(Sleef_quad *ptr, int offset, int step, vquad v) {
+  for(int i=0;i<VECTLENDP/2;i++) {
+    *(ptr+(offset + step * i)*2 + 0) = v.q[i*2+0];
+    *(ptr+(offset + step * i)*2 + 1) = v.q[i*2+1];
+  }
+}
+
+static INLINE void vsscatter2_v_p_i_i_vq(Sleef_quad *ptr, int offset, int step, vquad v) { vscatter2_v_p_i_i_vq(ptr, offset, step, v); }
+#endif
diff --git a/lib/kernel/sleef/arch/helpers.h b/lib/kernel/sleef/arch/helpers.h
new file mode 100644
index 0000000..bb0c4a8
--- /dev/null
+++ b/lib/kernel/sleef/arch/helpers.h
@@ -0,0 +1,127 @@
+/* OpenCL built-in library: SLEEF helpers.h
+
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+/************************/
+#if defined(PURE_C)
+
+  #ifdef DORENAME
+    #include "rename.h"
+  #endif
+
+#elif defined(VEC128)
+
+  #ifdef DORENAME
+    #include "rename_vec128.h"
+  #endif
+
+  #ifdef __ARM_NEON
+    #define CONFIG 1
+    #ifdef __aarch64__
+      #define ENABLE_ADVSIMD
+      #include "helperadvsimd.h"
+    #else
+      #define ENABLE_NEON32
+      #include "helperneon32.h"
+    #endif
+
+  #elif defined(__AVX2__)
+    #define CONFIG 1
+    #define ENABLE_AVX2
+    #include "helperavx2_128.h"
+
+  #elif defined(__AVX__) && defined(__FMA4__)
+    #define CONFIG 5
+    #include "helpersse2.h"
+
+  #elif defined(__SSE4_1__)
+    #define CONFIG 4
+    #define ENABLE_SSE4
+    #include "helpersse2.h"
+
+  #elif defined(__SSE3__)
+    #define CONFIG 3
+    #define ENABLE_SSE2
+    #include "helpersse2.h"
+
+  #elif defined(__SSE2__)
+    #define CONFIG 2
+    #define ENABLE_SSE2
+    #include "helpersse2.h"
+
+  #else
+    #error 128bit vectors unavailable
+  #endif
+
+#elif defined(VEC256)
+
+  #ifdef DORENAME
+    #include "rename_vec256.h"
+  #endif
+
+  #if defined(__AVX2__)
+    #define CONFIG 1
+    #define ENABLE_AVX2
+    #include "helperavx2.h"
+
+  #elif defined(__FMA4__)
+    #define CONFIG 4
+    #define ENABLE_FMA4
+    #define ENABLE_AVX
+    #include "helperavx.h"
+
+  #elif defined(__AVX__)
+    #define CONFIG 1
+    #define ENABLE_AVX
+    #include "helperavx.h"
+
+  #else
+    #error 256bit vectors unavailable
+  #endif
+
+#elif defined(VEC512)
+
+  #ifdef DORENAME
+    #include "rename_vec512.h"
+  #endif
+
+  #ifdef __AVX512F__
+    #define CONFIG 1
+    #define ENABLE_AVX512F
+    #include "helperavx512f.h"
+  #else
+    #error 512bit vectors unavailable
+  #endif
+
+#else
+#error Please specify valid vector size with -DVECxxx
+#endif
+
+/* TODO this one is completely untested. */
+
+#ifdef ENABLE_VECEXT
+#define CONFIG 1
+#include "helpervecext.h"
+#ifdef DORENAME
+#include "renamevecext.h"
+#endif
+#endif
diff --git a/lib/kernel/sleef/arch/helpersse2.h b/lib/kernel/sleef/arch/helpersse2.h
new file mode 100644
index 0000000..99ac898
--- /dev/null
+++ b/lib/kernel/sleef/arch/helpersse2.h
@@ -0,0 +1,440 @@
+//          Copyright Naoki Shibata 2010 - 2017.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#if CONFIG == 2
+
+#if !defined(__SSE2__)
+#error Please specify -msse2.
+#endif
+
+#elif CONFIG == 3
+
+#if !defined(__SSE2__) || !defined(__SSE3__)
+#error Please specify -msse2 and -msse3
+#endif
+
+#elif CONFIG == 4
+
+#if !defined(__SSE2__) || !defined(__SSE3__) || !defined(__SSE4_1__)
+#error Please specify -msse2, -msse3 and -msse4.1
+#endif
+
+#elif CONFIG == 5
+
+#define ENABLE_FMA_DP
+#define ENABLE_FMA_SP
+
+#if !defined(__SSE2__) || !defined(__SSE3__) || !defined(__SSE4_1__) || !defined(__FMA4__)
+#error Please specify -msse2, -msse3, -msse4.1 and -mfma4
+#endif
+
+
+#else
+#error CONFIG macro invalid or not defined
+#endif
+
+#define ENABLE_DP
+#define LOG2VECTLENDP 1
+#define VECTLENDP (1 << LOG2VECTLENDP)
+
+#define ENABLE_SP
+#define LOG2VECTLENSP (LOG2VECTLENDP+1)
+#define VECTLENSP (1 << LOG2VECTLENSP)
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+
+typedef __m128i vmask;
+typedef __m128i vopmask;
+
+typedef __m128d vdouble;
+typedef __m128i vint;
+
+typedef __m128  vfloat;
+typedef __m128i vint2;
+
+//
+
+static INLINE void vprefetch_v_p(const void *ptr) { _mm_prefetch(ptr, _MM_HINT_T0); }
+
+static INLINE int vtestallones_i_vo32(vopmask g) { return _mm_movemask_epi8(g) == 0xFFFF; }
+static INLINE int vtestallones_i_vo64(vopmask g) { return _mm_movemask_epi8(g) == 0xFFFF; }
+
+//
+
+static vint2 vloadu_vi2_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); }
+static void vstoreu_v_p_vi2(int32_t *p, vint2 v) { _mm_storeu_si128((__m128i *)p, v); }
+
+static vint vloadu_vi_p(int32_t *p) { return _mm_loadu_si128((__m128i *)p); }
+static void vstoreu_v_p_vi(int32_t *p, vint v) { _mm_storeu_si128((__m128i *)p, v); }
+
+//
+
+static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return _mm_and_si128(x, y); }
+static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return _mm_andnot_si128(x, y); }
+static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return _mm_or_si128(x, y); }
+static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return _mm_xor_si128(x, y); }
+
+static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return _mm_and_si128(x, y); }
+static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return _mm_andnot_si128(x, y); }
+static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return _mm_or_si128(x, y); }
+static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return _mm_xor_si128(x, y); }
+
+static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return _mm_and_si128(x, y); }
+static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return _mm_or_si128(x, y); }
+static INLINE vmask vandnot_vm_vo64_vm(vmask x, vmask y) { return _mm_andnot_si128(x, y); }
+static INLINE vmask vxor_vm_vo64_vm(vmask x, vmask y) { return _mm_xor_si128(x, y); }
+
+static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return _mm_and_si128(x, y); }
+static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return _mm_or_si128(x, y); }
+static INLINE vmask vandnot_vm_vo32_vm(vmask x, vmask y) { return _mm_andnot_si128(x, y); }
+static INLINE vmask vxor_vm_vo32_vm(vmask x, vmask y) { return _mm_xor_si128(x, y); }
+
+static INLINE vopmask vcast_vo32_vo64(vopmask m) { return _mm_shuffle_epi32(m, 0x08); }
+static INLINE vopmask vcast_vo64_vo32(vopmask m) { return _mm_shuffle_epi32(m, 0x50); }
+
+//
+
+static INLINE vint vrint_vi_vd(vdouble vd) { return _mm_cvtpd_epi32(vd); }
+static INLINE vint vtruncate_vi_vd(vdouble vd) { return _mm_cvttpd_epi32(vd); }
+static INLINE vdouble vcast_vd_vi(vint vi) { return _mm_cvtepi32_pd(vi); }
+static INLINE vint vcast_vi_i(int i) { return _mm_set_epi32(0, 0, i, i); }
+static INLINE vint2 vcastu_vi2_vi(vint vi) { return _mm_and_si128(_mm_shuffle_epi32(vi, 0x73), _mm_set_epi32(-1, 0, -1, 0)); }
+static INLINE vint vcastu_vi_vi2(vint2 vi) { return _mm_shuffle_epi32(vi, 0x0d); }
+
+static INLINE vint2 vsrl64_vi2_vi(vint2 x, int i) { return _mm_srli_epi64(x, i); }
+static INLINE vint2 vadd64_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_add_epi64(x, y); }
+static INLINE vint2 vsub64_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_sub_epi64(x, y); }
+static INLINE vint2 vcast_vi2_i64(long x) { return _mm_set1_epi64x(x); }
+
+#ifdef __SSE4_1__
+static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return _mm_round_pd(vd, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
+static INLINE vdouble vrint_vd_vd(vdouble vd) { return _mm_round_pd(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
+static INLINE vfloat vtruncate_vf_vf(vfloat vf) { return _mm_round_ps(vf, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
+static INLINE vfloat vrint_vf_vf(vfloat vd) { return _mm_round_ps(vd, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
+static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) { return _mm_cmpeq_epi64(x, y); }
+#define FULL_FP_ROUNDING
+#else
+static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return vcast_vd_vi(vtruncate_vi_vd(vd)); }
+static INLINE vdouble vrint_vd_vd(vdouble vd) { return vcast_vd_vi(vrint_vi_vd(vd)); }
+static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {
+  vmask t = _mm_cmpeq_epi32(x, y);
+  return vand_vm_vm_vm(t, _mm_shuffle_epi32(t, 0xb1));
+}
+#endif
+
+static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) { return _mm_add_epi64(x, y); }
+
+static INLINE vmask vcast_vm_i_i(int i0, int i1) { return _mm_set_epi32(i0, i1, i0, i1); }
+
+//
+
+static INLINE vdouble vcast_vd_d(double d) { return _mm_set1_pd(d); }
+static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return _mm_castpd_si128(vd); }
+static INLINE vint2 vreinterpret_vi2_vd(vdouble vd) { return _mm_castpd_si128(vd); }
+static INLINE vdouble vreinterpret_vd_vi2(vint2 vi) { return _mm_castsi128_pd(vi); }
+static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return _mm_castsi128_pd(vm); }
+
+static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return _mm_add_pd(x, y); }
+static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return _mm_sub_pd(x, y); }
+static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return _mm_mul_pd(x, y); }
+static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return _mm_div_pd(x, y); }
+static INLINE vdouble vrec_vd_vd(vdouble x) { return _mm_div_pd(_mm_set1_pd(1), x); }
+static INLINE vdouble vsqrt_vd_vd(vdouble x) { return _mm_sqrt_pd(x); }
+static INLINE vdouble vabs_vd_vd(vdouble d) { return _mm_andnot_pd(_mm_set1_pd(-0.0), d); }
+static INLINE vdouble vneg_vd_vd(vdouble d) { return _mm_xor_pd(_mm_set1_pd(-0.0), d); }
+
+#if CONFIG == 5
+static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_macc_pd(x, y, z); }
+static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_msub_pd(x, y, z); }
+static INLINE vdouble vmlanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_nmacc_pd(x, y, z); }
+static INLINE vdouble vfma_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_macc_pd(x, y, z); }
+static INLINE vdouble vfmapp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_macc_pd(x, y, z); }
+static INLINE vdouble vfmapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_msub_pd(x, y, z); }
+static INLINE vdouble vfmanp_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_nmacc_pd(x, y, z); }
+static INLINE vdouble vfmann_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return _mm_nmsub_pd(x, y, z); }
+#else
+static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
+static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsub_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
+#endif
+
+
+static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return _mm_max_pd(x, y); }
+static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return _mm_min_pd(x, y); }
+
+static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmpeq_pd(x, y)); }
+static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmpneq_pd(x, y)); }
+static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmplt_pd(x, y)); }
+static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmple_pd(x, y)); }
+static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmpgt_pd(x, y)); }
+static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return _mm_castpd_si128(_mm_cmpge_pd(x, y)); }
+
+static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return _mm_add_epi32(x, y); }
+static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return _mm_sub_epi32(x, y); }
+static INLINE vint vneg_vi_vi(vint e) { return vsub_vi_vi_vi(vcast_vi_i(0), e); }
+
+static INLINE vint vand_vi_vi_vi(vint x, vint y) { return _mm_and_si128(x, y); }
+static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return _mm_andnot_si128(x, y); }
+static INLINE vint vor_vi_vi_vi(vint x, vint y) { return _mm_or_si128(x, y); }
+static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return _mm_xor_si128(x, y); }
+
+static INLINE vint vand_vi_vo_vi(vopmask x, vint y) { return _mm_and_si128(x, y); }
+static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return _mm_andnot_si128(x, y); }
+
+static INLINE vint vsll_vi_vi_i(vint x, int c) { return _mm_slli_epi32(x, c); }
+static INLINE vint vsrl_vi_vi_i(vint x, int c) { return _mm_srli_epi32(x, c); }
+static INLINE vint vsra_vi_vi_i(vint x, int c) { return _mm_srai_epi32(x, c); }
+
+static INLINE vint veq_vi_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }
+static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }
+
+static INLINE vopmask veq_cvt_vo_vi_vi(vint x, vint y) { return _mm_shuffle_epi32(_mm_cmpeq_epi32(x, y), 0xf5); }
+static INLINE vopmask vgt_cvt_vo_vi_vi(vint x, vint y) { return _mm_shuffle_epi32(_mm_cmpgt_epi32(x, y), 0xf5); }
+
+static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return _mm_cmpeq_epi32(x, y); }
+static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return _mm_cmpgt_epi32(x, y); }
+
+#ifdef __SSE4_1__
+static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return _mm_blendv_epi8(y, x, m); }
+
+static INLINE vdouble vsel_vd_vo_vd_vd(vopmask m, vdouble x, vdouble y) { return _mm_blendv_pd(y, x, _mm_castsi128_pd(m)); }
+#else
+static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) { return vor_vm_vm_vm(vand_vm_vm_vm(m, x), vandnot_vm_vm_vm(m, y)); }
+
+static INLINE vdouble vsel_vd_vo_vd_vd(vopmask opmask, vdouble x, vdouble y) {
+  return vreinterpret_vd_vm(vor_vm_vm_vm(vand_vm_vm_vm(opmask, vreinterpret_vm_vd(x)), vandnot_vm_vm_vm(opmask, vreinterpret_vm_vd(y))));
+}
+#endif
+
+static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
+  return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
+}
+
+static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
+  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
+}
+
+static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
+  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
+}
+
+static INLINE vopmask visinf_vo_vd(vdouble d) {
+  return vreinterpret_vm_vd(_mm_cmpeq_pd(vabs_vd_vd(d), _mm_set1_pd(INFINITY)));
+}
+
+static INLINE vopmask vispinf_vo_vd(vdouble d) {
+  return vreinterpret_vm_vd(_mm_cmpeq_pd(d, _mm_set1_pd(INFINITY)));
+}
+
+static INLINE vopmask visminf_vo_vd(vdouble d) {
+  return vreinterpret_vm_vd(_mm_cmpeq_pd(d, _mm_set1_pd(-INFINITY)));
+}
+
+static INLINE vopmask visnan_vo_vd(vdouble d) {
+  return vreinterpret_vm_vd(_mm_cmpneq_pd(d, d));
+}
+
+//
+
+static INLINE vdouble vload_vd_p(const double *ptr) { return _mm_load_pd(ptr); }
+static INLINE vdouble vloadu_vd_p(const double *ptr) { return _mm_loadu_pd(ptr); }
+
+static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { _mm_store_pd(ptr, v); }
+static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) { _mm_storeu_pd(ptr, v); }
+
+#if defined(_MSC_VER)
+// This function is needed when debugging on MSVC.
+static INLINE double vcast_d_vd(vdouble v) {
+  double a[VECTLENDP];
+  vstoreu_v_p_vd(a, v);
+  return a[0];
+}
+#endif
+
+//
+
+static INLINE vint2 vcast_vi2_vm(vmask vm) { return vm; }
+static INLINE vmask vcast_vm_vi2(vint2 vi) { return vi; }
+static INLINE vint2 vrint_vi2_vf(vfloat vf) { return _mm_cvtps_epi32(vf); }
+static INLINE vint2 vtruncate_vi2_vf(vfloat vf) { return _mm_cvttps_epi32(vf); }
+static INLINE vfloat vcast_vf_vi2(vint2 vi) { return _mm_cvtepi32_ps(vcast_vm_vi2(vi)); }
+static INLINE vfloat vcast_vf_f(float f) { return _mm_set1_ps(f); }
+static INLINE vint2 vcast_vi2_i(int i) { return _mm_set1_epi32(i); }
+static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return _mm_castps_si128(vf); }
+static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return _mm_castsi128_ps(vm); }
+static INLINE vfloat vreinterpret_vf_vi2(vint2 vm) { return _mm_castsi128_ps(vm); }
+static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return _mm_castps_si128(vf); }
+
+#ifndef __SSE4_1__
+static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); }
+static INLINE vfloat vrint_vf_vf(vfloat vf) { return vcast_vf_vi2(vrint_vi2_vf(vf)); }
+#endif
+
+static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return _mm_add_ps(x, y); }
+static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return _mm_sub_ps(x, y); }
+static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return _mm_mul_ps(x, y); }
+static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return _mm_div_ps(x, y); }
+static INLINE vfloat vrec_vf_vf(vfloat x) { return vdiv_vf_vf_vf(vcast_vf_f(1.0f), x); }
+static INLINE vfloat vsqrt_vf_vf(vfloat x) { return _mm_sqrt_ps(x); }
+static INLINE vfloat vabs_vf_vf(vfloat f) { return vreinterpret_vf_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))); }
+static INLINE vfloat vneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(d))); }
+
+#if CONFIG == 5
+static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_macc_ps(x, y, z); }
+static INLINE vfloat vmlapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_msub_ps(x, y, z); }
+static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_nmacc_ps(x, y, z); }
+static INLINE vfloat vfma_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_macc_ps(x, y, z); }
+static INLINE vfloat vfmapp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_macc_ps(x, y, z); }
+static INLINE vfloat vfmapn_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_msub_ps(x, y, z); }
+static INLINE vfloat vfmanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_nmacc_ps(x, y, z); }
+static INLINE vfloat vfmann_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return _mm_nmsub_ps(x, y, z); }
+
+#else
+static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
+static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsub_vf_vf_vf(z, vmul_vf_vf_vf(x, y)); }
+#endif
+
+static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return _mm_max_ps(x, y); }
+static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return _mm_min_ps(x, y); }
+
+static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmpeq_ps(x, y)); }
+static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmpneq_ps(x, y)); }
+static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmplt_ps(x, y)); }
+static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmple_ps(x, y)); }
+static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmpgt_ps(x, y)); }
+static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return vreinterpret_vm_vf(_mm_cmpge_ps(x, y)); }
+
+static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return vadd_vi_vi_vi(x, y); }
+static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return vsub_vi_vi_vi(x, y); }
+static INLINE vint2 vneg_vi2_vi2(vint2 e) { return vsub_vi2_vi2_vi2(vcast_vi2_i(0), e); }
+
+static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return vand_vi_vi_vi(x, y); }
+static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return vandnot_vi_vi_vi(x, y); }
+static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return vor_vi_vi_vi(x, y); }
+static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return vxor_vi_vi_vi(x, y); }
+
+static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return vand_vi_vo_vi(x, y); }
+static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return vandnot_vi_vo_vi(x, y); }
+
+static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) { return vsll_vi_vi_i(x, c); }
+static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) { return vsrl_vi_vi_i(x, c); }
+static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return vsra_vi_vi_i(x, c); }
+
+static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpeq_epi32(x, y); }
+static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi32(x, y); }
+static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpeq_epi32(x, y); }
+static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return _mm_cmpgt_epi32(x, y); }
+
+#ifdef __SSE4_1__
+static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) { return _mm_blendv_epi8(y, x, m); }
+
+static INLINE vfloat vsel_vf_vo_vf_vf(vopmask m, vfloat x, vfloat y) { return _mm_blendv_ps(y, x, _mm_castsi128_ps(m)); }
+#else
+static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask m, vint2 x, vint2 y) {
+  return vor_vi2_vi2_vi2(vand_vi2_vi2_vi2(m, x), vandnot_vi2_vi2_vi2(m, y));
+}
+
+static INLINE vfloat vsel_vf_vo_vf_vf(vopmask mask, vfloat x, vfloat y) {
+  return vreinterpret_vf_vm(vor_vm_vm_vm(vand_vm_vm_vm(mask, vreinterpret_vm_vf(x)), vandnot_vm_vm_vm(mask, vreinterpret_vm_vf(y))));
+}
+#endif
+
+static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
+  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
+}
+
+static INLINE vopmask visinf_vo_vf(vfloat d) { return veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(INFINITYf)); }
+static INLINE vopmask vispinf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(INFINITYf)); }
+static INLINE vopmask visminf_vo_vf(vfloat d) { return veq_vo_vf_vf(d, vcast_vf_f(-INFINITYf)); }
+static INLINE vopmask visnan_vo_vf(vfloat d) { return vneq_vo_vf_vf(d, d); }
+
+static INLINE int vall_lte64_i_vd_vd(vdouble x, vdouble lim) {
+#ifdef __SSE4_1__
+  vopmask gt = vgt_vo_vd_vd(x, lim);
+  return _mm_testz_si128(gt, gt);
+#else
+  return 0;
+#endif
+}
+
+static INLINE int vall_lte32_i_vf_vf(vfloat x, vfloat lim) {
+#ifdef __SSE4_1__
+  vopmask gt = vgt_vo_vf_vf(x, lim);
+  return _mm_testz_si128(gt, gt);
+#else
+  return 0;
+#endif
+}
+
+static INLINE vfloat vload_vf_p(const float *ptr) { return _mm_load_ps(ptr); }
+static INLINE vfloat vloadu_vf_p(const float *ptr) { return _mm_loadu_ps(ptr); }
+
+static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { _mm_store_ps(ptr, v); }
+static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) { _mm_storeu_ps(ptr, v); }
+
+#ifdef _MSC_VER
+// This function is useful when debugging on MSVC.
+static INLINE float vcast_f_vf(vfloat v) {
+  float a[VECTLENSP];
+  vstoreu_v_p_vf(a, v);
+  return a[0];
+}
+#endif
+//
+
+#define PNMASK ((vdouble) { +0.0, -0.0 })
+#define NPMASK ((vdouble) { -0.0, +0.0 })
+#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
+#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })
+
+static INLINE vdouble vposneg_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(PNMASK))); }
+static INLINE vdouble vnegpos_vd_vd(vdouble d) { return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(NPMASK))); }
+static INLINE vfloat vposneg_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(PNMASKf))); }
+static INLINE vfloat vnegpos_vf_vf(vfloat d) { return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(d), vreinterpret_vm_vf(NPMASKf))); }
+
+#ifdef __SSE3__
+static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return _mm_addsub_pd(x, y); }
+static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return _mm_addsub_ps(x, y); }
+#else
+static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); }
+static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return vadd_vf_vf_vf(x, vnegpos_vf_vf(y)); }
+#endif
+static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
+static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
+
+static INLINE vdouble vrev21_vd_vd(vdouble d0) { return _mm_shuffle_pd(d0, d0, 1); }
+static INLINE vdouble vreva2_vd_vd(vdouble vd) { return vd; }
+
+static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { _mm_stream_pd(ptr, v); }
+static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vstore_v_p_vd((double *)(&ptr[2*offset]), v); }
+static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { _mm_stream_pd((double *)(&ptr[2*offset]), v); }
+
+//
+
+static INLINE vfloat vrev21_vf_vf(vfloat d0) { return _mm_shuffle_ps(d0, d0, (2 << 6) | (3 << 4) | (0 << 2) | (1 << 0)); }
+static INLINE vfloat vreva2_vf_vf(vfloat d0) { return _mm_shuffle_ps(d0, d0, (1 << 6) | (0 << 4) | (3 << 2) | (2 << 0)); }
+
+static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { _mm_stream_ps(ptr, v); }
+
+static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
+  _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
+  _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
+}
+
+static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
+  _mm_storel_pd((double *)(ptr+(offset + step * 0)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
+  _mm_storeh_pd((double *)(ptr+(offset + step * 1)*2), vreinterpret_vd_vm(vreinterpret_vm_vf(v)));
+}
diff --git a/lib/kernel/sleef/arch/helpervecext.h b/lib/kernel/sleef/arch/helpervecext.h
new file mode 100644
index 0000000..035095f
--- /dev/null
+++ b/lib/kernel/sleef/arch/helpervecext.h
@@ -0,0 +1,877 @@
+//          Copyright Naoki Shibata 2010 - 2017.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#include <stdint.h>
+#include "misc.h"
+
+#ifndef CONFIG
+#error CONFIG macro not defined
+#endif
+
+#define ENABLE_DP
+#define ENABLE_SP
+
+#define LOG2VECTLENDP CONFIG
+#define VECTLENDP (1 << LOG2VECTLENDP)
+#define LOG2VECTLENSP (LOG2VECTLENDP+1)
+#define VECTLENSP (1 << LOG2VECTLENSP)
+
+#define DFTPRIORITY LOG2VECTLENDP
+
+// GCC 4 has a bug that prevents long-double functions from compiling
+#if defined(__clang__) || (defined(__GNUC__) && __GNUC__ >= 5)
+#define ENABLE_LONGDOUBLE
+#endif
+
+#if defined(__clang__)
+#define ISANAME "Clang Vector Extension"
+
+typedef uint32_t vmask __attribute__((ext_vector_type(VECTLENDP*2)));
+typedef uint32_t vopmask __attribute__((ext_vector_type(VECTLENDP*2)));
+
+typedef double vdouble __attribute__((ext_vector_type(VECTLENDP)));
+typedef int32_t vint __attribute__((ext_vector_type(VECTLENDP)));
+
+typedef float vfloat __attribute__((ext_vector_type(VECTLENDP*2)));
+typedef int32_t vint2 __attribute__((ext_vector_type(VECTLENDP*2)));
+
+#ifdef ENABLE_LONGDOUBLE
+typedef uint8_t vmaskl __attribute__((ext_vector_type(sizeof(long double)*VECTLENDP)));
+typedef long double vlongdouble __attribute__((ext_vector_type(VECTLENDP)));
+#endif
+
+#ifdef Sleef_quad2_DEFINED
+typedef uint8_t vmaskq __attribute__((ext_vector_type(sizeof(Sleef_quad)*VECTLENDP)));
+#ifdef ENABLE_LONGDOUBLE
+typedef long double vquad __attribute__((ext_vector_type(VECTLENDP)));
+#endif
+#endif
+#elif defined(__GNUC__)
+#define ISANAME "GCC Vector Extension"
+
+typedef uint32_t vmask __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP*2)));
+typedef uint32_t vopmask __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP*2)));
+
+typedef double vdouble __attribute__((vector_size(sizeof(double)*VECTLENDP)));
+typedef int32_t vint __attribute__((vector_size(sizeof(int32_t)*VECTLENDP)));
+
+typedef float vfloat __attribute__((vector_size(sizeof(float)*VECTLENDP*2)));
+typedef int32_t vint2 __attribute__((vector_size(sizeof(int32_t)*VECTLENDP*2)));
+
+#ifdef ENABLE_LONGDOUBLE
+typedef uint8_t vmaskl __attribute__((vector_size(sizeof(long double)*VECTLENDP)));
+typedef long double vlongdouble __attribute__((vector_size(sizeof(long double)*VECTLENDP)));
+#endif
+
+#ifdef Sleef_quad2_DEFINED
+typedef uint8_t vmaskq __attribute__((vector_size(sizeof(Sleef_quad)*VECTLENDP)));
+typedef Sleef_quad vquad __attribute__((vector_size(sizeof(Sleef_quad)*VECTLENDP)));
+#endif
+#endif
+
+//
+
+#if VECTLENDP == 2
+static INLINE vopmask vcast_vo32_vo64(vopmask m) { return (vopmask){ m[1], m[3], 0, 0 }; }
+static INLINE vopmask vcast_vo64_vo32(vopmask m) { return (vopmask){ m[0], m[0], m[1], m[1] }; }
+
+static INLINE vint vcast_vi_i(int i) { return (vint) { i, i }; }
+static INLINE vint2 vcast_vi2_i(int i) { return (vint2) { i, i, i, i }; }
+static INLINE vfloat vcast_vf_f(float f) { return (vfloat) { f, f, f, f }; }
+static INLINE vdouble vcast_vd_d(double d) { return (vdouble) { d, d }; }
+#ifdef ENABLE_LONGDOUBLE
+static INLINE vlongdouble vcast_vl_l(long double d) { return (vlongdouble) { d, d }; }
+#endif
+#ifdef Sleef_quad2_DEFINED
+static INLINE vquad vcast_vq_q(Sleef_quad d) { return (vquad) { d, d }; }
+#endif
+
+static INLINE vmask vcast_vm_i_i(int h, int l) { return (vmask){ l, h, l, h }; }
+static INLINE vint2 vcastu_vi2_vi(vint vi) { return (vint2){ 0, vi[0], 0, vi[1] }; }
+static INLINE vint vcastu_vi_vi2(vint2 vi2) { return (vint){ vi2[1], vi2[3] }; }
+
+static INLINE vint vreinterpretFirstHalf_vi_vi2(vint2 vi2) { return (vint){ vi2[0], vi2[1] }; }
+static INLINE vint2 vreinterpretFirstHalf_vi2_vi(vint vi) { return (vint2){ vi[0], vi[1], 0, 0 }; }
+
+static INLINE vdouble vrev21_vd_vd(vdouble vd) { return (vdouble) { vd[1], vd[0] }; }
+static INLINE vdouble vreva2_vd_vd(vdouble vd) { return vd; }
+static INLINE vfloat vrev21_vf_vf(vfloat vd) { return (vfloat) { vd[1], vd[0], vd[3], vd[2] }; }
+static INLINE vfloat vreva2_vf_vf(vfloat vd) { return (vfloat) { vd[2], vd[3], vd[0], vd[1] }; }
+#ifdef ENABLE_LONGDOUBLE
+static INLINE vlongdouble vrev21_vl_vl(vlongdouble vd) { return (vlongdouble) { vd[1], vd[0] }; }
+static INLINE vlongdouble vreva2_vl_vl(vlongdouble vd) { return vd; }
+static INLINE vlongdouble vposneg_vl_vl(vlongdouble vd) { return (vlongdouble) { +vd[0], -vd[1] }; }
+static INLINE vlongdouble vnegpos_vl_vl(vlongdouble vd) { return (vlongdouble) { -vd[0], +vd[1] }; }
+#endif
+
+#ifdef Sleef_quad2_DEFINED
+static INLINE vquad vrev21_vq_vq(vquad vd) { return (vquad) { vd[1], vd[0] }; }
+static INLINE vquad vreva2_vq_vq(vquad vd) { return vd; }
+static INLINE vquad vposneg_vq_vq(vquad vd) { return (vquad) { +vd[0], -vd[1] }; }
+static INLINE vquad vnegpos_vq_vq(vquad vd) { return (vquad) { -vd[0], +vd[1] }; }
+#endif
+
+#define PNMASK ((vdouble) { +0.0, -0.0 })
+#define NPMASK ((vdouble) { -0.0, +0.0 })
+static INLINE vdouble vposneg_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)PNMASK); }
+static INLINE vdouble vnegpos_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)NPMASK); }
+
+#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f })
+#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f })
+static INLINE vfloat vposneg_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)PNMASKf); }
+static INLINE vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)NPMASKf); }
+#elif VECTLENDP == 4
+static INLINE vopmask vcast_vo32_vo64(vopmask m) { return (vopmask){ m[1], m[3], m[5], m[7], 0, 0, 0, 0 }; }
+static INLINE vopmask vcast_vo64_vo32(vopmask m) { return (vopmask){ m[0], m[0], m[1], m[1], m[2], m[2], m[3], m[3] }; }
+
+static INLINE vint vcast_vi_i(int i) { return (vint) { i, i, i, i }; }
+static INLINE vint2 vcast_vi2_i(int i) { return (vint2) { i, i, i, i, i, i, i, i }; }
+static INLINE vfloat vcast_vf_f(float f) { return (vfloat) { f, f, f, f, f, f, f, f }; }
+static INLINE vdouble vcast_vd_d(double d) { return (vdouble) { d, d, d, d }; }
+#ifdef ENABLE_LONGDOUBLE
+static INLINE vlongdouble vcast_vl_l(long double d) { return (vlongdouble) { d, d, d, d }; }
+#endif
+
+static INLINE vmask vcast_vm_i_i(int h, int l) { return (vmask){ l, h, l, h, l, h, l, h }; }
+static INLINE vint2 vcastu_vi2_vi(vint vi) { return (vint2){ 0, vi[0], 0, vi[1], 0, vi[2], 0, vi[3] }; }
+static INLINE vint vcastu_vi_vi2(vint2 vi2) { return (vint){ vi2[1], vi2[3], vi2[5], vi2[7] }; }
+
+static INLINE vint vreinterpretFirstHalf_vi_vi2(vint2 vi2) { return (vint){ vi2[0], vi2[1], vi2[2], vi2[3] }; }
+static INLINE vint2 vreinterpretFirstHalf_vi2_vi(vint vi) { return (vint2){ vi[0], vi[1], vi[2], vi[3], 0, 0, 0, 0 }; }
+
+#define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0 })
+#define NPMASK ((vdouble) { -0.0, +0.0, -0.0, +0.0 })
+static INLINE vdouble vposneg_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)PNMASK); }
+static INLINE vdouble vnegpos_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)NPMASK); }
+
+#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f })
+#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f })
+static INLINE vfloat vposneg_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)PNMASKf); }
+static INLINE vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)NPMASKf); }
+
+static INLINE vdouble vrev21_vd_vd(vdouble vd) { return (vdouble) { vd[1], vd[0], vd[3], vd[2] }; }
+static INLINE vdouble vreva2_vd_vd(vdouble vd) { return (vdouble) { vd[2], vd[3], vd[0], vd[1] }; }
+static INLINE vfloat vrev21_vf_vf(vfloat vd) { return (vfloat) { vd[1], vd[0], vd[3], vd[2], vd[5], vd[4], vd[7], vd[6] }; }
+static INLINE vfloat vreva2_vf_vf(vfloat vd) { return (vfloat) { vd[6], vd[7], vd[4], vd[5], vd[2], vd[3], vd[0], vd[1] }; }
+#ifdef ENABLE_LONGDOUBLE
+static INLINE vlongdouble vrev21_vl_vl(vlongdouble vd) { return (vlongdouble) { vd[1], vd[0], vd[3], vd[2] }; }
+static INLINE vlongdouble vreva2_vl_vl(vlongdouble vd) { return (vlongdouble) { vd[2], vd[3], vd[0], vd[1] }; }
+static INLINE vlongdouble vposneg_vl_vl(vlongdouble vd) { return (vlongdouble) { +vd[0], -vd[1], +vd[2], -vd[3] }; }
+static INLINE vlongdouble vnegpos_vl_vl(vlongdouble vd) { return (vlongdouble) { -vd[0], +vd[1], -vd[2], +vd[3] }; }
+#endif
+#elif VECTLENDP == 8
+static INLINE vopmask vcast_vo32_vo64(vopmask m) { return (vopmask){ m[1], m[3], m[5], m[7], m[9], m[11], m[13], m[15], 0, 0, 0, 0, 0, 0, 0, 0 }; }
+static INLINE vopmask vcast_vo64_vo32(vopmask m) { return (vopmask){ m[0], m[0], m[1], m[1], m[2], m[2], m[3], m[3], m[4], m[4], m[5], m[5], m[6], m[6], m[7], m[7] }; }
+
+static INLINE vint vcast_vi_i(int i) { return (vint) { i, i, i, i, i, i, i, i }; }
+static INLINE vint2 vcast_vi2_i(int i) { return (vint2) { i, i, i, i, i, i, i, i, i, i, i, i, i, i, i, i }; }
+static INLINE vfloat vcast_vf_f(float f) { return (vfloat) { f, f, f, f, f, f, f, f, f, f, f, f, f, f, f, f }; }
+static INLINE vdouble vcast_vd_d(double d) { return (vdouble) { d, d, d, d, d, d, d, d }; }
+#ifdef ENABLE_LONGDOUBLE
+static INLINE vlongdouble vcast_vl_l(long double d) { return (vlongdouble) { d, d, d, d, d, d, d, d }; }
+#endif
+
+static INLINE vmask vcast_vm_i_i(int h, int l) { return (vmask){ l, h, l, h, l, h, l, h, l, h, l, h, l, h, l, h }; }
+static INLINE vint2 vcastu_vi2_vi(vint vi) { return (vint2){ 0, vi[0], 0, vi[1], 0, vi[2], 0, vi[3], 0, vi[4], 0, vi[5], 0, vi[6], 0, vi[7] }; }
+static INLINE vint vcastu_vi_vi2(vint2 vi2) { return (vint){ vi2[1], vi2[3], vi2[5], vi2[7], vi2[9], vi2[11], vi2[13], vi2[15] }; }
+
+static INLINE vint vreinterpretFirstHalf_vi_vi2(vint2 vi2) { return (vint){ vi2[0], vi2[1], vi2[2], vi2[3], vi2[4], vi2[5], vi2[6], vi2[7] }; }
+static INLINE vint2 vreinterpretFirstHalf_vi2_vi(vint vi) { return (vint2){ vi[0], vi[1], vi[2], vi[3], vi[4], vi[5], vi[6], vi[7], 0, 0, 0, 0, 0, 0, 0, 0 }; }
+
+#define PNMASK ((vdouble) { +0.0, -0.0, +0.0, -0.0, +0.0, -0.0, +0.0, -0.0 })
+#define NPMASK ((vdouble) { -0.0, +0.0, -0.0, +0.0, -0.0, +0.0, -0.0, +0.0 })
+static INLINE vdouble vposneg_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)PNMASK); }
+static INLINE vdouble vnegpos_vd_vd(vdouble d) { return (vdouble)((vmask)d ^ (vmask)NPMASK); }
+
+#define PNMASKf ((vfloat) { +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f })
+#define NPMASKf ((vfloat) { -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f, -0.0f, +0.0f })
+static INLINE vfloat vposneg_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)PNMASKf); }
+static INLINE vfloat vnegpos_vf_vf(vfloat d) { return (vfloat)((vmask)d ^ (vmask)NPMASKf); }
+
+static INLINE vdouble vrev21_vd_vd(vdouble vd) { return (vdouble) { vd[1], vd[0], vd[3], vd[2], vd[5], vd[4], vd[7], vd[6] }; }
+static INLINE vdouble vreva2_vd_vd(vdouble vd) { return (vdouble) { vd[6], vd[7], vd[4], vd[5], vd[2], vd[3], vd[0], vd[1] }; }
+static INLINE vfloat vrev21_vf_vf(vfloat vd) {
+  return (vfloat) {
+    vd[1], vd[0], vd[3], vd[2], vd[5], vd[4], vd[7], vd[6],
+      vd[9], vd[8], vd[11], vd[10], vd[13], vd[12], vd[15], vd[14] };
+}
+static INLINE vfloat vreva2_vf_vf(vfloat vd) {
+  return (vfloat) {
+    vd[14], vd[15], vd[12], vd[13], vd[10], vd[11], vd[8], vd[9],
+      vd[6], vd[7], vd[4], vd[5], vd[2], vd[3], vd[0], vd[1]};
+}
+#ifdef ENABLE_LONGDOUBLE
+static INLINE vlongdouble vrev21_vl_vl(vlongdouble vd) { return (vlongdouble) { vd[1], vd[0], vd[3], vd[2], vd[5], vd[4], vd[7], vd[6] }; }
+static INLINE vlongdouble vreva2_vl_vl(vlongdouble vd) { return (vlongdouble) { vd[6], vd[7], vd[4], vd[5], vd[2], vd[3], vd[0], vd[1] }; }
+static INLINE vlongdouble vposneg_vl_vl(vlongdouble vd) { return (vlongdouble) { +vd[0], -vd[1], +vd[2], -vd[3], +vd[4], -vd[5], +vd[6], -vd[7] }; }
+static INLINE vlongdouble vnegpos_vl_vl(vlongdouble vd) { return (vlongdouble) { -vd[0], +vd[1], -vd[2], +vd[3], -vd[4], +vd[5], -vd[6], +vd[7] }; }
+#endif
+#else
+static INLINE vint vcast_vi_i(int k) {
+  vint ret;
+  for(int i=0;i<VECTLENDP;i++) ret[i] = k;
+  return ret;
+}
+
+static INLINE vint2 vcast_vi2_i(int k) {
+  vint2 ret;
+  for(int i=0;i<VECTLENSP;i++) ret[i] = k;
+  return ret;
+}
+
+static INLINE vdouble vcast_vd_d(double d) {
+  vdouble ret;
+  for(int i=0;i<VECTLENDP;i++) ret[i] = d;
+  return ret;
+}
+
+static INLINE vfloat vcast_vf_f(float f) {
+  vfloat ret;
+  for(int i=0;i<VECTLENSP;i++) ret[i] = f;
+  return ret;
+}
+
+#ifdef ENABLE_LONGDOUBLE
+static INLINE vlongdouble vcast_vl_l(long double d) {
+  vlongdouble ret;
+  for(int i=0;i<VECTLENDP;i++) ret[i] = d;
+  return ret;
+}
+#endif
+
+static INLINE vopmask vcast_vo32_vo64(vopmask m) {
+  vopmask ret;
+  for(int i=0;i<VECTLENDP;i++) ret[i] = m[i*2+1];
+  for(int i=VECTLENDP;i<VECTLENDP*2;i++) ret[i] = 0;
+  return ret;
+}
+
+static INLINE vopmask vcast_vo64_vo32(vopmask m) {
+  vopmask ret;
+  for(int i=0;i<VECTLENDP;i++) ret[i*2] = ret[i*2+1] = m[i];
+  return ret;
+}
+
+static INLINE vmask vcast_vm_i_i(int h, int l) {
+  vmask ret;
+  for(int i=0;i<VECTLENDP;i++) {
+    ret[i*2+0] = l;
+    ret[i*2+1] = h;
+  }
+  return ret;
+}
+
+static INLINE vint2 vcastu_vi2_vi(vint vi) {
+  vint2 ret;
+  for(int i=0;i<VECTLENDP;i++) {
+    ret[i*2+0] = 0;
+    ret[i*2+1] = vi[i];
+  }
+  return ret;
+}
+
+static INLINE vint vcastu_vi_vi2(vint2 vi2) {
+  vint ret;
+  for(int i=0;i<VECTLENDP;i++) ret[i] = vi2[i*2+1];
+  return ret;
+}
+
+static INLINE vint vreinterpretFirstHalf_vi_vi2(vint2 vi2) {
+  vint ret;
+  for(int i=0;i<VECTLENDP;i++) ret[i] = vi2[i];
+  return ret;
+}
+
+static INLINE vint2 vreinterpretFirstHalf_vi2_vi(vint vi) {
+  vint2 ret;
+  for(int i=0;i<VECTLENDP;i++) ret[i] = vi[i];
+  for(int i=VECTLENDP;i<VECTLENDP*2;i++) ret[i] = 0;
+  return ret;
+}
+
+static INLINE vdouble vrev21_vd_vd(vdouble d0) {
+  vdouble r;
+  for(int i=0;i<VECTLENDP/2;i++) {
+    r[i*2+0] = d0[i*2+1];
+    r[i*2+1] = d0[i*2+0];
+  }
+  return r;
+}
+
+static INLINE vdouble vreva2_vd_vd(vdouble d0) {
+  vdouble r;
+  for(int i=0;i<VECTLENDP/2;i++) {
+    r[i*2+0] = d0[(VECTLENDP/2-1-i)*2+0];
+    r[i*2+1] = d0[(VECTLENDP/2-1-i)*2+1];
+  }
+  return r;
+}
+
+static INLINE vfloat vrev21_vf_vf(vfloat d0) {
+  vfloat r;
+  for(int i=0;i<VECTLENSP/2;i++) {
+    r[i*2+0] = d0[i*2+1];
+    r[i*2+1] = d0[i*2+0];
+  }
+  return r;
+}
+
+static INLINE vfloat vreva2_vf_vf(vfloat d0) {
+  vfloat r;
+  for(int i=0;i<VECTLENSP/2;i++) {
+    r[i*2+0] = d0[(VECTLENSP/2-1-i)*2+0];
+    r[i*2+1] = d0[(VECTLENSP/2-1-i)*2+1];
+  }
+  return r;
+}
+
+#ifdef ENABLE_LONGDOUBLE
+static INLINE vlongdouble vrev21_vl_vl(vlongdouble d0) {
+  vlongdouble r;
+  for(int i=0;i<VECTLENDP/2;i++) {
+    r[i*2+0] = d0[i*2+1];
+    r[i*2+1] = d0[i*2+0];
+  }
+  return r;
+}
+
+static INLINE vlongdouble vreva2_vl_vl(vlongdouble d0) {
+  vlongdouble r;
+  for(int i=0;i<VECTLENDP/2;i++) {
+    r[i*2+0] = d0[(VECTLENDP/2-1-i)*2+0];
+    r[i*2+1] = d0[(VECTLENDP/2-1-i)*2+1];
+  }
+  return r;
+}
+#endif
+
+static INLINE vdouble vposneg_vd_vd(vdouble d0) {
+  vdouble r;
+  for(int i=0;i<VECTLENDP/2;i++) {
+    r[i*2+0] = +d0[i*2+0];
+    r[i*2+1] = -d0[i*2+1];
+  }
+  return r;
+}
+
+static INLINE vdouble vnegpos_vd_vd(vdouble d0) {
+  vdouble r;
+  for(int i=0;i<VECTLENDP/2;i++) {
+    r[i*2+0] = -d0[i*2+0];
+    r[i*2+1] = +d0[i*2+1];
+  }
+  return r;
+}
+
+static INLINE vfloat vposneg_vf_vf(vfloat d0) {
+  vfloat r;
+  for(int i=0;i<VECTLENSP/2;i++) {
+    r[i*2+0] = +d0[i*2+0];
+    r[i*2+1] = -d0[i*2+1];
+  }
+  return r;
+}
+
+static INLINE vfloat vnegpos_vf_vf(vfloat d0) {
+  vfloat r;
+  for(int i=0;i<VECTLENSP/2;i++) {
+    r[i*2+0] = -d0[i*2+0];
+    r[i*2+1] = +d0[i*2+1];
+  }
+  return r;
+}
+
+#ifdef ENABLE_LONGDOUBLE
+static INLINE vlongdouble vposneg_vl_vl(vlongdouble d0) {
+  vlongdouble r;
+  for(int i=0;i<VECTLENDP/2;i++) {
+    r[i*2+0] = +d0[i*2+0];
+    r[i*2+1] = -d0[i*2+1];
+  }
+  return r;
+}
+
+static INLINE vlongdouble vnegpos_vl_vl(vlongdouble d0) {
+  vlongdouble r;
+  for(int i=0;i<VECTLENDP/2;i++) {
+    r[i*2+0] = -d0[i*2+0];
+    r[i*2+1] = +d0[i*2+1];
+  }
+  return r;
+}
+#endif
+#endif
+
+//
+
+static INLINE int vavailability_i(int name) { return -1; }
+static INLINE void vprefetch_v_p(const void *ptr) { }
+
+static INLINE int vtestallones_i_vo64(vopmask g) {
+  int ret = 1; for(int i=0;i<VECTLENDP*2;i++) ret = ret && g[i]; return ret;
+}
+
+static INLINE int vtestallones_i_vo32(vopmask g) {
+  int ret = 1; for(int i=0;i<VECTLENDP*2;i++) ret = ret && g[i]; return ret;
+}
+
+//
+
+static vint2 vloadu_vi2_p(int32_t *p) {
+  vint2 vi;
+  for(int i=0;i<VECTLENSP;i++) vi[i] = p[i];
+  return vi;
+}
+
+static void vstoreu_v_p_vi2(int32_t *p, vint2 v) {
+  for(int i=0;i<VECTLENSP;i++) p[i] = v[i];
+}
+
+static vint vloadu_vi_p(int32_t *p) {
+  vint vi;
+  for(int i=0;i<VECTLENDP;i++) vi[i] = p[i];
+  return vi;
+}
+
+static void vstoreu_v_p_vi(int32_t *p, vint v) {
+  for(int i=0;i<VECTLENDP;i++) p[i] = v[i];
+}
+
+//
+
+static INLINE vmask vand_vm_vm_vm(vmask x, vmask y) { return x & y; }
+static INLINE vmask vandnot_vm_vm_vm(vmask x, vmask y) { return y & ~x; }
+static INLINE vmask vor_vm_vm_vm(vmask x, vmask y) { return x | y; }
+static INLINE vmask vxor_vm_vm_vm(vmask x, vmask y) { return x ^ y; }
+
+static INLINE vopmask vand_vo_vo_vo(vopmask x, vopmask y) { return x & y; }
+static INLINE vopmask vandnot_vo_vo_vo(vopmask x, vopmask y) { return y & ~x; }
+static INLINE vopmask vor_vo_vo_vo(vopmask x, vopmask y) { return x | y; }
+static INLINE vopmask vxor_vo_vo_vo(vopmask x, vopmask y) { return x ^ y; }
+
+static INLINE vmask vand_vm_vo64_vm(vopmask x, vmask y) { return x & y; }
+static INLINE vmask vandnot_vm_vo64_vm(vopmask x, vmask y) { return y & ~x; }
+static INLINE vmask vor_vm_vo64_vm(vopmask x, vmask y) { return x | y; }
+static INLINE vmask vxor_vm_vo64_vm(vopmask x, vmask y) { return x ^ y; }
+
+static INLINE vmask vand_vm_vo32_vm(vopmask x, vmask y) { return x & y; }
+static INLINE vmask vandnot_vm_vo32_vm(vopmask x, vmask y) { return y & ~x; }
+static INLINE vmask vor_vm_vo32_vm(vopmask x, vmask y) { return x | y; }
+static INLINE vmask vxor_vm_vo32_vm(vopmask x, vmask y) { return x ^ y; }
+
+//
+
+static INLINE vdouble vsel_vd_vo_vd_vd(vopmask o, vdouble x, vdouble y) { return (vdouble)(((vmask)o & (vmask)x) | ((vmask)y & ~(vmask)o)); }
+static INLINE vint2 vsel_vi2_vo_vi2_vi2(vopmask o, vint2 x, vint2 y) { return (vint2)(((vmask)o & (vmask)x) | ((vmask)y & ~(vmask)o)); }
+
+static INLINE CONST vdouble vsel_vd_vo_d_d(vopmask o, double v1, double v0) {
+  return vsel_vd_vo_vd_vd(o, vcast_vd_d(v1), vcast_vd_d(v0));
+}
+
+static INLINE vdouble vsel_vd_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
+  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_d_d(o1, d1, d2));
+}
+
+static INLINE vdouble vsel_vd_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
+  return vsel_vd_vo_vd_vd(o0, vcast_vd_d(d0), vsel_vd_vo_vd_vd(o1, vcast_vd_d(d1), vsel_vd_vo_d_d(o2, d2, d3)));
+}
+
+static INLINE vdouble vcast_vd_vi(vint vi) {
+#if defined(__clang__)
+  return __builtin_convertvector(vi, vdouble);
+#else
+  vdouble vd;
+  for(int i=0;i<VECTLENDP;i++) vd[i] = vi[i];
+  return vd;
+#endif
+}
+static INLINE vint vtruncate_vi_vd(vdouble vd) {
+#if defined(__clang__)
+  return __builtin_convertvector(vd, vint);
+#else
+  vint vi;
+  for(int i=0;i<VECTLENDP;i++) vi[i] = vd[i];
+  return vi;
+#endif
+}
+static INLINE vint vrint_vi_vd(vdouble vd) { return vtruncate_vi_vd(vsel_vd_vo_vd_vd((vopmask)(vd < 0.0), vd - 0.5, vd + 0.5)); }
+static INLINE vdouble vtruncate_vd_vd(vdouble vd) { return vcast_vd_vi(vtruncate_vi_vd(vd)); }
+static INLINE vdouble vrint_vd_vd(vdouble vd) { return vcast_vd_vi(vrint_vi_vd(vd)); }
+
+static INLINE vopmask veq64_vo_vm_vm(vmask x, vmask y) {
+#if defined(__clang__)
+  typedef int64_t vi64 __attribute__((ext_vector_type(VECTLENDP)));
+#else
+  typedef int64_t vi64 __attribute__((vector_size(sizeof(int64_t)*VECTLENDP)));
+#endif
+  return (vopmask)((vi64)x == (vi64)y);
+}
+
+static INLINE vmask vadd64_vm_vm_vm(vmask x, vmask y) {
+#if defined(__clang__)
+  typedef int64_t vi64 __attribute__((ext_vector_type(VECTLENDP)));
+#else
+  typedef int64_t vi64 __attribute__((vector_size(sizeof(int64_t)*VECTLENDP)));
+#endif
+  return (vmask)((vi64)x + (vi64)y);
+}
+
+//
+
+static INLINE vmask vreinterpret_vm_vd(vdouble vd) { return (vmask)vd; }
+static INLINE vint2 vreinterpret_vi2_vd(vdouble vd) { return (vint2)vd; }
+static INLINE vdouble vreinterpret_vd_vi2(vint2 vi) { return (vdouble)vi; }
+static INLINE vdouble vreinterpret_vd_vm(vmask vm) { return (vdouble)vm; }
+
+static INLINE vdouble vadd_vd_vd_vd(vdouble x, vdouble y) { return x + y; }
+static INLINE vdouble vsub_vd_vd_vd(vdouble x, vdouble y) { return x - y; }
+static INLINE vdouble vmul_vd_vd_vd(vdouble x, vdouble y) { return x * y; }
+static INLINE vdouble vdiv_vd_vd_vd(vdouble x, vdouble y) { return x / y; }
+static INLINE vdouble vrec_vd_vd(vdouble x) { return 1.0 / x; }
+
+static INLINE vdouble vabs_vd_vd(vdouble d) { return (vdouble)((vmask)d & ~(vmask)vcast_vd_d(-0.0)); }
+static INLINE vdouble vneg_vd_vd(vdouble d) { return -d; }
+static INLINE vdouble vmla_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return x * y + z; }
+static INLINE vdouble vmlapn_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return x * y - z; }
+static INLINE vdouble vmax_vd_vd_vd(vdouble x, vdouble y) { return vsel_vd_vo_vd_vd((vopmask)(x > y), x, y); }
+static INLINE vdouble vmin_vd_vd_vd(vdouble x, vdouble y) { return vsel_vd_vo_vd_vd((vopmask)(x < y), x, y); }
+
+static INLINE vdouble vsubadd_vd_vd_vd(vdouble x, vdouble y) { return vadd_vd_vd_vd(x, vnegpos_vd_vd(y)); }
+static INLINE vdouble vmlsubadd_vd_vd_vd_vd(vdouble x, vdouble y, vdouble z) { return vsubadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z); }
+
+static INLINE vopmask veq_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x == y); }
+static INLINE vopmask vneq_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x != y); }
+static INLINE vopmask vlt_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x < y); }
+static INLINE vopmask vle_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x <= y); }
+static INLINE vopmask vgt_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x > y); }
+static INLINE vopmask vge_vo_vd_vd(vdouble x, vdouble y) { return (vopmask)(x >= y); }
+
+static INLINE vint vadd_vi_vi_vi(vint x, vint y) { return x + y; }
+static INLINE vint vsub_vi_vi_vi(vint x, vint y) { return x - y; }
+static INLINE vint vneg_vi_vi(vint e) { return -e; }
+
+static INLINE vint vand_vi_vi_vi(vint x, vint y) { return x & y; }
+static INLINE vint vandnot_vi_vi_vi(vint x, vint y) { return y & ~x; }
+static INLINE vint vor_vi_vi_vi(vint x, vint y) { return x | y; }
+static INLINE vint vxor_vi_vi_vi(vint x, vint y) { return x ^ y; }
+
+static INLINE vint vand_vi_vo_vi(vopmask x, vint y) { return vreinterpretFirstHalf_vi_vi2((vint2)x) & y; }
+static INLINE vint vandnot_vi_vo_vi(vopmask x, vint y) { return y & ~vreinterpretFirstHalf_vi_vi2((vint2)x); }
+
+static INLINE vint vsll_vi_vi_i(vint x, int c) {
+#if defined(__clang__)
+  typedef uint32_t vu __attribute__((ext_vector_type(VECTLENDP)));
+#else
+  typedef uint32_t vu __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP)));
+#endif
+  return (vint)(((vu)x) << c);
+}
+
+static INLINE vint vsrl_vi_vi_i(vint x, int c) {
+#if defined(__clang__)
+  typedef uint32_t vu __attribute__((ext_vector_type(VECTLENDP)));
+#else
+  typedef uint32_t vu __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP)));
+#endif
+  return (vint)(((vu)x) >> c);
+}
+
+static INLINE vint vsra_vi_vi_i(vint x, int c) { return x >> c; }
+
+static INLINE vint veq_vi_vi_vi(vint x, vint y) { return x == y; }
+static INLINE vint vgt_vi_vi_vi(vint x, vint y) { return x > y; }
+
+static INLINE vopmask veq_vo_vi_vi(vint x, vint y) { return (vopmask)vreinterpretFirstHalf_vi2_vi(x == y); }
+static INLINE vopmask vgt_vo_vi_vi(vint x, vint y) { return (vopmask)vreinterpretFirstHalf_vi2_vi(x > y);}
+
+static INLINE vint vsel_vi_vo_vi_vi(vopmask m, vint x, vint y) {
+  return vor_vi_vi_vi(vand_vi_vi_vi(vreinterpretFirstHalf_vi_vi2((vint2)m), x),
+          vandnot_vi_vi_vi(vreinterpretFirstHalf_vi_vi2((vint2)m), y));
+}
+
+static INLINE vopmask visinf_vo_vd(vdouble d) { return (vopmask)(vabs_vd_vd(d) == INFINITY); }
+static INLINE vopmask vispinf_vo_vd(vdouble d) { return (vopmask)(d == INFINITY); }
+static INLINE vopmask visminf_vo_vd(vdouble d) { return (vopmask)(d == -INFINITY); }
+static INLINE vopmask visnan_vo_vd(vdouble d) { return (vopmask)(d != d); }
+
+static INLINE vdouble vsqrt_vd_vd(vdouble d) {
+#if defined(__clang__)
+  typedef int64_t vi64 __attribute__((ext_vector_type(VECTLENDP)));
+#else
+  typedef int64_t vi64 __attribute__((vector_size(sizeof(int64_t)*VECTLENDP)));
+#endif
+
+  vdouble q = vcast_vd_d(1);
+
+  vopmask o = (vopmask)(d < 8.636168555094445E-78);
+  d = (vdouble)((o & (vmask)(d * 1.157920892373162E77)) | (~o & (vmask)d));
+
+  q = (vdouble)((o & (vmask)vcast_vd_d(2.9387358770557188E-39)) | (~o & (vmask)vcast_vd_d(1)));
+
+  q = (vdouble)vor_vm_vm_vm(vlt_vo_vd_vd(d, vcast_vd_d(0)), (vmask)q);
+
+  vdouble x = (vdouble)(0x5fe6ec85e7de30daLL - ((vi64)(d + 1e-320) >> 1));
+  x = x * (  3 - d * x * x);
+  x = x * ( 12 - d * x * x);
+  x = x * (768 - d * x * x);
+  x *= 1.0 / (1 << 13);
+  x = (d - (d * x) * (d * x)) * (x * 0.5) + d * x;
+
+  return x * q;
+}
+
+static INLINE double vcast_d_vd(vdouble v) { return v[0]; }
+static INLINE float vcast_f_vf(vfloat v) { return v[0]; }
+
+static INLINE vdouble vload_vd_p(const double *ptr) { return *(vdouble *)ptr; }
+static INLINE vdouble vloadu_vd_p(const double *ptr) {
+  vdouble vd;
+  for(int i=0;i<VECTLENDP;i++) vd[i] = ptr[i];
+  return vd;
+}
+
+static INLINE void vstore_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
+static INLINE void vstoreu_v_p_vd(double *ptr, vdouble v) {
+  for(int i=0;i<VECTLENDP;i++) ptr[i] = v[i];
+}
+static INLINE void vstream_v_p_vd(double *ptr, vdouble v) { *(vdouble *)ptr = v; }
+
+static INLINE void vscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) {
+  for(int i=0;i<VECTLENDP/2;i++) {
+    *(ptr+(offset + step * i)*2 + 0) = v[i*2+0];
+    *(ptr+(offset + step * i)*2 + 1) = v[i*2+1];
+  }
+}
+
+static INLINE void vsscatter2_v_p_i_i_vd(double *ptr, int offset, int step, vdouble v) { vscatter2_v_p_i_i_vd(ptr, offset, step, v); }
+
+//
+
+static INLINE vfloat vsel_vf_vo_vf_vf(vopmask o, vfloat x, vfloat y) { return (vfloat)(((vmask)o & (vmask)x) | (~(vmask)o & (vmask)y)); }
+
+static INLINE CONST vfloat vsel_vf_vo_f_f(vopmask o, float v1, float v0) {
+  return vsel_vf_vo_vf_vf(o, vcast_vf_f(v1), vcast_vf_f(v0));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_f_f_f(vopmask o0, vopmask o1, float d0, float d1, float d2) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_f_f(o1, d1, d2));
+}
+
+static INLINE vfloat vsel_vf_vo_vo_vo_f_f_f_f(vopmask o0, vopmask o1, vopmask o2, float d0, float d1, float d2, float d3) {
+  return vsel_vf_vo_vf_vf(o0, vcast_vf_f(d0), vsel_vf_vo_vf_vf(o1, vcast_vf_f(d1), vsel_vf_vo_f_f(o2, d2, d3)));
+}
+
+static INLINE vint2 vcast_vi2_vm(vmask vm) { return (vint2)vm; }
+static INLINE vmask vcast_vm_vi2(vint2 vi) { return (vmask)vi; }
+
+static INLINE vfloat vcast_vf_vi2(vint2 vi) {
+#if defined(__clang__)
+  return __builtin_convertvector(vi, vfloat);
+#else
+  vfloat vf;
+  for(int i=0;i<VECTLENDP*2;i++) vf[i] = vi[i];
+  return vf;
+#endif
+}
+
+static INLINE vint2 vtruncate_vi2_vf(vfloat vf) {
+#if defined(__clang__)
+  return __builtin_convertvector(vf, vint2);
+#else
+  vint2 vi;
+  for(int i=0;i<VECTLENDP*2;i++) vi[i] = vf[i];
+  return vi;
+#endif
+}
+
+static INLINE vint2 vrint_vi2_vf(vfloat vf) { return vtruncate_vi2_vf(vsel_vf_vo_vf_vf((vopmask)(vf < 0), vf - 0.5f, vf + 0.5)); }
+static INLINE vfloat vtruncate_vf_vf(vfloat vd) { return vcast_vf_vi2(vtruncate_vi2_vf(vd)); }
+static INLINE vfloat vrint_vf_vf(vfloat vd) { return vcast_vf_vi2(vrint_vi2_vf(vd)); }
+
+static INLINE vmask vreinterpret_vm_vf(vfloat vf) { return (vmask)vf; }
+static INLINE vfloat vreinterpret_vf_vm(vmask vm) { return (vfloat)vm; }
+static INLINE vfloat vreinterpret_vf_vi2(vint2 vi) { return (vfloat)vi; }
+static INLINE vint2 vreinterpret_vi2_vf(vfloat vf) { return (vint2)vf; }
+
+static INLINE vfloat vadd_vf_vf_vf(vfloat x, vfloat y) { return x + y; }
+static INLINE vfloat vsub_vf_vf_vf(vfloat x, vfloat y) { return x - y; }
+static INLINE vfloat vmul_vf_vf_vf(vfloat x, vfloat y) { return x * y; }
+static INLINE vfloat vdiv_vf_vf_vf(vfloat x, vfloat y) { return x / y; }
+static INLINE vfloat vrec_vf_vf(vfloat x) { return 1.0f / x; }
+
+static INLINE vfloat vabs_vf_vf(vfloat f) { return (vfloat)vandnot_vm_vm_vm((vmask)vcast_vf_f(-0.0f), (vmask)f); }
+static INLINE vfloat vneg_vf_vf(vfloat d) { return -d; }
+static INLINE vfloat vmla_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return x*y+z; }
+static INLINE vfloat vmlanp_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return z-x*y; }
+static INLINE vfloat vmax_vf_vf_vf(vfloat x, vfloat y) { return vsel_vf_vo_vf_vf((vopmask)(x > y), x, y); }
+static INLINE vfloat vmin_vf_vf_vf(vfloat x, vfloat y) { return vsel_vf_vo_vf_vf((vopmask)(x < y), x, y); }
+
+static INLINE vfloat vsubadd_vf_vf_vf(vfloat x, vfloat y) { return vadd_vf_vf_vf(x, vnegpos_vf_vf(y)); }
+static INLINE vfloat vmlsubadd_vf_vf_vf_vf(vfloat x, vfloat y, vfloat z) { return vsubadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z); }
+
+static INLINE vopmask veq_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x == y); }
+static INLINE vopmask vneq_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x != y); }
+static INLINE vopmask vlt_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x < y); }
+static INLINE vopmask vle_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x <= y); }
+static INLINE vopmask vgt_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x > y); }
+static INLINE vopmask vge_vo_vf_vf(vfloat x, vfloat y) { return (vopmask)(x >= y); }
+
+static INLINE vint2 vadd_vi2_vi2_vi2(vint2 x, vint2 y) { return x + y; }
+static INLINE vint2 vsub_vi2_vi2_vi2(vint2 x, vint2 y) { return x - y; }
+static INLINE vint2 vneg_vi2_vi2(vint2 e) { return -e; }
+
+static INLINE vint2 vand_vi2_vi2_vi2(vint2 x, vint2 y) { return x & y; }
+static INLINE vint2 vandnot_vi2_vi2_vi2(vint2 x, vint2 y) { return  y & ~x; }
+static INLINE vint2 vor_vi2_vi2_vi2(vint2 x, vint2 y) { return x | y; }
+static INLINE vint2 vxor_vi2_vi2_vi2(vint2 x, vint2 y) { return x ^ y; }
+
+static INLINE vint2 vand_vi2_vo_vi2(vopmask x, vint2 y) { return (vint2)x & y; }
+static INLINE vint2 vandnot_vi2_vo_vi2(vopmask x, vint2 y) { return y & ~(vint2)x; }
+
+static INLINE vint2 vsll_vi2_vi2_i(vint2 x, int c) {
+#if defined(__clang__)
+  typedef uint32_t vu __attribute__((ext_vector_type(VECTLENDP*2)));
+#else
+  typedef uint32_t vu __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP*2)));
+#endif
+  return (vint2)(((vu)x) << c);
+}
+static INLINE vint2 vsrl_vi2_vi2_i(vint2 x, int c) {
+#if defined(__clang__)
+  typedef uint32_t vu __attribute__((ext_vector_type(VECTLENDP*2)));
+#else
+  typedef uint32_t vu __attribute__((vector_size(sizeof(uint32_t)*VECTLENDP*2)));
+#endif
+  return (vint2)(((vu)x) >> c);
+}
+static INLINE vint2 vsra_vi2_vi2_i(vint2 x, int c) { return x >> c; }
+
+static INLINE vopmask veq_vo_vi2_vi2(vint2 x, vint2 y) { return (vopmask)(x == y); }
+static INLINE vopmask vgt_vo_vi2_vi2(vint2 x, vint2 y) { return (vopmask)(x > y); }
+static INLINE vint2 veq_vi2_vi2_vi2(vint2 x, vint2 y) { return x == y; }
+static INLINE vint2 vgt_vi2_vi2_vi2(vint2 x, vint2 y) { return x > y; }
+
+static INLINE vopmask visinf_vo_vf(vfloat d) { return (vopmask)(vabs_vf_vf(d) == INFINITYf); }
+static INLINE vopmask vispinf_vo_vf(vfloat d) { return (vopmask)(d == INFINITYf); }
+static INLINE vopmask visminf_vo_vf(vfloat d) { return (vopmask)(d == -INFINITYf); }
+static INLINE vopmask visnan_vo_vf(vfloat d) { return (vopmask)(d != d); }
+
+static INLINE vfloat vsqrt_vf_vf(vfloat d) {
+  vfloat q = vcast_vf_f(1);
+
+  vopmask o = (vopmask)(d < 5.4210108624275221700372640043497e-20f); // 2^-64
+  d = (vfloat)((o & (vmask)(d * vcast_vf_f(18446744073709551616.0f))) | (~o & (vmask)d)); // 2^64
+  q = (vfloat)((o & (vmask)vcast_vf_f(0.00000000023283064365386962890625f)) | (~o & (vmask)vcast_vf_f(1))); // 2^-32
+  q = (vfloat)vor_vm_vm_vm(vlt_vo_vf_vf(d, vcast_vf_f(0)), (vmask)q);
+
+  vfloat x = (vfloat)(0x5f330de2 - (((vint2)d) >> 1));
+  x = x * ( 3.0f - d * x * x);
+  x = x * (12.0f - d * x * x);
+  x *= 0.0625f;
+  x = (d - (d * x) * (d * x)) * (x * 0.5) + d * x;
+
+  return x * q;
+}
+
+static INLINE vfloat vload_vf_p(const float *ptr) { return *(vfloat *)ptr; }
+static INLINE vfloat vloadu_vf_p(const float *ptr) {
+  vfloat vf;
+  for(int i=0;i<VECTLENSP;i++) vf[i] = ptr[i];
+  return vf;
+}
+
+static INLINE void vstore_v_p_vf(float *ptr, vfloat v) { *(vfloat *)ptr = v; }
+static INLINE void vstoreu_v_p_vf(float *ptr, vfloat v) {
+  for(int i=0;i<VECTLENSP;i++) ptr[i] = v[i];
+}
+static INLINE void vstream_v_p_vf(float *ptr, vfloat v) { *(vfloat *)ptr = v; }
+
+static INLINE void vscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) {
+  for(int i=0;i<VECTLENSP/2;i++) {
+    *(ptr+(offset + step * i)*2 + 0) = v[i*2+0];
+    *(ptr+(offset + step * i)*2 + 1) = v[i*2+1];
+  }
+}
+
+static INLINE void vsscatter2_v_p_i_i_vf(float *ptr, int offset, int step, vfloat v) { vscatter2_v_p_i_i_vf(ptr, offset, step, v); }
+
+//
+
+#ifdef ENABLE_LONGDOUBLE
+static INLINE vlongdouble vadd_vl_vl_vl(vlongdouble x, vlongdouble y) { return x + y; }
+static INLINE vlongdouble vsub_vl_vl_vl(vlongdouble x, vlongdouble y) { return x - y; }
+static INLINE vlongdouble vmul_vl_vl_vl(vlongdouble x, vlongdouble y) { return x * y; }
+
+static INLINE vlongdouble vneg_vl_vl(vlongdouble d) { return -d; }
+static INLINE vlongdouble vsubadd_vl_vl_vl(vlongdouble x, vlongdouble y) { return vadd_vl_vl_vl(x, vnegpos_vl_vl(y)); }
+static INLINE vlongdouble vmlsubadd_vl_vl_vl_vl(vlongdouble x, vlongdouble y, vlongdouble z) { return vsubadd_vl_vl_vl(vmul_vl_vl_vl(x, y), z); }
+
+static INLINE vlongdouble vload_vl_p(const long double *ptr) { return *(vlongdouble *)ptr; }
+static INLINE vlongdouble vloadu_vl_p(const long double *ptr) {
+  vlongdouble vd;
+  for(int i=0;i<VECTLENDP;i++) vd[i] = ptr[i];
+  return vd;
+}
+
+static INLINE void vstore_v_p_vl(long double *ptr, vlongdouble v) { *(vlongdouble *)ptr = v; }
+static INLINE void vstoreu_v_p_vl(long double *ptr, vlongdouble v) {
+  for(int i=0;i<VECTLENDP;i++) ptr[i] = v[i];
+}
+static INLINE void vstream_v_p_vl(long double *ptr, vlongdouble v) { *(vlongdouble *)ptr = v; }
+
+static INLINE void vscatter2_v_p_i_i_vl(long double *ptr, int offset, int step, vlongdouble v) {
+  for(int i=0;i<VECTLENDP/2;i++) {
+    *(ptr+(offset + step * i)*2 + 0) = v[i*2+0];
+    *(ptr+(offset + step * i)*2 + 1) = v[i*2+1];
+  }
+}
+
+static INLINE void vsscatter2_v_p_i_i_vl(long double *ptr, int offset, int step, vlongdouble v) { vscatter2_v_p_i_i_vl(ptr, offset, step, v); }
+#endif
+
+#ifdef Sleef_quad2_DEFINED
+static INLINE vquad vadd_vq_vq_vq(vquad x, vquad y) { return x + y; }
+static INLINE vquad vsub_vq_vq_vq(vquad x, vquad y) { return x - y; }
+static INLINE vquad vmul_vq_vq_vq(vquad x, vquad y) { return x * y; }
+
+static INLINE vquad vneg_vq_vq(vquad d) { return -d; }
+static INLINE vquad vsubadd_vq_vq_vq(vquad x, vquad y) { return vadd_vq_vq_vq(x, vnegpos_vq_vq(y)); }
+static INLINE vquad vmlsubadd_vq_vq_vq_vq(vquad x, vquad y, vquad z) { return vsubadd_vq_vq_vq(vmul_vq_vq_vq(x, y), z); }
+
+static INLINE vquad vload_vq_p(const Sleef_quad *ptr) { return *(vquad *)ptr; }
+static INLINE vquad vloadu_vq_p(const Sleef_quad *ptr) {
+  vquad vd;
+  for(int i=0;i<VECTLENDP;i++) vd[i] = ptr[i];
+  return vd;
+}
+
+static INLINE void vstore_v_p_vq(Sleef_quad *ptr, vquad v) { *(vquad *)ptr = v; }
+static INLINE void vstoreu_v_p_vq(Sleef_quad *ptr, vquad v) {
+  for(int i=0;i<VECTLENDP;i++) ptr[i] = v[i];
+}
+static INLINE void vstream_v_p_vq(Sleef_quad *ptr, vquad v) { *(vquad *)ptr = v; }
+
+static INLINE void vscatter2_v_p_i_i_vq(Sleef_quad *ptr, int offset, int step, vquad v) {
+  for(int i=0;i<VECTLENDP/2;i++) {
+    *(ptr+(offset + step * i)*2 + 0) = v[i*2+0];
+    *(ptr+(offset + step * i)*2 + 1) = v[i*2+1];
+  }
+}
+
+static INLINE void vsscatter2_v_p_i_i_vq(Sleef_quad *ptr, int offset, int step, vquad v) { vscatter2_v_p_i_i_vq(ptr, offset, step, v); }
+#endif
+
+
+// TODO
+
+static INLINE int vall_lte64_i_vd_vd(vdouble x, vdouble lim) {
+  return 0;
+}
+
+static INLINE int vall_lte32_i_vf_vf(vfloat x, vfloat lim) {
+  return 0;
+}
+
+// TODO SELECT
diff --git a/lib/kernel/sleef/arch/misc.h b/lib/kernel/sleef/arch/misc.h
new file mode 100644
index 0000000..6d2a345
--- /dev/null
+++ b/lib/kernel/sleef/arch/misc.h
@@ -0,0 +1,258 @@
+//          Copyright Naoki Shibata 2010 - 2017.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+//
+
+#ifndef __MISC_H__
+#define __MISC_H__
+
+#ifndef M_PI
+#define M_PI 3.141592653589793238462643383279502884
+#endif
+
+#ifndef M_PIl
+#define M_PIl 3.141592653589793238462643383279502884L
+#endif
+
+#ifndef M_1_PI
+#define M_1_PI 0.318309886183790671537767526745028724
+#endif
+
+#ifndef M_1_PIl
+#define M_1_PIl 0.318309886183790671537767526745028724L
+#endif
+
+#ifndef M_2_PI
+#define M_2_PI 0.636619772367581343075535053490057448
+#endif
+
+#ifndef M_2_PIl
+#define M_2_PIl 0.636619772367581343075535053490057448L
+#endif
+
+//
+
+/*
+  PI_A to PI_D are constants that satisfy the following two conditions.
+
+  * For PI_A, PI_B and PI_C, the last 28 bits are zero.
+  * PI_A + PI_B + PI_C + PI_D is close to PI as much as possible.
+
+  The argument of a trig function is multiplied by 1/PI, and the
+  integral part is divided into two parts, each has at most 28
+  bits. So, the maximum argument that could be correctly reduced
+  should be 2^(28*2-1) PI = 1.1e+17. However, due to internal
+  double precision calculation, the actual maximum argument that can
+  be correctly reduced is around 2^50 = 1.1e+15.
+ */
+
+#define PI_A 3.1415926218032836914
+#define PI_B 3.1786509424591713469e-08
+#define PI_C 1.2246467864107188502e-16
+#define PI_D 1.2736634327021899816e-24
+#define TRIGRANGEMAX 1e+15
+
+/*
+  PI_A2 and PI_B2 are constants that satisfy the following two conditions.
+
+  * The last 3 bits of PI_A2 are zero.
+  * PI_A2 + PI_B2 is close to PI as much as possible.
+
+  The argument of a trig function is multiplied by 1/PI, and the
+  integral part is multiplied by PI_A2. So, the maximum argument that
+  could be correctly reduced should be 2^(3-1) PI = 12.6. By testing,
+  we confirmed that it correctly reduces the argument up to around 15.
+ */
+
+#define PI_A2 3.141592653589793116
+#define PI_B2 1.2246467991473532072e-16
+#define TRIGRANGEMAX2 15
+
+#define M_2_PI_H 0.63661977236758138243
+#define M_2_PI_L -3.9357353350364971764e-17
+
+#define SQRT_DBL_MAX 1.3407807929942596355e+154
+
+#define TRIGRANGEMAX3 1e+9
+
+#define M_4_PI 1.273239544735162542821171882678754627704620361328125
+
+#define L2U .69314718055966295651160180568695068359375
+#define L2L .28235290563031577122588448175013436025525412068e-12
+#define R_LN2 1.442695040888963407359924681001892137426645954152985934135449406931
+
+#define L10U 0.30102999566383914498 // log 2 / log 10
+#define L10L 1.4205023227266099418e-13
+#define LOG10_2 3.3219280948873623478703194294893901758648313930
+
+#define L10Uf 0.3010253906f
+#define L10Lf 4.605038981e-06f
+
+//
+
+#define PI_Af 3.140625f
+#define PI_Bf 0.0009670257568359375f
+#define PI_Cf 6.2771141529083251953e-07f
+#define PI_Df 1.2154201256553420762e-10f
+#define PI_XDf 1.2141754268668591976e-10f
+#define PI_XEf 1.2446743939339977025e-13f
+#define TRIGRANGEMAXf 1e+7 // 39000
+
+#define PI_A2f 3.1414794921875f
+#define PI_B2f 0.00011315941810607910156f
+#define PI_C2f 1.9841872589410058936e-09f
+#define TRIGRANGEMAX2f 125.0f
+
+#define PI_A3f 3.14154052734375f
+#define PI_B3f 5.212612450122833252e-05f
+#define PI_C3f 1.2154188766544393729e-10f
+#define PI_D3f 1.2246402351402674302e-16f
+#define PI_E3f 6.5640073364868052239e-22f
+#define TRIGRANGEMAX3f 5e+9f
+
+#define TRIGRANGEMAX4f 8e+6f
+
+#define SQRT_FLT_MAX 18446743523953729536.0
+
+#define L2Uf 0.693145751953125f
+#define L2Lf 1.428606765330187045e-06f
+
+#define R_LN2f 1.442695040888963407359924681001892137426645954152985934135449406931f
+#define M_PIf ((float)M_PI)
+
+//
+
+#ifndef MIN
+#define MIN(x, y) ((x) < (y) ? (x) : (y))
+#endif
+
+#ifndef MAX
+#define MAX(x, y) ((x) > (y) ? (x) : (y))
+#endif
+
+#ifndef ABS
+#define ABS(x) ((x) < 0 ? -(x) : (x))
+#endif
+
+typedef long double longdouble;
+
+#ifndef Sleef_double2_DEFINED
+#define Sleef_double2_DEFINED
+typedef struct {
+  double x, y;
+} Sleef_double2;
+#endif
+
+#ifndef Sleef_float2_DEFINED
+#define Sleef_float2_DEFINED
+typedef struct {
+  float x, y;
+} Sleef_float2;
+#endif
+
+#ifndef Sleef_longdouble2_DEFINED
+#define Sleef_longdouble2_DEFINED
+typedef struct {
+  long double x, y;
+} Sleef_longdouble2;
+#endif
+
+#if defined(ENABLEFLOAT128) && !defined(Sleef_quad2_DEFINED)
+#define Sleef_quad2_DEFINED
+typedef __float128 Sleef_quad;
+typedef struct {
+  __float128 x, y;
+} Sleef_quad2;
+#endif
+
+//
+
+#if defined (__GNUC__) || defined (__clang__) || defined(__INTEL_COMPILER)
+
+#define INLINE __attribute__((always_inline))
+
+#ifndef __INTEL_COMPILER
+#define CONST const
+#else
+#define CONST __attribute__((const))
+#endif
+
+#if defined(__MINGW32__) || defined(__MINGW64__) || defined(__CYGWIN__)
+#define EXPORT __stdcall __declspec(dllexport)
+#else
+#define EXPORT
+#endif
+
+#ifdef INFINITY
+#undef INFINITY
+#endif
+
+#ifdef NAN
+#undef NAN
+#endif
+
+#define NAN __builtin_nan("")
+#define NANf __builtin_nanf("")
+#define NANl __builtin_nanl("")
+#define INFINITY __builtin_inf()
+#define INFINITYf __builtin_inff()
+#define INFINITYl __builtin_infl()
+
+#if defined(__INTEL_COMPILER)
+#define INFINITYq __builtin_inf()
+#define NANq __builtin_nan("")
+#else
+#define INFINITYq __builtin_infq()
+#define NANq (INFINITYq - INFINITYq)
+#endif
+
+#elif defined(_MSC_VER)
+
+#define INLINE __forceinline
+#define CONST
+#define EXPORT __declspec(dllexport)
+
+#if (defined(__GNUC__) || defined(__CLANG__)) && (defined(__i386__) || defined(__x86_64__))
+#include <x86intrin.h>
+#endif
+
+#define INFINITYf ((float)INFINITY)
+#define NANf ((float)NAN)
+#define INFINITYl ((long double)INFINITY)
+#define NANl ((long double)NAN)
+
+#if (defined(_M_AMD64) || defined(_M_X64))
+#ifndef __SSE2__
+#define __SSE2__
+#define __SSE3__
+#define __SSE4_1__
+#endif
+#elif _M_IX86_FP == 2
+#ifndef __SSE2__
+#define __SSE2__
+#define __SSE3__
+#define __SSE4_1__
+#endif
+#elif _M_IX86_FP == 1
+#ifndef __SSE__
+#define __SSE__
+#endif
+#endif
+
+static INLINE CONST int isinff(float x) { return x == INFINITYf || x == -INFINITYf; }
+static INLINE CONST int isinfl(long double x) { return x == INFINITYl || x == -INFINITYl; }
+static INLINE CONST int isnanf(float x) { return x != x; }
+static INLINE CONST int isnanl(long double x) { return x != x; }
+
+#endif // defined(_MSC_VER)
+
+#ifdef __APPLE__
+static INLINE CONST int isinff(float x) { return x == INFINITYf || x == -INFINITYf; }
+static INLINE CONST int isinfl(long double x) { return x == INFINITYl || x == -INFINITYl; }
+static INLINE CONST int isnanf(float x) { return x != x; }
+static INLINE CONST int isnanl(long double x) { return x != x; }
+#endif
+
+#endif // #ifndef __MISC_H__
diff --git a/lib/kernel/sleef/fma_test.c b/lib/kernel/sleef/fma_test.c
new file mode 100644
index 0000000..399be04
--- /dev/null
+++ b/lib/kernel/sleef/fma_test.c
@@ -0,0 +1,49 @@
+/************************/
+
+int main() {
+
+#if defined(PURE_C)
+
+  // glibc
+  return 0;
+
+#elif defined(VEC128)
+
+  #if defined(__aarch64__)
+    // __ARM_ARCH_ISA_A64
+    // ARM64 should always have FMA
+    return 0;
+  #elif defined(__ARM_NEON)
+    // TODO proper ARM detection
+    #error ARM32 FMA detection not implemented
+    return -1;
+  #elif defined(__AVX2__) || defined(__FMA4__)
+    return 0;
+  #else
+    #error FMA status unknown
+    return -1;
+  #endif
+
+#elif defined(VEC256)
+
+  #if defined(__AVX2__) || defined(__FMA4__)
+    return 0;
+  #else
+    #error FMA status unknown
+    return -1;
+  #endif
+
+#elif defined(VEC512)
+
+  #if defined(__AVX512F__)
+    return 0;
+  #else
+    #error FMA status unknown
+    return -1;
+  #endif
+
+#else
+  #error FMA status unknown
+  return -1;
+#endif
+}
diff --git a/lib/kernel/sleef/include/sleef.h b/lib/kernel/sleef/include/sleef.h
new file mode 100644
index 0000000..d09a58e
--- /dev/null
+++ b/lib/kernel/sleef/include/sleef.h
@@ -0,0 +1,890 @@
+//          Copyright Naoki Shibata 2010 - 2017.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#ifdef __SLEEF_CL_H__
+#error You must include sleef_cl.h AFTER sleef.h
+#endif
+
+#ifndef __SLEEF_H__
+#define __SLEEF_H__
+
+#include <stddef.h>
+#include <stdint.h>
+
+#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER)
+#define CONST const
+#else
+#define CONST
+#endif
+
+#if (defined(__GNUC__) || defined(__CLANG__))                                 \
+    && (defined(__i386__) || defined(__x86_64__))
+#include <x86intrin.h>
+#endif
+
+#if (defined(_MSC_VER))
+#include <intrin.h>
+#endif
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+#endif
+
+/* Function/type attributes supported by Clang/SPIR */
+#if __has_attribute(__always_inline__)
+#define _CL_ALWAYSINLINE __attribute__ ((__always_inline__))
+#else
+#define _CL_ALWAYSINLINE
+#endif
+#if __has_attribute(__noinline__)
+#define _CL_NOINLINE __attribute__ ((__noinline__))
+#else
+#define _CL_NOINLINE
+#endif
+#if __has_attribute(__overloadable__)
+#define _CL_OVERLOADABLE __attribute__ ((__overloadable__))
+#else
+#define _CL_OVERLOADABLE
+#endif
+#if __has_attribute(__const__)
+#define _CL_READNONE __attribute__ ((__const__))
+#else
+#define _CL_READNONE
+#endif
+#if __has_attribute(__pure__)
+#define _CL_READONLY __attribute__ ((__pure__))
+#else
+#define _CL_READONLY
+#endif
+#if __has_attribute(__unavailable__)
+#define _CL_UNAVAILABLE __attribute__ ((__unavailable__))
+#else
+#define _CL_UNAVAILABLE
+#endif
+
+#ifndef Sleef_double2_DEFINED
+#define Sleef_double2_DEFINED
+typedef struct
+{
+  double x, y;
+} Sleef_double2;
+#endif
+
+#ifndef Sleef_float2_DEFINED
+#define Sleef_float2_DEFINED
+typedef struct
+{
+  float x, y;
+} Sleef_float2;
+#endif
+
+double Sleef_sin_u35 (double);
+double Sleef_cos_u35 (double);
+Sleef_double2 Sleef_sincos_u35 (double);
+double Sleef_tan_u35 (double);
+double Sleef_asin_u35 (double);
+double Sleef_acos_u35 (double);
+double Sleef_atan_u35 (double);
+double Sleef_atan2_u35 (double, double);
+double Sleef_log_u35 (double);
+double Sleef_cbrt_u35 (double);
+double Sleef_sin_u10 (double);
+double Sleef_cos_u10 (double);
+Sleef_double2 Sleef_sincos_u10 (double);
+double Sleef_tan_u10 (double);
+double Sleef_asin_u10 (double);
+double Sleef_acos_u10 (double);
+double Sleef_atan_u10 (double);
+double Sleef_atan2_u10 (double, double);
+double Sleef_log_u10 (double);
+double Sleef_cbrt_u10 (double);
+double Sleef_exp_u10 (double);
+double Sleef_pow_u10 (double, double);
+double Sleef_sinh_u10 (double);
+double Sleef_cosh_u10 (double);
+double Sleef_tanh_u10 (double);
+double Sleef_asinh_u10 (double);
+double Sleef_acosh_u10 (double);
+double Sleef_atanh_u10 (double);
+double Sleef_exp2_u10 (double);
+double Sleef_exp10_u10 (double);
+double Sleef_expm1_u10 (double);
+double Sleef_log10_u10 (double);
+double Sleef_log1p_u10 (double);
+Sleef_double2 Sleef_sincospi_u05 (double);
+Sleef_double2 Sleef_sincospi_u35 (double);
+double Sleef_sinpi_u05 (double);
+double Sleef_cospi_u05 (double);
+double Sleef_ldexp (double, int);
+int Sleef_ilogb (double);
+double Sleef_fma (double, double, double);
+double Sleef_sqrt_u05 (double);
+double Sleef_hypot_u05 (double, double);
+double Sleef_hypot_u35 (double, double);
+double Sleef_fabs (double);
+double Sleef_copysign (double, double);
+double Sleef_fmax (double, double);
+double Sleef_fmin (double, double);
+double Sleef_fdim (double, double);
+double Sleef_trunc (double);
+double Sleef_floor (double);
+double Sleef_ceil (double);
+double Sleef_round (double);
+double Sleef_rint (double);
+double Sleef_nextafter (double, double);
+double Sleef_frfrexp (double);
+int Sleef_expfrexp (double);
+double Sleef_fmod (double, double);
+Sleef_double2 Sleef_modf (double);
+double Sleef_lgamma_u10 (double);
+Sleef_double2 Sleef_lgamma_r_u10 (double);
+double Sleef_tgamma_u10 (double);
+double Sleef_erf_u10 (double);
+double Sleef_erfc_u15 (double);
+
+float Sleef_sinf_u35 (float);
+float Sleef_cosf_u35 (float);
+Sleef_float2 Sleef_sincosf_u35 (float);
+float Sleef_tanf_u35 (float);
+float Sleef_asinf_u35 (float);
+float Sleef_acosf_u35 (float);
+float Sleef_atanf_u35 (float);
+float Sleef_atan2f_u35 (float, float);
+float Sleef_logf_u35 (float);
+float Sleef_cbrtf_u35 (float);
+float Sleef_sinf_u10 (float);
+float Sleef_cosf_u10 (float);
+Sleef_float2 Sleef_sincosf_u10 (float);
+float Sleef_tanf_u10 (float);
+float Sleef_asinf_u10 (float);
+float Sleef_acosf_u10 (float);
+float Sleef_atanf_u10 (float);
+float Sleef_atan2f_u10 (float, float);
+float Sleef_logf_u10 (float);
+float Sleef_cbrtf_u10 (float);
+float Sleef_expf_u10 (float);
+float Sleef_powf_u10 (float, float);
+float Sleef_sinhf_u10 (float);
+float Sleef_coshf_u10 (float);
+float Sleef_tanhf_u10 (float);
+float Sleef_asinhf_u10 (float);
+float Sleef_acoshf_u10 (float);
+float Sleef_atanhf_u10 (float);
+float Sleef_exp2f_u10 (float);
+float Sleef_exp10f_u10 (float);
+float Sleef_expm1f_u10 (float);
+float Sleef_log10f_u10 (float);
+float Sleef_log1pf_u10 (float);
+Sleef_float2 Sleef_sincospif_u05 (float);
+Sleef_float2 Sleef_sincospif_u35 (float);
+float Sleef_sinpif_u05 (float d);
+float Sleef_cospif_u05 (float d);
+float Sleef_ldexpf (float, int);
+int Sleef_ilogbf (float);
+float Sleef_fmaf (float, float, float);
+float Sleef_sqrtf_u05 (float);
+float Sleef_sqrtf_u35 (float);
+float Sleef_hypotf_u05 (float, float);
+float Sleef_hypotf_u35 (float, float);
+float Sleef_fabsf (float);
+float Sleef_copysignf (float, float);
+float Sleef_fmaxf (float, float);
+float Sleef_fminf (float, float);
+float Sleef_fdimf (float, float);
+float Sleef_truncf (float);
+float Sleef_floorf (float);
+float Sleef_ceilf (float);
+float Sleef_roundf (float);
+float Sleef_rintf (float);
+float Sleef_nextafterf (float, float);
+float Sleef_frfrexpf (float);
+int Sleef_expfrexpf (float);
+float Sleef_fmodf (float, float);
+Sleef_float2 Sleef_modff (float);
+float Sleef_lgammaf_u10 (float);
+Sleef_float2 Sleef_lgamma_rf_u10 (float);
+float Sleef_tgammaf_u10 (float);
+float Sleef_erff_u10 (float);
+float Sleef_erfcf_u15 (float);
+
+double Sleef_pown_u10 (double, int);
+float Sleef_pownf_u10 (float, int);
+double Sleef_powr_u10 (double, double);
+float Sleef_powrf_u10 (float, float);
+
+#ifdef __AVX512F__
+
+#define SLEEF_VEC_512_AVAILABLE
+
+typedef __m512 reg512f;
+typedef __m512d reg512d;
+typedef __m512i reg512i;
+
+#ifndef Sleef___m512d_2_DEFINED
+typedef struct
+{
+  __m512d x, y;
+} Sleef___m512d_2;
+#define Sleef___m512d_2_DEFINED
+#endif
+typedef Sleef___m512d_2 Sleef_reg512d_2;
+
+__m512d Sleef_sind8_u35_intrin (__m512d);
+__m512d Sleef_cosd8_u35_intrin (__m512d);
+Sleef___m512d_2 Sleef_sincosd8_u35_intrin (__m512d);
+__m512d Sleef_tand8_u35_intrin (__m512d);
+__m512d Sleef_asind8_u35_intrin (__m512d);
+__m512d Sleef_acosd8_u35_intrin (__m512d);
+__m512d Sleef_atand8_u35_intrin (__m512d);
+__m512d Sleef_atan2d8_u35_intrin (__m512d, __m512d);
+__m512d Sleef_logd8_u35_intrin (__m512d);
+__m512d Sleef_cbrtd8_u35_intrin (__m512d);
+__m512d Sleef_sind8_u10_intrin (__m512d);
+__m512d Sleef_cosd8_u10_intrin (__m512d);
+Sleef___m512d_2 Sleef_sincosd8_u10_intrin (__m512d);
+__m512d Sleef_tand8_u10_intrin (__m512d);
+__m512d Sleef_asind8_u10_intrin (__m512d);
+__m512d Sleef_acosd8_u10_intrin (__m512d);
+__m512d Sleef_atand8_u10_intrin (__m512d);
+__m512d Sleef_atan2d8_u10_intrin (__m512d, __m512d);
+__m512d Sleef_logd8_u10_intrin (__m512d);
+__m512d Sleef_cbrtd8_u10_intrin (__m512d);
+__m512d Sleef_expd8_u10_intrin (__m512d);
+__m512d Sleef_powd8_u10_intrin (__m512d, __m512d);
+__m512d Sleef_sinhd8_u10_intrin (__m512d);
+__m512d Sleef_coshd8_u10_intrin (__m512d);
+__m512d Sleef_tanhd8_u10_intrin (__m512d);
+__m512d Sleef_asinhd8_u10_intrin (__m512d);
+__m512d Sleef_acoshd8_u10_intrin (__m512d);
+__m512d Sleef_atanhd8_u10_intrin (__m512d);
+__m512d Sleef_exp2d8_u10_intrin (__m512d);
+__m512d Sleef_exp10d8_u10_intrin (__m512d);
+__m512d Sleef_expm1d8_u10_intrin (__m512d);
+__m512d Sleef_log10d8_u10_intrin (__m512d);
+__m512d Sleef_log1pd8_u10_intrin (__m512d);
+Sleef___m512d_2 Sleef_sincospid8_u05_intrin (__m512d);
+Sleef___m512d_2 Sleef_sincospid8_u35_intrin (__m512d);
+__m512d Sleef_sinpid8_u05_intrin (__m512d);
+__m512d Sleef_cospid8_u05_intrin (__m512d);
+__m512d Sleef_ldexpd8_intrin (__m512d, __m256i);
+__m256i Sleef_ilogbd8_intrin (__m512d);
+__m512d Sleef_fmad8_intrin (__m512d, __m512d, __m512d);
+__m512d Sleef_sqrtd8_u05_intrin (__m512d);
+__m512d Sleef_sqrtd8_u35_intrin (__m512d);
+__m512d Sleef_hypotd8_u05_intrin (__m512d, __m512d);
+__m512d Sleef_hypotd8_u35_intrin (__m512d, __m512d);
+__m512d Sleef_fabsd8_intrin (__m512d);
+__m512d Sleef_copysignd8_intrin (__m512d, __m512d);
+__m512d Sleef_fmaxd8_intrin (__m512d, __m512d);
+__m512d Sleef_fmind8_intrin (__m512d, __m512d);
+__m512d Sleef_fdimd8_intrin (__m512d, __m512d);
+__m512d Sleef_truncd8_intrin (__m512d);
+__m512d Sleef_floord8_intrin (__m512d);
+__m512d Sleef_ceild8_intrin (__m512d);
+__m512d Sleef_roundd8_intrin (__m512d);
+__m512d Sleef_rintd8_intrin (__m512d);
+__m512d Sleef_nextafterd8_intrin (__m512d, __m512d);
+__m512d Sleef_frfrexpd8_intrin (__m512d);
+__m512i Sleef_expfrexpd8_intrin (__m512d);
+__m512d Sleef_fmodd8_intrin (__m512d, __m512d);
+Sleef___m512d_2 Sleef_modfd8_intrin (__m512d);
+__m512d Sleef_lgammad8_u10_intrin (__m512d);
+Sleef___m512d_2 Sleef_lgamma_rd8_u10_intrin (__m512d);
+__m512d Sleef_tgammad8_u10_intrin (__m512d);
+__m512d Sleef_erfd8_u10_intrin (__m512d);
+__m512d Sleef_erfcd8_u15_intrin (__m512d);
+
+#ifndef Sleef___m512_2_DEFINED
+typedef struct
+{
+  __m512 x, y;
+} Sleef___m512_2;
+#define Sleef___m512_2_DEFINED
+#endif
+typedef Sleef___m512_2 Sleef_reg512f_2;
+
+__m512 Sleef_sinf16_u35_intrin (__m512);
+__m512 Sleef_cosf16_u35_intrin (__m512);
+Sleef___m512_2 Sleef_sincosf16_u35_intrin (__m512);
+__m512 Sleef_tanf16_u35_intrin (__m512);
+__m512 Sleef_asinf16_u35_intrin (__m512);
+__m512 Sleef_acosf16_u35_intrin (__m512);
+__m512 Sleef_atanf16_u35_intrin (__m512);
+__m512 Sleef_atan2f16_u35_intrin (__m512, __m512);
+__m512 Sleef_logf16_u35_intrin (__m512);
+__m512 Sleef_cbrtf16_u35_intrin (__m512);
+__m512 Sleef_sinf16_u10_intrin (__m512);
+__m512 Sleef_cosf16_u10_intrin (__m512);
+Sleef___m512_2 Sleef_sincosf16_u10_intrin (__m512);
+__m512 Sleef_tanf16_u10_intrin (__m512);
+__m512 Sleef_asinf16_u10_intrin (__m512);
+__m512 Sleef_acosf16_u10_intrin (__m512);
+__m512 Sleef_atanf16_u10_intrin (__m512);
+__m512 Sleef_atan2f16_u10_intrin (__m512, __m512);
+__m512 Sleef_logf16_u10_intrin (__m512);
+__m512 Sleef_cbrtf16_u10_intrin (__m512);
+__m512 Sleef_expf16_u10_intrin (__m512);
+__m512 Sleef_powf16_u10_intrin (__m512, __m512);
+__m512 Sleef_sinhf16_u10_intrin (__m512);
+__m512 Sleef_coshf16_u10_intrin (__m512);
+__m512 Sleef_tanhf16_u10_intrin (__m512);
+__m512 Sleef_asinhf16_u10_intrin (__m512);
+__m512 Sleef_acoshf16_u10_intrin (__m512);
+__m512 Sleef_atanhf16_u10_intrin (__m512);
+__m512 Sleef_exp2f16_u10_intrin (__m512);
+__m512 Sleef_exp10f16_u10_intrin (__m512);
+__m512 Sleef_expm1f16_u10_intrin (__m512);
+__m512 Sleef_log10f16_u10_intrin (__m512);
+__m512 Sleef_log1pf16_u10_intrin (__m512);
+Sleef___m512_2 Sleef_sincospif16_u05_intrin (__m512);
+Sleef___m512_2 Sleef_sincospif16_u35_intrin (__m512);
+__m512 Sleef_sinpif16_u05_intrin (__m512);
+__m512 Sleef_cospif16_u05_intrin (__m512);
+__m512 Sleef_ldexpf16_intrin (__m512, __m512i);
+__m512i Sleef_ilogbf16_intrin (__m512);
+__m512 Sleef_fmaf16_intrin (__m512, __m512, __m512);
+__m512 Sleef_sqrtf16_u05_intrin (__m512);
+__m512 Sleef_sqrtf16_u35_intrin (__m512);
+__m512 Sleef_hypotf16_u05_intrin (__m512, __m512);
+__m512 Sleef_hypotf16_u35_intrin (__m512, __m512);
+__m512 Sleef_fabsf16_intrin (__m512);
+__m512 Sleef_copysignf16_intrin (__m512, __m512);
+__m512 Sleef_fmaxf16_intrin (__m512, __m512);
+__m512 Sleef_fminf16_intrin (__m512, __m512);
+__m512 Sleef_fdimf16_intrin (__m512, __m512);
+__m512 Sleef_truncf16_intrin (__m512);
+__m512 Sleef_floorf16_intrin (__m512);
+__m512 Sleef_ceilf16_intrin (__m512);
+__m512 Sleef_roundf16_intrin (__m512);
+__m512 Sleef_rintf16_intrin (__m512);
+__m512 Sleef_nextafterf16_intrin (__m512, __m512);
+__m512 Sleef_frfrexpf16_intrin (__m512);
+__m512i Sleef_expfrexpf16_intrin (__m512);
+__m512 Sleef_fmodf16_intrin (__m512, __m512);
+Sleef___m512_2 Sleef_modff16_intrin (__m512);
+__m512 Sleef_lgammaf16_u10_intrin (__m512);
+Sleef___m512_2 Sleef_lgamma_rf16_u10_intrin (__m512);
+__m512 Sleef_tgammaf16_u10_intrin (__m512);
+__m512 Sleef_erff16_u10_intrin (__m512);
+__m512 Sleef_erfcf16_u15_intrin (__m512);
+
+__m512d Sleef_pownd8_u10_intrin (__m512d, __m256i);
+__m512 Sleef_pownf16_u10_intrin (__m512, __m512i);
+__m512d Sleef_powrd8_u10_intrin (__m512d, __m512d);
+__m512 Sleef_powrf16_u10_intrin (__m512, __m512);
+
+#endif
+
+#if defined(__AVX2__) || defined(__AVX__)
+
+#define SLEEF_VEC_256_AVAILABLE
+
+/*
+#ifndef __AVX2__
+
+typedef struct
+{
+  __m128i x, y;
+} __m256i;
+
+#endif
+*/
+
+typedef __m256 reg256f;
+typedef __m256d reg256d;
+typedef __m256i reg256i;
+
+#ifndef Sleef___m256d_2_DEFINED
+typedef struct
+{
+  __m256d x, y;
+} Sleef___m256d_2;
+#define Sleef___m256d_2_DEFINED
+#endif
+typedef Sleef___m256d_2 Sleef_reg256d_2;
+
+__m256d Sleef_sind4_u35_intrin (__m256d);
+__m256d Sleef_cosd4_u35_intrin (__m256d);
+Sleef___m256d_2 Sleef_sincosd4_u35_intrin (__m256d);
+__m256d Sleef_tand4_u35_intrin (__m256d);
+__m256d Sleef_asind4_u35_intrin (__m256d);
+__m256d Sleef_acosd4_u35_intrin (__m256d);
+__m256d Sleef_atand4_u35_intrin (__m256d);
+__m256d Sleef_atan2d4_u35_intrin (__m256d, __m256d);
+__m256d Sleef_logd4_u35_intrin (__m256d);
+__m256d Sleef_cbrtd4_u35_intrin (__m256d);
+__m256d Sleef_sind4_u10_intrin (__m256d);
+__m256d Sleef_cosd4_u10_intrin (__m256d);
+Sleef___m256d_2 Sleef_sincosd4_u10_intrin (__m256d);
+__m256d Sleef_tand4_u10_intrin (__m256d);
+__m256d Sleef_asind4_u10_intrin (__m256d);
+__m256d Sleef_acosd4_u10_intrin (__m256d);
+__m256d Sleef_atand4_u10_intrin (__m256d);
+__m256d Sleef_atan2d4_u10_intrin (__m256d, __m256d);
+__m256d Sleef_logd4_u10_intrin (__m256d);
+__m256d Sleef_cbrtd4_u10_intrin (__m256d);
+__m256d Sleef_expd4_u10_intrin (__m256d);
+__m256d Sleef_powd4_u10_intrin (__m256d, __m256d);
+__m256d Sleef_sinhd4_u10_intrin (__m256d);
+__m256d Sleef_coshd4_u10_intrin (__m256d);
+__m256d Sleef_tanhd4_u10_intrin (__m256d);
+__m256d Sleef_asinhd4_u10_intrin (__m256d);
+__m256d Sleef_acoshd4_u10_intrin (__m256d);
+__m256d Sleef_atanhd4_u10_intrin (__m256d);
+__m256d Sleef_exp2d4_u10_intrin (__m256d);
+__m256d Sleef_exp10d4_u10_intrin (__m256d);
+__m256d Sleef_expm1d4_u10_intrin (__m256d);
+__m256d Sleef_log10d4_u10_intrin (__m256d);
+__m256d Sleef_log1pd4_u10_intrin (__m256d);
+Sleef___m256d_2 Sleef_sincospid4_u05_intrin (__m256d);
+Sleef___m256d_2 Sleef_sincospid4_u35_intrin (__m256d);
+__m256d Sleef_sinpid4_u05_intrin (__m256d);
+__m256d Sleef_cospid4_u05_intrin (__m256d);
+__m256d Sleef_ldexpd4_intrin (__m256d, __m128i);
+__m128i Sleef_ilogbd4_intrin (__m256d);
+__m256d Sleef_fmad4_intrin (__m256d, __m256d, __m256d);
+__m256d Sleef_sqrtd4_u05_intrin (__m256d);
+__m256d Sleef_sqrtd4_u35_intrin (__m256d);
+__m256d Sleef_hypotd4_u05_intrin (__m256d, __m256d);
+__m256d Sleef_hypotd4_u35_intrin (__m256d, __m256d);
+__m256d Sleef_fabsd4_intrin (__m256d);
+__m256d Sleef_copysignd4_intrin (__m256d, __m256d);
+__m256d Sleef_fmaxd4_intrin (__m256d, __m256d);
+__m256d Sleef_fmind4_intrin (__m256d, __m256d);
+__m256d Sleef_fdimd4_intrin (__m256d, __m256d);
+__m256d Sleef_truncd4_intrin (__m256d);
+__m256d Sleef_floord4_intrin (__m256d);
+__m256d Sleef_ceild4_intrin (__m256d);
+__m256d Sleef_roundd4_intrin (__m256d);
+__m256d Sleef_rintd4_intrin (__m256d);
+__m256d Sleef_nextafterd4_intrin (__m256d, __m256d);
+__m256d Sleef_frfrexpd4_intrin (__m256d);
+__m256i Sleef_expfrexpd4_intrin (__m256d);
+__m256d Sleef_fmodd4_intrin (__m256d, __m256d);
+Sleef___m256d_2 Sleef_modfd4_intrin (__m256d);
+__m256d Sleef_lgammad4_u10_intrin (__m256d);
+Sleef___m256d_2 Sleef_lgamma_rd4_u10_intrin (__m256d);
+__m256d Sleef_tgammad4_u10_intrin (__m256d);
+__m256d Sleef_erfd4_u10_intrin (__m256d);
+__m256d Sleef_erfcd4_u15_intrin (__m256d);
+
+#ifndef Sleef___m256_2_DEFINED
+typedef struct
+{
+  __m256 x, y;
+} Sleef___m256_2;
+#define Sleef___m256_2_DEFINED
+#endif
+typedef Sleef___m256_2 Sleef_reg256f_2;
+
+__m256 Sleef_sinf8_u35_intrin (__m256);
+__m256 Sleef_cosf8_u35_intrin (__m256);
+Sleef___m256_2 Sleef_sincosf8_u35_intrin (__m256);
+__m256 Sleef_tanf8_u35_intrin (__m256);
+__m256 Sleef_asinf8_u35_intrin (__m256);
+__m256 Sleef_acosf8_u35_intrin (__m256);
+__m256 Sleef_atanf8_u35_intrin (__m256);
+__m256 Sleef_atan2f8_u35_intrin (__m256, __m256);
+__m256 Sleef_logf8_u35_intrin (__m256);
+__m256 Sleef_cbrtf8_u35_intrin (__m256);
+__m256 Sleef_sinf8_u10_intrin (__m256);
+__m256 Sleef_cosf8_u10_intrin (__m256);
+Sleef___m256_2 Sleef_sincosf8_u10_intrin (__m256);
+__m256 Sleef_tanf8_u10_intrin (__m256);
+__m256 Sleef_asinf8_u10_intrin (__m256);
+__m256 Sleef_acosf8_u10_intrin (__m256);
+__m256 Sleef_atanf8_u10_intrin (__m256);
+__m256 Sleef_atan2f8_u10_intrin (__m256, __m256);
+__m256 Sleef_logf8_u10_intrin (__m256);
+__m256 Sleef_cbrtf8_u10_intrin (__m256);
+__m256 Sleef_expf8_u10_intrin (__m256);
+__m256 Sleef_powf8_u10_intrin (__m256, __m256);
+__m256 Sleef_sinhf8_u10_intrin (__m256);
+__m256 Sleef_coshf8_u10_intrin (__m256);
+__m256 Sleef_tanhf8_u10_intrin (__m256);
+__m256 Sleef_asinhf8_u10_intrin (__m256);
+__m256 Sleef_acoshf8_u10_intrin (__m256);
+__m256 Sleef_atanhf8_u10_intrin (__m256);
+__m256 Sleef_exp2f8_u10_intrin (__m256);
+__m256 Sleef_exp10f8_u10_intrin (__m256);
+__m256 Sleef_expm1f8_u10_intrin (__m256);
+__m256 Sleef_log10f8_u10_intrin (__m256);
+__m256 Sleef_log1pf8_u10_intrin (__m256);
+Sleef___m256_2 Sleef_sincospif8_u05_intrin (__m256);
+Sleef___m256_2 Sleef_sincospif8_u35_intrin (__m256);
+__m256 Sleef_sinpif8_u05_intrin (__m256);
+__m256 Sleef_cospif8_u05_intrin (__m256);
+__m256 Sleef_ldexpf8_intrin (__m256, __m256i);
+__m256i Sleef_ilogbf8_intrin (__m256);
+__m256 Sleef_fmaf8_intrin (__m256, __m256, __m256);
+__m256 Sleef_sqrtf8_u05_intrin (__m256);
+__m256 Sleef_sqrtf8_u35_intrin (__m256);
+__m256 Sleef_hypotf8_u05_intrin (__m256, __m256);
+__m256 Sleef_hypotf8_u35_intrin (__m256, __m256);
+__m256 Sleef_fabsf8_intrin (__m256);
+__m256 Sleef_copysignf8_intrin (__m256, __m256);
+__m256 Sleef_fmaxf8_intrin (__m256, __m256);
+__m256 Sleef_fminf8_intrin (__m256, __m256);
+__m256 Sleef_fdimf8_intrin (__m256, __m256);
+__m256 Sleef_truncf8_intrin (__m256);
+__m256 Sleef_floorf8_intrin (__m256);
+__m256 Sleef_ceilf8_intrin (__m256);
+__m256 Sleef_roundf8_intrin (__m256);
+__m256 Sleef_rintf8_intrin (__m256);
+__m256 Sleef_nextafterf8_intrin (__m256, __m256);
+__m256 Sleef_frfrexpf8_intrin (__m256);
+__m256i Sleef_expfrexpf8_intrin (__m256);
+__m256 Sleef_fmodf8_intrin (__m256, __m256);
+Sleef___m256_2 Sleef_modff8_intrin (__m256);
+__m256 Sleef_lgammaf8_u10_intrin (__m256);
+Sleef___m256_2 Sleef_lgamma_rf8_u10_intrin (__m256);
+__m256 Sleef_tgammaf8_u10_intrin (__m256);
+__m256 Sleef_erff8_u10_intrin (__m256);
+__m256 Sleef_erfcf8_u15_intrin (__m256);
+
+__m256d Sleef_pownd4_u10_intrin (__m256d, __m128i);
+__m256 Sleef_pownf8_u10_intrin (__m256, __m256i);
+__m256d Sleef_powrd4_u10_intrin (__m256d, __m256d);
+__m256 Sleef_powrf8_u10_intrin (__m256, __m256);
+
+#endif
+
+#if defined(__SSE2__)
+
+#define SLEEF_VEC_128_AVAILABLE
+
+typedef __m128 reg128f;
+typedef __m128d reg128d;
+typedef __m128i reg128i;
+
+#ifndef Sleef___m128d_2_DEFINED
+typedef struct
+{
+  __m128d x, y;
+} Sleef___m128d_2;
+#define Sleef___m128d_2_DEFINED
+#endif
+typedef Sleef___m128d_2 Sleef_reg128d_2;
+
+__m128d Sleef_sind2_u35_intrin (__m128d);
+__m128d Sleef_cosd2_u35_intrin (__m128d);
+Sleef___m128d_2 Sleef_sincosd2_u35_intrin (__m128d);
+__m128d Sleef_tand2_u35_intrin (__m128d);
+__m128d Sleef_asind2_u35_intrin (__m128d);
+__m128d Sleef_acosd2_u35_intrin (__m128d);
+__m128d Sleef_atand2_u35_intrin (__m128d);
+__m128d Sleef_atan2d2_u35_intrin (__m128d, __m128d);
+__m128d Sleef_logd2_u35_intrin (__m128d);
+__m128d Sleef_cbrtd2_u35_intrin (__m128d);
+__m128d Sleef_sind2_u10_intrin (__m128d);
+__m128d Sleef_cosd2_u10_intrin (__m128d);
+Sleef___m128d_2 Sleef_sincosd2_u10_intrin (__m128d);
+__m128d Sleef_tand2_u10_intrin (__m128d);
+__m128d Sleef_asind2_u10_intrin (__m128d);
+__m128d Sleef_acosd2_u10_intrin (__m128d);
+__m128d Sleef_atand2_u10_intrin (__m128d);
+__m128d Sleef_atan2d2_u10_intrin (__m128d, __m128d);
+__m128d Sleef_logd2_u10_intrin (__m128d);
+__m128d Sleef_cbrtd2_u10_intrin (__m128d);
+__m128d Sleef_expd2_u10_intrin (__m128d);
+__m128d Sleef_powd2_u10_intrin (__m128d, __m128d);
+__m128d Sleef_sinhd2_u10_intrin (__m128d);
+__m128d Sleef_coshd2_u10_intrin (__m128d);
+__m128d Sleef_tanhd2_u10_intrin (__m128d);
+__m128d Sleef_asinhd2_u10_intrin (__m128d);
+__m128d Sleef_acoshd2_u10_intrin (__m128d);
+__m128d Sleef_atanhd2_u10_intrin (__m128d);
+__m128d Sleef_exp2d2_u10_intrin (__m128d);
+__m128d Sleef_exp10d2_u10_intrin (__m128d);
+__m128d Sleef_expm1d2_u10_intrin (__m128d);
+__m128d Sleef_log10d2_u10_intrin (__m128d);
+__m128d Sleef_log1pd2_u10_intrin (__m128d);
+Sleef___m128d_2 Sleef_sincospid2_u05_intrin (__m128d);
+Sleef___m128d_2 Sleef_sincospid2_u35_intrin (__m128d);
+__m128d Sleef_sinpid2_u05_intrin (__m128d);
+__m128d Sleef_cospid2_u05_intrin (__m128d);
+__m128d Sleef_ldexpd2_intrin (__m128d, __m128i);
+__m128i Sleef_ilogbd2_intrin (__m128d);
+__m128d Sleef_fmad2_intrin (__m128d, __m128d, __m128d);
+__m128d Sleef_sqrtd2_u05_intrin (__m128d);
+__m128d Sleef_sqrtd2_u35_intrin (__m128d);
+__m128d Sleef_hypotd2_u05_intrin (__m128d, __m128d);
+__m128d Sleef_hypotd2_u35_intrin (__m128d, __m128d);
+__m128d Sleef_fabsd2_intrin (__m128d);
+__m128d Sleef_copysignd2_intrin (__m128d, __m128d);
+__m128d Sleef_fmaxd2_intrin (__m128d, __m128d);
+__m128d Sleef_fmind2_intrin (__m128d, __m128d);
+__m128d Sleef_fdimd2_intrin (__m128d, __m128d);
+__m128d Sleef_truncd2_intrin (__m128d);
+__m128d Sleef_floord2_intrin (__m128d);
+__m128d Sleef_ceild2_intrin (__m128d);
+__m128d Sleef_roundd2_intrin (__m128d);
+__m128d Sleef_rintd2_intrin (__m128d);
+__m128d Sleef_nextafterd2_intrin (__m128d, __m128d);
+__m128d Sleef_frfrexpd2_intrin (__m128d);
+__m128i Sleef_expfrexpd2_intrin (__m128d);
+__m128d Sleef_fmodd2_intrin (__m128d, __m128d);
+Sleef___m128d_2 Sleef_modfd2_intrin (__m128d);
+__m128d Sleef_lgammad2_u10_intrin (__m128d);
+Sleef___m128d_2 Sleef_lgamma_rd2_u10_intrin (__m128d);
+__m128d Sleef_tgammad2_u10_intrin (__m128d);
+__m128d Sleef_erfd2_u10_intrin (__m128d);
+__m128d Sleef_erfcd2_u15_intrin (__m128d);
+
+#ifndef Sleef___m128_2_DEFINED
+typedef struct
+{
+  __m128 x, y;
+} Sleef___m128_2;
+#define Sleef___m128_2_DEFINED
+#endif
+typedef Sleef___m128_2 Sleef_reg128f_2;
+
+__m128 Sleef_sinf4_u35_intrin (__m128);
+__m128 Sleef_cosf4_u35_intrin (__m128);
+Sleef___m128_2 Sleef_sincosf4_u35_intrin (__m128);
+__m128 Sleef_tanf4_u35_intrin (__m128);
+__m128 Sleef_asinf4_u35_intrin (__m128);
+__m128 Sleef_acosf4_u35_intrin (__m128);
+__m128 Sleef_atanf4_u35_intrin (__m128);
+__m128 Sleef_atan2f4_u35_intrin (__m128, __m128);
+__m128 Sleef_logf4_u35_intrin (__m128);
+__m128 Sleef_cbrtf4_u35_intrin (__m128);
+__m128 Sleef_sinf4_u10_intrin (__m128);
+__m128 Sleef_cosf4_u10_intrin (__m128);
+Sleef___m128_2 Sleef_sincosf4_u10_intrin (__m128);
+__m128 Sleef_tanf4_u10_intrin (__m128);
+__m128 Sleef_asinf4_u10_intrin (__m128);
+__m128 Sleef_acosf4_u10_intrin (__m128);
+__m128 Sleef_atanf4_u10_intrin (__m128);
+__m128 Sleef_atan2f4_u10_intrin (__m128, __m128);
+__m128 Sleef_logf4_u10_intrin (__m128);
+__m128 Sleef_cbrtf4_u10_intrin (__m128);
+__m128 Sleef_expf4_u10_intrin (__m128);
+__m128 Sleef_powf4_u10_intrin (__m128, __m128);
+__m128 Sleef_sinhf4_u10_intrin (__m128);
+__m128 Sleef_coshf4_u10_intrin (__m128);
+__m128 Sleef_tanhf4_u10_intrin (__m128);
+__m128 Sleef_asinhf4_u10_intrin (__m128);
+__m128 Sleef_acoshf4_u10_intrin (__m128);
+__m128 Sleef_atanhf4_u10_intrin (__m128);
+__m128 Sleef_exp2f4_u10_intrin (__m128);
+__m128 Sleef_exp10f4_u10_intrin (__m128);
+__m128 Sleef_expm1f4_u10_intrin (__m128);
+__m128 Sleef_log10f4_u10_intrin (__m128);
+__m128 Sleef_log1pf4_u10_intrin (__m128);
+Sleef___m128_2 Sleef_sincospif4_u05_intrin (__m128);
+Sleef___m128_2 Sleef_sincospif4_u35_intrin (__m128);
+__m128 Sleef_sinpif4_u05_intrin (__m128);
+__m128 Sleef_cospif4_u05_intrin (__m128);
+__m128 Sleef_ldexpf4_intrin (__m128, __m128i);
+__m128i Sleef_ilogbf4_intrin (__m128);
+__m128 Sleef_fmaf4_intrin (__m128, __m128, __m128);
+__m128 Sleef_sqrtf4_u05_intrin (__m128);
+__m128 Sleef_sqrtf4_u35_intrin (__m128);
+__m128 Sleef_hypotf4_u05_intrin (__m128, __m128);
+__m128 Sleef_hypotf4_u35_intrin (__m128, __m128);
+__m128 Sleef_fabsf4_intrin (__m128);
+__m128 Sleef_copysignf4_intrin (__m128, __m128);
+__m128 Sleef_fmaxf4_intrin (__m128, __m128);
+__m128 Sleef_fminf4_intrin (__m128, __m128);
+__m128 Sleef_fdimf4_intrin (__m128, __m128);
+__m128 Sleef_truncf4_intrin (__m128);
+__m128 Sleef_floorf4_intrin (__m128);
+__m128 Sleef_ceilf4_intrin (__m128);
+__m128 Sleef_roundf4_intrin (__m128);
+__m128 Sleef_rintf4_intrin (__m128);
+__m128 Sleef_nextafterf4_intrin (__m128, __m128);
+__m128 Sleef_frfrexpf4_intrin (__m128);
+__m128i Sleef_expfrexpf4_intrin (__m128);
+__m128 Sleef_fmodf4_intrin (__m128, __m128);
+Sleef___m128_2 Sleef_modff4_intrin (__m128);
+__m128 Sleef_lgammaf4_u10_intrin (__m128);
+Sleef___m128_2 Sleef_lgamma_rf4_u10_intrin (__m128);
+__m128 Sleef_tgammaf4_u10_intrin (__m128);
+__m128 Sleef_erff4_u10_intrin (__m128);
+__m128 Sleef_erfcf4_u15_intrin (__m128);
+
+__m128d Sleef_pownd2_u10_intrin (__m128d, __m128i);
+__m128 Sleef_pownf4_u10_intrin (__m128, __m128i);
+__m128d Sleef_powrd2_u10_intrin (__m128d, __m128d);
+__m128 Sleef_powrf4_u10_intrin (__m128, __m128);
+
+#endif
+
+#ifdef __ARM_NEON
+
+#define SLEEF_VEC_128_AVAILABLE
+
+typedef float32x4_t reg128f;
+typedef int32x4_t reg128i;
+
+#ifndef Sleef_float32x4_t_2_DEFINED
+typedef struct
+{
+  float32x4_t x, y;
+} Sleef_float32x4_t_2;
+#define Sleef_float32x4_t_2_DEFINED
+#endif
+typedef Sleef_float32x4_t_2 Sleef_reg128f_2;
+
+float32x4_t Sleef_sinf4_u35_intrin (float32x4_t);
+float32x4_t Sleef_cosf4_u35_intrin (float32x4_t);
+Sleef_float32x4_t_2 Sleef_sincosf4_u35_intrin (float32x4_t);
+float32x4_t Sleef_tanf4_u35_intrin (float32x4_t);
+float32x4_t Sleef_asinf4_u35_intrin (float32x4_t);
+float32x4_t Sleef_acosf4_u35_intrin (float32x4_t);
+float32x4_t Sleef_atanf4_u35_intrin (float32x4_t);
+float32x4_t Sleef_atan2f4_u35_intrin (float32x4_t, float32x4_t);
+float32x4_t Sleef_logf4_u35_intrin (float32x4_t);
+float32x4_t Sleef_cbrtf4_u35_intrin (float32x4_t);
+float32x4_t Sleef_sinf4_u10_intrin (float32x4_t);
+float32x4_t Sleef_cosf4_u10_intrin (float32x4_t);
+Sleef_float32x4_t_2 Sleef_sincosf4_u10_intrin (float32x4_t);
+float32x4_t Sleef_tanf4_u10_intrin (float32x4_t);
+float32x4_t Sleef_asinf4_u10_intrin (float32x4_t);
+float32x4_t Sleef_acosf4_u10_intrin (float32x4_t);
+float32x4_t Sleef_atanf4_u10_intrin (float32x4_t);
+float32x4_t Sleef_atan2f4_u10_intrin (float32x4_t, float32x4_t);
+float32x4_t Sleef_logf4_u10_intrin (float32x4_t);
+float32x4_t Sleef_cbrtf4_u10_intrin (float32x4_t);
+float32x4_t Sleef_expf4_u10_intrin (float32x4_t);
+float32x4_t Sleef_powf4_u10_intrin (float32x4_t, float32x4_t);
+float32x4_t Sleef_sinhf4_u10_intrin (float32x4_t);
+float32x4_t Sleef_coshf4_u10_intrin (float32x4_t);
+float32x4_t Sleef_tanhf4_u10_intrin (float32x4_t);
+float32x4_t Sleef_asinhf4_u10_intrin (float32x4_t);
+float32x4_t Sleef_acoshf4_u10_intrin (float32x4_t);
+float32x4_t Sleef_atanhf4_u10_intrin (float32x4_t);
+float32x4_t Sleef_exp2f4_u10_intrin (float32x4_t);
+float32x4_t Sleef_exp10f4_u10_intrin (float32x4_t);
+float32x4_t Sleef_expm1f4_u10_intrin (float32x4_t);
+float32x4_t Sleef_log10f4_u10_intrin (float32x4_t);
+float32x4_t Sleef_log1pf4_u10_intrin (float32x4_t);
+Sleef_float32x4_t_2 Sleef_sincospif4_u05_intrin (float32x4_t);
+Sleef_float32x4_t_2 Sleef_sincospif4_u35_intrin (float32x4_t);
+float32x4_t Sleef_sinpif4_u05_intrin (float32x4_t);
+float32x4_t Sleef_cospif4_u05_intrin (float32x4_t);
+float32x4_t Sleef_ldexpf4_intrin (float32x4_t, int32x4_t);
+int32x4_t Sleef_ilogbf4_intrin (float32x4_t);
+
+float32x4_t Sleef_fmaf4_intrin (float32x4_t, float32x4_t, float32x4_t);
+float32x4_t Sleef_sqrtf4_u05_intrin (float32x4_t);
+float32x4_t Sleef_sqrtf4_u35_intrin (float32x4_t);
+float32x4_t Sleef_hypotf4_u05_intrin (float32x4_t, float32x4_t);
+float32x4_t Sleef_hypotf4_u35_intrin (float32x4_t, float32x4_t);
+float32x4_t Sleef_fabsf4_intrin (float32x4_t);
+float32x4_t Sleef_copysignf4_intrin (float32x4_t, float32x4_t);
+float32x4_t Sleef_fmaxf4_intrin (float32x4_t, float32x4_t);
+float32x4_t Sleef_fminf4_intrin (float32x4_t, float32x4_t);
+float32x4_t Sleef_fdimf4_intrin (float32x4_t, float32x4_t);
+float32x4_t Sleef_truncf4_intrin (float32x4_t);
+float32x4_t Sleef_floorf4_intrin (float32x4_t);
+float32x4_t Sleef_ceilf4_intrin (float32x4_t);
+float32x4_t Sleef_roundf4_intrin (float32x4_t);
+float32x4_t Sleef_rintf4_intrin (float32x4_t);
+float32x4_t Sleef_nextafterf4_intrin (float32x4_t, float32x4_t);
+float32x4_t Sleef_frfrexpf4_intrin (float32x4_t);
+int32x4_t Sleef_expfrexpf4_intrin (float32x4_t);
+
+float32x4_t Sleef_fmodf4_intrin (float32x4_t, float32x4_t);
+Sleef_float32x4_t_2 Sleef_modff4_intrin (float32x4_t);
+float32x4_t Sleef_lgammaf4_u10_intrin (float32x4_t);
+Sleef_float32x4_t_2 Sleef_lgamma_rf4_u10_intrin (float32x4_t);
+float32x4_t Sleef_tgammaf4_u10_intrin (float32x4_t);
+float32x4_t Sleef_erff4_u10_intrin (float32x4_t);
+float32x4_t Sleef_erfcf4_u15_intrin (float32x4_t);
+
+float32x4_t Sleef_pownf4_u10_intrin (float32x4_t, int32x4_t);
+float32x4_t Sleef_powrf4_u10_intrin (float32x4_t, float32x4_t);
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+typedef float64x2_t reg128d;
+
+#ifndef Sleef_float64x2_t_2_DEFINED
+typedef struct
+{
+  float64x2_t x, y;
+} Sleef_float64x2_t_2;
+#define Sleef_float64x2_t_2_DEFINED
+#endif
+typedef Sleef_float64x2_t_2 Sleef_reg128d_2;
+
+float64x2_t Sleef_sind2_u35_intrin (float64x2_t);
+float64x2_t Sleef_cosd2_u35_intrin (float64x2_t);
+Sleef_float64x2_t_2 Sleef_sincosd2_u35_intrin (float64x2_t);
+float64x2_t Sleef_tand2_u35_intrin (float64x2_t);
+float64x2_t Sleef_asind2_u35_intrin (float64x2_t);
+float64x2_t Sleef_acosd2_u35_intrin (float64x2_t);
+float64x2_t Sleef_atand2_u35_intrin (float64x2_t);
+float64x2_t Sleef_atan2d2_u35_intrin (float64x2_t, float64x2_t);
+float64x2_t Sleef_logd2_u35_intrin (float64x2_t);
+float64x2_t Sleef_cbrtd2_u35_intrin (float64x2_t);
+float64x2_t Sleef_sind2_u10_intrin (float64x2_t);
+float64x2_t Sleef_cosd2_u10_intrin (float64x2_t);
+Sleef_float64x2_t_2 Sleef_sincosd2_u10_intrin (float64x2_t);
+float64x2_t Sleef_tand2_u10_intrin (float64x2_t);
+float64x2_t Sleef_asind2_u10_intrin (float64x2_t);
+float64x2_t Sleef_acosd2_u10_intrin (float64x2_t);
+float64x2_t Sleef_atand2_u10_intrin (float64x2_t);
+float64x2_t Sleef_atan2d2_u10_intrin (float64x2_t, float64x2_t);
+float64x2_t Sleef_logd2_u10_intrin (float64x2_t);
+float64x2_t Sleef_cbrtd2_u10_intrin (float64x2_t);
+float64x2_t Sleef_expd2_u10_intrin (float64x2_t);
+float64x2_t Sleef_powd2_u10_intrin (float64x2_t, float64x2_t);
+float64x2_t Sleef_sinhd2_u10_intrin (float64x2_t);
+float64x2_t Sleef_coshd2_u10_intrin (float64x2_t);
+float64x2_t Sleef_tanhd2_u10_intrin (float64x2_t);
+float64x2_t Sleef_asinhd2_u10_intrin (float64x2_t);
+float64x2_t Sleef_acoshd2_u10_intrin (float64x2_t);
+float64x2_t Sleef_atanhd2_u10_intrin (float64x2_t);
+float64x2_t Sleef_exp2d2_u10_intrin (float64x2_t);
+float64x2_t Sleef_exp10d2_u10_intrin (float64x2_t);
+float64x2_t Sleef_expm1d2_u10_intrin (float64x2_t);
+float64x2_t Sleef_log10d2_u10_intrin (float64x2_t);
+float64x2_t Sleef_log1pd2_u10_intrin (float64x2_t);
+Sleef_float64x2_t_2 Sleef_sincospid2_u05_intrin (float64x2_t);
+Sleef_float64x2_t_2 Sleef_sincospid2_u35_intrin (float64x2_t);
+float64x2_t Sleef_sinpid2_u05_intrin (float64x2_t);
+float64x2_t Sleef_cospid2_u05_intrin (float64x2_t);
+float64x2_t Sleef_ldexpd2_intrin (float64x2_t, int32x4_t);
+int32x4_t Sleef_ilogbd2_intrin (float64x2_t);
+
+float64x2_t Sleef_fmad2_intrin (float64x2_t, float64x2_t, float64x2_t);
+float64x2_t Sleef_sqrtd2_u05_intrin (float64x2_t);
+float64x2_t Sleef_sqrtd2_u35_intrin (float64x2_t);
+float64x2_t Sleef_hypotd2_u05_intrin (float64x2_t, float64x2_t);
+float64x2_t Sleef_hypotd2_u35_intrin (float64x2_t, float64x2_t);
+float64x2_t Sleef_fabsd2_intrin (float64x2_t);
+float64x2_t Sleef_copysignd2_intrin (float64x2_t, float64x2_t);
+float64x2_t Sleef_fmaxd2_intrin (float64x2_t, float64x2_t);
+float64x2_t Sleef_fmind2_intrin (float64x2_t, float64x2_t);
+float64x2_t Sleef_fdimd2_intrin (float64x2_t, float64x2_t);
+float64x2_t Sleef_truncd2_intrin (float64x2_t);
+float64x2_t Sleef_floord2_intrin (float64x2_t);
+float64x2_t Sleef_ceild2_intrin (float64x2_t);
+float64x2_t Sleef_roundd2_intrin (float64x2_t);
+float64x2_t Sleef_rintd2_intrin (float64x2_t);
+float64x2_t Sleef_nextafterd2_intrin (float64x2_t, float64x2_t);
+float64x2_t Sleef_frfrexpd2_intrin (float64x2_t);
+int32x4_t Sleef_expfrexpd2_intrin (float64x2_t);
+
+float64x2_t Sleef_fmodd2_intrin (float64x2_t, float64x2_t);
+Sleef_float64x2_t_2 Sleef_modfd2_intrin (float64x2_t);
+float64x2_t Sleef_lgammad2_u10_intrin (float64x2_t);
+Sleef_float64x2_t_2 Sleef_lgamma_rd2_u10_intrin (float64x2_t);
+float64x2_t Sleef_tgammad2_u10_intrin (float64x2_t);
+float64x2_t Sleef_erfd2_u10_intrin (float64x2_t);
+float64x2_t Sleef_erfcd2_u15_intrin (float64x2_t);
+
+float64x2_t Sleef_pownd2_u10_intrin (float64x2_t, int32x4_t);
+float64x2_t Sleef_powrd2_u10_intrin (float64x2_t, float64x2_t);
+
+#endif
+
+#endif
+
+
+#endif // __SLEEF_H__
diff --git a/lib/kernel/sleef/include/sleef_cl.h b/lib/kernel/sleef/include/sleef_cl.h
new file mode 100644
index 0000000..89d2b3a
--- /dev/null
+++ b/lib/kernel/sleef/include/sleef_cl.h
@@ -0,0 +1,691 @@
+/* OpenCL built-in library: SLEEF OpenCL prototypes
+
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#ifndef __SLEEF_CL_H__
+#define __SLEEF_CL_H__
+
+#ifndef __OPENCL_VERSION__
+
+typedef int int2 __attribute__ ((__ext_vector_type__ (2)));
+typedef int int3 __attribute__ ((__ext_vector_type__ (3)));
+typedef int int4 __attribute__ ((__ext_vector_type__ (4)));
+typedef int int8 __attribute__ ((__ext_vector_type__ (8)));
+typedef int int16 __attribute__ ((__ext_vector_type__ (16)));
+
+typedef unsigned int uint;
+typedef uint uint2 __attribute__ ((__ext_vector_type__ (2)));
+typedef uint uint3 __attribute__ ((__ext_vector_type__ (3)));
+typedef uint uint4 __attribute__ ((__ext_vector_type__ (4)));
+typedef uint uint8 __attribute__ ((__ext_vector_type__ (8)));
+typedef uint uint16 __attribute__ ((__ext_vector_type__ (16)));
+
+typedef long long2 __attribute__ ((__ext_vector_type__ (2)));
+typedef long long3 __attribute__ ((__ext_vector_type__ (3)));
+typedef long long4 __attribute__ ((__ext_vector_type__ (4)));
+typedef long long8 __attribute__ ((__ext_vector_type__ (8)));
+typedef long long16 __attribute__ ((__ext_vector_type__ (16)));
+
+typedef unsigned long ulong;
+
+typedef ulong ulong2 __attribute__ ((__ext_vector_type__ (2)));
+typedef ulong ulong3 __attribute__ ((__ext_vector_type__ (3)));
+typedef ulong ulong4 __attribute__ ((__ext_vector_type__ (4)));
+typedef ulong ulong8 __attribute__ ((__ext_vector_type__ (8)));
+typedef ulong ulong16 __attribute__ ((__ext_vector_type__ (16)));
+
+typedef float float2 __attribute__ ((__ext_vector_type__ (2)));
+typedef float float3 __attribute__ ((__ext_vector_type__ (3)));
+typedef float float4 __attribute__ ((__ext_vector_type__ (4)));
+typedef float float8 __attribute__ ((__ext_vector_type__ (8)));
+typedef float float16 __attribute__ ((__ext_vector_type__ (16)));
+
+typedef double double2 __attribute__ ((__ext_vector_type__ (2)));
+typedef double double3 __attribute__ ((__ext_vector_type__ (3)));
+typedef double double4 __attribute__ ((__ext_vector_type__ (4)));
+typedef double double8 __attribute__ ((__ext_vector_type__ (8)));
+typedef double double16 __attribute__ ((__ext_vector_type__ (16)));
+
+#endif
+
+#ifndef Sleef_double2_DEFINED
+#define Sleef_double2_DEFINED
+typedef struct
+{
+  double x, y;
+} Sleef_double2;
+#endif
+
+#ifndef Sleef_float2_DEFINED
+#define Sleef_float2_DEFINED
+typedef struct
+{
+  float x, y;
+} Sleef_float2;
+#endif
+
+double Sleef_sin_u35 (double);
+double Sleef_cos_u35 (double);
+Sleef_double2 Sleef_sincos_u35 (double);
+double Sleef_tan_u35 (double);
+double Sleef_asin_u35 (double);
+double Sleef_acos_u35 (double);
+double Sleef_atan_u35 (double);
+double Sleef_atan2_u35 (double, double);
+double Sleef_log_u35 (double);
+double Sleef_cbrt_u35 (double);
+double Sleef_sin_u10 (double);
+double Sleef_cos_u10 (double);
+Sleef_double2 Sleef_sincos_u10 (double);
+double Sleef_tan_u10 (double);
+double Sleef_asin_u10 (double);
+double Sleef_acos_u10 (double);
+double Sleef_atan_u10 (double);
+double Sleef_atan2_u10 (double, double);
+double Sleef_log_u10 (double);
+double Sleef_cbrt_u10 (double);
+double Sleef_exp_u10 (double);
+double Sleef_pow_u10 (double, double);
+double Sleef_sinh_u10 (double);
+double Sleef_cosh_u10 (double);
+double Sleef_tanh_u10 (double);
+double Sleef_asinh_u10 (double);
+double Sleef_acosh_u10 (double);
+double Sleef_atanh_u10 (double);
+double Sleef_exp2_u10 (double);
+double Sleef_exp10_u10 (double);
+double Sleef_expm1_u10 (double);
+double Sleef_log10_u10 (double);
+double Sleef_log1p_u10 (double);
+Sleef_double2 Sleef_sincospi_u05 (double);
+Sleef_double2 Sleef_sincospi_u35 (double);
+double Sleef_sinpi_u05 (double);
+double Sleef_cospi_u05 (double);
+double Sleef_ldexp (double, int);
+int Sleef_ilogb (double);
+double Sleef_fma (double, double, double);
+double Sleef_sqrt_u05 (double);
+double Sleef_hypot_u05 (double, double);
+double Sleef_hypot_u35 (double, double);
+double Sleef_fabs (double);
+double Sleef_copysign (double, double);
+double Sleef_fmax (double, double);
+double Sleef_fmin (double, double);
+double Sleef_fdim (double, double);
+double Sleef_trunc (double);
+double Sleef_floor (double);
+double Sleef_ceil (double);
+double Sleef_round (double);
+double Sleef_rint (double);
+double Sleef_nextafter (double, double);
+double Sleef_frfrexp (double);
+int Sleef_expfrexp (double);
+double Sleef_fmod (double, double);
+Sleef_double2 Sleef_modf (double);
+double Sleef_lgamma_u10 (double);
+Sleef_double2 Sleef_lgamma_r_u10 (double);
+double Sleef_tgamma_u10 (double);
+double Sleef_erf_u10 (double);
+double Sleef_erfc_u15 (double);
+
+float Sleef_sinf_u35 (float);
+float Sleef_cosf_u35 (float);
+Sleef_float2 Sleef_sincosf_u35 (float);
+float Sleef_tanf_u35 (float);
+float Sleef_asinf_u35 (float);
+float Sleef_acosf_u35 (float);
+float Sleef_atanf_u35 (float);
+float Sleef_atan2f_u35 (float, float);
+float Sleef_logf_u35 (float);
+float Sleef_cbrtf_u35 (float);
+float Sleef_sinf_u10 (float);
+float Sleef_cosf_u10 (float);
+Sleef_float2 Sleef_sincosf_u10 (float);
+float Sleef_tanf_u10 (float);
+float Sleef_asinf_u10 (float);
+float Sleef_acosf_u10 (float);
+float Sleef_atanf_u10 (float);
+float Sleef_atan2f_u10 (float, float);
+float Sleef_logf_u10 (float);
+float Sleef_cbrtf_u10 (float);
+float Sleef_expf_u10 (float);
+float Sleef_powf_u10 (float, float);
+float Sleef_sinhf_u10 (float);
+float Sleef_coshf_u10 (float);
+float Sleef_tanhf_u10 (float);
+float Sleef_asinhf_u10 (float);
+float Sleef_acoshf_u10 (float);
+float Sleef_atanhf_u10 (float);
+float Sleef_exp2f_u10 (float);
+float Sleef_exp10f_u10 (float);
+float Sleef_expm1f_u10 (float);
+float Sleef_log10f_u10 (float);
+float Sleef_log1pf_u10 (float);
+Sleef_float2 Sleef_sincospif_u05 (float);
+Sleef_float2 Sleef_sincospif_u35 (float);
+float Sleef_sinpif_u05 (float d);
+float Sleef_cospif_u05 (float d);
+float Sleef_ldexpf (float, int);
+int Sleef_ilogbf (float);
+float Sleef_fmaf (float, float, float);
+float Sleef_sqrtf_u05 (float);
+float Sleef_sqrtf_u35 (float);
+float Sleef_hypotf_u05 (float, float);
+float Sleef_hypotf_u35 (float, float);
+float Sleef_fabsf (float);
+float Sleef_copysignf (float, float);
+float Sleef_fmaxf (float, float);
+float Sleef_fminf (float, float);
+float Sleef_fdimf (float, float);
+float Sleef_truncf (float);
+float Sleef_floorf (float);
+float Sleef_ceilf (float);
+float Sleef_roundf (float);
+float Sleef_rintf (float);
+float Sleef_nextafterf (float, float);
+float Sleef_frfrexpf (float);
+int Sleef_expfrexpf (float);
+float Sleef_fmodf (float, float);
+Sleef_float2 Sleef_modff (float);
+float Sleef_lgammaf_u10 (float);
+Sleef_float2 Sleef_lgamma_rf_u10 (float);
+float Sleef_tgammaf_u10 (float);
+float Sleef_erff_u10 (float);
+float Sleef_erfcf_u15 (float);
+
+double Sleef_pown_u10 (double, int);
+float Sleef_pownf_u10 (float, int);
+double Sleef_powr_u10 (double, double);
+float Sleef_powrf_u10 (float, float);
+
+
+// #####################
+
+#ifdef SLEEF_VEC_512_AVAILABLE
+
+#ifndef Sleef_double8_2_DEFINED
+typedef struct
+{
+  double8 x, y;
+} Sleef_double8_2;
+#define Sleef_double8_2_DEFINED
+#endif
+
+double8 Sleef_sind8_u35 (double8);
+double8 Sleef_cosd8_u35 (double8);
+Sleef_double8_2 Sleef_sincosd8_u35 (double8);
+double8 Sleef_tand8_u35 (double8);
+double8 Sleef_asind8_u35 (double8);
+double8 Sleef_acosd8_u35 (double8);
+double8 Sleef_atand8_u35 (double8);
+double8 Sleef_atan2d8_u35 (double8, double8);
+double8 Sleef_logd8_u35 (double8);
+double8 Sleef_cbrtd8_u35 (double8);
+double8 Sleef_sind8_u10 (double8);
+double8 Sleef_cosd8_u10 (double8);
+Sleef_double8_2 Sleef_sincosd8_u10 (double8);
+double8 Sleef_tand8_u10 (double8);
+double8 Sleef_asind8_u10 (double8);
+double8 Sleef_acosd8_u10 (double8);
+double8 Sleef_atand8_u10 (double8);
+double8 Sleef_atan2d8_u10 (double8, double8);
+double8 Sleef_logd8_u10 (double8);
+double8 Sleef_cbrtd8_u10 (double8);
+double8 Sleef_expd8_u10 (double8);
+double8 Sleef_powd8_u10 (double8, double8);
+double8 Sleef_sinhd8_u10 (double8);
+double8 Sleef_coshd8_u10 (double8);
+double8 Sleef_tanhd8_u10 (double8);
+double8 Sleef_asinhd8_u10 (double8);
+double8 Sleef_acoshd8_u10 (double8);
+double8 Sleef_atanhd8_u10 (double8);
+double8 Sleef_exp2d8_u10 (double8);
+double8 Sleef_exp10d8_u10 (double8);
+double8 Sleef_expm1d8_u10 (double8);
+double8 Sleef_log10d8_u10 (double8);
+double8 Sleef_log1pd8_u10 (double8);
+Sleef_double8_2 Sleef_sincospid8_u05 (double8);
+Sleef_double8_2 Sleef_sincospid8_u35 (double8);
+double8 Sleef_sinpid8_u05 (double8);
+double8 Sleef_cospid8_u05 (double8);
+double8 Sleef_ldexpd8 (double8, int8);
+int8 Sleef_ilogbd8 (double8);
+double8 Sleef_fmad8 (double8, double8, double8);
+double8 Sleef_sqrtd8_u05 (double8);
+double8 Sleef_sqrtd8_u35 (double8);
+double8 Sleef_hypotd8_u05 (double8, double8);
+double8 Sleef_hypotd8_u35 (double8, double8);
+double8 Sleef_fabsd8 (double8);
+double8 Sleef_copysignd8 (double8, double8);
+double8 Sleef_fmaxd8 (double8, double8);
+double8 Sleef_fmind8 (double8, double8);
+double8 Sleef_fdimd8 (double8, double8);
+double8 Sleef_truncd8 (double8);
+double8 Sleef_floord8 (double8);
+double8 Sleef_ceild8 (double8);
+double8 Sleef_roundd8 (double8);
+double8 Sleef_rintd8 (double8);
+double8 Sleef_nextafterd8 (double8, double8);
+double8 Sleef_frfrexpd8 (double8);
+int8 Sleef_expfrexpd8 (double8);
+double8 Sleef_fmodd8 (double8, double8);
+Sleef_double8_2 Sleef_modfd8 (double8);
+double8 Sleef_lgammad8_u10 (double8);
+Sleef_double8_2 Sleef_lgamma_rd8_u10 (double8);
+double8 Sleef_tgammad8_u10 (double8);
+double8 Sleef_erfd8_u10 (double8);
+double8 Sleef_erfcd8_u15 (double8);
+
+#ifndef Sleef_float16_2_DEFINED
+typedef struct
+{
+  float16 x, y;
+} Sleef_float16_2;
+#define Sleef_float16_2_DEFINED
+#endif
+
+float16 Sleef_sinf16_u35 (float16);
+float16 Sleef_cosf16_u35 (float16);
+Sleef_float16_2 Sleef_sincosf16_u35 (float16);
+float16 Sleef_tanf16_u35 (float16);
+float16 Sleef_asinf16_u35 (float16);
+float16 Sleef_acosf16_u35 (float16);
+float16 Sleef_atanf16_u35 (float16);
+float16 Sleef_atan2f16_u35 (float16, float16);
+float16 Sleef_logf16_u35 (float16);
+float16 Sleef_cbrtf16_u35 (float16);
+float16 Sleef_sinf16_u10 (float16);
+float16 Sleef_cosf16_u10 (float16);
+Sleef_float16_2 Sleef_sincosf16_u10 (float16);
+float16 Sleef_tanf16_u10 (float16);
+float16 Sleef_asinf16_u10 (float16);
+float16 Sleef_acosf16_u10 (float16);
+float16 Sleef_atanf16_u10 (float16);
+float16 Sleef_atan2f16_u10 (float16, float16);
+float16 Sleef_logf16_u10 (float16);
+float16 Sleef_cbrtf16_u10 (float16);
+float16 Sleef_expf16_u10 (float16);
+float16 Sleef_powf16_u10 (float16, float16);
+float16 Sleef_sinhf16_u10 (float16);
+float16 Sleef_coshf16_u10 (float16);
+float16 Sleef_tanhf16_u10 (float16);
+float16 Sleef_asinhf16_u10 (float16);
+float16 Sleef_acoshf16_u10 (float16);
+float16 Sleef_atanhf16_u10 (float16);
+float16 Sleef_exp2f16_u10 (float16);
+float16 Sleef_exp10f16_u10 (float16);
+float16 Sleef_expm1f16_u10 (float16);
+float16 Sleef_log10f16_u10 (float16);
+float16 Sleef_log1pf16_u10 (float16);
+Sleef_float16_2 Sleef_sincospif16_u05 (float16);
+Sleef_float16_2 Sleef_sincospif16_u35 (float16);
+float16 Sleef_sinpif16_u05 (float16);
+float16 Sleef_cospif16_u05 (float16);
+float16 Sleef_ldexpf16 (float16, int16);
+int16 Sleef_ilogbf16 (float16);
+float16 Sleef_fmaf16 (float16, float16, float16);
+float16 Sleef_sqrtf16_u05 (float16);
+float16 Sleef_sqrtf16_u35 (float16);
+float16 Sleef_hypotf16_u05 (float16, float16);
+float16 Sleef_hypotf16_u35 (float16, float16);
+float16 Sleef_fabsf16 (float16);
+float16 Sleef_copysignf16 (float16, float16);
+float16 Sleef_fmaxf16 (float16, float16);
+float16 Sleef_fminf16 (float16, float16);
+float16 Sleef_fdimf16 (float16, float16);
+float16 Sleef_truncf16 (float16);
+float16 Sleef_floorf16 (float16);
+float16 Sleef_ceilf16 (float16);
+float16 Sleef_roundf16 (float16);
+float16 Sleef_rintf16 (float16);
+float16 Sleef_nextafterf16 (float16, float16);
+float16 Sleef_frfrexpf16 (float16);
+int16 Sleef_expfrexpf16 (float16);
+float16 Sleef_fmodf16 (float16, float16);
+Sleef_float16_2 Sleef_modff16 (float16);
+float16 Sleef_lgammaf16_u10 (float16);
+Sleef_float16_2 Sleef_lgamma_rf16_u10 (float16);
+float16 Sleef_tgammaf16_u10 (float16);
+float16 Sleef_erff16_u10 (float16);
+float16 Sleef_erfcf16_u15 (float16);
+
+
+double8 Sleef_pownd8_u10 (double8, int8);
+float16 Sleef_pownf16_u10 (float16, int16);
+double8 Sleef_powrd8_u10 (double8, double8);
+float16 Sleef_powrf16_u10 (float16, float16);
+
+#endif
+
+// #####################
+
+#ifdef SLEEF_VEC_256_AVAILABLE
+
+#ifndef Sleef_double4_2_DEFINED
+typedef struct
+{
+  double4 x, y;
+} Sleef_double4_2;
+#define Sleef_double4_2_DEFINED
+#endif
+
+double4 Sleef_sind4_u35 (double4);
+double4 Sleef_cosd4_u35 (double4);
+Sleef_double4_2 Sleef_sincosd4_u35 (double4);
+double4 Sleef_tand4_u35 (double4);
+double4 Sleef_asind4_u35 (double4);
+double4 Sleef_acosd4_u35 (double4);
+double4 Sleef_atand4_u35 (double4);
+double4 Sleef_atan2d4_u35 (double4, double4);
+double4 Sleef_logd4_u35 (double4);
+double4 Sleef_cbrtd4_u35 (double4);
+double4 Sleef_sind4_u10 (double4);
+double4 Sleef_cosd4_u10 (double4);
+Sleef_double4_2 Sleef_sincosd4_u10 (double4);
+double4 Sleef_tand4_u10 (double4);
+double4 Sleef_asind4_u10 (double4);
+double4 Sleef_acosd4_u10 (double4);
+double4 Sleef_atand4_u10 (double4);
+double4 Sleef_atan2d4_u10 (double4, double4);
+double4 Sleef_logd4_u10 (double4);
+double4 Sleef_cbrtd4_u10 (double4);
+double4 Sleef_expd4_u10 (double4);
+double4 Sleef_powd4_u10 (double4, double4);
+double4 Sleef_sinhd4_u10 (double4);
+double4 Sleef_coshd4_u10 (double4);
+double4 Sleef_tanhd4_u10 (double4);
+double4 Sleef_asinhd4_u10 (double4);
+double4 Sleef_acoshd4_u10 (double4);
+double4 Sleef_atanhd4_u10 (double4);
+double4 Sleef_exp2d4_u10 (double4);
+double4 Sleef_exp10d4_u10 (double4);
+double4 Sleef_expm1d4_u10 (double4);
+double4 Sleef_log10d4_u10 (double4);
+double4 Sleef_log1pd4_u10 (double4);
+Sleef_double4_2 Sleef_sincospid4_u05 (double4);
+Sleef_double4_2 Sleef_sincospid4_u35 (double4);
+double4 Sleef_sinpid4_u05 (double4);
+double4 Sleef_cospid4_u05 (double4);
+double4 Sleef_ldexpd4 (double4, int4);
+int4 Sleef_ilogbd4 (double4);
+double4 Sleef_fmad4 (double4, double4, double4);
+double4 Sleef_sqrtd4_u05 (double4);
+double4 Sleef_sqrtd4_u35 (double4);
+double4 Sleef_hypotd4_u05 (double4, double4);
+double4 Sleef_hypotd4_u35 (double4, double4);
+double4 Sleef_fabsd4 (double4);
+double4 Sleef_copysignd4 (double4, double4);
+double4 Sleef_fmaxd4 (double4, double4);
+double4 Sleef_fmind4 (double4, double4);
+double4 Sleef_fdimd4 (double4, double4);
+double4 Sleef_truncd4 (double4);
+double4 Sleef_floord4 (double4);
+double4 Sleef_ceild4 (double4);
+double4 Sleef_roundd4 (double4);
+double4 Sleef_rintd4 (double4);
+double4 Sleef_nextafterd4 (double4, double4);
+double4 Sleef_frfrexpd4 (double4);
+int4 Sleef_expfrexpd4 (double4);
+double4 Sleef_fmodd4 (double4, double4);
+Sleef_double4_2 Sleef_modfd4 (double4);
+double4 Sleef_lgammad4_u10 (double4);
+Sleef_double4_2 Sleef_lgamma_rd4_u10 (double4);
+double4 Sleef_tgammad4_u10 (double4);
+double4 Sleef_erfd4_u10 (double4);
+double4 Sleef_erfcd4_u15 (double4);
+
+#ifndef Sleef_float8_2_DEFINED
+typedef struct
+{
+  float8 x, y;
+} Sleef_float8_2;
+#define Sleef_float8_2_DEFINED
+#endif
+
+float8 Sleef_sinf8_u35 (float8);
+float8 Sleef_cosf8_u35 (float8);
+Sleef_float8_2 Sleef_sincosf8_u35 (float8);
+float8 Sleef_tanf8_u35 (float8);
+float8 Sleef_asinf8_u35 (float8);
+float8 Sleef_acosf8_u35 (float8);
+float8 Sleef_atanf8_u35 (float8);
+float8 Sleef_atan2f8_u35 (float8, float8);
+float8 Sleef_logf8_u35 (float8);
+float8 Sleef_cbrtf8_u35 (float8);
+float8 Sleef_sinf8_u10 (float8);
+float8 Sleef_cosf8_u10 (float8);
+Sleef_float8_2 Sleef_sincosf8_u10 (float8);
+float8 Sleef_tanf8_u10 (float8);
+float8 Sleef_asinf8_u10 (float8);
+float8 Sleef_acosf8_u10 (float8);
+float8 Sleef_atanf8_u10 (float8);
+float8 Sleef_atan2f8_u10 (float8, float8);
+float8 Sleef_logf8_u10 (float8);
+float8 Sleef_cbrtf8_u10 (float8);
+float8 Sleef_expf8_u10 (float8);
+float8 Sleef_powf8_u10 (float8, float8);
+float8 Sleef_sinhf8_u10 (float8);
+float8 Sleef_coshf8_u10 (float8);
+float8 Sleef_tanhf8_u10 (float8);
+float8 Sleef_asinhf8_u10 (float8);
+float8 Sleef_acoshf8_u10 (float8);
+float8 Sleef_atanhf8_u10 (float8);
+float8 Sleef_exp2f8_u10 (float8);
+float8 Sleef_exp10f8_u10 (float8);
+float8 Sleef_expm1f8_u10 (float8);
+float8 Sleef_log10f8_u10 (float8);
+float8 Sleef_log1pf8_u10 (float8);
+Sleef_float8_2 Sleef_sincospif8_u05 (float8);
+Sleef_float8_2 Sleef_sincospif8_u35 (float8);
+float8 Sleef_sinpif8_u05 (float8);
+float8 Sleef_cospif8_u05 (float8);
+float8 Sleef_ldexpf8 (float8, int8);
+int8 Sleef_ilogbf8 (float8);
+float8 Sleef_fmaf8 (float8, float8, float8);
+float8 Sleef_sqrtf8_u05 (float8);
+float8 Sleef_sqrtf8_u35 (float8);
+float8 Sleef_hypotf8_u05 (float8, float8);
+float8 Sleef_hypotf8_u35 (float8, float8);
+float8 Sleef_fabsf8 (float8);
+float8 Sleef_copysignf8 (float8, float8);
+float8 Sleef_fmaxf8 (float8, float8);
+float8 Sleef_fminf8 (float8, float8);
+float8 Sleef_fdimf8 (float8, float8);
+float8 Sleef_truncf8 (float8);
+float8 Sleef_floorf8 (float8);
+float8 Sleef_ceilf8 (float8);
+float8 Sleef_roundf8 (float8);
+float8 Sleef_rintf8 (float8);
+float8 Sleef_nextafterf8 (float8, float8);
+float8 Sleef_frfrexpf8 (float8);
+int8 Sleef_expfrexpf8 (float8);
+float8 Sleef_fmodf8 (float8, float8);
+Sleef_float8_2 Sleef_modff8 (float8);
+float8 Sleef_lgammaf8_u10 (float8);
+Sleef_float8_2 Sleef_lgamma_rf8_u10 (float8);
+float8 Sleef_tgammaf8_u10 (float8);
+float8 Sleef_erff8_u10 (float8);
+float8 Sleef_erfcf8_u15 (float8);
+
+double4 Sleef_pownd4_u10 (double4, int4);
+float8 Sleef_pownf8_u10 (float8, int8);
+double4 Sleef_powrd4_u10 (double4, double4);
+float8 Sleef_powrf8_u10 (float8, float8);
+
+#endif
+
+#ifdef SLEEF_VEC_128_AVAILABLE
+
+#ifndef Sleef_double2_2_DEFINED
+typedef struct
+{
+  double2 x, y;
+} Sleef_double2_2;
+#define Sleef_double2_2_DEFINED
+#endif
+
+double2 Sleef_sind2_u35 (double2);
+double2 Sleef_cosd2_u35 (double2);
+Sleef_double2_2 Sleef_sincosd2_u35 (double2);
+double2 Sleef_tand2_u35 (double2);
+double2 Sleef_asind2_u35 (double2);
+double2 Sleef_acosd2_u35 (double2);
+double2 Sleef_atand2_u35 (double2);
+double2 Sleef_atan2d2_u35 (double2, double2);
+double2 Sleef_logd2_u35 (double2);
+double2 Sleef_cbrtd2_u35 (double2);
+double2 Sleef_sind2_u10 (double2);
+double2 Sleef_cosd2_u10 (double2);
+Sleef_double2_2 Sleef_sincosd2_u10 (double2);
+double2 Sleef_tand2_u10 (double2);
+double2 Sleef_asind2_u10 (double2);
+double2 Sleef_acosd2_u10 (double2);
+double2 Sleef_atand2_u10 (double2);
+double2 Sleef_atan2d2_u10 (double2, double2);
+double2 Sleef_logd2_u10 (double2);
+double2 Sleef_cbrtd2_u10 (double2);
+double2 Sleef_expd2_u10 (double2);
+double2 Sleef_powd2_u10 (double2, double2);
+double2 Sleef_sinhd2_u10 (double2);
+double2 Sleef_coshd2_u10 (double2);
+double2 Sleef_tanhd2_u10 (double2);
+double2 Sleef_asinhd2_u10 (double2);
+double2 Sleef_acoshd2_u10 (double2);
+double2 Sleef_atanhd2_u10 (double2);
+double2 Sleef_exp2d2_u10 (double2);
+double2 Sleef_exp10d2_u10 (double2);
+double2 Sleef_expm1d2_u10 (double2);
+double2 Sleef_log10d2_u10 (double2);
+double2 Sleef_log1pd2_u10 (double2);
+Sleef_double2_2 Sleef_sincospid2_u05 (double2);
+Sleef_double2_2 Sleef_sincospid2_u35 (double2);
+double2 Sleef_sinpid2_u05 (double2);
+double2 Sleef_cospid2_u05 (double2);
+double2 Sleef_ldexpd2 (double2, int2);
+int2 Sleef_ilogbd2 (double2);
+double2 Sleef_fmad2 (double2, double2, double2);
+double2 Sleef_sqrtd2_u05 (double2);
+double2 Sleef_sqrtd2_u35 (double2);
+double2 Sleef_hypotd2_u05 (double2, double2);
+double2 Sleef_hypotd2_u35 (double2, double2);
+double2 Sleef_fabsd2 (double2);
+double2 Sleef_copysignd2 (double2, double2);
+double2 Sleef_fmaxd2 (double2, double2);
+double2 Sleef_fmind2 (double2, double2);
+double2 Sleef_fdimd2 (double2, double2);
+double2 Sleef_truncd2 (double2);
+double2 Sleef_floord2 (double2);
+double2 Sleef_ceild2 (double2);
+double2 Sleef_roundd2 (double2);
+double2 Sleef_rintd2 (double2);
+double2 Sleef_nextafterd2 (double2, double2);
+double2 Sleef_frfrexpd2 (double2);
+int2 Sleef_expfrexpd2 (double2);
+double2 Sleef_fmodd2 (double2, double2);
+Sleef_double2_2 Sleef_modfd2 (double2);
+double2 Sleef_lgammad2_u10 (double2);
+Sleef_double2_2 Sleef_lgamma_rd2_u10 (double2);
+double2 Sleef_tgammad2_u10 (double2);
+double2 Sleef_erfd2_u10 (double2);
+double2 Sleef_erfcd2_u15 (double2);
+
+#ifndef Sleef_float4_2_DEFINED
+typedef struct
+{
+  float4 x, y;
+} Sleef_float4_2;
+#define Sleef_float4_2_DEFINED
+#endif
+
+float4 Sleef_sinf4_u35 (float4);
+float4 Sleef_cosf4_u35 (float4);
+Sleef_float4_2 Sleef_sincosf4_u35 (float4);
+float4 Sleef_tanf4_u35 (float4);
+float4 Sleef_asinf4_u35 (float4);
+float4 Sleef_acosf4_u35 (float4);
+float4 Sleef_atanf4_u35 (float4);
+float4 Sleef_atan2f4_u35 (float4, float4);
+float4 Sleef_logf4_u35 (float4);
+float4 Sleef_cbrtf4_u35 (float4);
+float4 Sleef_sinf4_u10 (float4);
+float4 Sleef_cosf4_u10 (float4);
+Sleef_float4_2 Sleef_sincosf4_u10 (float4);
+float4 Sleef_tanf4_u10 (float4);
+float4 Sleef_asinf4_u10 (float4);
+float4 Sleef_acosf4_u10 (float4);
+float4 Sleef_atanf4_u10 (float4);
+float4 Sleef_atan2f4_u10 (float4, float4);
+float4 Sleef_logf4_u10 (float4);
+float4 Sleef_cbrtf4_u10 (float4);
+float4 Sleef_expf4_u10 (float4);
+float4 Sleef_powf4_u10 (float4, float4);
+float4 Sleef_sinhf4_u10 (float4);
+float4 Sleef_coshf4_u10 (float4);
+float4 Sleef_tanhf4_u10 (float4);
+float4 Sleef_asinhf4_u10 (float4);
+float4 Sleef_acoshf4_u10 (float4);
+float4 Sleef_atanhf4_u10 (float4);
+float4 Sleef_exp2f4_u10 (float4);
+float4 Sleef_exp10f4_u10 (float4);
+float4 Sleef_expm1f4_u10 (float4);
+float4 Sleef_log10f4_u10 (float4);
+float4 Sleef_log1pf4_u10 (float4);
+Sleef_float4_2 Sleef_sincospif4_u05 (float4);
+Sleef_float4_2 Sleef_sincospif4_u35 (float4);
+float4 Sleef_sinpif4_u05 (float4);
+float4 Sleef_cospif4_u05 (float4);
+float4 Sleef_ldexpf4 (float4, int4);
+int4 Sleef_ilogbf4 (float4);
+float4 Sleef_fmaf4 (float4, float4, float4);
+float4 Sleef_sqrtf4_u05 (float4);
+float4 Sleef_sqrtf4_u35 (float4);
+float4 Sleef_hypotf4_u05 (float4, float4);
+float4 Sleef_hypotf4_u35 (float4, float4);
+float4 Sleef_fabsf4 (float4);
+float4 Sleef_copysignf4 (float4, float4);
+float4 Sleef_fmaxf4 (float4, float4);
+float4 Sleef_fminf4 (float4, float4);
+float4 Sleef_fdimf4 (float4, float4);
+float4 Sleef_truncf4 (float4);
+float4 Sleef_floorf4 (float4);
+float4 Sleef_ceilf4 (float4);
+float4 Sleef_roundf4 (float4);
+float4 Sleef_rintf4 (float4);
+float4 Sleef_nextafterf4 (float4, float4);
+float4 Sleef_frfrexpf4 (float4);
+int4 Sleef_expfrexpf4 (float4);
+float4 Sleef_fmodf4 (float4, float4);
+Sleef_float4_2 Sleef_modff4 (float4);
+float4 Sleef_lgammaf4_u10 (float4);
+Sleef_float4_2 Sleef_lgamma_rf4_u10 (float4);
+float4 Sleef_tgammaf4_u10 (float4);
+float4 Sleef_erff4_u10 (float4);
+float4 Sleef_erfcf4_u15 (float4);
+
+double2 Sleef_pownd2_u10 (double2, int2);
+float4 Sleef_pownf4_u10 (float4, int4);
+double2 Sleef_powrd2_u10 (double2, double2);
+float4 Sleef_powrf4_u10 (float4, float4);
+
+#endif
+
+#endif
diff --git a/lib/kernel/sleef/libm/dd.h b/lib/kernel/sleef/libm/dd.h
new file mode 100644
index 0000000..068f97b
--- /dev/null
+++ b/lib/kernel/sleef/libm/dd.h
@@ -0,0 +1,395 @@
+//          Copyright Naoki Shibata 2010 - 2017.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+typedef struct {
+  vdouble x, y;
+} vdouble2;
+
+static INLINE CONST vdouble vupper_vd_vd(vdouble d) {
+  return vreinterpret_vd_vm(vand_vm_vm_vm(vreinterpret_vm_vd(d), vcast_vm_i_i(0xffffffff, 0xf8000000)));
+}
+
+static INLINE CONST vdouble2 vcast_vd2_vd_vd(vdouble h, vdouble l) {
+  vdouble2 ret = {h, l};
+  return ret;
+}
+
+static INLINE CONST vdouble2 vcast_vd2_d_d(double h, double l) {
+  vdouble2 ret = {vcast_vd_d(h), vcast_vd_d(l)};
+  return ret;
+}
+
+static INLINE CONST vdouble2 vsel_vd2_vo_vd2_vd2(vopmask m, vdouble2 x, vdouble2 y) {
+  vdouble2 r;
+  r.x = vsel_vd_vo_vd_vd(m, x.x, y.x);
+  r.y = vsel_vd_vo_vd_vd(m, x.y, y.y);
+  return r;
+}
+
+static INLINE CONST vdouble2 vsel_vd2_vo_d_d_d_d(vopmask o, double x1, double y1, double x0, double y0) {
+  vdouble2 r;
+  r.x = vsel_vd_vo_d_d(o, x1, x0);
+  r.y = vsel_vd_vo_d_d(o, y1, y0);
+  return r;
+}
+
+static INLINE CONST vdouble vadd_vd_3vd(vdouble v0, vdouble v1, vdouble v2) {
+  return vadd_vd_vd_vd(vadd_vd_vd_vd(v0, v1), v2);
+}
+
+static INLINE CONST vdouble vadd_vd_4vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {
+  return vadd_vd_3vd(vadd_vd_vd_vd(v0, v1), v2, v3);
+}
+
+static INLINE CONST vdouble vadd_vd_5vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {
+  return vadd_vd_4vd(vadd_vd_vd_vd(v0, v1), v2, v3, v4);
+}
+
+static INLINE CONST vdouble vadd_vd_6vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5) {
+  return vadd_vd_5vd(vadd_vd_vd_vd(v0, v1), v2, v3, v4, v5);
+}
+
+static INLINE CONST vdouble vadd_vd_7vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5, vdouble v6) {
+  return vadd_vd_6vd(vadd_vd_vd_vd(v0, v1), v2, v3, v4, v5, v6);
+}
+
+static INLINE CONST vdouble vsub_vd_3vd(vdouble v0, vdouble v1, vdouble v2) {
+  return vsub_vd_vd_vd(vsub_vd_vd_vd(v0, v1), v2);
+}
+
+static INLINE CONST vdouble vsub_vd_4vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3) {
+  return vsub_vd_3vd(vsub_vd_vd_vd(v0, v1), v2, v3);
+}
+
+static INLINE CONST vdouble vsub_vd_5vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4) {
+  return vsub_vd_4vd(vsub_vd_vd_vd(v0, v1), v2, v3, v4);
+}
+
+static INLINE CONST vdouble vsub_vd_6vd(vdouble v0, vdouble v1, vdouble v2, vdouble v3, vdouble v4, vdouble v5) {
+  return vsub_vd_5vd(vsub_vd_vd_vd(v0, v1), v2, v3, v4, v5);
+}
+
+//
+
+static INLINE CONST vdouble2 ddneg_vd2_vd2(vdouble2 x) {
+  return vcast_vd2_vd_vd(vneg_vd_vd(x.x), vneg_vd_vd(x.y));
+}
+
+static INLINE CONST vdouble2 ddabs_vd2_vd2(vdouble2 x) {
+  return vcast_vd2_vd_vd(vabs_vd_vd(x.x),
+			 vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(x.y), vand_vm_vm_vm(vreinterpret_vm_vd(x.x), vreinterpret_vm_vd(vcast_vd_d(-0.0))))));
+}
+
+static INLINE CONST vdouble2 ddnormalize_vd2_vd2(vdouble2 t) {
+  vdouble2 s;
+
+  s.x = vadd_vd_vd_vd(t.x, t.y);
+  s.y = vadd_vd_vd_vd(vsub_vd_vd_vd(t.x, s.x), t.y);
+
+  return s;
+}
+
+static INLINE CONST vdouble2 ddscale_vd2_vd2_vd(vdouble2 d, vdouble s) {
+  vdouble2 r = {vmul_vd_vd_vd(d.x, s), vmul_vd_vd_vd(d.y, s)};
+  return r;
+}
+
+static INLINE CONST vdouble2 ddadd_vd2_vd_vd(vdouble x, vdouble y) {
+  vdouble2 r;
+
+  r.x = vadd_vd_vd_vd(x, y);
+  r.y = vadd_vd_vd_vd(vsub_vd_vd_vd(x, r.x), y);
+
+  return r;
+}
+
+static INLINE CONST vdouble2 ddadd2_vd2_vd_vd(vdouble x, vdouble y) {
+  vdouble2 r;
+
+  r.x = vadd_vd_vd_vd(x, y);
+  vdouble v = vsub_vd_vd_vd(r.x, x);
+  r.y = vadd_vd_vd_vd(vsub_vd_vd_vd(x, vsub_vd_vd_vd(r.x, v)), vsub_vd_vd_vd(y, v));
+
+  return r;
+}
+
+static INLINE CONST vdouble2 ddadd_vd2_vd2_vd(vdouble2 x, vdouble y) {
+  vdouble2 r;
+
+  r.x = vadd_vd_vd_vd(x.x, y);
+  r.y = vadd_vd_3vd(vsub_vd_vd_vd(x.x, r.x), y, x.y);
+
+  return r;
+}
+
+static INLINE CONST vdouble2 ddsub_vd2_vd2_vd(vdouble2 x, vdouble y) {
+  vdouble2 r;
+
+  r.x = vsub_vd_vd_vd(x.x, y);
+  r.y = vadd_vd_vd_vd(vsub_vd_vd_vd(vsub_vd_vd_vd(x.x, r.x), y), x.y);
+
+  return r;
+}
+
+static INLINE CONST vdouble2 ddadd2_vd2_vd2_vd(vdouble2 x, vdouble y) {
+  vdouble2 r;
+
+  r.x = vadd_vd_vd_vd(x.x, y);
+  vdouble v = vsub_vd_vd_vd(r.x, x.x);
+  r.y = vadd_vd_vd_vd(vsub_vd_vd_vd(x.x, vsub_vd_vd_vd(r.x, v)), vsub_vd_vd_vd(y, v));
+  r.y = vadd_vd_vd_vd(r.y, x.y);
+
+  return r;
+}
+
+static INLINE CONST vdouble2 ddadd_vd2_vd_vd2(vdouble x, vdouble2 y) {
+  vdouble2 r;
+
+  r.x = vadd_vd_vd_vd(x, y.x);
+  r.y = vadd_vd_3vd(vsub_vd_vd_vd(x, r.x), y.x, y.y);
+
+  return r;
+}
+
+static INLINE CONST vdouble2 ddadd2_vd2_vd_vd2(vdouble x, vdouble2 y) {
+  vdouble2 r;
+
+  r.x  = vadd_vd_vd_vd(x, y.x);
+  vdouble v = vsub_vd_vd_vd(r.x, x);
+  r.y = vadd_vd_vd_vd(vadd_vd_vd_vd(vsub_vd_vd_vd(x, vsub_vd_vd_vd(r.x, v)), vsub_vd_vd_vd(y.x, v)), y.y);
+
+  return r;
+}
+
+static INLINE CONST vdouble2 ddadd_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {
+  // |x| >= |y|
+
+  vdouble2 r;
+
+  r.x = vadd_vd_vd_vd(x.x, y.x);
+  r.y = vadd_vd_4vd(vsub_vd_vd_vd(x.x, r.x), y.x, x.y, y.y);
+
+  return r;
+}
+
+static INLINE CONST vdouble2 ddadd2_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {
+  vdouble2 r;
+
+  r.x  = vadd_vd_vd_vd(x.x, y.x);
+  vdouble v = vsub_vd_vd_vd(r.x, x.x);
+  r.y = vadd_vd_vd_vd(vsub_vd_vd_vd(x.x, vsub_vd_vd_vd(r.x, v)), vsub_vd_vd_vd(y.x, v));
+  r.y = vadd_vd_vd_vd(r.y, vadd_vd_vd_vd(x.y, y.y));
+
+  return r;
+}
+
+static INLINE CONST vdouble2 ddsub_vd2_vd_vd(vdouble x, vdouble y) {
+  // |x| >= |y|
+
+  vdouble2 r;
+
+  r.x = vsub_vd_vd_vd(x, y);
+  r.y = vsub_vd_vd_vd(vsub_vd_vd_vd(x, r.x), y);
+
+  return r;
+}
+
+static INLINE CONST vdouble2 ddsub_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {
+  // |x| >= |y|
+
+  vdouble2 r;
+
+  r.x = vsub_vd_vd_vd(x.x, y.x);
+  r.y = vsub_vd_vd_vd(x.x, r.x);
+  r.y = vsub_vd_vd_vd(r.y, y.x);
+  r.y = vadd_vd_vd_vd(r.y, x.y);
+  r.y = vsub_vd_vd_vd(r.y, y.y);
+
+  return r;
+}
+
+#ifdef ENABLE_FMA_DP
+static INLINE CONST vdouble2 dddiv_vd2_vd2_vd2(vdouble2 n, vdouble2 d) {
+  vdouble2 q;
+  vdouble t = vrec_vd_vd(d.x), u;
+
+  q.x = vmul_vd_vd_vd(n.x, t);
+  u = vfmapn_vd_vd_vd_vd(t, n.x, q.x);
+  q.y = vfmanp_vd_vd_vd_vd(d.y, t, vfmanp_vd_vd_vd_vd(d.x, t, vcast_vd_d(1)));
+  q.y = vfma_vd_vd_vd_vd(q.x, q.y, vfma_vd_vd_vd_vd(n.y, t, u));
+
+  return q;
+}
+
+static INLINE CONST vdouble2 ddmul_vd2_vd_vd(vdouble x, vdouble y) {
+  vdouble2 r;
+
+  r.x = vmul_vd_vd_vd(x, y);
+  r.y = vfmapn_vd_vd_vd_vd(x, y, r.x);
+
+  return r;
+}
+
+static INLINE CONST vdouble2 ddsqu_vd2_vd2(vdouble2 x) {
+  vdouble2 r;
+
+  r.x = vmul_vd_vd_vd(x.x, x.x);
+  r.y = vfma_vd_vd_vd_vd(vadd_vd_vd_vd(x.x, x.x), x.y, vfmapn_vd_vd_vd_vd(x.x, x.x, r.x));
+
+  return r;
+}
+
+static INLINE CONST vdouble2 ddmul_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {
+  vdouble2 r;
+
+  r.x = vmul_vd_vd_vd(x.x, y.x);
+  r.y = vfma_vd_vd_vd_vd(x.x, y.y, vfma_vd_vd_vd_vd(x.y, y.x, vfmapn_vd_vd_vd_vd(x.x, y.x, r.x)));
+
+  return r;
+}
+
+static INLINE CONST vdouble ddmul_vd_vd2_vd2(vdouble2 x, vdouble2 y) {
+  return vfma_vd_vd_vd_vd(x.x, y.x, vfma_vd_vd_vd_vd(x.y, y.x, vmul_vd_vd_vd(x.x, y.y)));
+}
+
+static INLINE CONST vdouble ddsqu_vd_vd2(vdouble2 x) {
+  return vfma_vd_vd_vd_vd(x.x, x.x, vadd_vd_vd_vd(vmul_vd_vd_vd(x.x, x.y), vmul_vd_vd_vd(x.x, x.y)));
+}
+
+static INLINE CONST vdouble2 ddmul_vd2_vd2_vd(vdouble2 x, vdouble y) {
+  vdouble2 r;
+
+  r.x = vmul_vd_vd_vd(x.x, y);
+  r.y = vfma_vd_vd_vd_vd(x.y, y, vfmapn_vd_vd_vd_vd(x.x, y, r.x));
+
+  return r;
+}
+
+static INLINE CONST vdouble2 ddrec_vd2_vd(vdouble d) {
+  vdouble2 q;
+
+  q.x = vrec_vd_vd(d);
+  q.y = vmul_vd_vd_vd(q.x, vfmanp_vd_vd_vd_vd(d, q.x, vcast_vd_d(1)));
+
+  return q;
+}
+
+static INLINE CONST vdouble2 ddrec_vd2_vd2(vdouble2 d) {
+  vdouble2 q;
+
+  q.x = vrec_vd_vd(d.x);
+  q.y = vmul_vd_vd_vd(q.x, vfmanp_vd_vd_vd_vd(d.y, q.x, vfmanp_vd_vd_vd_vd(d.x, q.x, vcast_vd_d(1))));
+
+  return q;
+}
+#else
+static INLINE CONST vdouble2 dddiv_vd2_vd2_vd2(vdouble2 n, vdouble2 d) {
+  vdouble t = vrec_vd_vd(d.x);
+  vdouble dh  = vupper_vd_vd(d.x), dl  = vsub_vd_vd_vd(d.x,  dh);
+  vdouble th  = vupper_vd_vd(t  ), tl  = vsub_vd_vd_vd(t  ,  th);
+  vdouble nhh = vupper_vd_vd(n.x), nhl = vsub_vd_vd_vd(n.x, nhh);
+
+  vdouble2 q;
+
+  q.x = vmul_vd_vd_vd(n.x, t);
+
+  vdouble u = vadd_vd_5vd(vsub_vd_vd_vd(vmul_vd_vd_vd(nhh, th), q.x), vmul_vd_vd_vd(nhh, tl), vmul_vd_vd_vd(nhl, th), vmul_vd_vd_vd(nhl, tl),
+		    vmul_vd_vd_vd(q.x, vsub_vd_5vd(vcast_vd_d(1), vmul_vd_vd_vd(dh, th), vmul_vd_vd_vd(dh, tl), vmul_vd_vd_vd(dl, th), vmul_vd_vd_vd(dl, tl))));
+
+  q.y = vmla_vd_vd_vd_vd(t, vsub_vd_vd_vd(n.y, vmul_vd_vd_vd(q.x, d.y)), u);
+
+  return q;
+}
+
+static INLINE CONST vdouble2 ddmul_vd2_vd_vd(vdouble x, vdouble y) {
+  vdouble xh = vupper_vd_vd(x), xl = vsub_vd_vd_vd(x, xh);
+  vdouble yh = vupper_vd_vd(y), yl = vsub_vd_vd_vd(y, yh);
+  vdouble2 r;
+
+  r.x = vmul_vd_vd_vd(x, y);
+  r.y = vadd_vd_5vd(vmul_vd_vd_vd(xh, yh), vneg_vd_vd(r.x), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yl));
+
+  return r;
+}
+
+static INLINE CONST vdouble2 ddmul_vd2_vd2_vd(vdouble2 x, vdouble y) {
+  vdouble xh = vupper_vd_vd(x.x), xl = vsub_vd_vd_vd(x.x, xh);
+  vdouble yh = vupper_vd_vd(y  ), yl = vsub_vd_vd_vd(y  , yh);
+  vdouble2 r;
+
+  r.x = vmul_vd_vd_vd(x.x, y);
+  r.y = vadd_vd_6vd(vmul_vd_vd_vd(xh, yh), vneg_vd_vd(r.x), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yl), vmul_vd_vd_vd(x.y, y));
+
+  return r;
+}
+
+static INLINE CONST vdouble2 ddmul_vd2_vd2_vd2(vdouble2 x, vdouble2 y) {
+  vdouble xh = vupper_vd_vd(x.x), xl = vsub_vd_vd_vd(x.x, xh);
+  vdouble yh = vupper_vd_vd(y.x), yl = vsub_vd_vd_vd(y.x, yh);
+  vdouble2 r;
+
+  r.x = vmul_vd_vd_vd(x.x, y.x);
+  r.y = vadd_vd_7vd(vmul_vd_vd_vd(xh, yh), vneg_vd_vd(r.x), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yl), vmul_vd_vd_vd(x.x, y.y), vmul_vd_vd_vd(x.y, y.x));
+
+  return r;
+}
+
+static INLINE CONST vdouble ddmul_vd_vd2_vd2(vdouble2 x, vdouble2 y) {
+  vdouble xh = vupper_vd_vd(x.x), xl = vsub_vd_vd_vd(x.x, xh);
+  vdouble yh = vupper_vd_vd(y.x), yl = vsub_vd_vd_vd(y.x, yh);
+
+  return vadd_vd_6vd(vmul_vd_vd_vd(x.y, yh), vmul_vd_vd_vd(xh, y.y), vmul_vd_vd_vd(xl, yl), vmul_vd_vd_vd(xh, yl), vmul_vd_vd_vd(xl, yh), vmul_vd_vd_vd(xh, yh));
+}
+
+static INLINE CONST vdouble2 ddsqu_vd2_vd2(vdouble2 x) {
+  vdouble xh = vupper_vd_vd(x.x), xl = vsub_vd_vd_vd(x.x, xh);
+  vdouble2 r;
+
+  r.x = vmul_vd_vd_vd(x.x, x.x);
+  r.y = vadd_vd_5vd(vmul_vd_vd_vd(xh, xh), vneg_vd_vd(r.x), vmul_vd_vd_vd(vadd_vd_vd_vd(xh, xh), xl), vmul_vd_vd_vd(xl, xl), vmul_vd_vd_vd(x.x, vadd_vd_vd_vd(x.y, x.y)));
+
+  return r;
+}
+
+static INLINE CONST vdouble ddsqu_vd_vd2(vdouble2 x) {
+  vdouble xh = vupper_vd_vd(x.x), xl = vsub_vd_vd_vd(x.x, xh);
+
+  return vadd_vd_5vd(vmul_vd_vd_vd(xh, x.y), vmul_vd_vd_vd(xh, x.y), vmul_vd_vd_vd(xl, xl), vadd_vd_vd_vd(vmul_vd_vd_vd(xh, xl), vmul_vd_vd_vd(xh, xl)), vmul_vd_vd_vd(xh, xh));
+}
+
+static INLINE CONST vdouble2 ddrec_vd2_vd(vdouble d) {
+  vdouble t = vrec_vd_vd(d);
+  vdouble dh = vupper_vd_vd(d), dl = vsub_vd_vd_vd(d, dh);
+  vdouble th = vupper_vd_vd(t), tl = vsub_vd_vd_vd(t, th);
+  vdouble2 q;
+
+  q.x = t;
+  q.y = vmul_vd_vd_vd(t, vsub_vd_5vd(vcast_vd_d(1), vmul_vd_vd_vd(dh, th), vmul_vd_vd_vd(dh, tl), vmul_vd_vd_vd(dl, th), vmul_vd_vd_vd(dl, tl)));
+
+  return q;
+}
+
+static INLINE CONST vdouble2 ddrec_vd2_vd2(vdouble2 d) {
+  vdouble t = vrec_vd_vd(d.x);
+  vdouble dh = vupper_vd_vd(d.x), dl = vsub_vd_vd_vd(d.x, dh);
+  vdouble th = vupper_vd_vd(t  ), tl = vsub_vd_vd_vd(t  , th);
+  vdouble2 q;
+
+  q.x = t;
+  q.y = vmul_vd_vd_vd(t, vsub_vd_6vd(vcast_vd_d(1), vmul_vd_vd_vd(dh, th), vmul_vd_vd_vd(dh, tl), vmul_vd_vd_vd(dl, th), vmul_vd_vd_vd(dl, tl), vmul_vd_vd_vd(d.y, t)));
+
+  return q;
+}
+#endif
+
+static INLINE CONST vdouble2 ddsqrt_vd2_vd2(vdouble2 d) {
+  vdouble t = vsqrt_vd_vd(vadd_vd_vd_vd(d.x, d.y));
+  return ddscale_vd2_vd2_vd(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd2_vd2(d, ddmul_vd2_vd_vd(t, t)), ddrec_vd2_vd(t)), vcast_vd_d(0.5));
+}
+
+static INLINE CONST vdouble2 ddsqrt_vd2_vd(vdouble d) {
+  vdouble t = vsqrt_vd_vd(d);
+  return ddscale_vd2_vd2_vd(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd_vd2(d, ddmul_vd2_vd_vd(t, t)), ddrec_vd2_vd(t)), vcast_vd_d(0.5));
+}
diff --git a/lib/kernel/sleef/libm/df.h b/lib/kernel/sleef/libm/df.h
new file mode 100644
index 0000000..14705e6
--- /dev/null
+++ b/lib/kernel/sleef/libm/df.h
@@ -0,0 +1,466 @@
+//          Copyright Naoki Shibata 2010 - 2017.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+typedef struct {
+  vfloat x, y;
+} vfloat2;
+
+static INLINE CONST vfloat vupper_vf_vf(vfloat d) {
+  return vreinterpret_vf_vi2(vand_vi2_vi2_vi2(vreinterpret_vi2_vf(d), vcast_vi2_i(0xfffff000)));
+}
+
+static INLINE CONST vfloat2 vcast_vf2_vf_vf(vfloat h, vfloat l) {
+  vfloat2 ret = {h, l};
+  return ret;
+}
+
+static INLINE CONST vfloat2 vcast_vf2_f_f(float h, float l) {
+  vfloat2 ret = {vcast_vf_f(h), vcast_vf_f(l)};
+  return ret;
+}
+
+static INLINE CONST vfloat2 vcast_vf2_d(double d) {
+  vfloat2 ret = {vcast_vf_f(d), vcast_vf_f(d - (float)d)};
+  return ret;
+}
+
+static INLINE CONST vfloat2 vsel_vf2_vo_vf2_vf2(vopmask m, vfloat2 x, vfloat2 y) {
+  vfloat2 r;
+  r.x = vsel_vf_vo_vf_vf(m, x.x, y.x);
+  r.y = vsel_vf_vo_vf_vf(m, x.y, y.y);
+  return r;
+}
+
+static INLINE CONST vfloat2 vsel_vf2_vo_f_f_f_f(vopmask o, float x1, float y1, float x0, float y0) {
+  vfloat2 r;
+  r.x = vsel_vf_vo_f_f(o, x1, x0);
+  r.y = vsel_vf_vo_f_f(o, y1, y0);
+  return r;
+}
+
+static INLINE CONST vfloat2 vsel_vf2_vo_vo_d_d_d(vopmask o0, vopmask o1, double d0, double d1, double d2) {
+  return vsel_vf2_vo_vf2_vf2(o0, vcast_vf2_d(d0), vsel_vf2_vo_vf2_vf2(o1, vcast_vf2_d(d1), vcast_vf2_d(d2)));
+}
+
+static INLINE CONST vfloat2 vsel_vf2_vo_vo_vo_d_d_d_d(vopmask o0, vopmask o1, vopmask o2, double d0, double d1, double d2, double d3) {
+  return vsel_vf2_vo_vf2_vf2(o0, vcast_vf2_d(d0), vsel_vf2_vo_vf2_vf2(o1, vcast_vf2_d(d1), vsel_vf2_vo_vf2_vf2(o2, vcast_vf2_d(d2), vcast_vf2_d(d3))));
+}
+
+static INLINE CONST vfloat2 vabs_vf2_vf2(vfloat2 x) {
+  return vcast_vf2_vf_vf(vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0)), vreinterpret_vm_vf(x.x)), vreinterpret_vm_vf(x.x))),
+			 vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0)), vreinterpret_vm_vf(x.x)), vreinterpret_vm_vf(x.y))));
+}
+
+static INLINE CONST vfloat vadd_vf_3vf(vfloat v0, vfloat v1, vfloat v2) {
+  return vadd_vf_vf_vf(vadd_vf_vf_vf(v0, v1), v2);
+}
+
+static INLINE CONST vfloat vadd_vf_4vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3) {
+  return vadd_vf_3vf(vadd_vf_vf_vf(v0, v1), v2, v3);
+}
+
+static INLINE CONST vfloat vadd_vf_5vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4) {
+  return vadd_vf_4vf(vadd_vf_vf_vf(v0, v1), v2, v3, v4);
+}
+
+static INLINE CONST vfloat vadd_vf_6vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4, vfloat v5) {
+  return vadd_vf_5vf(vadd_vf_vf_vf(v0, v1), v2, v3, v4, v5);
+}
+
+static INLINE CONST vfloat vadd_vf_7vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4, vfloat v5, vfloat v6) {
+  return vadd_vf_6vf(vadd_vf_vf_vf(v0, v1), v2, v3, v4, v5, v6);
+}
+
+static INLINE CONST vfloat vsub_vf_3vf(vfloat v0, vfloat v1, vfloat v2) {
+  return vsub_vf_vf_vf(vsub_vf_vf_vf(v0, v1), v2);
+}
+
+static INLINE CONST vfloat vsub_vf_4vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3) {
+  return vsub_vf_3vf(vsub_vf_vf_vf(v0, v1), v2, v3);
+}
+
+static INLINE CONST vfloat vsub_vf_5vf(vfloat v0, vfloat v1, vfloat v2, vfloat v3, vfloat v4) {
+  return vsub_vf_4vf(vsub_vf_vf_vf(v0, v1), v2, v3, v4);
+}
+
+//
+
+static INLINE CONST vfloat2 dfneg_vf2_vf2(vfloat2 x) {
+  return vcast_vf2_vf_vf(vneg_vf_vf(x.x), vneg_vf_vf(x.y));
+}
+
+static INLINE CONST vfloat2 dfabs_vf2_vf2(vfloat2 x) {
+  return vcast_vf2_vf_vf(vabs_vf_vf(x.x),
+			 vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(x.y), vand_vm_vm_vm(vreinterpret_vm_vf(x.x), vreinterpret_vm_vf(vcast_vf_f(-0.0f))))));
+}
+
+static INLINE CONST vfloat2 dfnormalize_vf2_vf2(vfloat2 t) {
+  vfloat2 s;
+
+  s.x = vadd_vf_vf_vf(t.x, t.y);
+  s.y = vadd_vf_vf_vf(vsub_vf_vf_vf(t.x, s.x), t.y);
+
+  return s;
+}
+
+static INLINE CONST vfloat2 dfscale_vf2_vf2_vf(vfloat2 d, vfloat s) {
+  vfloat2 r = {vmul_vf_vf_vf(d.x, s), vmul_vf_vf_vf(d.y, s)};
+  return r;
+}
+
+static INLINE CONST vfloat2 dfadd_vf2_vf_vf(vfloat x, vfloat y) {
+  vfloat2 r;
+
+  r.x = vadd_vf_vf_vf(x, y);
+  r.y = vadd_vf_vf_vf(vsub_vf_vf_vf(x, r.x), y);
+
+  return r;
+}
+
+static INLINE CONST vfloat2 dfadd2_vf2_vf_vf(vfloat x, vfloat y) {
+  vfloat2 r;
+
+  r.x = vadd_vf_vf_vf(x, y);
+  vfloat v = vsub_vf_vf_vf(r.x, x);
+  r.y = vadd_vf_vf_vf(vsub_vf_vf_vf(x, vsub_vf_vf_vf(r.x, v)), vsub_vf_vf_vf(y, v));
+
+  return r;
+}
+
+static INLINE CONST vfloat2 dfadd2_vf2_vf_vf2(vfloat x, vfloat2 y) {
+  vfloat2 r;
+
+  r.x  = vadd_vf_vf_vf(x, y.x);
+  vfloat v = vsub_vf_vf_vf(r.x, x);
+  r.y = vadd_vf_vf_vf(vadd_vf_vf_vf(vsub_vf_vf_vf(x, vsub_vf_vf_vf(r.x, v)), vsub_vf_vf_vf(y.x, v)), y.y);
+
+  return r;
+}
+
+static INLINE CONST vfloat2 dfadd_vf2_vf2_vf(vfloat2 x, vfloat y) {
+  vfloat2 r;
+
+  r.x = vadd_vf_vf_vf(x.x, y);
+  r.y = vadd_vf_3vf(vsub_vf_vf_vf(x.x, r.x), y, x.y);
+
+  return r;
+}
+
+static INLINE CONST vfloat2 dfsub_vf2_vf2_vf(vfloat2 x, vfloat y) {
+  vfloat2 r;
+
+  r.x = vsub_vf_vf_vf(x.x, y);
+  r.y = vadd_vf_vf_vf(vsub_vf_vf_vf(vsub_vf_vf_vf(x.x, r.x), y), x.y);
+
+  return r;
+}
+
+static INLINE CONST vfloat2 dfadd2_vf2_vf2_vf(vfloat2 x, vfloat y) {
+  vfloat2 r;
+
+  r.x = vadd_vf_vf_vf(x.x, y);
+  vfloat v = vsub_vf_vf_vf(r.x, x.x);
+  r.y = vadd_vf_vf_vf(vsub_vf_vf_vf(x.x, vsub_vf_vf_vf(r.x, v)), vsub_vf_vf_vf(y, v));
+  r.y = vadd_vf_vf_vf(r.y, x.y);
+
+  return r;
+}
+
+static INLINE CONST vfloat2 dfadd_vf2_vf_vf2(vfloat x, vfloat2 y) {
+  vfloat2 r;
+
+  r.x = vadd_vf_vf_vf(x, y.x);
+  r.y = vadd_vf_3vf(vsub_vf_vf_vf(x, r.x), y.x, y.y);
+
+  return r;
+}
+
+static INLINE CONST vfloat2 dfadd_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {
+  // |x| >= |y|
+
+  vfloat2 r;
+
+  r.x = vadd_vf_vf_vf(x.x, y.x);
+  r.y = vadd_vf_4vf(vsub_vf_vf_vf(x.x, r.x), y.x, x.y, y.y);
+
+  return r;
+}
+
+static INLINE CONST vfloat2 dfadd2_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {
+  vfloat2 r;
+
+  r.x  = vadd_vf_vf_vf(x.x, y.x);
+  vfloat v = vsub_vf_vf_vf(r.x, x.x);
+  r.y = vadd_vf_vf_vf(vsub_vf_vf_vf(x.x, vsub_vf_vf_vf(r.x, v)), vsub_vf_vf_vf(y.x, v));
+  r.y = vadd_vf_vf_vf(r.y, vadd_vf_vf_vf(x.y, y.y));
+
+  return r;
+}
+
+static INLINE CONST vfloat2 dfsub_vf2_vf_vf(vfloat x, vfloat y) {
+  // |x| >= |y|
+
+  vfloat2 r;
+
+  r.x = vsub_vf_vf_vf(x, y);
+  r.y = vsub_vf_vf_vf(vsub_vf_vf_vf(x, r.x), y);
+
+  return r;
+}
+
+static INLINE CONST vfloat2 dfsub_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {
+  // |x| >= |y|
+
+  vfloat2 r;
+
+  r.x = vsub_vf_vf_vf(x.x, y.x);
+  r.y = vsub_vf_vf_vf(x.x, r.x);
+  r.y = vsub_vf_vf_vf(r.y, y.x);
+  r.y = vadd_vf_vf_vf(r.y, x.y);
+  r.y = vsub_vf_vf_vf(r.y, y.y);
+
+  return r;
+}
+
+#ifdef ENABLE_FMA_SP
+static INLINE CONST vfloat2 dfdiv_vf2_vf2_vf2(vfloat2 n, vfloat2 d) {
+  vfloat2 q;
+  vfloat t = vrec_vf_vf(d.x), u;
+
+  q.x = vmul_vf_vf_vf(n.x, t);
+  u = vfmapn_vf_vf_vf_vf(t, n.x, q.x);
+  q.y = vfmanp_vf_vf_vf_vf(d.y, t, vfmanp_vf_vf_vf_vf(d.x, t, vcast_vf_f(1)));
+  q.y = vfma_vf_vf_vf_vf(q.x, q.y, vfma_vf_vf_vf_vf(n.y, t, u));
+
+  return q;
+}
+
+static INLINE CONST vfloat2 dfmul_vf2_vf_vf(vfloat x, vfloat y) {
+  vfloat2 r;
+
+  r.x = vmul_vf_vf_vf(x, y);
+  r.y = vfmapn_vf_vf_vf_vf(x, y, r.x);
+
+  return r;
+}
+
+static INLINE CONST vfloat2 dfsqu_vf2_vf2(vfloat2 x) {
+  vfloat2 r;
+
+  r.x = vmul_vf_vf_vf(x.x, x.x);
+  r.y = vfma_vf_vf_vf_vf(vadd_vf_vf_vf(x.x, x.x), x.y, vfmapn_vf_vf_vf_vf(x.x, x.x, r.x));
+
+  return r;
+}
+
+static INLINE CONST vfloat dfsqu_vf_vf2(vfloat2 x) {
+  return vfma_vf_vf_vf_vf(x.x, x.x, vadd_vf_vf_vf(vmul_vf_vf_vf(x.x, x.y), vmul_vf_vf_vf(x.x, x.y)));
+}
+
+static INLINE CONST vfloat2 dfmul_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {
+  vfloat2 r;
+
+  r.x = vmul_vf_vf_vf(x.x, y.x);
+  r.y = vfma_vf_vf_vf_vf(x.x, y.y, vfma_vf_vf_vf_vf(x.y, y.x, vfmapn_vf_vf_vf_vf(x.x, y.x, r.x)));
+
+  return r;
+}
+
+static INLINE CONST vfloat dfmul_vf_vf2_vf2(vfloat2 x, vfloat2 y) {
+  return vfma_vf_vf_vf_vf(x.x, y.x, vfma_vf_vf_vf_vf(x.y, y.x, vmul_vf_vf_vf(x.x, y.y)));
+}
+
+static INLINE CONST vfloat2 dfmul_vf2_vf2_vf(vfloat2 x, vfloat y) {
+  vfloat2 r;
+
+  r.x = vmul_vf_vf_vf(x.x, y);
+  r.y = vfma_vf_vf_vf_vf(x.y, y, vfmapn_vf_vf_vf_vf(x.x, y, r.x));
+
+  return r;
+}
+
+static INLINE CONST vfloat2 dfrec_vf2_vf(vfloat d) {
+  vfloat2 q;
+
+  q.x = vrec_vf_vf(d);
+  q.y = vmul_vf_vf_vf(q.x, vfmanp_vf_vf_vf_vf(d, q.x, vcast_vf_f(1)));
+
+  return q;
+}
+
+static INLINE CONST vfloat2 dfrec_vf2_vf2(vfloat2 d) {
+  vfloat2 q;
+
+  q.x = vrec_vf_vf(d.x);
+  q.y = vmul_vf_vf_vf(q.x, vfmanp_vf_vf_vf_vf(d.y, q.x, vfmanp_vf_vf_vf_vf(d.x, q.x, vcast_vf_f(1))));
+
+  return q;
+}
+#else
+static INLINE CONST vfloat2 dfdiv_vf2_vf2_vf2(vfloat2 n, vfloat2 d) {
+  vfloat t = vrec_vf_vf(d.x);
+  vfloat dh  = vupper_vf_vf(d.x), dl  = vsub_vf_vf_vf(d.x,  dh);
+  vfloat th  = vupper_vf_vf(t  ), tl  = vsub_vf_vf_vf(t  ,  th);
+  vfloat nhh = vupper_vf_vf(n.x), nhl = vsub_vf_vf_vf(n.x, nhh);
+
+  vfloat2 q;
+
+  q.x = vmul_vf_vf_vf(n.x, t);
+
+  vfloat u, w;
+  w = vcast_vf_f(-1);
+  w = vmla_vf_vf_vf_vf(dh, th, w);
+  w = vmla_vf_vf_vf_vf(dh, tl, w);
+  w = vmla_vf_vf_vf_vf(dl, th, w);
+  w = vmla_vf_vf_vf_vf(dl, tl, w);
+  w = vneg_vf_vf(w);
+
+  u = vmla_vf_vf_vf_vf(nhh, th, vneg_vf_vf(q.x));
+  u = vmla_vf_vf_vf_vf(nhh, tl, u);
+  u = vmla_vf_vf_vf_vf(nhl, th, u);
+  u = vmla_vf_vf_vf_vf(nhl, tl, u);
+  u = vmla_vf_vf_vf_vf(q.x, w , u);
+
+  q.y = vmla_vf_vf_vf_vf(t, vsub_vf_vf_vf(n.y, vmul_vf_vf_vf(q.x, d.y)), u);
+
+  return q;
+}
+
+static INLINE CONST vfloat2 dfmul_vf2_vf_vf(vfloat x, vfloat y) {
+  vfloat xh = vupper_vf_vf(x), xl = vsub_vf_vf_vf(x, xh);
+  vfloat yh = vupper_vf_vf(y), yl = vsub_vf_vf_vf(y, yh);
+  vfloat2 r;
+
+  r.x = vmul_vf_vf_vf(x, y);
+
+  vfloat t;
+  t = vmla_vf_vf_vf_vf(xh, yh, vneg_vf_vf(r.x));
+  t = vmla_vf_vf_vf_vf(xl, yh, t);
+  t = vmla_vf_vf_vf_vf(xh, yl, t);
+  t = vmla_vf_vf_vf_vf(xl, yl, t);
+  r.y = t;
+
+  return r;
+}
+
+static INLINE CONST vfloat2 dfmul_vf2_vf2_vf(vfloat2 x, vfloat y) {
+  vfloat xh = vupper_vf_vf(x.x), xl = vsub_vf_vf_vf(x.x, xh);
+  vfloat yh = vupper_vf_vf(y  ), yl = vsub_vf_vf_vf(y  , yh);
+  vfloat2 r;
+
+  r.x = vmul_vf_vf_vf(x.x, y);
+
+  vfloat t;
+  t = vmla_vf_vf_vf_vf(xh, yh, vneg_vf_vf(r.x));
+  t = vmla_vf_vf_vf_vf(xl, yh, t);
+  t = vmla_vf_vf_vf_vf(xh, yl, t);
+  t = vmla_vf_vf_vf_vf(xl, yl, t);
+  t = vmla_vf_vf_vf_vf(x.y, y, t);
+  r.y = t;
+
+  return r;
+}
+
+static INLINE CONST vfloat2 dfmul_vf2_vf2_vf2(vfloat2 x, vfloat2 y) {
+  vfloat xh = vupper_vf_vf(x.x), xl = vsub_vf_vf_vf(x.x, xh);
+  vfloat yh = vupper_vf_vf(y.x), yl = vsub_vf_vf_vf(y.x, yh);
+  vfloat2 r;
+
+  r.x = vmul_vf_vf_vf(x.x, y.x);
+
+  vfloat t;
+  t = vmla_vf_vf_vf_vf(xh, yh, vneg_vf_vf(r.x));
+  t = vmla_vf_vf_vf_vf(xl, yh, t);
+  t = vmla_vf_vf_vf_vf(xh, yl, t);
+  t = vmla_vf_vf_vf_vf(xl, yl, t);
+  t = vmla_vf_vf_vf_vf(x.x, y.y, t);
+  t = vmla_vf_vf_vf_vf(x.y, y.x, t);
+  r.y = t;
+
+  return r;
+}
+
+static INLINE CONST vfloat dfmul_vf_vf2_vf2(vfloat2 x, vfloat2 y) {
+  vfloat xh = vupper_vf_vf(x.x), xl = vsub_vf_vf_vf(x.x, xh);
+  vfloat yh = vupper_vf_vf(y.x), yl = vsub_vf_vf_vf(y.x, yh);
+
+  return vadd_vf_6vf(vmul_vf_vf_vf(x.y, yh), vmul_vf_vf_vf(xh, y.y), vmul_vf_vf_vf(xl, yl), vmul_vf_vf_vf(xh, yl), vmul_vf_vf_vf(xl, yh), vmul_vf_vf_vf(xh, yh));
+}
+
+static INLINE CONST vfloat2 dfsqu_vf2_vf2(vfloat2 x) {
+  vfloat xh = vupper_vf_vf(x.x), xl = vsub_vf_vf_vf(x.x, xh);
+  vfloat2 r;
+
+  r.x = vmul_vf_vf_vf(x.x, x.x);
+
+  vfloat t;
+  t = vmla_vf_vf_vf_vf(xh, xh, vneg_vf_vf(r.x));
+  t = vmla_vf_vf_vf_vf(vadd_vf_vf_vf(xh, xh), xl, t);
+  t = vmla_vf_vf_vf_vf(xl, xl, t);
+  t = vmla_vf_vf_vf_vf(x.x, vadd_vf_vf_vf(x.y, x.y), t);
+  r.y = t;
+
+  return r;
+}
+
+static INLINE CONST vfloat dfsqu_vf_vf2(vfloat2 x) {
+  vfloat xh = vupper_vf_vf(x.x), xl = vsub_vf_vf_vf(x.x, xh);
+
+  return vadd_vf_5vf(vmul_vf_vf_vf(xh, x.y), vmul_vf_vf_vf(xh, x.y), vmul_vf_vf_vf(xl, xl), vadd_vf_vf_vf(vmul_vf_vf_vf(xh, xl), vmul_vf_vf_vf(xh, xl)), vmul_vf_vf_vf(xh, xh));
+}
+
+static INLINE CONST vfloat2 dfrec_vf2_vf(vfloat d) {
+  vfloat t = vrec_vf_vf(d);
+  vfloat dh = vupper_vf_vf(d), dl = vsub_vf_vf_vf(d, dh);
+  vfloat th = vupper_vf_vf(t), tl = vsub_vf_vf_vf(t, th);
+  vfloat2 q;
+
+  q.x = t;
+
+  vfloat u = vcast_vf_f(-1);
+  u = vmla_vf_vf_vf_vf(dh, th, u);
+  u = vmla_vf_vf_vf_vf(dh, tl, u);
+  u = vmla_vf_vf_vf_vf(dl, th, u);
+  u = vmla_vf_vf_vf_vf(dl, tl, u);
+  q.y = vmul_vf_vf_vf(vneg_vf_vf(t), u);
+
+  return q;
+}
+
+static INLINE CONST vfloat2 dfrec_vf2_vf2(vfloat2 d) {
+  vfloat t = vrec_vf_vf(d.x);
+  vfloat dh = vupper_vf_vf(d.x), dl = vsub_vf_vf_vf(d.x, dh);
+  vfloat th = vupper_vf_vf(t  ), tl = vsub_vf_vf_vf(t  , th);
+  vfloat2 q;
+
+  q.x = t;
+
+  vfloat u = vcast_vf_f(-1);
+  u = vmla_vf_vf_vf_vf(dh, th, u);
+  u = vmla_vf_vf_vf_vf(dh, tl, u);
+  u = vmla_vf_vf_vf_vf(dl, th, u);
+  u = vmla_vf_vf_vf_vf(dl, tl, u);
+  u = vmla_vf_vf_vf_vf(d.y, t, u);
+  q.y = vmul_vf_vf_vf(vneg_vf_vf(t), u);
+
+  return q;
+}
+#endif
+
+static INLINE CONST vfloat2 dfsqrt_vf2_vf2(vfloat2 d) {
+#ifdef ENABLE_RECSQRT_SP
+  vfloat x = vrecsqrt_vf_vf(vadd_vf_vf_vf(d.x, d.y));
+  vfloat2 r = dfmul_vf2_vf2_vf(d, x);
+  return dfscale_vf2_vf2_vf(dfmul_vf2_vf2_vf2(r, dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf(r, x), vcast_vf_f(-3.0))), vcast_vf_f(-0.5));
+#else
+  vfloat t = vsqrt_vf_vf(vadd_vf_vf_vf(d.x, d.y));
+  return dfscale_vf2_vf2_vf(dfmul_vf2_vf2_vf2(dfadd2_vf2_vf2_vf2(d, dfmul_vf2_vf_vf(t, t)), dfrec_vf2_vf(t)), vcast_vf_f(0.5));
+#endif
+}
+
+static INLINE CONST vfloat2 dfsqrt_vf2_vf(vfloat d) {
+  vfloat t = vsqrt_vf_vf(d);
+  return dfscale_vf2_vf2_vf(dfmul_vf2_vf2_vf2(dfadd2_vf2_vf_vf2(d, dfmul_vf2_vf_vf(t, t)), dfrec_vf2_vf(t)), vcast_vf_f(0.5f));
+}
diff --git a/lib/kernel/sleef/libm/rename.h b/lib/kernel/sleef/libm/rename.h
new file mode 100644
index 0000000..5042670
--- /dev/null
+++ b/lib/kernel/sleef/libm/rename.h
@@ -0,0 +1,143 @@
+//          Copyright Naoki Shibata 2010 - 2017.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+#define xsin Sleef_sin_u35
+#define xcos Sleef_cos_u35
+#define xsincos Sleef_sincos_u35
+#define xtan Sleef_tan_u35
+#define xasin Sleef_asin_u35
+#define xacos Sleef_acos_u35
+#define xatan Sleef_atan_u35
+#define xatan2 Sleef_atan2_u35
+#define xlog Sleef_log_u35
+#define xcbrt Sleef_cbrt_u35
+#define xsin_u1 Sleef_sin_u10
+#define xcos_u1 Sleef_cos_u10
+#define xsincos_u1 Sleef_sincos_u10
+#define xtan_u1 Sleef_tan_u10
+#define xasin_u1 Sleef_asin_u10
+#define xacos_u1 Sleef_acos_u10
+#define xatan_u1 Sleef_atan_u10
+#define xatan2_u1 Sleef_atan2_u10
+#define xlog_u1 Sleef_log_u10
+#define xcbrt_u1 Sleef_cbrt_u10
+#define xexp Sleef_exp_u10
+#define xpow Sleef_pow_u10
+#define xsinh Sleef_sinh_u10
+#define xcosh Sleef_cosh_u10
+#define xtanh Sleef_tanh_u10
+#define xasinh Sleef_asinh_u10
+#define xacosh Sleef_acosh_u10
+#define xatanh Sleef_atanh_u10
+#define xexp2 Sleef_exp2_u10
+#define xexp10 Sleef_exp10_u10
+#define xexpm1 Sleef_expm1_u10
+#define xlog10 Sleef_log10_u10
+#define xlog1p Sleef_log1p_u10
+#define xsincospi_u05 Sleef_sincospi_u05
+#define xsincospi_u35 Sleef_sincospi_u35
+#define xsinpi_u05 Sleef_sinpi_u05
+#define xcospi_u05 Sleef_cospi_u05
+#define xldexp Sleef_ldexp
+#define xilogb Sleef_ilogb
+#define xfma Sleef_fma
+#define xsqrt_u05 Sleef_sqrt_u05
+#define xhypot_u05 Sleef_hypot_u05
+#define xhypot_u35 Sleef_hypot_u35
+#define xfabs Sleef_fabs
+#define xcopysign Sleef_copysign
+#define xfmax Sleef_fmax
+#define xfmin Sleef_fmin
+#define xfdim Sleef_fdim
+#define xtrunc Sleef_trunc
+#define xfloor Sleef_floor
+#define xceil Sleef_ceil
+#define xround Sleef_round
+#define xrint Sleef_rint
+#define xnextafter Sleef_nextafter
+#define xfrfrexp Sleef_frfrexp
+#define xexpfrexp Sleef_expfrexp
+#define xfmod Sleef_fmod
+#define xmodf Sleef_modf
+#define xlgamma_u1 Sleef_lgamma_u10
+#define xlgamma_r_u1 Sleef_lgamma_r_u10
+#define xtgamma_u1 Sleef_tgamma_u10
+#define xerf_u1 Sleef_erf_u10
+#define xerfc_u15 Sleef_erfc_u15
+
+//
+
+#define xsinf Sleef_sinf_u35
+#define xcosf Sleef_cosf_u35
+#define xsincosf Sleef_sincosf_u35
+#define xtanf Sleef_tanf_u35
+#define xasinf Sleef_asinf_u35
+#define xacosf Sleef_acosf_u35
+#define xatanf Sleef_atanf_u35
+#define xatan2f Sleef_atan2f_u35
+#define xlogf Sleef_logf_u35
+#define xcbrtf Sleef_cbrtf_u35
+#define xsinf_u1 Sleef_sinf_u10
+#define xcosf_u1 Sleef_cosf_u10
+#define xsincosf_u1 Sleef_sincosf_u10
+#define xtanf_u1 Sleef_tanf_u10
+#define xasinf_u1 Sleef_asinf_u10
+#define xacosf_u1 Sleef_acosf_u10
+#define xatanf_u1 Sleef_atanf_u10
+#define xatan2f_u1 Sleef_atan2f_u10
+#define xlogf_u1 Sleef_logf_u10
+#define xcbrtf_u1 Sleef_cbrtf_u10
+#define xexpf Sleef_expf_u10
+#define xpowf Sleef_powf_u10
+#define xsinhf Sleef_sinhf_u10
+#define xcoshf Sleef_coshf_u10
+#define xtanhf Sleef_tanhf_u10
+#define xasinhf Sleef_asinhf_u10
+#define xacoshf Sleef_acoshf_u10
+#define xatanhf Sleef_atanhf_u10
+#define xexp2f Sleef_exp2f_u10
+#define xexp10f Sleef_exp10f_u10
+#define xexpm1f Sleef_expm1f_u10
+#define xlog10f Sleef_log10f_u10
+#define xlog1pf Sleef_log1pf_u10
+#define xsincospif_u05 Sleef_sincospif_u05
+#define xsincospif_u35 Sleef_sincospif_u35
+#define xsinpif_u05 Sleef_sinpif_u05
+#define xcospif_u05 Sleef_cospif_u05
+#define xldexpf Sleef_ldexpf
+#define xilogbf Sleef_ilogbf
+#define xfmaf Sleef_fmaf
+#define xsqrtf_u05 Sleef_sqrtf_u05
+#define xsqrtf_u35 Sleef_sqrtf_u35
+#define xhypotf_u05 Sleef_hypotf_u05
+#define xhypotf_u35 Sleef_hypotf_u35
+#define xfabsf Sleef_fabsf
+#define xcopysignf Sleef_copysignf
+#define xfmaxf Sleef_fmaxf
+#define xfminf Sleef_fminf
+#define xfdimf Sleef_fdimf
+#define xtruncf Sleef_truncf
+#define xfloorf Sleef_floorf
+#define xceilf Sleef_ceilf
+#define xroundf Sleef_roundf
+#define xrintf Sleef_rintf
+#define xnextafterf Sleef_nextafterf
+#define xfrfrexpf Sleef_frfrexpf
+#define xexpfrexpf Sleef_expfrexpf
+#define xfmodf Sleef_fmodf
+#define xmodff Sleef_modff
+#define xlgammaf_u1 Sleef_lgammaf_u10
+#define xlgamma_rf_u1 Sleef_lgamma_rf_u10
+#define xtgammaf_u1 Sleef_tgammaf_u10
+#define xerff_u1 Sleef_erff_u10
+#define xerfcf_u15 Sleef_erfcf_u15
+/***********************/
+#define xlog1p_fast Sleef_log1p_fast_u10
+#define xlog1pf_fast Sleef_log1pf_fast_u10
+
+#define xpown Sleef_pown_u10
+#define xpownf Sleef_pownf_u10
+#define xpowr Sleef_powr_u10
+#define xpowrf Sleef_powrf_u10
diff --git a/lib/kernel/sleef/libm/rename_vec128.h b/lib/kernel/sleef/libm/rename_vec128.h
new file mode 100644
index 0000000..09b9e4d
--- /dev/null
+++ b/lib/kernel/sleef/libm/rename_vec128.h
@@ -0,0 +1,137 @@
+#define xsin Sleef_sind2_u35_intrin
+#define xcos Sleef_cosd2_u35_intrin
+#define xsincos Sleef_sincosd2_u35_intrin
+#define xtan Sleef_tand2_u35_intrin
+#define xasin Sleef_asind2_u35_intrin
+#define xacos Sleef_acosd2_u35_intrin
+#define xatan Sleef_atand2_u35_intrin
+#define xatan2 Sleef_atan2d2_u35_intrin
+#define xlog Sleef_logd2_u35_intrin
+#define xcbrt Sleef_cbrtd2_u35_intrin
+#define xsin_u1 Sleef_sind2_u10_intrin
+#define xcos_u1 Sleef_cosd2_u10_intrin
+#define xsincos_u1 Sleef_sincosd2_u10_intrin
+#define xtan_u1 Sleef_tand2_u10_intrin
+#define xasin_u1 Sleef_asind2_u10_intrin
+#define xacos_u1 Sleef_acosd2_u10_intrin
+#define xatan_u1 Sleef_atand2_u10_intrin
+#define xatan2_u1 Sleef_atan2d2_u10_intrin
+#define xlog_u1 Sleef_logd2_u10_intrin
+#define xcbrt_u1 Sleef_cbrtd2_u10_intrin
+#define xexp Sleef_expd2_u10_intrin
+#define xpow Sleef_powd2_u10_intrin
+#define xsinh Sleef_sinhd2_u10_intrin
+#define xcosh Sleef_coshd2_u10_intrin
+#define xtanh Sleef_tanhd2_u10_intrin
+#define xasinh Sleef_asinhd2_u10_intrin
+#define xacosh Sleef_acoshd2_u10_intrin
+#define xatanh Sleef_atanhd2_u10_intrin
+#define xexp2 Sleef_exp2d2_u10_intrin
+#define xexp10 Sleef_exp10d2_u10_intrin
+#define xexpm1 Sleef_expm1d2_u10_intrin
+#define xlog10 Sleef_log10d2_u10_intrin
+#define xlog1p Sleef_log1pd2_u10_intrin
+#define xsincospi_u05 Sleef_sincospid2_u05_intrin
+#define xsincospi_u35 Sleef_sincospid2_u35_intrin
+#define xsinpi_u05 Sleef_sinpid2_u05_intrin
+#define xcospi_u05 Sleef_cospid2_u05_intrin
+#define xldexp Sleef_ldexpd2_intrin
+#define xilogb Sleef_ilogbd2_intrin
+#define xfma Sleef_fmad2_intrin
+#define xsqrt_u05 Sleef_sqrtd2_u05_intrin
+#define xsqrt_u35 Sleef_sqrtd2_u35_intrin
+#define xhypot_u05 Sleef_hypotd2_u05_intrin
+#define xhypot_u35 Sleef_hypotd2_u35_intrin
+#define xfabs Sleef_fabsd2_intrin
+#define xcopysign Sleef_copysignd2_intrin
+#define xfmax Sleef_fmaxd2_intrin
+#define xfmin Sleef_fmind2_intrin
+#define xfdim Sleef_fdimd2_intrin
+#define xtrunc Sleef_truncd2_intrin
+#define xfloor Sleef_floord2_intrin
+#define xceil Sleef_ceild2_intrin
+#define xround Sleef_roundd2_intrin
+#define xrint Sleef_rintd2_intrin
+#define xnextafter Sleef_nextafterd2_intrin
+#define xfrfrexp Sleef_frfrexpd2_intrin
+#define xexpfrexp Sleef_expfrexpd2_intrin
+#define xfmod Sleef_fmodd2_intrin
+#define xmodf Sleef_modfd2_intrin
+#define xlgamma_u1 Sleef_lgammad2_u10_intrin
+#define xlgamma_r_u1 Sleef_lgamma_rd2_u10_intrin
+#define xtgamma_u1 Sleef_tgammad2_u10_intrin
+#define xerf_u1 Sleef_erfd2_u10_intrin
+#define xerfc_u15 Sleef_erfcd2_u15_intrin
+
+#define xsinf Sleef_sinf4_u35_intrin
+#define xcosf Sleef_cosf4_u35_intrin
+#define xsincosf Sleef_sincosf4_u35_intrin
+#define xtanf Sleef_tanf4_u35_intrin
+#define xasinf Sleef_asinf4_u35_intrin
+#define xacosf Sleef_acosf4_u35_intrin
+#define xatanf Sleef_atanf4_u35_intrin
+#define xatan2f Sleef_atan2f4_u35_intrin
+#define xlogf Sleef_logf4_u35_intrin
+#define xcbrtf Sleef_cbrtf4_u35_intrin
+#define xsinf_u1 Sleef_sinf4_u10_intrin
+#define xcosf_u1 Sleef_cosf4_u10_intrin
+#define xsincosf_u1 Sleef_sincosf4_u10_intrin
+#define xtanf_u1 Sleef_tanf4_u10_intrin
+#define xasinf_u1 Sleef_asinf4_u10_intrin
+#define xacosf_u1 Sleef_acosf4_u10_intrin
+#define xatanf_u1 Sleef_atanf4_u10_intrin
+#define xatan2f_u1 Sleef_atan2f4_u10_intrin
+#define xlogf_u1 Sleef_logf4_u10_intrin
+#define xcbrtf_u1 Sleef_cbrtf4_u10_intrin
+#define xexpf Sleef_expf4_u10_intrin
+#define xpowf Sleef_powf4_u10_intrin
+#define xsinhf Sleef_sinhf4_u10_intrin
+#define xcoshf Sleef_coshf4_u10_intrin
+#define xtanhf Sleef_tanhf4_u10_intrin
+#define xasinhf Sleef_asinhf4_u10_intrin
+#define xacoshf Sleef_acoshf4_u10_intrin
+#define xatanhf Sleef_atanhf4_u10_intrin
+#define xexp2f Sleef_exp2f4_u10_intrin
+#define xexp10f Sleef_exp10f4_u10_intrin
+#define xexpm1f Sleef_expm1f4_u10_intrin
+#define xlog10f Sleef_log10f4_u10_intrin
+#define xlog1pf Sleef_log1pf4_u10_intrin
+#define xsincospif_u05 Sleef_sincospif4_u05_intrin
+#define xsincospif_u35 Sleef_sincospif4_u35_intrin
+#define xsinpif_u05 Sleef_sinpif4_u05_intrin
+#define xcospif_u05 Sleef_cospif4_u05_intrin
+#define xldexpf Sleef_ldexpf4_intrin
+#define xilogbf Sleef_ilogbf4_intrin
+#define xfmaf Sleef_fmaf4_intrin
+#define xsqrtf_u05 Sleef_sqrtf4_u05_intrin
+#define xsqrtf_u35 Sleef_sqrtf4_u35_intrin
+#define xhypotf_u05 Sleef_hypotf4_u05_intrin
+#define xhypotf_u35 Sleef_hypotf4_u35_intrin
+#define xfabsf Sleef_fabsf4_intrin
+#define xcopysignf Sleef_copysignf4_intrin
+#define xfmaxf Sleef_fmaxf4_intrin
+#define xfminf Sleef_fminf4_intrin
+#define xfdimf Sleef_fdimf4_intrin
+#define xtruncf Sleef_truncf4_intrin
+#define xfloorf Sleef_floorf4_intrin
+#define xceilf Sleef_ceilf4_intrin
+#define xroundf Sleef_roundf4_intrin
+#define xrintf Sleef_rintf4_intrin
+#define xnextafterf Sleef_nextafterf4_intrin
+#define xfrfrexpf Sleef_frfrexpf4_intrin
+#define xexpfrexpf Sleef_expfrexpf4_intrin
+#define xfmodf Sleef_fmodf4_intrin
+#define xmodff Sleef_modff4_intrin
+#define xlgammaf_u1 Sleef_lgammaf4_u10_intrin
+#define xlgamma_rf_u1 Sleef_lgamma_rf4_u10_intrin
+#define xtgammaf_u1 Sleef_tgammaf4_u10_intrin
+#define xerff_u1 Sleef_erff4_u10_intrin
+#define xerfcf_u15 Sleef_erfcf4_u15_intrin
+/***********************/
+#define xlog1p_fast Sleef_log1p_fast_d2_u10_intrin
+#define xlog1pf_fast Sleef_log1p_fast_f4_u10_intrin
+
+#define xpown Sleef_pownd2_u10_intrin
+#define xpownf Sleef_pownf4_u10_intrin
+#define xpowr Sleef_powrd2_u10_intrin
+#define xpowrf Sleef_powrf4_u10_intrin
diff --git a/lib/kernel/sleef/libm/rename_vec256.h b/lib/kernel/sleef/libm/rename_vec256.h
new file mode 100644
index 0000000..32e143b
--- /dev/null
+++ b/lib/kernel/sleef/libm/rename_vec256.h
@@ -0,0 +1,137 @@
+#define xsin Sleef_sind4_u35_intrin
+#define xcos Sleef_cosd4_u35_intrin
+#define xsincos Sleef_sincosd4_u35_intrin
+#define xtan Sleef_tand4_u35_intrin
+#define xasin Sleef_asind4_u35_intrin
+#define xacos Sleef_acosd4_u35_intrin
+#define xatan Sleef_atand4_u35_intrin
+#define xatan2 Sleef_atan2d4_u35_intrin
+#define xlog Sleef_logd4_u35_intrin
+#define xcbrt Sleef_cbrtd4_u35_intrin
+#define xsin_u1 Sleef_sind4_u10_intrin
+#define xcos_u1 Sleef_cosd4_u10_intrin
+#define xsincos_u1 Sleef_sincosd4_u10_intrin
+#define xtan_u1 Sleef_tand4_u10_intrin
+#define xasin_u1 Sleef_asind4_u10_intrin
+#define xacos_u1 Sleef_acosd4_u10_intrin
+#define xatan_u1 Sleef_atand4_u10_intrin
+#define xatan2_u1 Sleef_atan2d4_u10_intrin
+#define xlog_u1 Sleef_logd4_u10_intrin
+#define xcbrt_u1 Sleef_cbrtd4_u10_intrin
+#define xexp Sleef_expd4_u10_intrin
+#define xpow Sleef_powd4_u10_intrin
+#define xsinh Sleef_sinhd4_u10_intrin
+#define xcosh Sleef_coshd4_u10_intrin
+#define xtanh Sleef_tanhd4_u10_intrin
+#define xasinh Sleef_asinhd4_u10_intrin
+#define xacosh Sleef_acoshd4_u10_intrin
+#define xatanh Sleef_atanhd4_u10_intrin
+#define xexp2 Sleef_exp2d4_u10_intrin
+#define xexp10 Sleef_exp10d4_u10_intrin
+#define xexpm1 Sleef_expm1d4_u10_intrin
+#define xlog10 Sleef_log10d4_u10_intrin
+#define xlog1p Sleef_log1pd4_u10_intrin
+#define xsincospi_u05 Sleef_sincospid4_u05_intrin
+#define xsincospi_u35 Sleef_sincospid4_u35_intrin
+#define xsinpi_u05 Sleef_sinpid4_u05_intrin
+#define xcospi_u05 Sleef_cospid4_u05_intrin
+#define xldexp Sleef_ldexpd4_intrin
+#define xilogb Sleef_ilogbd4_intrin
+#define xfma Sleef_fmad4_intrin
+#define xsqrt_u05 Sleef_sqrtd4_u05_intrin
+#define xsqrt_u35 Sleef_sqrtd4_u35_intrin
+#define xhypot_u05 Sleef_hypotd4_u05_intrin
+#define xhypot_u35 Sleef_hypotd4_u35_intrin
+#define xfabs Sleef_fabsd4_intrin
+#define xcopysign Sleef_copysignd4_intrin
+#define xfmax Sleef_fmaxd4_intrin
+#define xfmin Sleef_fmind4_intrin
+#define xfdim Sleef_fdimd4_intrin
+#define xtrunc Sleef_truncd4_intrin
+#define xfloor Sleef_floord4_intrin
+#define xceil Sleef_ceild4_intrin
+#define xround Sleef_roundd4_intrin
+#define xrint Sleef_rintd4_intrin
+#define xnextafter Sleef_nextafterd4_intrin
+#define xfrfrexp Sleef_frfrexpd4_intrin
+#define xexpfrexp Sleef_expfrexpd4_intrin
+#define xfmod Sleef_fmodd4_intrin
+#define xmodf Sleef_modfd4_intrin
+#define xlgamma_u1 Sleef_lgammad4_u10_intrin
+#define xlgamma_r_u1 Sleef_lgamma_rd4_u10_intrin
+#define xtgamma_u1 Sleef_tgammad4_u10_intrin
+#define xerf_u1 Sleef_erfd4_u10_intrin
+#define xerfc_u15 Sleef_erfcd4_u15_intrin
+
+#define xsinf Sleef_sinf8_u35_intrin
+#define xcosf Sleef_cosf8_u35_intrin
+#define xsincosf Sleef_sincosf8_u35_intrin
+#define xtanf Sleef_tanf8_u35_intrin
+#define xasinf Sleef_asinf8_u35_intrin
+#define xacosf Sleef_acosf8_u35_intrin
+#define xatanf Sleef_atanf8_u35_intrin
+#define xatan2f Sleef_atan2f8_u35_intrin
+#define xlogf Sleef_logf8_u35_intrin
+#define xcbrtf Sleef_cbrtf8_u35_intrin
+#define xsinf_u1 Sleef_sinf8_u10_intrin
+#define xcosf_u1 Sleef_cosf8_u10_intrin
+#define xsincosf_u1 Sleef_sincosf8_u10_intrin
+#define xtanf_u1 Sleef_tanf8_u10_intrin
+#define xasinf_u1 Sleef_asinf8_u10_intrin
+#define xacosf_u1 Sleef_acosf8_u10_intrin
+#define xatanf_u1 Sleef_atanf8_u10_intrin
+#define xatan2f_u1 Sleef_atan2f8_u10_intrin
+#define xlogf_u1 Sleef_logf8_u10_intrin
+#define xcbrtf_u1 Sleef_cbrtf8_u10_intrin
+#define xexpf Sleef_expf8_u10_intrin
+#define xpowf Sleef_powf8_u10_intrin
+#define xsinhf Sleef_sinhf8_u10_intrin
+#define xcoshf Sleef_coshf8_u10_intrin
+#define xtanhf Sleef_tanhf8_u10_intrin
+#define xasinhf Sleef_asinhf8_u10_intrin
+#define xacoshf Sleef_acoshf8_u10_intrin
+#define xatanhf Sleef_atanhf8_u10_intrin
+#define xexp2f Sleef_exp2f8_u10_intrin
+#define xexp10f Sleef_exp10f8_u10_intrin
+#define xexpm1f Sleef_expm1f8_u10_intrin
+#define xlog10f Sleef_log10f8_u10_intrin
+#define xlog1pf Sleef_log1pf8_u10_intrin
+#define xsincospif_u05 Sleef_sincospif8_u05_intrin
+#define xsincospif_u35 Sleef_sincospif8_u35_intrin
+#define xsinpif_u05 Sleef_sinpif8_u05_intrin
+#define xcospif_u05 Sleef_cospif8_u05_intrin
+#define xldexpf Sleef_ldexpf8_intrin
+#define xilogbf Sleef_ilogbf8_intrin
+#define xfmaf Sleef_fmaf8_intrin
+#define xsqrtf_u05 Sleef_sqrtf8_u05_intrin
+#define xsqrtf_u35 Sleef_sqrtf8_u35_intrin
+#define xhypotf_u05 Sleef_hypotf8_u05_intrin
+#define xhypotf_u35 Sleef_hypotf8_u35_intrin
+#define xfabsf Sleef_fabsf8_intrin
+#define xcopysignf Sleef_copysignf8_intrin
+#define xfmaxf Sleef_fmaxf8_intrin
+#define xfminf Sleef_fminf8_intrin
+#define xfdimf Sleef_fdimf8_intrin
+#define xtruncf Sleef_truncf8_intrin
+#define xfloorf Sleef_floorf8_intrin
+#define xceilf Sleef_ceilf8_intrin
+#define xroundf Sleef_roundf8_intrin
+#define xrintf Sleef_rintf8_intrin
+#define xnextafterf Sleef_nextafterf8_intrin
+#define xfrfrexpf Sleef_frfrexpf8_intrin
+#define xexpfrexpf Sleef_expfrexpf8_intrin
+#define xfmodf Sleef_fmodf8_intrin
+#define xmodff Sleef_modff8_intrin
+#define xlgammaf_u1 Sleef_lgammaf8_u10_intrin
+#define xlgamma_rf_u1 Sleef_lgamma_rf8_u10_intrin
+#define xtgammaf_u1 Sleef_tgammaf8_u10_intrin
+#define xerff_u1 Sleef_erff8_u10_intrin
+#define xerfcf_u15 Sleef_erfcf8_u15_intrin
+/***********************/
+#define xlog1p_fast Sleef_log1p_fast_d4_u10_intrin
+#define xlog1pf_fast Sleef_log1p_fast_f8_u10_intrin
+
+#define xpown Sleef_pownd4_u10_intrin
+#define xpownf Sleef_pownf8_u10_intrin
+#define xpowr Sleef_powrd4_u10_intrin
+#define xpowrf Sleef_powrf8_u10_intrin
diff --git a/lib/kernel/sleef/libm/rename_vec512.h b/lib/kernel/sleef/libm/rename_vec512.h
new file mode 100644
index 0000000..0b12ffd
--- /dev/null
+++ b/lib/kernel/sleef/libm/rename_vec512.h
@@ -0,0 +1,137 @@
+#define xsin Sleef_sind8_u35_intrin
+#define xcos Sleef_cosd8_u35_intrin
+#define xsincos Sleef_sincosd8_u35_intrin
+#define xtan Sleef_tand8_u35_intrin
+#define xasin Sleef_asind8_u35_intrin
+#define xacos Sleef_acosd8_u35_intrin
+#define xatan Sleef_atand8_u35_intrin
+#define xatan2 Sleef_atan2d8_u35_intrin
+#define xlog Sleef_logd8_u35_intrin
+#define xcbrt Sleef_cbrtd8_u35_intrin
+#define xsin_u1 Sleef_sind8_u10_intrin
+#define xcos_u1 Sleef_cosd8_u10_intrin
+#define xsincos_u1 Sleef_sincosd8_u10_intrin
+#define xtan_u1 Sleef_tand8_u10_intrin
+#define xasin_u1 Sleef_asind8_u10_intrin
+#define xacos_u1 Sleef_acosd8_u10_intrin
+#define xatan_u1 Sleef_atand8_u10_intrin
+#define xatan2_u1 Sleef_atan2d8_u10_intrin
+#define xlog_u1 Sleef_logd8_u10_intrin
+#define xcbrt_u1 Sleef_cbrtd8_u10_intrin
+#define xexp Sleef_expd8_u10_intrin
+#define xpow Sleef_powd8_u10_intrin
+#define xsinh Sleef_sinhd8_u10_intrin
+#define xcosh Sleef_coshd8_u10_intrin
+#define xtanh Sleef_tanhd8_u10_intrin
+#define xasinh Sleef_asinhd8_u10_intrin
+#define xacosh Sleef_acoshd8_u10_intrin
+#define xatanh Sleef_atanhd8_u10_intrin
+#define xexp2 Sleef_exp2d8_u10_intrin
+#define xexp10 Sleef_exp10d8_u10_intrin
+#define xexpm1 Sleef_expm1d8_u10_intrin
+#define xlog10 Sleef_log10d8_u10_intrin
+#define xlog1p Sleef_log1pd8_u10_intrin
+#define xsincospi_u05 Sleef_sincospid8_u05_intrin
+#define xsincospi_u35 Sleef_sincospid8_u35_intrin
+#define xsinpi_u05 Sleef_sinpid8_u05_intrin
+#define xcospi_u05 Sleef_cospid8_u05_intrin
+#define xldexp Sleef_ldexpd8_intrin
+#define xilogb Sleef_ilogbd8_intrin
+#define xfma Sleef_fmad8_intrin
+#define xsqrt_u05 Sleef_sqrtd8_u05_intrin
+#define xsqrt_u35 Sleef_sqrtd8_u35_intrin
+#define xhypot_u05 Sleef_hypotd8_u05_intrin
+#define xhypot_u35 Sleef_hypotd8_u35_intrin
+#define xfabs Sleef_fabsd8_intrin
+#define xcopysign Sleef_copysignd8_intrin
+#define xfmax Sleef_fmaxd8_intrin
+#define xfmin Sleef_fmind8_intrin
+#define xfdim Sleef_fdimd8_intrin
+#define xtrunc Sleef_truncd8_intrin
+#define xfloor Sleef_floord8_intrin
+#define xceil Sleef_ceild8_intrin
+#define xround Sleef_roundd8_intrin
+#define xrint Sleef_rintd8_intrin
+#define xnextafter Sleef_nextafterd8_intrin
+#define xfrfrexp Sleef_frfrexpd8_intrin
+#define xexpfrexp Sleef_expfrexpd8_intrin
+#define xfmod Sleef_fmodd8_intrin
+#define xmodf Sleef_modfd8_intrin
+#define xlgamma_u1 Sleef_lgammad8_u10_intrin
+#define xlgamma_r_u1 Sleef_lgamma_rd8_u10_intrin
+#define xtgamma_u1 Sleef_tgammad8_u10_intrin
+#define xerf_u1 Sleef_erfd8_u10_intrin
+#define xerfc_u15 Sleef_erfcd8_u15_intrin
+
+#define xsinf Sleef_sinf16_u35_intrin
+#define xcosf Sleef_cosf16_u35_intrin
+#define xsincosf Sleef_sincosf16_u35_intrin
+#define xtanf Sleef_tanf16_u35_intrin
+#define xasinf Sleef_asinf16_u35_intrin
+#define xacosf Sleef_acosf16_u35_intrin
+#define xatanf Sleef_atanf16_u35_intrin
+#define xatan2f Sleef_atan2f16_u35_intrin
+#define xlogf Sleef_logf16_u35_intrin
+#define xcbrtf Sleef_cbrtf16_u35_intrin
+#define xsinf_u1 Sleef_sinf16_u10_intrin
+#define xcosf_u1 Sleef_cosf16_u10_intrin
+#define xsincosf_u1 Sleef_sincosf16_u10_intrin
+#define xtanf_u1 Sleef_tanf16_u10_intrin
+#define xasinf_u1 Sleef_asinf16_u10_intrin
+#define xacosf_u1 Sleef_acosf16_u10_intrin
+#define xatanf_u1 Sleef_atanf16_u10_intrin
+#define xatan2f_u1 Sleef_atan2f16_u10_intrin
+#define xlogf_u1 Sleef_logf16_u10_intrin
+#define xcbrtf_u1 Sleef_cbrtf16_u10_intrin
+#define xexpf Sleef_expf16_u10_intrin
+#define xpowf Sleef_powf16_u10_intrin
+#define xsinhf Sleef_sinhf16_u10_intrin
+#define xcoshf Sleef_coshf16_u10_intrin
+#define xtanhf Sleef_tanhf16_u10_intrin
+#define xasinhf Sleef_asinhf16_u10_intrin
+#define xacoshf Sleef_acoshf16_u10_intrin
+#define xatanhf Sleef_atanhf16_u10_intrin
+#define xexp2f Sleef_exp2f16_u10_intrin
+#define xexp10f Sleef_exp10f16_u10_intrin
+#define xexpm1f Sleef_expm1f16_u10_intrin
+#define xlog10f Sleef_log10f16_u10_intrin
+#define xlog1pf Sleef_log1pf16_u10_intrin
+#define xsincospif_u05 Sleef_sincospif16_u05_intrin
+#define xsincospif_u35 Sleef_sincospif16_u35_intrin
+#define xsinpif_u05 Sleef_sinpif16_u05_intrin
+#define xcospif_u05 Sleef_cospif16_u05_intrin
+#define xldexpf Sleef_ldexpf16_intrin
+#define xilogbf Sleef_ilogbf16_intrin
+#define xfmaf Sleef_fmaf16_intrin
+#define xsqrtf_u05 Sleef_sqrtf16_u05_intrin
+#define xsqrtf_u35 Sleef_sqrtf16_u35_intrin
+#define xhypotf_u05 Sleef_hypotf16_u05_intrin
+#define xhypotf_u35 Sleef_hypotf16_u35_intrin
+#define xfabsf Sleef_fabsf16_intrin
+#define xcopysignf Sleef_copysignf16_intrin
+#define xfmaxf Sleef_fmaxf16_intrin
+#define xfminf Sleef_fminf16_intrin
+#define xfdimf Sleef_fdimf16_intrin
+#define xtruncf Sleef_truncf16_intrin
+#define xfloorf Sleef_floorf16_intrin
+#define xceilf Sleef_ceilf16_intrin
+#define xroundf Sleef_roundf16_intrin
+#define xrintf Sleef_rintf16_intrin
+#define xnextafterf Sleef_nextafterf16_intrin
+#define xfrfrexpf Sleef_frfrexpf16_intrin
+#define xexpfrexpf Sleef_expfrexpf16_intrin
+#define xfmodf Sleef_fmodf16_intrin
+#define xmodff Sleef_modff16_intrin
+#define xlgammaf_u1 Sleef_lgammaf16_u10_intrin
+#define xlgamma_rf_u1 Sleef_lgamma_rf16_u10_intrin
+#define xtgammaf_u1 Sleef_tgammaf16_u10_intrin
+#define xerff_u1 Sleef_erff16_u10_intrin
+#define xerfcf_u15 Sleef_erfcf16_u15_intrin
+/***********************/
+#define xlog1p_fast Sleef_log1p_fast_d8_u10_intrin
+#define xlog1pf_fast Sleef_log1p_fast_f16_u10_intrin
+
+#define xpown Sleef_pownd8_u10_intrin
+#define xpownf Sleef_pownf16_u10_intrin
+#define xpowr Sleef_powrd8_u10_intrin
+#define xpowrf Sleef_powrf16_u10_intrin
diff --git a/lib/kernel/sleef/libm/sleef_builtin.c b/lib/kernel/sleef/libm/sleef_builtin.c
new file mode 100644
index 0000000..3d5bb0b
--- /dev/null
+++ b/lib/kernel/sleef/libm/sleef_builtin.c
@@ -0,0 +1,938 @@
+/* OpenCL built-in library: SLEEF C fallback using libm / compiler builtins
+
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#define _GNU_SOURCE
+#include <float.h>
+#include <limits.h>
+#include <math.h>
+#include <stdint.h>
+
+#include "sleef.h"
+#include "sleef_cl.h"
+#include "rename.h"
+
+//##################################################################
+
+static int64_t
+doubleToRawLongBits (double d)
+{
+  union
+  {
+    double f;
+    int64_t i;
+  } tmp;
+  tmp.f = d;
+  return tmp.i;
+}
+
+static double
+longBitsToDouble (int64_t i)
+{
+  union
+  {
+    double f;
+    int64_t i;
+  } tmp;
+  tmp.i = i;
+  return tmp.f;
+}
+
+static double
+fabsk (double x)
+{
+  return longBitsToDouble (0x7fffffffffffffffLL & doubleToRawLongBits (x));
+}
+
+static double
+mulsign (double x, double y)
+{
+  return longBitsToDouble (doubleToRawLongBits (x)
+                           ^ (doubleToRawLongBits (y) & (1LL << 63)));
+}
+
+//##################################################################
+
+#define INFINITYf ((float)INFINITY)
+#define NANf ((float)NAN)
+
+static int32_t
+floatToRawIntBits (float d)
+{
+  union
+  {
+    float f;
+    int32_t i;
+  } tmp;
+  tmp.f = d;
+  return tmp.i;
+}
+
+static float
+intBitsToFloat (int32_t i)
+{
+  union
+  {
+    float f;
+    int32_t i;
+  } tmp;
+  tmp.i = i;
+  return tmp.f;
+}
+
+static float
+fabsfk (float x)
+{
+  return intBitsToFloat (0x7fffffffL & floatToRawIntBits (x));
+}
+
+static float
+mulsignf (float x, float y)
+{
+  return intBitsToFloat (floatToRawIntBits (x)
+                         ^ (floatToRawIntBits (y) & (1 << 31)));
+}
+
+//##################################################################
+
+
+double
+xsin (double x)
+{
+  return __builtin_sin (x);
+}
+double
+xcos (double x)
+{
+  return __builtin_cos (x);
+}
+double
+xtan (double x)
+{
+  return __builtin_tan (x);
+}
+double
+xasin (double x)
+{
+  return __builtin_asin (x);
+}
+double
+xacos (double x)
+{
+  return __builtin_acos (x);
+}
+double
+xatan (double x)
+{
+  return __builtin_atan (x);
+}
+double
+xatan2 (double x, double y)
+{
+  return __builtin_atan2 (x, y);
+}
+double
+xlog (double x)
+{
+  return __builtin_log (x);
+}
+double
+xcbrt (double x)
+{
+  return __builtin_cbrt (x);
+}
+
+double
+xsin_u1 (double x)
+{
+  return __builtin_sin (x);
+}
+double
+xcos_u1 (double x)
+{
+  return __builtin_cos (x);
+}
+double
+xtan_u1 (double x)
+{
+  return __builtin_tan (x);
+}
+double
+xasin_u1 (double x)
+{
+  return __builtin_asin (x);
+}
+double
+xacos_u1 (double x)
+{
+  return __builtin_acos (x);
+}
+double
+xatan_u1 (double x)
+{
+  return __builtin_atan (x);
+}
+double
+xatan2_u1 (double x, double y)
+{
+  return __builtin_atan2 (x, y);
+}
+double
+xlog_u1 (double x)
+{
+  return __builtin_log (x);
+}
+double
+xcbrt_u1 (double x)
+{
+  return __builtin_cbrt (x);
+}
+
+double
+xexp (double x)
+{
+  return __builtin_exp (x);
+}
+
+double
+xpow (double x, double y)
+{
+  return __builtin_pow (x, y);
+}
+
+double
+xpown (double x, int y)
+{
+  return __builtin_pow (x, (double)y);
+}
+
+double
+xpowr (double x, double y)
+{
+  if (x < 0.0)
+    return NAN;
+  if (isnan(y))
+    return y;
+  double res = __builtin_pow (x, y);
+  return res;
+}
+
+double
+xsinh (double x)
+{
+  return __builtin_sinh (x);
+}
+
+double
+xcosh (double x)
+{
+  return __builtin_cosh (x);
+}
+double
+xtanh (double x)
+{
+  return __builtin_tanh (x);
+}
+
+double
+xasinh (double x)
+{
+  return __builtin_asinh (x);
+}
+double
+xacosh (double x)
+{
+  return __builtin_acosh (x);
+}
+double
+xatanh (double x)
+{
+  return __builtin_atanh (x);
+}
+
+double
+xexp2 (double x)
+{
+  return __builtin_exp2 (x);
+}
+double
+xexp10 (double x)
+{
+  return exp10 (x);
+}
+
+double
+xexpm1 (double x)
+{
+  return __builtin_expm1 (x);
+}
+
+double
+xlog10 (double x)
+{
+  return __builtin_log10 (x);
+}
+double
+xlog1p (double x)
+{
+  return __builtin_log1p (x);
+}
+
+double
+xsinpi_u05 (double x)
+{
+  return __builtin_sin (x * (double)M_PI);
+}
+double
+xcospi_u05 (double x)
+{
+  return __builtin_cos (x * (double)M_PI);
+}
+
+Sleef_double2
+xsincos (double x)
+{
+  Sleef_double2 tmp;
+  sincos (x, &tmp.x, &tmp.y);
+  return tmp;
+}
+
+Sleef_double2
+xsincos_u1 (double x)
+{
+  return xsincos (x);
+}
+
+double
+xldexp (double x, int k)
+{
+  return __builtin_ldexp (x, k);
+}
+
+int
+xilogb (double x)
+{
+  return __builtin_ilogb (x);
+}
+
+double
+xfma (double x, double y, double z)
+{
+  return __builtin_fma (x, y, z);
+}
+
+double
+xsqrt_u05 (double x)
+{
+  return __builtin_sqrt (x);
+}
+
+double
+xsqrt_u35 (double x)
+{
+  return __builtin_sqrt (x);
+}
+
+double
+xhypot_u05 (double x, double y)
+{
+  return __builtin_hypot (x, y);
+}
+
+double
+xhypot_u35 (double x, double y)
+{
+  return __builtin_hypot (x, y);
+}
+
+double
+xfabs (double x)
+{
+  return __builtin_fabs (x);
+}
+
+double
+xcopysign (double x, double y)
+{
+  return __builtin_copysign (x, y);
+}
+double
+xfmax (double x, double y)
+{
+  return __builtin_fmax (x, y);
+}
+double
+xfmin (double x, double y)
+{
+  return __builtin_fmin (x, y);
+}
+
+double
+xfdim (double x, double y)
+{
+  return __builtin_fdim (x, y);
+}
+double
+xtrunc (double x)
+{
+  return __builtin_trunc (x);
+}
+double
+xfloor (double x)
+{
+  return __builtin_floor (x);
+}
+
+double
+xceil (double x)
+{
+  return __builtin_ceil (x);
+}
+double
+xround (double x)
+{
+  return __builtin_round (x);
+}
+double
+xrint (double x)
+{
+  return __builtin_rint (x);
+}
+
+double
+xnextafter (double x, double y)
+{
+  return __builtin_nextafter (x, y);
+}
+
+double
+xfrfrexp (double x)
+{
+  union
+  {
+    double f;
+    uint64_t u;
+  } cx;
+
+  if (__builtin_isnan (x))
+    return x;
+
+  if (fabsk (x) < DBL_MIN)
+    x *= (1ULL << 63);
+
+  cx.f = x;
+  cx.u &= ~0x7ff0000000000000ULL;
+  cx.u |= 0x3fe0000000000000ULL;
+
+  if (__builtin_isinf (x))
+    cx.f = mulsign (INFINITY, x);
+  if (x == 0)
+    cx.f = x;
+
+  return cx.f;
+}
+
+int
+xexpfrexp (double x)
+{
+  union
+  {
+    double f;
+    uint64_t u;
+  } cx;
+
+  int ret = 0;
+
+  if (fabsk (x) < DBL_MIN)
+    {
+      x *= (1ULL << 63);
+      ret = -63;
+    }
+
+  cx.f = x;
+  ret += (int32_t) (((cx.u >> 52) & 0x7ff)) - 0x3fe;
+
+  if (x == 0 || __builtin_isnan (x) || __builtin_isinf (x))
+    ret = 0;
+
+  return ret;
+}
+
+double
+xfmod (double x, double y)
+{
+  return __builtin_fmod (x, y);
+}
+
+Sleef_double2
+xmodf (double x)
+{
+  Sleef_double2 res;
+  double tmp;
+  res.x = __builtin_modf (x, &tmp);
+  res.y = tmp;
+  return res;
+}
+
+double
+xlgamma_u1 (double x)
+{
+  return __builtin_lgamma (x);
+}
+Sleef_double2
+xlgamma_r_u1 (double x)
+{
+  Sleef_double2 ret;
+  int sign;
+  ret.x = lgamma_r (x, &sign);
+  ret.y = (sign > 0 ? 1.0 : -1.0);
+  return ret;
+}
+double
+xtgamma_u1 (double x)
+{
+  return __builtin_tgamma (x);
+}
+double
+xerf_u1 (double x)
+{
+  return __builtin_erf (x);
+}
+double
+xerfc_u15 (double x)
+{
+  return __builtin_erfc (x);
+}
+
+// *********************************************************************
+// *********************************************************************
+// *********************************************************************
+// *********************************************************************
+
+float
+xsinf (float x)
+{
+  return __builtin_sinf (x);
+}
+float
+xcosf (float x)
+{
+  return __builtin_cosf (x);
+}
+float
+xtanf (float x)
+{
+  return __builtin_tanf (x);
+}
+float
+xasinf (float x)
+{
+  return __builtin_asinf (x);
+}
+float
+xacosf (float x)
+{
+  return __builtin_acosf (x);
+}
+float
+xatanf (float x)
+{
+  return __builtin_atanf (x);
+}
+float
+xatan2f (float x, float y)
+{
+  return __builtin_atan2f (x, y);
+}
+float
+xlogf (float x)
+{
+  return __builtin_logf (x);
+}
+float
+xcbrtf (float x)
+{
+  return __builtin_cbrtf (x);
+}
+
+float
+xsinf_u1 (float x)
+{
+  return __builtin_sinf (x);
+}
+float
+xcosf_u1 (float x)
+{
+  return __builtin_cosf (x);
+}
+float
+xtanf_u1 (float x)
+{
+  return __builtin_tanf (x);
+}
+float
+xasinf_u1 (float x)
+{
+  return __builtin_asinf (x);
+}
+float
+xacosf_u1 (float x)
+{
+  return __builtin_acosf (x);
+}
+float
+xatanf_u1 (float x)
+{
+  return __builtin_atanf (x);
+}
+float
+xatan2f_u1 (float x, float y)
+{
+  return __builtin_atan2f (x, y);
+}
+float
+xlogf_u1 (float x)
+{
+  return __builtin_logf (x);
+}
+float
+xcbrtf_u1 (float x)
+{
+  return __builtin_cbrtf (x);
+}
+
+float
+xexpf (float x)
+{
+  return __builtin_expf (x);
+}
+
+float
+xpowf (float x, float y)
+{
+
+  return (float) __builtin_pow ((double)x, (double)y);
+}
+
+float
+xpownf (float x, int y)
+{
+  return (float) __builtin_pow ((double)x, (double)y);
+}
+
+float
+xpowrf (float x, float y)
+{
+  if (x < 0.0f)
+    return NAN;
+  float res = (float) __builtin_pow ((double)x, (double)y);
+  return res;
+}
+
+float
+xsinhf (float x)
+{
+  return __builtin_sinhf (x);
+}
+float
+xcoshf (float x)
+{
+  return __builtin_coshf (x);
+}
+float
+xtanhf (float x)
+{
+  return __builtin_tanhf (x);
+}
+
+float
+xasinhf (float x)
+{
+  return __builtin_asinhf (x);
+}
+float
+xacoshf (float x)
+{
+  return __builtin_acoshf (x);
+}
+float
+xatanhf (float x)
+{
+  return __builtin_atanhf (x);
+}
+
+float
+xexp2f (float x)
+{
+  return __builtin_exp2f (x);
+}
+
+float
+xexp10f (float x)
+{
+  return exp10f (x);
+}
+
+float
+xexpm1f (float x)
+{
+  return __builtin_expm1f (x);
+}
+
+float
+xlog10f (float x)
+{
+  return __builtin_log10f (x);
+}
+float
+xlog1pf (float x)
+{
+  return __builtin_log1pf (x);
+}
+
+float
+xsinpif_u05 (float x)
+{
+  return __builtin_sinf (x * (float)M_PI);
+}
+float
+xcospif_u05 (float x)
+{
+  return __builtin_cosf (x * (float)M_PI);
+}
+
+Sleef_float2
+xsincosf (float x)
+{
+  Sleef_float2 tmp;
+  sincosf (x, &tmp.x, &tmp.y);
+  return tmp;
+}
+
+Sleef_float2
+xsincosf_u1 (float x)
+{
+  return xsincosf (x);
+}
+
+float
+xsqrtf_u05 (float x)
+{
+  return __builtin_sqrtf (x);
+}
+
+float
+xsqrtf_u35 (float x)
+{
+  return __builtin_sqrtf (x);
+}
+
+float
+xhypotf_u05 (float x, float y)
+{
+  return __builtin_hypotf (x, y);
+}
+
+float
+xhypotf_u35 (float x, float y)
+{
+  return __builtin_hypotf (x, y);
+}
+
+float
+xldexpf (float x, int k)
+{
+  return __builtin_ldexpf (x, k);
+}
+
+int
+xilogbf (float x)
+{
+  return __builtin_ilogbf (x);
+}
+
+float
+xfmaf (float x, float y, float z)
+{
+  return __builtin_fmaf (x, y, z);
+}
+
+float
+xfabsf (float x)
+{
+  return __builtin_fabsf (x);
+}
+
+float
+xcopysignf (float x, float y)
+{
+  return __builtin_copysignf (x, y);
+}
+float
+xfmaxf (float x, float y)
+{
+  return __builtin_fmaxf (x, y);
+}
+float
+xfminf (float x, float y)
+{
+  return __builtin_fminf (x, y);
+}
+
+float
+xfdimf (float x, float y)
+{
+  return __builtin_fdimf (x, y);
+}
+float
+xtruncf (float x)
+{
+  return __builtin_truncf (x);
+}
+float
+xfloorf (float x)
+{
+  return __builtin_floorf (x);
+}
+
+float
+xceilf (float x)
+{
+  return __builtin_ceilf (x);
+}
+float
+xroundf (float x)
+{
+  return __builtin_roundf (x);
+}
+float
+xrintf (float x)
+{
+  return __builtin_rintf (x);
+}
+
+float
+xnextafterf (float x, float y)
+{
+  return __builtin_nextafterf (x, y);
+}
+
+
+float
+xfrfrexpf (float x)
+{
+  union
+  {
+    float f;
+    int32_t u;
+  } cx;
+
+  if (__builtin_isnan (x))
+    return x;
+
+  if (fabsfk (x) < FLT_MIN)
+    x *= (1 << 30);
+
+  cx.f = x;
+  cx.u &= ~0x7f800000U;
+  cx.u |= 0x3f000000U;
+
+  if (__builtin_isinf (x))
+    cx.f = mulsignf (INFINITYf, x);
+  if (x == 0)
+    cx.f = x;
+
+  return cx.f;
+}
+
+int
+xexpfrexpf (float x)
+{
+  union
+  {
+    float f;
+    uint32_t u;
+  } cx;
+
+  int ret = 0;
+
+  if (fabsfk (x) < FLT_MIN)
+    {
+      x *= (1 << 30);
+      ret = -30;
+    }
+
+  cx.f = x;
+  ret += (int32_t) (((cx.u >> 23) & 0xff)) - 0x7e;
+
+  if (x == 0 || __builtin_isnan (x) || __builtin_isinf (x))
+    ret = 0;
+
+  return ret;
+}
+
+float
+xfmodf (float x, float y)
+{
+  return __builtin_fmodf (x, y);
+}
+
+Sleef_float2
+xmodff (float x)
+{
+  Sleef_float2 res;
+  float tmp;
+  res.x = __builtin_modff (x, &tmp);
+  res.y = tmp;
+  return res;
+}
+
+float
+xlgammaf_u1 (float x)
+{
+  return __builtin_lgammaf (x);
+}
+Sleef_float2
+xlgamma_rf_u1 (float x)
+{
+  Sleef_float2 ret;
+  int sign;
+  ret.x = lgammaf_r (x, &sign);
+  ret.y = (sign > 0 ? 1.0f : -1.0f);
+  return ret;
+}
+
+float
+xtgammaf_u1 (float x)
+{
+  return __builtin_tgammaf (x);
+}
+float
+xerff_u1 (float x)
+{
+  return __builtin_erff (x);
+}
+float
+xerfcf_u15 (float x)
+{
+  return __builtin_erfcf (x);
+}
diff --git a/lib/kernel/sleef/libm/sleef_glue.cl b/lib/kernel/sleef/libm/sleef_glue.cl
new file mode 100644
index 0000000..ad98b43
--- /dev/null
+++ b/lib/kernel/sleef/libm/sleef_glue.cl
@@ -0,0 +1,78 @@
+/* OpenCL built-in library: a few hand-written SLEEF indirect function calls
+
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include "sleef_cl.h"
+
+#ifdef SLEEF_VEC_128_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_ldexpd2_long (double2 x, long2 k);
+_CL_ALWAYSINLINE double2 Sleef_pownd2_u10_long (double2 x, long2 k);
+_CL_ALWAYSINLINE long2 Sleef_ilogbd2_long (double2 x);
+
+_CL_ALWAYSINLINE double2
+Sleef_ldexpd2 (double2 x, int2 k)
+{
+  int4 tmp = (int4) (k, k);
+  return Sleef_ldexpd2_long (x, as_long2 (tmp));
+}
+
+_CL_ALWAYSINLINE double2
+Sleef_pownd2_u10 (double2 x, int2 k)
+{
+  int4 tmp = (int4) (k, k);
+  return Sleef_pownd2_u10_long (x, as_long2 (tmp));
+}
+
+
+_CL_ALWAYSINLINE int2
+Sleef_ilogbd2 (double2 x)
+{
+  int4 r = as_int4 (Sleef_ilogbd2_long (x));
+  return r.xy;
+}
+
+_CL_ALWAYSINLINE long2 Sleef_expfrexpd2_long (double2 x);
+
+_CL_ALWAYSINLINE int2
+Sleef_expfrexpd2 (double2 x)
+{
+  return convert_int2 (Sleef_expfrexpd2_long (x));
+}
+
+_CL_ALWAYSINLINE long4 Sleef_expfrexpd4_long (double4 x);
+
+_CL_ALWAYSINLINE int4
+Sleef_expfrexpd4 (double4 x)
+{
+  return convert_int4 (Sleef_expfrexpd4_long (x));
+}
+
+_CL_ALWAYSINLINE long8 Sleef_expfrexpd8_long (double8 x);
+
+_CL_ALWAYSINLINE int8
+Sleef_expfrexpd8 (double8 x)
+{
+  return convert_int8 (Sleef_expfrexpd8_long (x));
+}
+
+#endif
diff --git a/lib/kernel/sleef/libm/sleef_glue_auto.c b/lib/kernel/sleef/libm/sleef_glue_auto.c
new file mode 100644
index 0000000..b4f660a
--- /dev/null
+++ b/lib/kernel/sleef/libm/sleef_glue_auto.c
@@ -0,0 +1,4353 @@
+#include "sleef.h"
+
+#include "sleef_cl.h"
+
+
+#ifdef SLEEF_VEC_128_AVAILABLE
+
+_CL_ALWAYSINLINE float4 Sleef_sinf4_u10(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_sinf4_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE float4 Sleef_sinf4_u35(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_sinf4_u35_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_sind2_u10(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_sind2_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE double2 Sleef_sind2_u35(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_sind2_u35_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_cosf4_u10(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_cosf4_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE float4 Sleef_cosf4_u35(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_cosf4_u35_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_cosd2_u10(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_cosd2_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE double2 Sleef_cosd2_u35(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_cosd2_u35_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_tanf4_u10(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_tanf4_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE float4 Sleef_tanf4_u35(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_tanf4_u35_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_tand2_u10(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_tand2_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE double2 Sleef_tand2_u35(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_tand2_u35_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_asinf4_u10(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_asinf4_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE float4 Sleef_asinf4_u35(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_asinf4_u35_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_asind2_u10(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_asind2_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE double2 Sleef_asind2_u35(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_asind2_u35_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_acosf4_u10(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_acosf4_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE float4 Sleef_acosf4_u35(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_acosf4_u35_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_acosd2_u10(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_acosd2_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE double2 Sleef_acosd2_u35(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_acosd2_u35_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_atanf4_u10(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_atanf4_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE float4 Sleef_atanf4_u35(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_atanf4_u35_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_atand2_u10(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_atand2_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE double2 Sleef_atand2_u35(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_atand2_u35_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_atan2f4_u10(float4 x, float4 y)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } y_in;
+  y_in.t = y;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_atan2f4_u10_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE float4 Sleef_atan2f4_u35(float4 x, float4 y)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } y_in;
+  y_in.t = y;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_atan2f4_u35_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_atan2d2_u10(double2 x, double2 y)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } y_in;
+  y_in.t = y;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_atan2d2_u10_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE double2 Sleef_atan2d2_u35(double2 x, double2 y)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } y_in;
+  y_in.t = y;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_atan2d2_u35_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_cbrtf4_u10(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_cbrtf4_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE float4 Sleef_cbrtf4_u35(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_cbrtf4_u35_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_cbrtd2_u10(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_cbrtd2_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE double2 Sleef_cbrtd2_u35(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_cbrtd2_u35_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_logf4_u10(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_logf4_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE float4 Sleef_logf4_u35(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_logf4_u35_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_logd2_u10(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_logd2_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE double2 Sleef_logd2_u35(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_logd2_u35_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_expf4_u10(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_expf4_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_expd2_u10(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_expd2_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_powf4_u10(float4 x, float4 y)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } y_in;
+  y_in.t = y;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_powf4_u10_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_powd2_u10(double2 x, double2 y)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } y_in;
+  y_in.t = y;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_powd2_u10_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_pownf4_u10(float4 x, int4 y)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { int4 t; reg128i r; } y_in;
+  y_in.t = y;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_pownf4_u10_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_pownd2_u10_long(double2 x, long2 y)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { long2 t; reg128i r; } y_in;
+  y_in.t = y;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_pownd2_u10_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_powrf4_u10(float4 x, float4 y)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } y_in;
+  y_in.t = y;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_powrf4_u10_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_powrd2_u10(double2 x, double2 y)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } y_in;
+  y_in.t = y;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_powrd2_u10_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_sinhf4_u10(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_sinhf4_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_sinhd2_u10(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_sinhd2_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_coshf4_u10(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_coshf4_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_coshd2_u10(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_coshd2_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_tanhf4_u10(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_tanhf4_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_tanhd2_u10(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_tanhd2_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_asinhf4_u10(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_asinhf4_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_asinhd2_u10(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_asinhd2_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_acoshf4_u10(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_acoshf4_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_acoshd2_u10(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_acoshd2_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_atanhf4_u10(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_atanhf4_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_atanhd2_u10(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_atanhd2_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_exp2f4_u10(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_exp2f4_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_exp2d2_u10(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_exp2d2_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_exp10f4_u10(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_exp10f4_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_exp10d2_u10(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_exp10d2_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_expm1f4_u10(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_expm1f4_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_expm1d2_u10(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_expm1d2_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_log10f4_u10(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_log10f4_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_log10d2_u10(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_log10d2_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_log1pf4_u10(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_log1pf4_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_log1pd2_u10(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_log1pd2_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_sinpif4_u05(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_sinpif4_u05_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_sinpid2_u05(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_sinpid2_u05_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_cospif4_u05(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_cospif4_u05_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_cospid2_u05(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_cospid2_u05_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_fmaf4(float4 x, float4 y, float4 z)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } y_in;
+  y_in.t = y;
+  union { float4 t; reg128f r; } z_in;
+  z_in.t = z;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_fmaf4_intrin(x_in.r, y_in.r, z_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_fmad2(double2 x, double2 y, double2 z)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } y_in;
+  y_in.t = y;
+  union { double2 t; reg128d r; } z_in;
+  z_in.t = z;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_fmad2_intrin(x_in.r, y_in.r, z_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_sqrtf4_u05(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_sqrtf4_u05_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_sqrtd2_u05(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_sqrtd2_u05_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_hypotf4_u05(float4 x, float4 y)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } y_in;
+  y_in.t = y;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_hypotf4_u05_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE float4 Sleef_hypotf4_u35(float4 x, float4 y)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } y_in;
+  y_in.t = y;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_hypotf4_u35_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_hypotd2_u05(double2 x, double2 y)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } y_in;
+  y_in.t = y;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_hypotd2_u05_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE double2 Sleef_hypotd2_u35(double2 x, double2 y)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } y_in;
+  y_in.t = y;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_hypotd2_u35_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_fabsf4(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_fabsf4_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_fabsd2(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_fabsd2_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_copysignf4(float4 x, float4 y)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } y_in;
+  y_in.t = y;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_copysignf4_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_copysignd2(double2 x, double2 y)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } y_in;
+  y_in.t = y;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_copysignd2_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_fmaxf4(float4 x, float4 y)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } y_in;
+  y_in.t = y;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_fmaxf4_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_fmaxd2(double2 x, double2 y)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } y_in;
+  y_in.t = y;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_fmaxd2_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_fminf4(float4 x, float4 y)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } y_in;
+  y_in.t = y;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_fminf4_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_fmind2(double2 x, double2 y)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } y_in;
+  y_in.t = y;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_fmind2_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_fdimf4(float4 x, float4 y)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } y_in;
+  y_in.t = y;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_fdimf4_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_fdimd2(double2 x, double2 y)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } y_in;
+  y_in.t = y;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_fdimd2_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_truncf4(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_truncf4_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_truncd2(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_truncd2_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_floorf4(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_floorf4_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_floord2(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_floord2_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_ceilf4(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_ceilf4_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_ceild2(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_ceild2_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_roundf4(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_roundf4_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_roundd2(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_roundd2_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_rintf4(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_rintf4_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_rintd2(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_rintd2_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_nextafterf4(float4 x, float4 y)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } y_in;
+  y_in.t = y;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_nextafterf4_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_nextafterd2(double2 x, double2 y)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } y_in;
+  y_in.t = y;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_nextafterd2_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_fmodf4(float4 x, float4 y)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } y_in;
+  y_in.t = y;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_fmodf4_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_fmodd2(double2 x, double2 y)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } y_in;
+  y_in.t = y;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_fmodd2_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_lgammaf4_u10(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_lgammaf4_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_lgammad2_u10(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_lgammad2_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE Sleef_float4_2 Sleef_lgamma_rf4_u10(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { Sleef_float4_2 t; Sleef_reg128f_2 r; } ret;
+  ret.r = Sleef_lgamma_rf4_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE Sleef_double2_2 Sleef_lgamma_rd2_u10(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { Sleef_double2_2 t; Sleef_reg128d_2 r; } ret;
+  ret.r = Sleef_lgamma_rd2_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_tgammaf4_u10(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_tgammaf4_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_tgammad2_u10(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_tgammad2_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_erff4_u10(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_erff4_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_erfd2_u10(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_erfd2_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_erfcf4_u15(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_erfcf4_u15_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_erfcd2_u15(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_erfcd2_u15_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_frfrexpf4(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_frfrexpf4_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_frfrexpd2(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_frfrexpd2_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE Sleef_float4_2 Sleef_sincosf4_u10(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { Sleef_float4_2 t; Sleef_reg128f_2 r; } ret;
+  ret.r = Sleef_sincosf4_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE Sleef_float4_2 Sleef_sincosf4_u35(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { Sleef_float4_2 t; Sleef_reg128f_2 r; } ret;
+  ret.r = Sleef_sincosf4_u35_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE Sleef_double2_2 Sleef_sincosd2_u10(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { Sleef_double2_2 t; Sleef_reg128d_2 r; } ret;
+  ret.r = Sleef_sincosd2_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE Sleef_double2_2 Sleef_sincosd2_u35(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { Sleef_double2_2 t; Sleef_reg128d_2 r; } ret;
+  ret.r = Sleef_sincosd2_u35_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE Sleef_float4_2 Sleef_sincospif4_u05(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { Sleef_float4_2 t; Sleef_reg128f_2 r; } ret;
+  ret.r = Sleef_sincospif4_u05_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE Sleef_float4_2 Sleef_sincospif4_u35(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { Sleef_float4_2 t; Sleef_reg128f_2 r; } ret;
+  ret.r = Sleef_sincospif4_u35_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE Sleef_double2_2 Sleef_sincospid2_u05(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { Sleef_double2_2 t; Sleef_reg128d_2 r; } ret;
+  ret.r = Sleef_sincospid2_u05_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE Sleef_double2_2 Sleef_sincospid2_u35(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { Sleef_double2_2 t; Sleef_reg128d_2 r; } ret;
+  ret.r = Sleef_sincospid2_u35_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE Sleef_float4_2 Sleef_modff4(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { Sleef_float4_2 t; Sleef_reg128f_2 r; } ret;
+  ret.r = Sleef_modff4_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE Sleef_double2_2 Sleef_modfd2(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { Sleef_double2_2 t; Sleef_reg128d_2 r; } ret;
+  ret.r = Sleef_modfd2_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float4 Sleef_ldexpf4(float4 x, int4 k)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { int4 t; reg128i r; } k_in;
+  k_in.t = k;
+  union { float4 t; reg128f r; } ret;
+  ret.r = Sleef_ldexpf4_intrin(x_in.r, k_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double2 Sleef_ldexpd2_long(double2 x, long2 k)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { long2 t; reg128i r; } k_in;
+  k_in.t = k;
+  union { double2 t; reg128d r; } ret;
+  ret.r = Sleef_ldexpd2_intrin(x_in.r, k_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE int4 Sleef_expfrexpf4(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { int4 t; reg128i r; } ret;
+  ret.r = Sleef_expfrexpf4_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE long2 Sleef_expfrexpd2_long(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { long2 t; reg128i r; } ret;
+  ret.r = Sleef_expfrexpd2_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE int4 Sleef_ilogbf4(float4 x)
+{
+  union { float4 t; reg128f r; } x_in;
+  x_in.t = x;
+  union { int4 t; reg128i r; } ret;
+  ret.r = Sleef_ilogbf4_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE long2 Sleef_ilogbd2_long(double2 x)
+{
+  union { double2 t; reg128d r; } x_in;
+  x_in.t = x;
+  union { long2 t; reg128i r; } ret;
+  ret.r = Sleef_ilogbd2_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+#endif
+
+
+#ifdef SLEEF_VEC_256_AVAILABLE
+
+_CL_ALWAYSINLINE float8 Sleef_sinf8_u10(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_sinf8_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE float8 Sleef_sinf8_u35(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_sinf8_u35_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_sind4_u10(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_sind4_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE double4 Sleef_sind4_u35(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_sind4_u35_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_cosf8_u10(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_cosf8_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE float8 Sleef_cosf8_u35(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_cosf8_u35_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_cosd4_u10(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_cosd4_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE double4 Sleef_cosd4_u35(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_cosd4_u35_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_tanf8_u10(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_tanf8_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE float8 Sleef_tanf8_u35(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_tanf8_u35_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_tand4_u10(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_tand4_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE double4 Sleef_tand4_u35(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_tand4_u35_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_asinf8_u10(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_asinf8_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE float8 Sleef_asinf8_u35(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_asinf8_u35_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_asind4_u10(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_asind4_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE double4 Sleef_asind4_u35(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_asind4_u35_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_acosf8_u10(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_acosf8_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE float8 Sleef_acosf8_u35(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_acosf8_u35_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_acosd4_u10(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_acosd4_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE double4 Sleef_acosd4_u35(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_acosd4_u35_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_atanf8_u10(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_atanf8_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE float8 Sleef_atanf8_u35(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_atanf8_u35_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_atand4_u10(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_atand4_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE double4 Sleef_atand4_u35(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_atand4_u35_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_atan2f8_u10(float8 x, float8 y)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } y_in;
+  y_in.t = y;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_atan2f8_u10_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE float8 Sleef_atan2f8_u35(float8 x, float8 y)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } y_in;
+  y_in.t = y;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_atan2f8_u35_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_atan2d4_u10(double4 x, double4 y)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } y_in;
+  y_in.t = y;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_atan2d4_u10_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE double4 Sleef_atan2d4_u35(double4 x, double4 y)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } y_in;
+  y_in.t = y;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_atan2d4_u35_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_cbrtf8_u10(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_cbrtf8_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE float8 Sleef_cbrtf8_u35(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_cbrtf8_u35_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_cbrtd4_u10(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_cbrtd4_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE double4 Sleef_cbrtd4_u35(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_cbrtd4_u35_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_logf8_u10(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_logf8_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE float8 Sleef_logf8_u35(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_logf8_u35_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_logd4_u10(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_logd4_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE double4 Sleef_logd4_u35(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_logd4_u35_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_expf8_u10(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_expf8_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_expd4_u10(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_expd4_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_powf8_u10(float8 x, float8 y)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } y_in;
+  y_in.t = y;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_powf8_u10_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_powd4_u10(double4 x, double4 y)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } y_in;
+  y_in.t = y;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_powd4_u10_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_pownf8_u10(float8 x, int8 y)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { int8 t; reg256i r; } y_in;
+  y_in.t = y;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_pownf8_u10_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_pownd4_u10(double4 x, int4 y)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { int4 t; reg128i r; } y_in;
+  y_in.t = y;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_pownd4_u10_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_powrf8_u10(float8 x, float8 y)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } y_in;
+  y_in.t = y;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_powrf8_u10_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_powrd4_u10(double4 x, double4 y)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } y_in;
+  y_in.t = y;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_powrd4_u10_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_sinhf8_u10(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_sinhf8_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_sinhd4_u10(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_sinhd4_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_coshf8_u10(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_coshf8_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_coshd4_u10(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_coshd4_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_tanhf8_u10(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_tanhf8_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_tanhd4_u10(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_tanhd4_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_asinhf8_u10(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_asinhf8_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_asinhd4_u10(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_asinhd4_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_acoshf8_u10(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_acoshf8_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_acoshd4_u10(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_acoshd4_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_atanhf8_u10(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_atanhf8_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_atanhd4_u10(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_atanhd4_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_exp2f8_u10(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_exp2f8_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_exp2d4_u10(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_exp2d4_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_exp10f8_u10(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_exp10f8_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_exp10d4_u10(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_exp10d4_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_expm1f8_u10(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_expm1f8_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_expm1d4_u10(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_expm1d4_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_log10f8_u10(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_log10f8_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_log10d4_u10(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_log10d4_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_log1pf8_u10(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_log1pf8_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_log1pd4_u10(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_log1pd4_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_sinpif8_u05(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_sinpif8_u05_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_sinpid4_u05(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_sinpid4_u05_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_cospif8_u05(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_cospif8_u05_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_cospid4_u05(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_cospid4_u05_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_fmaf8(float8 x, float8 y, float8 z)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } y_in;
+  y_in.t = y;
+  union { float8 t; reg256f r; } z_in;
+  z_in.t = z;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_fmaf8_intrin(x_in.r, y_in.r, z_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_fmad4(double4 x, double4 y, double4 z)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } y_in;
+  y_in.t = y;
+  union { double4 t; reg256d r; } z_in;
+  z_in.t = z;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_fmad4_intrin(x_in.r, y_in.r, z_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_sqrtf8_u05(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_sqrtf8_u05_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_sqrtd4_u05(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_sqrtd4_u05_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_hypotf8_u05(float8 x, float8 y)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } y_in;
+  y_in.t = y;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_hypotf8_u05_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE float8 Sleef_hypotf8_u35(float8 x, float8 y)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } y_in;
+  y_in.t = y;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_hypotf8_u35_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_hypotd4_u05(double4 x, double4 y)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } y_in;
+  y_in.t = y;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_hypotd4_u05_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE double4 Sleef_hypotd4_u35(double4 x, double4 y)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } y_in;
+  y_in.t = y;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_hypotd4_u35_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_fabsf8(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_fabsf8_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_fabsd4(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_fabsd4_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_copysignf8(float8 x, float8 y)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } y_in;
+  y_in.t = y;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_copysignf8_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_copysignd4(double4 x, double4 y)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } y_in;
+  y_in.t = y;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_copysignd4_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_fmaxf8(float8 x, float8 y)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } y_in;
+  y_in.t = y;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_fmaxf8_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_fmaxd4(double4 x, double4 y)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } y_in;
+  y_in.t = y;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_fmaxd4_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_fminf8(float8 x, float8 y)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } y_in;
+  y_in.t = y;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_fminf8_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_fmind4(double4 x, double4 y)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } y_in;
+  y_in.t = y;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_fmind4_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_fdimf8(float8 x, float8 y)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } y_in;
+  y_in.t = y;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_fdimf8_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_fdimd4(double4 x, double4 y)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } y_in;
+  y_in.t = y;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_fdimd4_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_truncf8(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_truncf8_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_truncd4(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_truncd4_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_floorf8(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_floorf8_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_floord4(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_floord4_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_ceilf8(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_ceilf8_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_ceild4(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_ceild4_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_roundf8(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_roundf8_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_roundd4(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_roundd4_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_rintf8(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_rintf8_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_rintd4(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_rintd4_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_nextafterf8(float8 x, float8 y)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } y_in;
+  y_in.t = y;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_nextafterf8_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_nextafterd4(double4 x, double4 y)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } y_in;
+  y_in.t = y;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_nextafterd4_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_fmodf8(float8 x, float8 y)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } y_in;
+  y_in.t = y;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_fmodf8_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_fmodd4(double4 x, double4 y)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } y_in;
+  y_in.t = y;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_fmodd4_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_lgammaf8_u10(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_lgammaf8_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_lgammad4_u10(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_lgammad4_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE Sleef_float8_2 Sleef_lgamma_rf8_u10(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { Sleef_float8_2 t; Sleef_reg256f_2 r; } ret;
+  ret.r = Sleef_lgamma_rf8_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE Sleef_double4_2 Sleef_lgamma_rd4_u10(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { Sleef_double4_2 t; Sleef_reg256d_2 r; } ret;
+  ret.r = Sleef_lgamma_rd4_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_tgammaf8_u10(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_tgammaf8_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_tgammad4_u10(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_tgammad4_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_erff8_u10(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_erff8_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_erfd4_u10(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_erfd4_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_erfcf8_u15(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_erfcf8_u15_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_erfcd4_u15(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_erfcd4_u15_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_frfrexpf8(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_frfrexpf8_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_frfrexpd4(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_frfrexpd4_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE Sleef_float8_2 Sleef_sincosf8_u10(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { Sleef_float8_2 t; Sleef_reg256f_2 r; } ret;
+  ret.r = Sleef_sincosf8_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE Sleef_float8_2 Sleef_sincosf8_u35(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { Sleef_float8_2 t; Sleef_reg256f_2 r; } ret;
+  ret.r = Sleef_sincosf8_u35_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE Sleef_double4_2 Sleef_sincosd4_u10(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { Sleef_double4_2 t; Sleef_reg256d_2 r; } ret;
+  ret.r = Sleef_sincosd4_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE Sleef_double4_2 Sleef_sincosd4_u35(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { Sleef_double4_2 t; Sleef_reg256d_2 r; } ret;
+  ret.r = Sleef_sincosd4_u35_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE Sleef_float8_2 Sleef_sincospif8_u05(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { Sleef_float8_2 t; Sleef_reg256f_2 r; } ret;
+  ret.r = Sleef_sincospif8_u05_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE Sleef_float8_2 Sleef_sincospif8_u35(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { Sleef_float8_2 t; Sleef_reg256f_2 r; } ret;
+  ret.r = Sleef_sincospif8_u35_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE Sleef_double4_2 Sleef_sincospid4_u05(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { Sleef_double4_2 t; Sleef_reg256d_2 r; } ret;
+  ret.r = Sleef_sincospid4_u05_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE Sleef_double4_2 Sleef_sincospid4_u35(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { Sleef_double4_2 t; Sleef_reg256d_2 r; } ret;
+  ret.r = Sleef_sincospid4_u35_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE Sleef_float8_2 Sleef_modff8(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { Sleef_float8_2 t; Sleef_reg256f_2 r; } ret;
+  ret.r = Sleef_modff8_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE Sleef_double4_2 Sleef_modfd4(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { Sleef_double4_2 t; Sleef_reg256d_2 r; } ret;
+  ret.r = Sleef_modfd4_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float8 Sleef_ldexpf8(float8 x, int8 k)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { int8 t; reg256i r; } k_in;
+  k_in.t = k;
+  union { float8 t; reg256f r; } ret;
+  ret.r = Sleef_ldexpf8_intrin(x_in.r, k_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double4 Sleef_ldexpd4(double4 x, int4 k)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { int4 t; reg128i r; } k_in;
+  k_in.t = k;
+  union { double4 t; reg256d r; } ret;
+  ret.r = Sleef_ldexpd4_intrin(x_in.r, k_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE int8 Sleef_expfrexpf8(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { int8 t; reg256i r; } ret;
+  ret.r = Sleef_expfrexpf8_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE long4 Sleef_expfrexpd4_long(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { long4 t; reg256i r; } ret;
+  ret.r = Sleef_expfrexpd4_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE int8 Sleef_ilogbf8(float8 x)
+{
+  union { float8 t; reg256f r; } x_in;
+  x_in.t = x;
+  union { int8 t; reg256i r; } ret;
+  ret.r = Sleef_ilogbf8_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE int4 Sleef_ilogbd4(double4 x)
+{
+  union { double4 t; reg256d r; } x_in;
+  x_in.t = x;
+  union { int4 t; reg128i r; } ret;
+  ret.r = Sleef_ilogbd4_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+#endif
+
+
+#ifdef SLEEF_VEC_512_AVAILABLE
+
+_CL_ALWAYSINLINE float16 Sleef_sinf16_u10(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_sinf16_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE float16 Sleef_sinf16_u35(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_sinf16_u35_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_sind8_u10(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_sind8_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE double8 Sleef_sind8_u35(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_sind8_u35_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_cosf16_u10(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_cosf16_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE float16 Sleef_cosf16_u35(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_cosf16_u35_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_cosd8_u10(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_cosd8_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE double8 Sleef_cosd8_u35(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_cosd8_u35_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_tanf16_u10(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_tanf16_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE float16 Sleef_tanf16_u35(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_tanf16_u35_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_tand8_u10(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_tand8_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE double8 Sleef_tand8_u35(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_tand8_u35_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_asinf16_u10(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_asinf16_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE float16 Sleef_asinf16_u35(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_asinf16_u35_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_asind8_u10(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_asind8_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE double8 Sleef_asind8_u35(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_asind8_u35_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_acosf16_u10(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_acosf16_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE float16 Sleef_acosf16_u35(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_acosf16_u35_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_acosd8_u10(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_acosd8_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE double8 Sleef_acosd8_u35(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_acosd8_u35_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_atanf16_u10(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_atanf16_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE float16 Sleef_atanf16_u35(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_atanf16_u35_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_atand8_u10(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_atand8_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE double8 Sleef_atand8_u35(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_atand8_u35_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_atan2f16_u10(float16 x, float16 y)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } y_in;
+  y_in.t = y;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_atan2f16_u10_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE float16 Sleef_atan2f16_u35(float16 x, float16 y)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } y_in;
+  y_in.t = y;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_atan2f16_u35_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_atan2d8_u10(double8 x, double8 y)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } y_in;
+  y_in.t = y;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_atan2d8_u10_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE double8 Sleef_atan2d8_u35(double8 x, double8 y)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } y_in;
+  y_in.t = y;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_atan2d8_u35_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_cbrtf16_u10(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_cbrtf16_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE float16 Sleef_cbrtf16_u35(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_cbrtf16_u35_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_cbrtd8_u10(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_cbrtd8_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE double8 Sleef_cbrtd8_u35(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_cbrtd8_u35_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_logf16_u10(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_logf16_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE float16 Sleef_logf16_u35(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_logf16_u35_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_logd8_u10(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_logd8_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE double8 Sleef_logd8_u35(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_logd8_u35_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_expf16_u10(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_expf16_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_expd8_u10(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_expd8_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_powf16_u10(float16 x, float16 y)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } y_in;
+  y_in.t = y;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_powf16_u10_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_powd8_u10(double8 x, double8 y)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } y_in;
+  y_in.t = y;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_powd8_u10_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_pownf16_u10(float16 x, int16 y)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { int16 t; reg512i r; } y_in;
+  y_in.t = y;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_pownf16_u10_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_pownd8_u10(double8 x, int8 y)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { int8 t; reg256i r; } y_in;
+  y_in.t = y;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_pownd8_u10_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_powrf16_u10(float16 x, float16 y)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } y_in;
+  y_in.t = y;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_powrf16_u10_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_powrd8_u10(double8 x, double8 y)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } y_in;
+  y_in.t = y;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_powrd8_u10_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_sinhf16_u10(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_sinhf16_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_sinhd8_u10(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_sinhd8_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_coshf16_u10(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_coshf16_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_coshd8_u10(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_coshd8_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_tanhf16_u10(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_tanhf16_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_tanhd8_u10(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_tanhd8_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_asinhf16_u10(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_asinhf16_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_asinhd8_u10(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_asinhd8_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_acoshf16_u10(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_acoshf16_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_acoshd8_u10(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_acoshd8_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_atanhf16_u10(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_atanhf16_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_atanhd8_u10(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_atanhd8_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_exp2f16_u10(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_exp2f16_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_exp2d8_u10(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_exp2d8_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_exp10f16_u10(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_exp10f16_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_exp10d8_u10(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_exp10d8_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_expm1f16_u10(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_expm1f16_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_expm1d8_u10(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_expm1d8_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_log10f16_u10(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_log10f16_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_log10d8_u10(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_log10d8_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_log1pf16_u10(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_log1pf16_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_log1pd8_u10(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_log1pd8_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_sinpif16_u05(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_sinpif16_u05_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_sinpid8_u05(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_sinpid8_u05_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_cospif16_u05(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_cospif16_u05_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_cospid8_u05(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_cospid8_u05_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_fmaf16(float16 x, float16 y, float16 z)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } y_in;
+  y_in.t = y;
+  union { float16 t; reg512f r; } z_in;
+  z_in.t = z;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_fmaf16_intrin(x_in.r, y_in.r, z_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_fmad8(double8 x, double8 y, double8 z)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } y_in;
+  y_in.t = y;
+  union { double8 t; reg512d r; } z_in;
+  z_in.t = z;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_fmad8_intrin(x_in.r, y_in.r, z_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_sqrtf16_u05(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_sqrtf16_u05_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_sqrtd8_u05(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_sqrtd8_u05_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_hypotf16_u05(float16 x, float16 y)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } y_in;
+  y_in.t = y;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_hypotf16_u05_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE float16 Sleef_hypotf16_u35(float16 x, float16 y)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } y_in;
+  y_in.t = y;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_hypotf16_u35_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_hypotd8_u05(double8 x, double8 y)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } y_in;
+  y_in.t = y;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_hypotd8_u05_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE double8 Sleef_hypotd8_u35(double8 x, double8 y)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } y_in;
+  y_in.t = y;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_hypotd8_u35_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_fabsf16(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_fabsf16_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_fabsd8(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_fabsd8_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_copysignf16(float16 x, float16 y)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } y_in;
+  y_in.t = y;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_copysignf16_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_copysignd8(double8 x, double8 y)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } y_in;
+  y_in.t = y;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_copysignd8_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_fmaxf16(float16 x, float16 y)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } y_in;
+  y_in.t = y;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_fmaxf16_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_fmaxd8(double8 x, double8 y)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } y_in;
+  y_in.t = y;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_fmaxd8_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_fminf16(float16 x, float16 y)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } y_in;
+  y_in.t = y;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_fminf16_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_fmind8(double8 x, double8 y)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } y_in;
+  y_in.t = y;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_fmind8_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_fdimf16(float16 x, float16 y)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } y_in;
+  y_in.t = y;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_fdimf16_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_fdimd8(double8 x, double8 y)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } y_in;
+  y_in.t = y;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_fdimd8_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_truncf16(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_truncf16_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_truncd8(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_truncd8_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_floorf16(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_floorf16_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_floord8(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_floord8_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_ceilf16(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_ceilf16_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_ceild8(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_ceild8_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_roundf16(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_roundf16_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_roundd8(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_roundd8_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_rintf16(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_rintf16_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_rintd8(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_rintd8_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_nextafterf16(float16 x, float16 y)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } y_in;
+  y_in.t = y;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_nextafterf16_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_nextafterd8(double8 x, double8 y)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } y_in;
+  y_in.t = y;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_nextafterd8_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_fmodf16(float16 x, float16 y)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } y_in;
+  y_in.t = y;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_fmodf16_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_fmodd8(double8 x, double8 y)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } y_in;
+  y_in.t = y;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_fmodd8_intrin(x_in.r, y_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_lgammaf16_u10(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_lgammaf16_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_lgammad8_u10(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_lgammad8_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE Sleef_float16_2 Sleef_lgamma_rf16_u10(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { Sleef_float16_2 t; Sleef_reg512f_2 r; } ret;
+  ret.r = Sleef_lgamma_rf16_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE Sleef_double8_2 Sleef_lgamma_rd8_u10(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { Sleef_double8_2 t; Sleef_reg512d_2 r; } ret;
+  ret.r = Sleef_lgamma_rd8_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_tgammaf16_u10(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_tgammaf16_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_tgammad8_u10(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_tgammad8_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_erff16_u10(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_erff16_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_erfd8_u10(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_erfd8_u10_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_erfcf16_u15(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_erfcf16_u15_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_erfcd8_u15(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_erfcd8_u15_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_frfrexpf16(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_frfrexpf16_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_frfrexpd8(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_frfrexpd8_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE Sleef_float16_2 Sleef_sincosf16_u10(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { Sleef_float16_2 t; Sleef_reg512f_2 r; } ret;
+  ret.r = Sleef_sincosf16_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE Sleef_float16_2 Sleef_sincosf16_u35(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { Sleef_float16_2 t; Sleef_reg512f_2 r; } ret;
+  ret.r = Sleef_sincosf16_u35_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE Sleef_double8_2 Sleef_sincosd8_u10(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { Sleef_double8_2 t; Sleef_reg512d_2 r; } ret;
+  ret.r = Sleef_sincosd8_u10_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE Sleef_double8_2 Sleef_sincosd8_u35(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { Sleef_double8_2 t; Sleef_reg512d_2 r; } ret;
+  ret.r = Sleef_sincosd8_u35_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE Sleef_float16_2 Sleef_sincospif16_u05(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { Sleef_float16_2 t; Sleef_reg512f_2 r; } ret;
+  ret.r = Sleef_sincospif16_u05_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE Sleef_float16_2 Sleef_sincospif16_u35(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { Sleef_float16_2 t; Sleef_reg512f_2 r; } ret;
+  ret.r = Sleef_sincospif16_u35_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE Sleef_double8_2 Sleef_sincospid8_u05(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { Sleef_double8_2 t; Sleef_reg512d_2 r; } ret;
+  ret.r = Sleef_sincospid8_u05_intrin(x_in.r);
+  return ret.t;
+}
+
+_CL_ALWAYSINLINE Sleef_double8_2 Sleef_sincospid8_u35(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { Sleef_double8_2 t; Sleef_reg512d_2 r; } ret;
+  ret.r = Sleef_sincospid8_u35_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE Sleef_float16_2 Sleef_modff16(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { Sleef_float16_2 t; Sleef_reg512f_2 r; } ret;
+  ret.r = Sleef_modff16_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE Sleef_double8_2 Sleef_modfd8(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { Sleef_double8_2 t; Sleef_reg512d_2 r; } ret;
+  ret.r = Sleef_modfd8_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE float16 Sleef_ldexpf16(float16 x, int16 k)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { int16 t; reg512i r; } k_in;
+  k_in.t = k;
+  union { float16 t; reg512f r; } ret;
+  ret.r = Sleef_ldexpf16_intrin(x_in.r, k_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE double8 Sleef_ldexpd8(double8 x, int8 k)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { int8 t; reg256i r; } k_in;
+  k_in.t = k;
+  union { double8 t; reg512d r; } ret;
+  ret.r = Sleef_ldexpd8_intrin(x_in.r, k_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE int16 Sleef_expfrexpf16(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { int16 t; reg512i r; } ret;
+  ret.r = Sleef_expfrexpf16_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE long8 Sleef_expfrexpd8_long(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { long8 t; reg512i r; } ret;
+  ret.r = Sleef_expfrexpd8_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+
+_CL_ALWAYSINLINE int16 Sleef_ilogbf16(float16 x)
+{
+  union { float16 t; reg512f r; } x_in;
+  x_in.t = x;
+  union { int16 t; reg512i r; } ret;
+  ret.r = Sleef_ilogbf16_intrin(x_in.r);
+  return ret.t;
+}
+
+#ifdef SLEEF_DOUBLE_VEC_AVAILABLE
+
+_CL_ALWAYSINLINE int8 Sleef_ilogbd8(double8 x)
+{
+  union { double8 t; reg512d r; } x_in;
+  x_in.t = x;
+  union { int8 t; reg256i r; } ret;
+  ret.r = Sleef_ilogbd8_intrin(x_in.r);
+  return ret.t;
+}
+#endif
+
+#endif
diff --git a/lib/kernel/sleef/libm/sleefdp.c b/lib/kernel/sleef/libm/sleefdp.c
new file mode 100644
index 0000000..d8107ba
--- /dev/null
+++ b/lib/kernel/sleef/libm/sleefdp.c
@@ -0,0 +1,2323 @@
+//          Copyright Naoki Shibata 2010 - 2017.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+// Always use -ffp-contract=off option to compile SLEEF.
+
+#include <stdint.h>
+#include <math.h>
+#include <limits.h>
+#include <float.h>
+
+#include "misc.h"
+
+// debug prints using fprintf
+#define NDEBUG
+
+#if (defined(_MSC_VER))
+#pragma fp_contract (off)
+#endif
+
+#include "helpers.h"
+
+static INLINE CONST int64_t doubleToRawLongBits(double d) {
+  union {
+    double f;
+    int64_t i;
+  } tmp;
+  tmp.f = d;
+  return tmp.i;
+}
+
+static INLINE CONST double longBitsToDouble(int64_t i) {
+  union {
+    double f;
+    int64_t i;
+  } tmp;
+  tmp.i = i;
+  return tmp.f;
+}
+
+static INLINE CONST double fabsk(double x) {
+  return longBitsToDouble(0x7fffffffffffffffLL & doubleToRawLongBits(x));
+}
+
+static INLINE CONST double mulsign(double x, double y) {
+  return longBitsToDouble(doubleToRawLongBits(x) ^ (doubleToRawLongBits(y) & (1LL << 63)));
+}
+
+static INLINE CONST double copysignk(double x, double y) {
+  return longBitsToDouble((doubleToRawLongBits(x) & ~(1LL << 63)) ^ (doubleToRawLongBits(y) & (1LL << 63)));
+}
+
+static INLINE CONST double sign(double d) { return mulsign(1, d); }
+static INLINE CONST double mla(double x, double y, double z) { return x * y + z; }
+static INLINE CONST double rintk(double x) { return x < 0 ? (int)(x - 0.5) : (int)(x + 0.5); }
+static INLINE CONST int ceilk(double x) { return (int)x + (x < 0 ? 0 : 1); }
+static INLINE CONST double trunck(double x) { return (double)(int)x; }
+static INLINE CONST double fmink(double x, double y) { return x < y ? x : y; }
+static INLINE CONST double fmaxk(double x, double y) { return x > y ? x : y; }
+
+static INLINE CONST int xisnan(double x) { return x != x; }
+static INLINE CONST int xisinf(double x) { return x == INFINITY || x == -INFINITY; }
+static INLINE CONST int xisminf(double x) { return x == -INFINITY; }
+static INLINE CONST int xispinf(double x) { return x == INFINITY; }
+static INLINE CONST int xisnegzero(double x) { return doubleToRawLongBits(x) == doubleToRawLongBits(-0.0); }
+static INLINE CONST int xisnumber(double x) { return !xisinf(x) && !xisnan(x); }
+
+static INLINE CONST int xisint(double d) {
+  double x = d - (double)(1LL << 31) * (int)(d * (1.0 / (1LL << 31)));
+  return (x == (int)x) || (fabsk(d) >= (double)(1LL << 53));
+}
+
+static INLINE CONST int xisodd(double d) {
+  double x = d - (double)(1LL << 31) * (int)(d * (1.0 / (1LL << 31)));
+  return (1 & (int)x) != 0 && fabsk(d) < (double)(1LL << 53);
+}
+
+static INLINE CONST double pow2i(int q) {
+  return longBitsToDouble(((int64_t)(q + 0x3ff)) << 52);
+}
+
+static INLINE CONST double ldexpk(double x, int q) {
+  double u;
+  int m;
+  m = q >> 31;
+  m = (((m + q) >> 9) - m) << 7;
+  q = q - (m << 2);
+  m += 0x3ff;
+  m = m < 0     ? 0     : m;
+  m = m > 0x7ff ? 0x7ff : m;
+  u = longBitsToDouble(((int64_t)m) << 52);
+  x = x * u * u * u * u;
+  u = longBitsToDouble(((int64_t)(q + 0x3ff)) << 52);
+  return x * u;
+}
+
+static INLINE CONST double ldexp2k(double d, int e) { // faster than ldexpk, short reach
+  return d * pow2i(e >> 1) * pow2i(e - (e >> 1));
+}
+
+static INLINE CONST double ldexp3k(double d, int e) { // very fast, no denormal
+  return longBitsToDouble(doubleToRawLongBits(d) + (((int64_t)e) << 52));
+}
+
+EXPORT CONST double xldexp(double x, int exp) {
+  if (exp >  2100) exp =  2100;
+  if (exp < -2100) exp = -2100;
+
+  int e0 = exp >> 2;
+  if (exp < 0) e0++;
+  if (-100 < exp && exp < 100) e0 = 0;
+  int e1 = exp - (e0 << 2);
+
+  double p = pow2i(e0);
+  double ret = x * pow2i(e1) * p * p * p * p;
+
+  return ret;
+}
+
+static INLINE CONST int ilogbk(double d) {
+  int m = d < 4.9090934652977266E-91;
+  d = m ? 2.037035976334486E90 * d : d;
+  int q = (doubleToRawLongBits(d) >> 52) & 0x7ff;
+  q = m ? q - (300 + 0x03ff) : q - 0x03ff;
+  return q;
+}
+
+// ilogb2k is similar to ilogbk, but the argument has to be a
+// normalized FP value.
+static INLINE CONST int ilogb2k(double d) {
+  return ((doubleToRawLongBits(d) >> 52) & 0x7ff) - 0x3ff;
+}
+
+EXPORT CONST int xilogb(double d) {
+  int e = ilogbk(fabsk(d));
+  e = d == 0.0  ? FP_ILOGB0 : e;
+  e = xisnan(d) ? FP_ILOGBNAN : e;
+  e = xisinf(d) ? INT_MAX : e;
+  return e;
+}
+
+//
+
+#ifndef NDEBUG
+static int checkfp(double x) {
+  if (xisinf(x) || xisnan(x)) return 1;
+  return 0;
+}
+#endif
+
+static INLINE CONST double upper(double d) {
+  return longBitsToDouble(doubleToRawLongBits(d) & 0xfffffffff8000000LL);
+}
+
+static INLINE CONST Sleef_double2 dd(double h, double l) {
+  Sleef_double2 ret;
+  ret.x = h; ret.y = l;
+  return ret;
+}
+
+static INLINE CONST Sleef_double2 ddnormalize_d2_d2(Sleef_double2 t) {
+  Sleef_double2 s;
+
+  s.x = t.x + t.y;
+  s.y = t.x - s.x + t.y;
+
+  return s;
+}
+
+static INLINE CONST Sleef_double2 ddscale_d2_d2_d(Sleef_double2 d, double s) {
+  Sleef_double2 r;
+
+  r.x = d.x * s;
+  r.y = d.y * s;
+
+  return r;
+}
+
+static INLINE CONST Sleef_double2 ddneg_d2_d2(Sleef_double2 d) {
+  Sleef_double2 r;
+
+  r.x = -d.x;
+  r.y = -d.y;
+
+  return r;
+}
+
+static INLINE CONST Sleef_double2 ddabs_d2_d2(Sleef_double2 x) {
+  return dd(x.x < 0 ? -x.x : x.x, x.x < 0 ? -x.y : x.y);
+}
+
+/*
+ * ddadd and ddadd2 are functions for double-double addition.  ddadd
+ * is simpler and faster than ddadd2, but it requires the absolute
+ * value of first argument to be larger than the second argument. The
+ * exact condition that should be met is checked if NDEBUG macro is
+ * not defined.
+ *
+ * Please note that if the results won't be used, it is no problem to
+ * feed arguments that do not meet this condition. You will see
+ * warning messages if you turn off NDEBUG macro and run tester2, but
+ * this is normal.
+ *
+ * Please see :
+ * Jonathan Richard Shewchuk, Adaptive Precision Floating-Point
+ * Arithmetic and Fast Robust Geometric Predicates, Discrete &
+ * Computational Geometry 18:305-363, 1997.
+ */
+
+static INLINE CONST Sleef_double2 ddadd_d2_d_d(double x, double y) {
+  // |x| >= |y|
+
+  Sleef_double2 r;
+
+#ifndef NDEBUG
+  if (!(checkfp(x) || checkfp(y) || fabsk(x) >= fabsk(y) || (fabs(x+y) <= fabs(x) && fabs(x+y) <= fabs(y)))) {
+    fprintf(stderr, "[ddadd_d2_d_d : %g, %g]\n", x, y);
+    fflush(stderr);
+  }
+#endif
+
+  r.x = x + y;
+  r.y = x - r.x + y;
+
+  return r;
+}
+
+static INLINE CONST Sleef_double2 ddadd2_d2_d_d(double x, double y) {
+  Sleef_double2 r;
+
+  r.x = x + y;
+  double v = r.x - x;
+  r.y = (x - (r.x - v)) + (y - v);
+
+  return r;
+}
+
+static INLINE CONST Sleef_double2 ddadd_d2_d2_d(Sleef_double2 x, double y) {
+  // |x| >= |y|
+
+  Sleef_double2 r;
+
+#ifndef NDEBUG
+  if (!(checkfp(x.x) || checkfp(y) || fabsk(x.x) >= fabsk(y) || (fabs(x.x+y) <= fabs(x.x) && fabs(x.x+y) <= fabs(y)))) {
+    fprintf(stderr, "[ddadd_d2_d2_d : %g %g]\n", x.x, y);
+    fflush(stderr);
+  }
+#endif
+
+  r.x = x.x + y;
+  r.y = x.x - r.x + y + x.y;
+
+  return r;
+}
+
+static INLINE CONST Sleef_double2 ddadd2_d2_d2_d(Sleef_double2 x, double y) {
+  Sleef_double2 r;
+
+  r.x  = x.x + y;
+  double v = r.x - x.x;
+  r.y = (x.x - (r.x - v)) + (y - v);
+  r.y += x.y;
+
+  return r;
+}
+
+static INLINE CONST Sleef_double2 ddadd_d2_d_d2(double x, Sleef_double2 y) {
+  // |x| >= |y|
+
+  Sleef_double2 r;
+
+#ifndef NDEBUG
+  if (!(checkfp(x) || checkfp(y.x) || fabsk(x) >= fabsk(y.x) || (fabs(x+y.x) <= fabs(x) && fabs(x+y.x) <= fabs(y.x)))) {
+    fprintf(stderr, "[ddadd_d2_d_d2 : %g %g]\n", x, y.x);
+    fflush(stderr);
+  }
+#endif
+
+  r.x = x + y.x;
+  r.y = x - r.x + y.x + y.y;
+
+  return r;
+}
+
+static INLINE CONST Sleef_double2 ddadd2_d2_d_d2(double x, Sleef_double2 y) {
+  Sleef_double2 r;
+
+  r.x  = x + y.x;
+  double v = r.x - x;
+  r.y = (x - (r.x - v)) + (y.x - v) + y.y;
+
+  return r;
+}
+
+static INLINE CONST double ddadd2_d_d_d2(double x, Sleef_double2 y) { return y.y + y.x + x; }
+
+static INLINE CONST Sleef_double2 ddadd_d2_d2_d2(Sleef_double2 x, Sleef_double2 y) {
+  // |x| >= |y|
+
+  Sleef_double2 r;
+
+#ifndef NDEBUG
+  if (!(checkfp(x.x) || checkfp(y.x) || fabsk(x.x) >= fabsk(y.x) || (fabs(x.x+y.x) <= fabs(x.x) && fabs(x.x+y.x) <= fabs(y.x)))) {
+    fprintf(stderr, "[ddadd_d2_d2_d2 : %g %g]\n", x.x, y.x);
+    fflush(stderr);
+  }
+#endif
+
+  r.x = x.x + y.x;
+  r.y = x.x - r.x + y.x + x.y + y.y;
+
+  return r;
+}
+
+static INLINE CONST Sleef_double2 ddadd2_d2_d2_d2(Sleef_double2 x, Sleef_double2 y) {
+  Sleef_double2 r;
+
+  r.x  = x.x + y.x;
+  double v = r.x - x.x;
+  r.y = (x.x - (r.x - v)) + (y.x - v);
+  r.y += x.y + y.y;
+
+  return r;
+}
+
+static INLINE CONST Sleef_double2 ddsub_d2_d2_d2(Sleef_double2 x, Sleef_double2 y) {
+  // |x| >= |y|
+
+  Sleef_double2 r;
+
+#ifndef NDEBUG
+  if (!(checkfp(x.x) || checkfp(y.x) || fabsk(x.x) >= fabsk(y.x) || (fabs(x.x-y.x) <= fabs(x.x) && fabs(x.x-y.x) <= fabs(y.x)))) {
+    fprintf(stderr, "[ddsub_d2_d2_d2 : %g %g]\n", x.x, y.x);
+    fflush(stderr);
+  }
+#endif
+
+  r.x = x.x - y.x;
+  r.y = x.x - r.x - y.x + x.y - y.y;
+
+  return r;
+}
+
+static INLINE CONST Sleef_double2 dddiv_d2_d2_d2(Sleef_double2 n, Sleef_double2 d) {
+  double t = 1.0 / d.x;
+  double dh  = upper(d.x), dl  = d.x - dh;
+  double th  = upper(t  ), tl  = t   - th;
+  double nhh = upper(n.x), nhl = n.x - nhh;
+
+  Sleef_double2 q;
+
+  q.x = n.x * t;
+
+  double u = -q.x + nhh * th + nhh * tl + nhl * th + nhl * tl +
+    q.x * (1 - dh * th - dh * tl - dl * th - dl * tl);
+
+  q.y = t * (n.y - q.x * d.y) + u;
+
+  return q;
+}
+
+static INLINE CONST Sleef_double2 ddmul_d2_d_d(double x, double y) {
+  double xh = upper(x), xl = x - xh;
+  double yh = upper(y), yl = y - yh;
+  Sleef_double2 r;
+
+  r.x = x * y;
+  r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl;
+
+  return r;
+}
+
+static INLINE CONST Sleef_double2 ddmul_d2_d2_d(Sleef_double2 x, double y) {
+  double xh = upper(x.x), xl = x.x - xh;
+  double yh = upper(y  ), yl = y   - yh;
+  Sleef_double2 r;
+
+  r.x = x.x * y;
+  r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.y * y;
+
+  return r;
+}
+
+static INLINE CONST Sleef_double2 ddmul_d2_d2_d2(Sleef_double2 x, Sleef_double2 y) {
+  double xh = upper(x.x), xl = x.x - xh;
+  double yh = upper(y.x), yl = y.x - yh;
+  Sleef_double2 r;
+
+  r.x = x.x * y.x;
+  r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.x * y.y + x.y * y.x;
+
+  return r;
+}
+
+static INLINE CONST double ddmul_d_d2_d2(Sleef_double2 x, Sleef_double2 y) {
+  double xh = upper(x.x), xl = x.x - xh;
+  double yh = upper(y.x), yl = y.x - yh;
+
+  return x.y * yh + xh * y.y + xl * yl + xh * yl + xl * yh + xh * yh;
+}
+
+static INLINE CONST Sleef_double2 ddsqu_d2_d2(Sleef_double2 x) {
+  double xh = upper(x.x), xl = x.x - xh;
+  Sleef_double2 r;
+
+  r.x = x.x * x.x;
+  r.y = xh * xh - r.x + (xh + xh) * xl + xl * xl + x.x * (x.y + x.y);
+
+  return r;
+}
+
+static INLINE CONST double ddsqu_d_d2(Sleef_double2 x) {
+  double xh = upper(x.x), xl = x.x - xh;
+
+  return xh * x.y + xh * x.y + xl * xl + (xh * xl + xh * xl) + xh * xh;
+}
+
+static INLINE CONST Sleef_double2 ddrec_d2_d(double d) {
+  double t = 1.0 / d;
+  double dh = upper(d), dl = d - dh;
+  double th = upper(t), tl = t - th;
+  Sleef_double2 q;
+
+  q.x = t;
+  q.y = t * (1 - dh * th - dh * tl - dl * th - dl * tl);
+
+  return q;
+}
+
+static INLINE CONST Sleef_double2 ddrec_d2_d2(Sleef_double2 d) {
+  double t = 1.0 / d.x;
+  double dh = upper(d.x), dl = d.x - dh;
+  double th = upper(t  ), tl = t   - th;
+  Sleef_double2 q;
+
+  q.x = t;
+  q.y = t * (1 - dh * th - dh * tl - dl * th - dl * tl - d.y * t);
+
+  return q;
+}
+
+static INLINE CONST Sleef_double2 ddsqrt_d2_d2(Sleef_double2 d) {
+  double t = sqrt(d.x + d.y);
+  return ddscale_d2_d2_d(ddmul_d2_d2_d2(ddadd2_d2_d2_d2(d, ddmul_d2_d_d(t, t)), ddrec_d2_d(t)), 0.5);
+}
+
+static INLINE CONST Sleef_double2 ddsqrt_d2_d(double d) {
+  double t = sqrt(d);
+  return ddscale_d2_d2_d(ddmul_d2_d2_d2(ddadd2_d2_d_d2(d, ddmul_d2_d_d(t, t)), ddrec_d2_d(t)), 0.5);
+}
+
+//
+
+static INLINE CONST double atan2k(double y, double x) {
+  double s, t, u;
+  int q = 0;
+
+  if (x < 0) { x = -x; q = -2; }
+  if (y > x) { t = x; x = y; y = -t; q += 1; }
+
+  s = y / x;
+  t = s * s;
+
+  u = -1.88796008463073496563746e-05;
+  u = mla(u, t, 0.000209850076645816976906797);
+  u = mla(u, t, -0.00110611831486672482563471);
+  u = mla(u, t, 0.00370026744188713119232403);
+  u = mla(u, t, -0.00889896195887655491740809);
+  u = mla(u, t, 0.016599329773529201970117);
+  u = mla(u, t, -0.0254517624932312641616861);
+  u = mla(u, t, 0.0337852580001353069993897);
+  u = mla(u, t, -0.0407629191276836500001934);
+  u = mla(u, t, 0.0466667150077840625632675);
+  u = mla(u, t, -0.0523674852303482457616113);
+  u = mla(u, t, 0.0587666392926673580854313);
+  u = mla(u, t, -0.0666573579361080525984562);
+  u = mla(u, t, 0.0769219538311769618355029);
+  u = mla(u, t, -0.090908995008245008229153);
+  u = mla(u, t, 0.111111105648261418443745);
+  u = mla(u, t, -0.14285714266771329383765);
+  u = mla(u, t, 0.199999999996591265594148);
+  u = mla(u, t, -0.333333333333311110369124);
+
+  t = u * t * s + s;
+  t = q * (M_PI/2) + t;
+
+  return t;
+}
+
+EXPORT CONST double xatan2(double y, double x) {
+  double r = atan2k(fabsk(y), x);
+
+  r = mulsign(r, x);
+  if (xisinf(x) || x == 0) r = M_PI/2 - (xisinf(x) ? (sign(x) * (M_PI  /2)) : 0);
+  if (xisinf(y)          ) r = M_PI/2 - (xisinf(x) ? (sign(x) * (M_PI*1/4)) : 0);
+  if (             y == 0) r = (sign(x) == -1 ? M_PI : 0);
+
+  return xisnan(x) || xisnan(y) ? NAN : mulsign(r, y);
+}
+
+EXPORT CONST double xasin(double d) {
+  int o = fabsk(d) < 0.5;
+  double x2 = o ? (d*d) : ((1-fabsk(d))*0.5), x = o ? fabsk(d) : sqrt(x2), u;
+
+  u = +0.3161587650653934628e-1;
+  u = mla(u, x2, -0.1581918243329996643e-1);
+  u = mla(u, x2, +0.1929045477267910674e-1);
+  u = mla(u, x2, +0.6606077476277170610e-2);
+  u = mla(u, x2, +0.1215360525577377331e-1);
+  u = mla(u, x2, +0.1388715184501609218e-1);
+  u = mla(u, x2, +0.1735956991223614604e-1);
+  u = mla(u, x2, +0.2237176181932048341e-1);
+  u = mla(u, x2, +0.3038195928038132237e-1);
+  u = mla(u, x2, +0.4464285681377102438e-1);
+  u = mla(u, x2, +0.7500000000378581611e-1);
+  u = mla(u, x2, +0.1666666666666497543e+0);
+  u = mla(u, x * x2, x);
+
+  double r = o ? u : (M_PI/2 - 2*u);
+  r = mulsign(r, d);
+
+  return r;
+}
+
+EXPORT CONST double xacos(double d) {
+  int o = fabsk(d) < 0.5;
+  double x2 = o ? (d*d) : ((1-fabsk(d))*0.5), u;
+  double x = o ? fabsk(d) : sqrt(x2);
+  x = fabsk(d) == 1.0 ? 0 : x;
+
+  u = +0.3161587650653934628e-1;
+  u = mla(u, x2, -0.1581918243329996643e-1);
+  u = mla(u, x2, +0.1929045477267910674e-1);
+  u = mla(u, x2, +0.6606077476277170610e-2);
+  u = mla(u, x2, +0.1215360525577377331e-1);
+  u = mla(u, x2, +0.1388715184501609218e-1);
+  u = mla(u, x2, +0.1735956991223614604e-1);
+  u = mla(u, x2, +0.2237176181932048341e-1);
+  u = mla(u, x2, +0.3038195928038132237e-1);
+  u = mla(u, x2, +0.4464285681377102438e-1);
+  u = mla(u, x2, +0.7500000000378581611e-1);
+  u = mla(u, x2, +0.1666666666666497543e+0);
+
+  u *= x * x2;
+
+  double y = 3.1415926535897932/2 - (mulsign(x, d) + mulsign(u, d));
+  x += u;
+  double r = o ? y : (x*2);
+  if (!o && d < 0) r = ddadd_d2_d2_d(dd(3.141592653589793116, 1.2246467991473532072e-16), -r).x;
+
+  return r;
+}
+
+
+EXPORT CONST double xatan(double s) {
+  double t, u;
+  int q = 0;
+
+  if (sign(s) == -1) { s = -s; q = 2; }
+  if (s > 1) { s = 1.0 / s; q |= 1; }
+
+  t = s * s;
+
+  u = -1.88796008463073496563746e-05;
+  u = mla(u, t, 0.000209850076645816976906797);
+  u = mla(u, t, -0.00110611831486672482563471);
+  u = mla(u, t, 0.00370026744188713119232403);
+  u = mla(u, t, -0.00889896195887655491740809);
+  u = mla(u, t, 0.016599329773529201970117);
+  u = mla(u, t, -0.0254517624932312641616861);
+  u = mla(u, t, 0.0337852580001353069993897);
+  u = mla(u, t, -0.0407629191276836500001934);
+  u = mla(u, t, 0.0466667150077840625632675);
+  u = mla(u, t, -0.0523674852303482457616113);
+  u = mla(u, t, 0.0587666392926673580854313);
+  u = mla(u, t, -0.0666573579361080525984562);
+  u = mla(u, t, 0.0769219538311769618355029);
+  u = mla(u, t, -0.090908995008245008229153);
+  u = mla(u, t, 0.111111105648261418443745);
+  u = mla(u, t, -0.14285714266771329383765);
+  u = mla(u, t, 0.199999999996591265594148);
+  u = mla(u, t, -0.333333333333311110369124);
+
+  t = s + s * (t * u);
+
+  if ((q & 1) != 0) t = 1.570796326794896557998982 - t;
+  if ((q & 2) != 0) t = -t;
+
+  return t;
+}
+
+static Sleef_double2 atan2k_u1(Sleef_double2 y, Sleef_double2 x) {
+  double u;
+  Sleef_double2 s, t;
+  int q = 0;
+
+  if (x.x < 0) { x.x = -x.x; x.y = -x.y; q = -2; }
+  if (y.x > x.x) { t = x; x = y; y.x = -t.x; y.y = -t.y; q += 1; }
+
+  s = dddiv_d2_d2_d2(y, x);
+  t = ddsqu_d2_d2(s);
+  t = ddnormalize_d2_d2(t);
+
+  u = 1.06298484191448746607415e-05;
+  u = mla(u, t.x, -0.000125620649967286867384336);
+  u = mla(u, t.x, 0.00070557664296393412389774);
+  u = mla(u, t.x, -0.00251865614498713360352999);
+  u = mla(u, t.x, 0.00646262899036991172313504);
+  u = mla(u, t.x, -0.0128281333663399031014274);
+  u = mla(u, t.x, 0.0208024799924145797902497);
+  u = mla(u, t.x, -0.0289002344784740315686289);
+  u = mla(u, t.x, 0.0359785005035104590853656);
+  u = mla(u, t.x, -0.041848579703592507506027);
+  u = mla(u, t.x, 0.0470843011653283988193763);
+  u = mla(u, t.x, -0.0524914210588448421068719);
+  u = mla(u, t.x, 0.0587946590969581003860434);
+  u = mla(u, t.x, -0.0666620884778795497194182);
+  u = mla(u, t.x, 0.0769225330296203768654095);
+  u = mla(u, t.x, -0.0909090442773387574781907);
+  u = mla(u, t.x, 0.111111108376896236538123);
+  u = mla(u, t.x, -0.142857142756268568062339);
+  u = mla(u, t.x, 0.199999999997977351284817);
+  u = mla(u, t.x, -0.333333333333317605173818);
+
+  t = ddmul_d2_d2_d(t, u);
+  t = ddmul_d2_d2_d2(s, ddadd_d2_d_d2(1, t));
+  if (fabsk(s.x) < 1e-200) t = s;
+  t = ddadd2_d2_d2_d2(ddmul_d2_d2_d(dd(1.570796326794896557998982, 6.12323399573676603586882e-17), q), t);
+
+  return t;
+}
+
+EXPORT CONST double xatan2_u1(double y, double x) {
+  if (fabsk(x) < 5.5626846462680083984e-309) { y *= (1ULL << 53); x *= (1ULL << 53); } // nexttoward((1.0 / DBL_MAX), 1)
+  Sleef_double2 d = atan2k_u1(dd(fabsk(y), 0), dd(x, 0));
+  double r = d.x + d.y;
+
+  r = mulsign(r, x);
+  if (xisinf(x) || x == 0) r = M_PI/2 - (xisinf(x) ? (sign(x) * (M_PI  /2)) : 0);
+  if (xisinf(y)          ) r = M_PI/2 - (xisinf(x) ? (sign(x) * (M_PI*1/4)) : 0);
+  if (             y == 0) r = (sign(x) == -1 ? M_PI : 0);
+
+  return xisnan(x) || xisnan(y) ? NAN : mulsign(r, y);
+}
+
+EXPORT CONST double xasin_u1(double d) {
+  int o = fabsk(d) < 0.5;
+  double x2 = o ? (d*d) : ((1-fabsk(d))*0.5), u;
+  Sleef_double2 x = o ? dd(fabsk(d), 0) : ddsqrt_d2_d(x2);
+  x = fabsk(d) == 1.0 ? dd(0, 0) : x;
+
+  u = +0.3161587650653934628e-1;
+  u = mla(u, x2, -0.1581918243329996643e-1);
+  u = mla(u, x2, +0.1929045477267910674e-1);
+  u = mla(u, x2, +0.6606077476277170610e-2);
+  u = mla(u, x2, +0.1215360525577377331e-1);
+  u = mla(u, x2, +0.1388715184501609218e-1);
+  u = mla(u, x2, +0.1735956991223614604e-1);
+  u = mla(u, x2, +0.2237176181932048341e-1);
+  u = mla(u, x2, +0.3038195928038132237e-1);
+  u = mla(u, x2, +0.4464285681377102438e-1);
+  u = mla(u, x2, +0.7500000000378581611e-1);
+  u = mla(u, x2, +0.1666666666666497543e+0);
+  u *= x2 * x.x;
+
+  Sleef_double2 y = ddadd_d2_d2_d(ddsub_d2_d2_d2(dd(3.141592653589793116/4, 1.2246467991473532072e-16/4), x), -u);
+  double r = o ? (u + x.x) : ((y.x + y.y)*2);
+  r = mulsign(r, d);
+
+  return r;
+}
+
+EXPORT CONST double xacos_u1(double d) {
+  int o = fabsk(d) < 0.5;
+  double x2 = o ? (d*d) : ((1-fabsk(d))*0.5), u;
+  Sleef_double2 x = o ? dd(fabsk(d), 0) : ddsqrt_d2_d(x2), w;
+  x = fabsk(d) == 1.0 ? dd(0, 0) : x;
+
+  u = +0.3161587650653934628e-1;
+  u = mla(u, x2, -0.1581918243329996643e-1);
+  u = mla(u, x2, +0.1929045477267910674e-1);
+  u = mla(u, x2, +0.6606077476277170610e-2);
+  u = mla(u, x2, +0.1215360525577377331e-1);
+  u = mla(u, x2, +0.1388715184501609218e-1);
+  u = mla(u, x2, +0.1735956991223614604e-1);
+  u = mla(u, x2, +0.2237176181932048341e-1);
+  u = mla(u, x2, +0.3038195928038132237e-1);
+  u = mla(u, x2, +0.4464285681377102438e-1);
+  u = mla(u, x2, +0.7500000000378581611e-1);
+  u = mla(u, x2, +0.1666666666666497543e+0);
+
+  u *= x.x * x2;
+
+  Sleef_double2 y = ddsub_d2_d2_d2(dd(3.141592653589793116/2, 1.2246467991473532072e-16/2),
+                                   ddadd_d2_d_d(mulsign(x.x, d), mulsign(u, d)));
+  x = ddadd_d2_d2_d(x, u);
+  y = o ? y : ddscale_d2_d2_d(x, 2);
+  if (!o && d < 0) y = ddsub_d2_d2_d2(dd(3.141592653589793116, 1.2246467991473532072e-16), y);
+
+  return y.x + y.y;
+}
+
+EXPORT CONST double xatan_u1(double d) {
+  Sleef_double2 d2 = atan2k_u1(dd(fabsk(d), 0), dd(1, 0));
+  double r = d2.x + d2.y;
+  if (xisinf(d)) r = 1.570796326794896557998982;
+  return mulsign(r, d);
+}
+
+EXPORT CONST double xsin(double d) {
+  double u, s, t = d;
+
+  double dqh = trunck(d * (M_1_PI / (1 << 24))) * (double)(1 << 24);
+  int ql = rintk(mla(d, M_1_PI, -dqh));
+
+  d = mla(dqh, -PI_A, d);
+  d = mla( ql, -PI_A, d);
+  d = mla(dqh, -PI_B, d);
+  d = mla( ql, -PI_B, d);
+  d = mla(dqh, -PI_C, d);
+  d = mla( ql, -PI_C, d);
+  d = mla(dqh + ql, -PI_D, d);
+
+  s = d * d;
+
+  if ((ql & 1) != 0) d = -d;
+
+  u = -7.97255955009037868891952e-18;
+  u = mla(u, s, 2.81009972710863200091251e-15);
+  u = mla(u, s, -7.64712219118158833288484e-13);
+  u = mla(u, s, 1.60590430605664501629054e-10);
+  u = mla(u, s, -2.50521083763502045810755e-08);
+  u = mla(u, s, 2.75573192239198747630416e-06);
+  u = mla(u, s, -0.000198412698412696162806809);
+  u = mla(u, s, 0.00833333333333332974823815);
+  u = mla(u, s, -0.166666666666666657414808);
+
+  u = mla(s, u * d, d);
+
+  if (!xisinf(t) && (xisnegzero(t) || fabsk(t) > TRIGRANGEMAX)) u = -0.0;
+
+  return u;
+}
+
+EXPORT CONST double xsin_u1(double d) {
+  double u;
+  Sleef_double2 s, t, x;
+  int ql;
+
+  if (fabsk(d) < TRIGRANGEMAX2) {
+    ql = rintk(d * M_1_PI);
+    u = mla(ql, -PI_A2, d);
+    s = ddadd_d2_d_d (u,  ql * -PI_B2);
+  } else {
+    const double dqh = trunck(d * (M_1_PI / (1 << 24))) * (double)(1 << 24);
+    ql = rintk(mla(d, M_1_PI, -dqh));
+
+    u = mla(dqh, -PI_A, d);
+    s = ddadd_d2_d_d  (u,  ql * -PI_A);
+    s = ddadd2_d2_d2_d(s, dqh * -PI_B);
+    s = ddadd2_d2_d2_d(s,  ql * -PI_B);
+    s = ddadd2_d2_d2_d(s, dqh * -PI_C);
+    s = ddadd2_d2_d2_d(s,  ql * -PI_C);
+    s = ddadd_d2_d2_d (s, (dqh + ql) * -PI_D);
+  }
+
+  t = s;
+  s = ddsqu_d2_d2(s);
+
+  u = 2.72052416138529567917983e-15;
+  u = mla(u, s.x, -7.6429259411395447190023e-13);
+  u = mla(u, s.x, 1.60589370117277896211623e-10);
+  u = mla(u, s.x, -2.5052106814843123359368e-08);
+  u = mla(u, s.x, 2.75573192104428224777379e-06);
+  u = mla(u, s.x, -0.000198412698412046454654947);
+  u = mla(u, s.x, 0.00833333333333318056201922);
+
+  x = ddadd_d2_d_d2(1, ddmul_d2_d2_d2(ddadd_d2_d_d(-0.166666666666666657414808, u * s.x), s));
+
+  u = ddmul_d_d2_d2(t, x);
+
+  if ((ql & 1) != 0) u = -u;
+  if (!xisinf(d) && (xisnegzero(d) || fabsk(d) > TRIGRANGEMAX)) u = -0.0;
+
+  return u;
+}
+
+EXPORT CONST double xcos(double d) {
+  double u, s, t = d;
+
+  double dqh = trunck(d * (M_1_PI / (1LL << 23)) - 0.5 * (M_1_PI / (1LL << 23)));
+  int ql = 2*rintk(d * M_1_PI - 0.5 - dqh * (double)(1LL << 23))+1;
+  dqh *= 1 << 24;
+
+  d = mla(dqh, -PI_A*0.5, d);
+  d = mla( ql, -PI_A*0.5, d);
+  d = mla(dqh, -PI_B*0.5, d);
+  d = mla( ql, -PI_B*0.5, d);
+  d = mla(dqh, -PI_C*0.5, d);
+  d = mla( ql, -PI_C*0.5, d);
+  d = mla(dqh + ql , -PI_D*0.5, d);
+
+  s = d * d;
+
+  if ((ql & 2) == 0) d = -d;
+
+  u = -7.97255955009037868891952e-18;
+  u = mla(u, s, 2.81009972710863200091251e-15);
+  u = mla(u, s, -7.64712219118158833288484e-13);
+  u = mla(u, s, 1.60590430605664501629054e-10);
+  u = mla(u, s, -2.50521083763502045810755e-08);
+  u = mla(u, s, 2.75573192239198747630416e-06);
+  u = mla(u, s, -0.000198412698412696162806809);
+  u = mla(u, s, 0.00833333333333332974823815);
+  u = mla(u, s, -0.166666666666666657414808);
+
+  u = mla(s, u * d, d);
+
+  if (!xisinf(t) && fabsk(t) > TRIGRANGEMAX) u = 1.0;
+
+  return u;
+}
+
+EXPORT CONST double xcos_u1(double d) {
+  double u;
+  Sleef_double2 s, t, x;
+  int ql;
+
+  d = fabsk(d);
+
+  if (d < TRIGRANGEMAX2) {
+    ql = mla(2, rintk(d * M_1_PI - 0.5), 1);
+    s = ddadd2_d2_d_d(d, ql * (-PI_A2*0.5));
+    s = ddadd_d2_d2_d(s, ql * (-PI_B2*0.5));
+  } else {
+    double dqh = trunck(d * (M_1_PI / (1LL << 23)) - 0.5 * (M_1_PI / (1LL << 23)));
+    ql = 2*rintk(d * M_1_PI - 0.5 - dqh * (double)(1LL << 23))+1;
+    dqh *= 1 << 24;
+
+    u = mla(dqh, -PI_A*0.5, d);
+    s = ddadd2_d2_d_d (u,  ql * (-PI_A*0.5));
+    s = ddadd2_d2_d2_d(s, dqh * (-PI_B*0.5));
+    s = ddadd2_d2_d2_d(s,  ql * (-PI_B*0.5));
+    s = ddadd2_d2_d2_d(s, dqh * (-PI_C*0.5));
+    s = ddadd2_d2_d2_d(s,  ql * (-PI_C*0.5));
+    s = ddadd_d2_d2_d(s, (dqh + ql) * (-PI_D*0.5));
+  }
+
+  t = s;
+  s = ddsqu_d2_d2(s);
+
+  u = 2.72052416138529567917983e-15;
+  u = mla(u, s.x, -7.6429259411395447190023e-13);
+  u = mla(u, s.x, 1.60589370117277896211623e-10);
+  u = mla(u, s.x, -2.5052106814843123359368e-08);
+  u = mla(u, s.x, 2.75573192104428224777379e-06);
+  u = mla(u, s.x, -0.000198412698412046454654947);
+  u = mla(u, s.x, 0.00833333333333318056201922);
+
+  x = ddadd_d2_d_d2(1, ddmul_d2_d2_d2(ddadd_d2_d_d(-0.166666666666666657414808, u * s.x), s));
+
+  u = ddmul_d_d2_d2(t, x);
+
+  if ((((int)ql) & 2) == 0) u = -u;
+  if (!xisinf(d) && d > TRIGRANGEMAX) u = 1.0;
+
+  return u;
+}
+
+EXPORT CONST Sleef_double2 xsincos(double d) {
+  double u, s, t;
+  Sleef_double2 r;
+
+  s = d;
+
+  double dqh = trunck(d * ((2 * M_1_PI) / (1 << 24))) * (double)(1 << 24);
+  int ql = rintk(d * (2 * M_1_PI) - dqh);
+
+  s = mla(dqh, -PI_A * 0.5, s);
+  s = mla( ql, -PI_A * 0.5, s);
+  s = mla(dqh, -PI_B * 0.5, s);
+  s = mla( ql, -PI_B * 0.5, s);
+  s = mla(dqh, -PI_C * 0.5, s);
+  s = mla( ql, -PI_C * 0.5, s);
+  s = mla(dqh + ql, -PI_D * 0.5, s);
+
+  t = s;
+
+  s = s * s;
+
+  u = 1.58938307283228937328511e-10;
+  u = mla(u, s, -2.50506943502539773349318e-08);
+  u = mla(u, s, 2.75573131776846360512547e-06);
+  u = mla(u, s, -0.000198412698278911770864914);
+  u = mla(u, s, 0.0083333333333191845961746);
+  u = mla(u, s, -0.166666666666666130709393);
+  u = u * s * t;
+
+  r.x = t + u;
+
+  if (xisnegzero(d)) r.x = -0.0;
+
+  u = -1.13615350239097429531523e-11;
+  u = mla(u, s, 2.08757471207040055479366e-09);
+  u = mla(u, s, -2.75573144028847567498567e-07);
+  u = mla(u, s, 2.48015872890001867311915e-05);
+  u = mla(u, s, -0.00138888888888714019282329);
+  u = mla(u, s, 0.0416666666666665519592062);
+  u = mla(u, s, -0.5);
+
+  r.y = u * s + 1;
+
+  if ((ql & 1) != 0) { s = r.y; r.y = r.x; r.x = s; }
+  if ((ql & 2) != 0) { r.x = -r.x; }
+  if (((ql+1) & 2) != 0) { r.y = -r.y; }
+
+  if (fabsk(d) > TRIGRANGEMAX) { r.x = 0; r.y = 1; }
+  if (xisinf(d)) { r.x = r.y = NAN; }
+
+  return r;
+}
+
+EXPORT CONST Sleef_double2 xsincos_u1(double d) {
+  double u;
+  Sleef_double2 r, s, t, x;
+  int ql;
+
+  if (fabsk(d) < TRIGRANGEMAX2) {
+    ql = rintk(d * (2 * M_1_PI));
+    u = mla(ql, -PI_A2*0.5, d);
+    s = ddadd_d2_d_d (u,  ql * (-PI_B2*0.5));
+  } else {
+    const double dqh = trunck(d * ((2 * M_1_PI) / (1 << 24))) * (double)(1 << 24);
+    ql = rintk(d * (2 * M_1_PI) - dqh);
+
+    u = mla(dqh, -PI_A*0.5, d);
+    s = ddadd_d2_d_d(u, ql * (-PI_A*0.5));
+    s = ddadd2_d2_d2_d(s, dqh * (-PI_B*0.5));
+    s = ddadd2_d2_d2_d(s, ql * (-PI_B*0.5));
+    s = ddadd2_d2_d2_d(s, dqh * (-PI_C*0.5));
+    s = ddadd2_d2_d2_d(s, ql * (-PI_C*0.5));
+    s = ddadd_d2_d2_d(s, (dqh + ql) * (-PI_D*0.5));
+  }
+
+  t = s;
+
+  s.x = ddsqu_d_d2(s);
+
+  u = 1.58938307283228937328511e-10;
+  u = mla(u, s.x, -2.50506943502539773349318e-08);
+  u = mla(u, s.x, 2.75573131776846360512547e-06);
+  u = mla(u, s.x, -0.000198412698278911770864914);
+  u = mla(u, s.x, 0.0083333333333191845961746);
+  u = mla(u, s.x, -0.166666666666666130709393);
+
+  u *= s.x * t.x;
+
+  x = ddadd_d2_d2_d(t, u);
+  r.x = x.x + x.y;
+
+  if (xisnegzero(d)) r.x = -0.0;
+
+  u = -1.13615350239097429531523e-11;
+  u = mla(u, s.x, 2.08757471207040055479366e-09);
+  u = mla(u, s.x, -2.75573144028847567498567e-07);
+  u = mla(u, s.x, 2.48015872890001867311915e-05);
+  u = mla(u, s.x, -0.00138888888888714019282329);
+  u = mla(u, s.x, 0.0416666666666665519592062);
+  u = mla(u, s.x, -0.5);
+
+  x = ddadd_d2_d_d2(1, ddmul_d2_d_d(s.x, u));
+  r.y = x.x + x.y;
+
+  if ((ql & 1) != 0) { u = r.y; r.y = r.x; r.x = u; }
+  if ((ql & 2) != 0) { r.x = -r.x; }
+  if (((ql+1) & 2) != 0) { r.y = -r.y; }
+
+  if (fabsk(d) > TRIGRANGEMAX) { r.x = 0; r.y = 1; }
+  if (xisinf(d)) { r.x = r.y = NAN; }
+
+  return r;
+}
+
+EXPORT CONST Sleef_double2 xsincospi_u05(double d) {
+  double u, s, t;
+  Sleef_double2 r, x, s2;
+
+  u = d * 4;
+  int q = ceilk(u) & ~(int)1;
+
+  s = u - (double)q;
+  t = s;
+  s = s * s;
+  s2 = ddmul_d2_d_d(t, t);
+
+  //
+
+  u = -2.02461120785182399295868e-14;
+  u = mla(u, s, 6.94821830580179461327784e-12);
+  u = mla(u, s, -1.75724749952853179952664e-09);
+  u = mla(u, s, 3.13361688966868392878422e-07);
+  u = mla(u, s, -3.6576204182161551920361e-05);
+  u = mla(u, s, 0.00249039457019271850274356);
+  x = ddadd2_d2_d_d2(u * s, dd(-0.0807455121882807852484731, 3.61852475067037104849987e-18));
+  x = ddadd2_d2_d2_d2(ddmul_d2_d2_d2(s2, x), dd(0.785398163397448278999491, 3.06287113727155002607105e-17));
+
+  x = ddmul_d2_d2_d(x, t);
+  r.x = x.x + x.y;
+
+  if (xisnegzero(d)) r.x = -0.0;
+
+  //
+
+  u = 9.94480387626843774090208e-16;
+  u = mla(u, s, -3.89796226062932799164047e-13);
+  u = mla(u, s, 1.15011582539996035266901e-10);
+  u = mla(u, s, -2.4611369501044697495359e-08);
+  u = mla(u, s, 3.59086044859052754005062e-06);
+  u = mla(u, s, -0.000325991886927389905997954);
+  x = ddadd2_d2_d_d2(u * s, dd(0.0158543442438155018914259, -1.04693272280631521908845e-18));
+  x = ddadd2_d2_d2_d2(ddmul_d2_d2_d2(s2, x), dd(-0.308425137534042437259529, -1.95698492133633550338345e-17));
+
+  x = ddadd2_d2_d2_d(ddmul_d2_d2_d2(x, s2), 1);
+  r.y = x.x + x.y;
+
+  //
+
+  if ((q & 2) != 0) { s = r.y; r.y = r.x; r.x = s; }
+  if ((q & 4) != 0) { r.x = -r.x; }
+  if (((q+2) & 4) != 0) { r.y = -r.y; }
+
+  if (fabsk(d) > TRIGRANGEMAX3/4) { r.x = 0; r.y = 1; }
+  if (xisinf(d)) { r.x = r.y = NAN; }
+
+  return r;
+}
+
+EXPORT CONST Sleef_double2 xsincospi_u35(double d) {
+  double u, s, t;
+  Sleef_double2 r;
+
+  u = d * 4;
+  int q = ceilk(u) & ~(int)1;
+
+  s = u - (double)q;
+  t = s;
+  s = s * s;
+
+  //
+
+  u = +0.6880638894766060136e-11;
+  u = mla(u, s, -0.1757159564542310199e-8);
+  u = mla(u, s, +0.3133616327257867311e-6);
+  u = mla(u, s, -0.3657620416388486452e-4);
+  u = mla(u, s, +0.2490394570189932103e-2);
+  u = mla(u, s, -0.8074551218828056320e-1);
+  u = mla(u, s, +0.7853981633974482790e+0);
+
+  r.x = u * t;
+
+  //
+
+  u = -0.3860141213683794352e-12;
+  u = mla(u, s, +0.1150057888029681415e-9);
+  u = mla(u, s, -0.2461136493006663553e-7);
+  u = mla(u, s, +0.3590860446623516713e-5);
+  u = mla(u, s, -0.3259918869269435942e-3);
+  u = mla(u, s, +0.1585434424381541169e-1);
+  u = mla(u, s, -0.3084251375340424373e+0);
+  u = mla(u, s, 1);
+
+  r.y = u;
+
+  //
+
+  if ((q & 2) != 0) { s = r.y; r.y = r.x; r.x = s; }
+  if ((q & 4) != 0) { r.x = -r.x; }
+  if (((q+2) & 4) != 0) { r.y = -r.y; }
+
+  if (fabsk(d) > TRIGRANGEMAX3/4) { r.x = 0; r.y = 1; }
+  if (xisinf(d)) { r.x = r.y = NAN; }
+
+  return r;
+}
+
+static INLINE CONST Sleef_double2 sinpik(double d) {
+  double u, s, t;
+  Sleef_double2 x, s2;
+
+  u = d * 4;
+  int q = ceilk(u) & ~1;
+  int o = (q & 2) != 0;
+
+  s = u - (double)q;
+  t = s;
+  s = s * s;
+  s2 = ddmul_d2_d_d(t, t);
+
+  //
+
+  u = o ? 9.94480387626843774090208e-16 : -2.02461120785182399295868e-14;
+  u = mla(u, s, o ? -3.89796226062932799164047e-13 : 6.94821830580179461327784e-12);
+  u = mla(u, s, o ? 1.15011582539996035266901e-10 : -1.75724749952853179952664e-09);
+  u = mla(u, s, o ? -2.4611369501044697495359e-08 : 3.13361688966868392878422e-07);
+  u = mla(u, s, o ? 3.59086044859052754005062e-06 : -3.6576204182161551920361e-05);
+  u = mla(u, s, o ? -0.000325991886927389905997954 : 0.00249039457019271850274356);
+  x = ddadd2_d2_d_d2(u * s, o ? dd(0.0158543442438155018914259, -1.04693272280631521908845e-18) :
+         dd(-0.0807455121882807852484731, 3.61852475067037104849987e-18));
+  x = ddadd2_d2_d2_d2(ddmul_d2_d2_d2(s2, x), o ? dd(-0.308425137534042437259529, -1.95698492133633550338345e-17) :
+          dd(0.785398163397448278999491, 3.06287113727155002607105e-17));
+
+  x = ddmul_d2_d2_d2(x, o ? s2 : dd(t, 0));
+  x = o ? ddadd2_d2_d2_d(x, 1) : x;
+
+  //
+
+  if ((q & 4) != 0) { x.x = -x.x; x.y = -x.y; }
+
+  return x;
+}
+
+EXPORT CONST double xsinpi_u05(double d) {
+  Sleef_double2 x = sinpik(d);
+  double r = x.x + x.y;
+
+  if (xisnegzero(d)) r = -0.0;
+  if (fabsk(d) > TRIGRANGEMAX3/4) r = 0;
+  if (xisinf(d)) r = NAN;
+
+  return r;
+}
+
+static INLINE CONST Sleef_double2 cospik(double d) {
+  double u, s, t;
+  Sleef_double2 x, s2;
+
+  u = d * 4;
+  int q = ceilk(u) & ~1;
+  int o = (q & 2) == 0;
+
+  s = u - (double)q;
+  t = s;
+  s = s * s;
+  s2 = ddmul_d2_d_d(t, t);
+
+  //
+
+  u = o ? 9.94480387626843774090208e-16 : -2.02461120785182399295868e-14;
+  u = mla(u, s, o ? -3.89796226062932799164047e-13 : 6.94821830580179461327784e-12);
+  u = mla(u, s, o ? 1.15011582539996035266901e-10 : -1.75724749952853179952664e-09);
+  u = mla(u, s, o ? -2.4611369501044697495359e-08 : 3.13361688966868392878422e-07);
+  u = mla(u, s, o ? 3.59086044859052754005062e-06 : -3.6576204182161551920361e-05);
+  u = mla(u, s, o ? -0.000325991886927389905997954 : 0.00249039457019271850274356);
+  x = ddadd2_d2_d_d2(u * s, o ? dd(0.0158543442438155018914259, -1.04693272280631521908845e-18) :
+         dd(-0.0807455121882807852484731, 3.61852475067037104849987e-18));
+  x = ddadd2_d2_d2_d2(ddmul_d2_d2_d2(s2, x), o ? dd(-0.308425137534042437259529, -1.95698492133633550338345e-17) :
+          dd(0.785398163397448278999491, 3.06287113727155002607105e-17));
+
+  x = ddmul_d2_d2_d2(x, o ? s2 : dd(t, 0));
+  x = o ? ddadd2_d2_d2_d(x, 1) : x;
+
+  //
+
+  if (((q+2) & 4) != 0) { x.x = -x.x; x.y = -x.y; }
+
+  return x;
+}
+
+EXPORT CONST double xcospi_u05(double d) {
+  Sleef_double2 x = cospik(d);
+  double r = x.x + x.y;
+
+  if (fabsk(d) > TRIGRANGEMAX3/4) r = 1;
+  if (xisinf(d)) r = NAN;
+
+  return r;
+}
+
+EXPORT CONST double xtan(double d) {
+  double u, s, x;
+
+  double dqh = trunck(d * ((2 * M_1_PI) / (1 << 24))) * (double)(1 << 24);
+  int ql = rintk(d * (2 * M_1_PI) - dqh);
+
+  x = mla(dqh, -PI_A * 0.5, d);
+  x = mla( ql, -PI_A * 0.5, x);
+  x = mla(dqh, -PI_B * 0.5, x);
+  x = mla( ql, -PI_B * 0.5, x);
+  x = mla(dqh, -PI_C * 0.5, x);
+  x = mla( ql, -PI_C * 0.5, x);
+  x = mla(dqh + ql, -PI_D * 0.5, x);
+
+  s = x * x;
+
+  if ((ql & 1) != 0) x = -x;
+
+  u = 9.99583485362149960784268e-06;
+  u = mla(u, s, -4.31184585467324750724175e-05);
+  u = mla(u, s, 0.000103573238391744000389851);
+  u = mla(u, s, -0.000137892809714281708733524);
+  u = mla(u, s, 0.000157624358465342784274554);
+  u = mla(u, s, -6.07500301486087879295969e-05);
+  u = mla(u, s, 0.000148898734751616411290179);
+  u = mla(u, s, 0.000219040550724571513561967);
+  u = mla(u, s, 0.000595799595197098359744547);
+  u = mla(u, s, 0.00145461240472358871965441);
+  u = mla(u, s, 0.0035923150771440177410343);
+  u = mla(u, s, 0.00886321546662684547901456);
+  u = mla(u, s, 0.0218694899718446938985394);
+  u = mla(u, s, 0.0539682539049961967903002);
+  u = mla(u, s, 0.133333333334818976423364);
+  u = mla(u, s, 0.333333333333320047664472);
+
+  u = mla(s, u * x, x);
+
+  if ((ql & 1) != 0) u = 1.0 / u;
+
+  if (xisinf(d)) u = NAN;
+
+  return u;
+}
+
+EXPORT CONST double xtan_u1(double d) {
+  double u;
+  Sleef_double2 s, t, x;
+  int ql;
+
+  if (fabsk(d) < TRIGRANGEMAX2) {
+    ql = rintk(d * (2 * M_1_PI));
+    u = mla(ql, -PI_A2*0.5, d);
+    s = ddadd_d2_d_d(u,  ql * (-PI_B2*0.5));
+  } else {
+    const double dqh = trunck(d * (M_2_PI / (1 << 24))) * (double)(1 << 24);
+    s = ddadd2_d2_d2_d(ddmul_d2_d2_d(dd(M_2_PI_H, M_2_PI_L), d), (d < 0 ? -0.5 : 0.5) - dqh);
+    ql = s.x + s.y;
+
+    u = mla(dqh, -PI_A*0.5, d);
+    s = ddadd_d2_d_d  (u,  ql * (-PI_A*0.5));
+    s = ddadd2_d2_d2_d(s, dqh * (-PI_B*0.5));
+    s = ddadd2_d2_d2_d(s,  ql * (-PI_B*0.5));
+    s = ddadd2_d2_d2_d(s, dqh * (-PI_C*0.5));
+    s = ddadd2_d2_d2_d(s,  ql * (-PI_C*0.5));
+    s = ddadd_d2_d2_d(s, (dqh + ql) * (-PI_D*0.5));
+  }
+
+  if ((ql & 1) != 0) s = ddneg_d2_d2(s);
+
+  t = s;
+  s = ddsqu_d2_d2(s);
+
+  u = 1.01419718511083373224408e-05;
+  u = mla(u, s.x, -2.59519791585924697698614e-05);
+  u = mla(u, s.x, 5.23388081915899855325186e-05);
+  u = mla(u, s.x, -3.05033014433946488225616e-05);
+  u = mla(u, s.x, 7.14707504084242744267497e-05);
+  u = mla(u, s.x, 8.09674518280159187045078e-05);
+  u = mla(u, s.x, 0.000244884931879331847054404);
+  u = mla(u, s.x, 0.000588505168743587154904506);
+  u = mla(u, s.x, 0.00145612788922812427978848);
+  u = mla(u, s.x, 0.00359208743836906619142924);
+  u = mla(u, s.x, 0.00886323944362401618113356);
+  u = mla(u, s.x, 0.0218694882853846389592078);
+  u = mla(u, s.x, 0.0539682539781298417636002);
+  u = mla(u, s.x, 0.133333333333125941821962);
+
+  x = ddadd_d2_d_d2(1, ddmul_d2_d2_d2(ddadd_d2_d_d(0.333333333333334980164153, u * s.x), s));
+  x = ddmul_d2_d2_d2(t, x);
+
+  if ((ql & 1) != 0) x = ddrec_d2_d2(x);
+
+  u = x.x + x.y;
+
+  if (!xisinf(d) && (xisnegzero(d) || fabsk(d) > TRIGRANGEMAX)) u = -0.0;
+
+  return u;
+}
+
+EXPORT CONST double xlog(double d) {
+  double x, x2, t, m;
+  int e;
+
+  int o = d < DBL_MIN;
+  if (o) d *= (double)(1LL << 32) * (double)(1LL << 32);
+
+  e = ilogb2k(d * (1.0/0.75));
+  m = ldexp3k(d, -e);
+
+  if (o) e -= 64;
+
+  x = (m-1) / (m+1);
+  x2 = x * x;
+
+  t = 0.153487338491425068243146;
+  t = mla(t, x2, 0.152519917006351951593857);
+  t = mla(t, x2, 0.181863266251982985677316);
+  t = mla(t, x2, 0.222221366518767365905163);
+  t = mla(t, x2, 0.285714294746548025383248);
+  t = mla(t, x2, 0.399999999950799600689777);
+  t = mla(t, x2, 0.6666666666667778740063);
+  t = mla(t, x2, 2);
+
+  x = x * t + 0.693147180559945286226764 * e;
+
+  if (xisinf(d)) x = INFINITY;
+  if (d < 0 || xisnan(d)) x = NAN;
+  if (d == 0) x = -INFINITY;
+
+  return x;
+}
+
+EXPORT CONST double xexp(double d) {
+  int q = (int)rintk(d * R_LN2);
+  double s, u;
+
+  s = mla(q, -L2U, d);
+  s = mla(q, -L2L, s);
+
+  u = 2.08860621107283687536341e-09;
+  u = mla(u, s, 2.51112930892876518610661e-08);
+  u = mla(u, s, 2.75573911234900471893338e-07);
+  u = mla(u, s, 2.75572362911928827629423e-06);
+  u = mla(u, s, 2.4801587159235472998791e-05);
+  u = mla(u, s, 0.000198412698960509205564975);
+  u = mla(u, s, 0.00138888888889774492207962);
+  u = mla(u, s, 0.00833333333331652721664984);
+  u = mla(u, s, 0.0416666666666665047591422);
+  u = mla(u, s, 0.166666666666666851703837);
+  u = mla(u, s, 0.5);
+
+  u = s * s * u + s + 1;
+  u = ldexp2k(u, q);
+
+  if (d > 709.78271114955742909217217426) u = INFINITY;
+  if (d < -1000) u = 0;
+
+  return u;
+}
+
+static INLINE CONST Sleef_double2 logk(double d) {
+  Sleef_double2 x, x2, s;
+  double m, t;
+  int e;
+
+  int o = d < DBL_MIN;
+  if (o) d *= (double)(1LL << 32) * (double)(1LL << 32);
+
+  e = ilogb2k(d * (1.0/0.75));
+  m = ldexp3k(d, -e);
+
+  if (o) e -= 64;
+
+  x = dddiv_d2_d2_d2(ddadd2_d2_d_d(-1, m), ddadd2_d2_d_d(1, m));
+  x2 = ddsqu_d2_d2(x);
+
+  t = 0.116255524079935043668677;
+  t = mla(t, x2.x, 0.103239680901072952701192);
+  t = mla(t, x2.x, 0.117754809412463995466069);
+  t = mla(t, x2.x, 0.13332981086846273921509);
+  t = mla(t, x2.x, 0.153846227114512262845736);
+  t = mla(t, x2.x, 0.181818180850050775676507);
+  t = mla(t, x2.x, 0.222222222230083560345903);
+  t = mla(t, x2.x, 0.285714285714249172087875);
+  t = mla(t, x2.x, 0.400000000000000077715612);
+  Sleef_double2 c = dd(0.666666666666666629659233, 3.80554962542412056336616e-17);
+
+  s = ddmul_d2_d2_d(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), e);
+  s = ddadd_d2_d2_d2(s, ddscale_d2_d2_d(x, 2));
+  s = ddadd_d2_d2_d2(s, ddmul_d2_d2_d2(ddmul_d2_d2_d2(x2, x),
+                                      ddadd2_d2_d2_d2(ddmul_d2_d2_d(x2, t), c)));
+  return s;
+}
+
+EXPORT CONST double xlog_u1(double d) {
+  Sleef_double2 x, s;
+  double m, t, x2;
+  int e;
+
+  int o = d < DBL_MIN;
+  if (o) d *= (double)(1LL << 32) * (double)(1LL << 32);
+
+  e = ilogb2k(d * (1.0/0.75));
+  m = ldexp3k(d, -e);
+
+  if (o) e -= 64;
+
+  x = dddiv_d2_d2_d2(ddadd2_d2_d_d(-1, m), ddadd2_d2_d_d(1, m));
+  x2 = x.x * x.x;
+
+  t = 0.1532076988502701353e+0;
+  t = mla(t, x2, 0.1525629051003428716e+0);
+  t = mla(t, x2, 0.1818605932937785996e+0);
+  t = mla(t, x2, 0.2222214519839380009e+0);
+  t = mla(t, x2, 0.2857142932794299317e+0);
+  t = mla(t, x2, 0.3999999999635251990e+0);
+  t = mla(t, x2, 0.6666666666667333541e+0);
+
+  s = ddmul_d2_d2_d(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), (double)e);
+  s = ddadd_d2_d2_d2(s, ddscale_d2_d2_d(x, 2));
+  s = ddadd_d2_d2_d(s, x2 * x.x * t);
+
+  double r = s.x + s.y;
+
+  if (xisinf(d)) r = INFINITY;
+  if (d < 0 || xisnan(d)) r = NAN;
+  if (d == 0) r = -INFINITY;
+
+  return r;
+}
+
+static INLINE CONST double expk(Sleef_double2 d) {
+  int q = (int)rintk((d.x + d.y) * R_LN2);
+  Sleef_double2 s, t;
+  double u;
+
+  s = ddadd2_d2_d2_d(d, q * -L2U);
+  s = ddadd2_d2_d2_d(s, q * -L2L);
+
+  s = ddnormalize_d2_d2(s);
+
+  u = 2.51069683420950419527139e-08;
+  u = mla(u, s.x, 2.76286166770270649116855e-07);
+  u = mla(u, s.x, 2.75572496725023574143864e-06);
+  u = mla(u, s.x, 2.48014973989819794114153e-05);
+  u = mla(u, s.x, 0.000198412698809069797676111);
+  u = mla(u, s.x, 0.0013888888939977128960529);
+  u = mla(u, s.x, 0.00833333333332371417601081);
+  u = mla(u, s.x, 0.0416666666665409524128449);
+  u = mla(u, s.x, 0.166666666666666740681535);
+  u = mla(u, s.x, 0.500000000000000999200722);
+
+  t = ddadd_d2_d2_d2(s, ddmul_d2_d2_d(ddsqu_d2_d2(s), u));
+
+  t = ddadd_d2_d_d2(1, t);
+
+  u = ldexpk(t.x + t.y, q);
+
+  if (d.x < -1000) u = 0;
+
+  return u;
+}
+
+EXPORT CONST double xpow(double x, double y) {
+  int yisint = xisint(y);
+  int yisodd = yisint && xisodd(y);
+
+  Sleef_double2 d = ddmul_d2_d2_d(logk(fabsk(x)), y);
+  double result = expk(d);
+  if (d.x > 709.78271114955742909217217426) result = INFINITY;
+
+  result = xisnan(result) ? INFINITY : result;
+  result *= (x > 0 ? 1 : (!yisint ? NAN : (yisodd ? -1 : 1)));
+
+  double efx = mulsign(fabsk(x) - 1, y);
+  if (xisinf(y)) result = efx < 0 ? 0.0 : (efx == 0 ? 1.0 : INFINITY);
+  if (xisinf(x) || x == 0) result = (yisodd ? sign(x) : 1) * ((x == 0 ? -y : y) < 0 ? 0 : INFINITY);
+  if (xisnan(x) || xisnan(y)) result = NAN;
+  if (y == 0 || x == 1) result = 1;
+
+  return result;
+}
+
+EXPORT CONST double xpown(double x, int y) {
+  return xpow(x, (double)y);
+}
+
+EXPORT CONST double xpowr(double x, double y) {
+  if (x < 0.0)
+    return NAN;
+  if (isnan(y))
+    return y;
+  return xpow(x, y);
+}
+
+static INLINE CONST Sleef_double2 expk2(Sleef_double2 d) {
+  int q = (int)rintk((d.x + d.y) * R_LN2);
+  Sleef_double2 s, t;
+  double u;
+
+  s = ddadd2_d2_d2_d(d, q * -L2U);
+  s = ddadd2_d2_d2_d(s, q * -L2L);
+
+  u = +0.1602472219709932072e-9;
+  u = mla(u, s.x, +0.2092255183563157007e-8);
+  u = mla(u, s.x, +0.2505230023782644465e-7);
+  u = mla(u, s.x, +0.2755724800902135303e-6);
+  u = mla(u, s.x, +0.2755731892386044373e-5);
+  u = mla(u, s.x, +0.2480158735605815065e-4);
+  u = mla(u, s.x, +0.1984126984148071858e-3);
+  u = mla(u, s.x, +0.1388888888886763255e-2);
+  u = mla(u, s.x, +0.8333333333333347095e-2);
+  u = mla(u, s.x, +0.4166666666666669905e-1);
+
+  t = ddadd2_d2_d2_d(ddmul_d2_d2_d(s, u), +0.1666666666666666574e+0);
+  t = ddadd2_d2_d2_d(ddmul_d2_d2_d2(s, t), 0.5);
+  t = ddadd2_d2_d2_d2(s, ddmul_d2_d2_d2(ddsqu_d2_d2(s), t));
+
+  t = ddadd2_d2_d_d2(1, t);
+
+  t.x = ldexp2k(t.x, q);
+  t.y = ldexp2k(t.y, q);
+
+  return d.x < -1000 ? dd(0, 0) : t;
+}
+
+EXPORT CONST double xsinh(double x) {
+  double y = fabsk(x);
+  Sleef_double2 d = expk2(dd(y, 0));
+  d = ddsub_d2_d2_d2(d, ddrec_d2_d2(d));
+  y = (d.x + d.y) * 0.5;
+
+  y = fabsk(x) > 710 ? INFINITY : y;
+  y = xisnan(y) ? INFINITY : y;
+  y = mulsign(y, x);
+  y = xisnan(x) ? NAN : y;
+
+  return y;
+}
+
+EXPORT CONST double xcosh(double x) {
+  double y = fabsk(x);
+  Sleef_double2 d = expk2(dd(y, 0));
+  d = ddadd_d2_d2_d2(d, ddrec_d2_d2(d));
+  y = (d.x + d.y) * 0.5;
+
+  y = fabsk(x) > 710 ? INFINITY : y;
+  y = xisnan(y) ? INFINITY : y;
+  y = xisnan(x) ? NAN : y;
+
+  return y;
+}
+
+EXPORT CONST double xtanh(double x) {
+  double y = fabsk(x);
+  Sleef_double2 d = expk2(dd(y, 0));
+  Sleef_double2 e = ddrec_d2_d2(d);
+  d = dddiv_d2_d2_d2(ddsub_d2_d2_d2(d, e), ddadd_d2_d2_d2(d, e));
+  y = d.x + d.y;
+
+  y = fabsk(x) > 18.714973875 ? 1.0 : y;
+  y = xisnan(y) ? 1.0 : y;
+  y = mulsign(y, x);
+  y = xisnan(x) ? NAN : y;
+
+  return y;
+}
+
+static INLINE CONST Sleef_double2 logk2(Sleef_double2 d) {
+  Sleef_double2 x, x2, m, s;
+  double t;
+  int e;
+
+  e = ilogbk(d.x * (1.0/0.75));
+
+  m.x = ldexp2k(d.x, -e);
+  m.y = ldexp2k(d.y, -e);
+
+  x = dddiv_d2_d2_d2(ddadd2_d2_d2_d(m, -1), ddadd2_d2_d2_d(m, 1));
+  x2 = ddsqu_d2_d2(x);
+
+  t = 0.13860436390467167910856;
+  t = mla(t, x2.x, 0.131699838841615374240845);
+  t = mla(t, x2.x, 0.153914168346271945653214);
+  t = mla(t, x2.x, 0.181816523941564611721589);
+  t = mla(t, x2.x, 0.22222224632662035403996);
+  t = mla(t, x2.x, 0.285714285511134091777308);
+  t = mla(t, x2.x, 0.400000000000914013309483);
+  t = mla(t, x2.x, 0.666666666666664853302393);
+
+  s = ddmul_d2_d2_d(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), e);
+  s = ddadd_d2_d2_d2(s, ddscale_d2_d2_d(x, 2));
+  s = ddadd_d2_d2_d2(s, ddmul_d2_d2_d(ddmul_d2_d2_d2(x2, x), t));
+
+  return s;
+}
+
+EXPORT CONST double xasinh(double x) {
+  double y = fabsk(x);
+  Sleef_double2 d;
+
+  d = y > 1 ? ddrec_d2_d(x) : dd(y, 0);
+  d = ddsqrt_d2_d2(ddadd2_d2_d2_d(ddsqu_d2_d2(d), 1));
+  d = y > 1 ? ddmul_d2_d2_d(d, y) : d;
+
+  d = logk2(ddnormalize_d2_d2(ddadd_d2_d2_d(d, x)));
+  y = d.x + d.y;
+
+  y = (fabsk(x) > SQRT_DBL_MAX || xisnan(y)) ? mulsign(INFINITY, x) : y;
+  y = xisnan(x) ? NAN : y;
+  y = xisnegzero(x) ? -0.0 : y;
+
+  return y;
+}
+
+EXPORT CONST double xacosh(double x) {
+  Sleef_double2 d = logk2(ddadd2_d2_d2_d(ddmul_d2_d2_d2(ddsqrt_d2_d2(ddadd2_d2_d_d(x, 1)), ddsqrt_d2_d2(ddadd2_d2_d_d(x, -1))), x));
+  double y = d.x + d.y;
+
+  y = (x > SQRT_DBL_MAX || xisnan(y)) ? INFINITY : y;
+  y = x == 1.0 ? 0.0 : y;
+  y = x < 1.0 ? NAN : y;
+  y = xisnan(x) ? NAN : y;
+
+  return y;
+}
+
+EXPORT CONST double xatanh(double x) {
+  double y = fabsk(x);
+  Sleef_double2 d = logk2(dddiv_d2_d2_d2(ddadd2_d2_d_d(1, y), ddadd2_d2_d_d(1, -y)));
+  y = y > 1.0 ? NAN : (y == 1.0 ? INFINITY : (d.x + d.y) * 0.5);
+
+  y = mulsign(y, x);
+  y = (xisinf(x) || xisnan(y)) ? NAN : y;
+
+  return y;
+}
+
+//
+
+EXPORT CONST double xcbrt(double d) { // max error : 2 ulps
+  double x, y, q = 1.0;
+  int e, r;
+
+  e = ilogbk(fabsk(d))+1;
+  d = ldexp2k(d, -e);
+  r = (e + 6144) % 3;
+  q = (r == 1) ? 1.2599210498948731647672106 : q;
+  q = (r == 2) ? 1.5874010519681994747517056 : q;
+  q = ldexp2k(q, (e + 6144) / 3 - 2048);
+
+  q = mulsign(q, d);
+  d = fabsk(d);
+
+  x = -0.640245898480692909870982;
+  x = mla(x, d, 2.96155103020039511818595);
+  x = mla(x, d, -5.73353060922947843636166);
+  x = mla(x, d, 6.03990368989458747961407);
+  x = mla(x, d, -3.85841935510444988821632);
+  x = mla(x, d, 2.2307275302496609725722);
+
+  y = x * x; y = y * y; x -= (d * y - x) * (1.0 / 3.0);
+  y = d * x * x;
+  y = (y - (2.0 / 3.0) * y * (y * x - 1)) * q;
+
+  return y;
+}
+
+EXPORT CONST double xcbrt_u1(double d) {
+  double x, y, z;
+  Sleef_double2 q2 = dd(1, 0), u, v;
+  int e, r;
+
+  e = ilogbk(fabsk(d))+1;
+  d = ldexp2k(d, -e);
+  r = (e + 6144) % 3;
+  q2 = (r == 1) ? dd(1.2599210498948731907, -2.5899333753005069177e-17) : q2;
+  q2 = (r == 2) ? dd(1.5874010519681995834, -1.0869008194197822986e-16) : q2;
+
+  q2.x = mulsign(q2.x, d); q2.y = mulsign(q2.y, d);
+  d = fabsk(d);
+
+  x = -0.640245898480692909870982;
+  x = mla(x, d, 2.96155103020039511818595);
+  x = mla(x, d, -5.73353060922947843636166);
+  x = mla(x, d, 6.03990368989458747961407);
+  x = mla(x, d, -3.85841935510444988821632);
+  x = mla(x, d, 2.2307275302496609725722);
+
+  y = x * x; y = y * y; x -= (d * y - x) * (1.0 / 3.0);
+
+  z = x;
+
+  u = ddmul_d2_d_d(x, x);
+  u = ddmul_d2_d2_d2(u, u);
+  u = ddmul_d2_d2_d(u, d);
+  u = ddadd2_d2_d2_d(u, -x);
+  y = u.x + u.y;
+
+  y = -2.0 / 3.0 * y * z;
+  v = ddadd2_d2_d2_d(ddmul_d2_d_d(z, z), y);
+  v = ddmul_d2_d2_d(v, d);
+  v = ddmul_d2_d2_d2(v, q2);
+  z = ldexp2k(v.x + v.y, (e + 6144) / 3 - 2048);
+
+  if (xisinf(d)) { z = mulsign(INFINITY, q2.x); }
+  if (d == 0) { z = mulsign(0, q2.x); }
+
+  return z;
+}
+
+EXPORT CONST double xexp2(double d) {
+  int q = (int)rintk(d);
+  double s, u;
+
+  s = d - q;
+
+  u = +0.4434359082926529454e-9;
+  u = mla(u, s, +0.7073164598085707425e-8);
+  u = mla(u, s, +0.1017819260921760451e-6);
+  u = mla(u, s, +0.1321543872511327615e-5);
+  u = mla(u, s, +0.1525273353517584730e-4);
+  u = mla(u, s, +0.1540353045101147808e-3);
+  u = mla(u, s, +0.1333355814670499073e-2);
+  u = mla(u, s, +0.9618129107597600536e-2);
+  u = mla(u, s, +0.5550410866482046596e-1);
+  u = mla(u, s, +0.2402265069591012214e+0);
+  u = mla(u, s, +0.6931471805599452862e+0);
+  u = ddnormalize_d2_d2(ddadd_d2_d_d2(1, ddmul_d2_d_d(u, s))).x;
+
+  u = ldexp2k(u, q);
+
+  if (d >= 1024) u = INFINITY;
+  if (d < -2000) u = 0;
+  
+  return u;
+}
+
+EXPORT CONST double xexp10(double d) {
+  int q = (int)rintk(d * LOG10_2);
+  double s, u;
+  
+  s = mla(q, -L10U, d);
+  s = mla(q, -L10L, s);
+  
+  u = +0.2411463498334267652e-3;
+  u = mla(u, s, +0.1157488415217187375e-2);
+  u = mla(u, s, +0.5013975546789733659e-2);
+  u = mla(u, s, +0.1959762320720533080e-1);
+  u = mla(u, s, +0.6808936399446784138e-1);
+  u = mla(u, s, +0.2069958494722676234e+0);
+  u = mla(u, s, +0.5393829292058536229e+0);
+  u = mla(u, s, +0.1171255148908541655e+1);
+  u = mla(u, s, +0.2034678592293432953e+1);
+  u = mla(u, s, +0.2650949055239205876e+1);
+  u = mla(u, s, +0.2302585092994045901e+1);
+  u = ddnormalize_d2_d2(ddadd_d2_d_d2(1, ddmul_d2_d_d(u, s))).x;
+  
+  u = ldexp2k(u, q);
+  
+  if (d > 308.25471555991671) u = INFINITY; // log10(DBL_MAX)
+  if (d < -350) u = 0;
+  
+  return u;
+}
+
+EXPORT CONST double xexpm1(double a) {
+  Sleef_double2 d = ddadd2_d2_d2_d(expk2(dd(a, 0)), -1.0);
+  double x = d.x + d.y;
+  if (a > 709.782712893383996732223) x = INFINITY; // log(DBL_MAX)
+  if (a < -36.736800569677101399113302437) x = -1; // log(1 - nexttoward(1, 0))
+  if (xisnegzero(a)) x = -0.0;
+  return x;
+}
+
+EXPORT CONST double xlog10(double d) {
+  Sleef_double2 x, s;
+  double m, t, x2;
+  int e;
+
+  int o = d < DBL_MIN;
+  if (o) d *= (double)(1LL << 32) * (double)(1LL << 32);
+      
+  e = ilogb2k(d * (1.0/0.75));
+  m = ldexp3k(d, -e);
+
+  if (o) e -= 64;
+
+  x = dddiv_d2_d2_d2(ddadd2_d2_d_d(-1, m), ddadd2_d2_d_d(1, m));
+  x2 = x.x * x.x;
+
+  t = +0.6653725819576758460e-1;
+  t = mla(t, x2, +0.6625722782820833712e-1);
+  t = mla(t, x2, +0.7898105214313944078e-1);
+  t = mla(t, x2, +0.9650955035715275132e-1);
+  t = mla(t, x2, +0.1240841409721444993e+0);
+  t = mla(t, x2, +0.1737177927454605086e+0);
+  t = mla(t, x2, +0.2895296546021972617e+0);
+  
+  s = ddmul_d2_d2_d(dd(0.30102999566398119802, -2.803728127785170339e-18), (double)e);
+  s = ddadd_d2_d2_d2(s, ddmul_d2_d2_d2(x, dd(0.86858896380650363334, 1.1430059694096389311e-17)));
+  s = ddadd_d2_d2_d(s, x2 * x.x * t);
+
+  double r = s.x + s.y;
+  
+  if (xisinf(d)) r = INFINITY;
+  if (d < 0 || xisnan(d)) r = NAN;
+  if (d == 0) r = -INFINITY;
+
+  return r;
+}
+
+static INLINE CONST double xlog1p_fast(double d) {
+  Sleef_double2 x, s;
+  double m, t, x2;
+  int e;
+
+  double dp1 = d + 1;
+
+  int o = dp1 < DBL_MIN;
+  if (o) dp1 *= (double)(1LL << 32) * (double)(1LL << 32);
+
+  e = ilogb2k(dp1 * (1.0/0.75));
+
+  t = ldexp3k(1, -e);
+  m = mla(d, t, t - 1);
+
+  if (o) e -= 64;
+
+  x = dddiv_d2_d2_d2(dd(m, 0), ddadd_d2_d_d(2, m));
+  x2 = x.x * x.x;
+
+  t = 0.1532076988502701353e+0;
+  t = mla(t, x2, 0.1525629051003428716e+0);
+  t = mla(t, x2, 0.1818605932937785996e+0);
+  t = mla(t, x2, 0.2222214519839380009e+0);
+  t = mla(t, x2, 0.2857142932794299317e+0);
+  t = mla(t, x2, 0.3999999999635251990e+0);
+  t = mla(t, x2, 0.6666666666667333541e+0);
+
+  s = ddmul_d2_d2_d(dd(0.693147180559945286226764, 2.319046813846299558417771e-17), (double)e);
+  s = ddadd_d2_d2_d2(s, ddscale_d2_d2_d(x, 2));
+  s = ddadd_d2_d2_d(s, x2 * x.x * t);
+
+  double r = s.x + s.y;
+
+  if (d == INFINITY) r = INFINITY;
+  if (d < -1) r = NAN;
+  if (d == -1) r = -INFINITY;
+  if (xisnegzero(d)) r = -0.0;
+  if (xisnan(d)) r = d;
+
+  return r;
+}
+
+EXPORT CONST double xlog1p(double d) {
+  if (d > 0x1.0p+1000)
+    return xlog(d);
+  else
+    return xlog1p_fast(d);
+}
+
+//
+
+EXPORT CONST double xfma(double x, double y, double z) {
+#if __has_builtin(__builtin_fma)
+  return __builtin_fma(x, y, z);
+#else
+#warning Using software FMA
+  double h2 = x * y + z, q = 1;
+  if (fabsk(h2) < 1e-300) {
+    const double c0 = 1ULL << 54, c1 = c0 * c0, c2 = c1 * c1;
+    x *= c1;
+    y *= c1;
+    z *= c2;
+    q = 1.0 / c2;
+  }
+  if (fabsk(h2) > 1e+300) {
+    const double c0 = 1ULL << 54, c1 = c0 * c0, c2 = c1 * c1;
+    x *= 1.0 / c1;
+    y *= 1.0 / c1;
+    z *= 1. / c2;
+    q = c2;
+  }
+  Sleef_double2 d = ddmul_d2_d_d(x, y);
+  d = ddadd2_d2_d2_d(d, z);
+  double ret = (x == 0 || y == 0) ? z : (d.x + d.y);
+  if ((xisinf(z) && !xisinf(x) && !xisnan(x) && !xisinf(y) && !xisnan(y))) h2 = z;
+  return (xisinf(h2) || xisnan(h2)) ? h2 : ret*q;
+#endif
+}
+
+EXPORT CONST double xsqrt_u05(double d) {
+#if __has_builtin(__builtin_sqrt)
+  return __builtin_sqrt(d);
+#else
+#warning Using software SQRT
+  double q = 0.5;
+
+  d = d < 0 ? NAN : d;
+
+  if (d < 8.636168555094445E-78) {
+    d *= 1.157920892373162E77;
+    q = 2.9387358770557188E-39 * 0.5;
+  }
+
+  if (d > 1.3407807929942597e+154) {
+    d *= 7.4583407312002070e-155;
+    q = 1.1579208923731620e+77 * 0.5;
+  }
+
+  // http://en.wikipedia.org/wiki/Fast_inverse_square_root
+  double x = longBitsToDouble(0x5fe6ec85e7de30da - (doubleToRawLongBits(d + 1e-320) >> 1));
+
+  x = x * (1.5 - 0.5 * d * x * x);
+  x = x * (1.5 - 0.5 * d * x * x);
+  x = x * (1.5 - 0.5 * d * x * x) * d;
+
+  Sleef_double2 d2 = ddmul_d2_d2_d2(ddadd2_d2_d_d2(d, ddmul_d2_d_d(x, x)), ddrec_d2_d(x));
+
+  double ret = (d2.x + d2.y) * q;
+
+  ret = d == INFINITY ? INFINITY : ret;
+  ret = d == 0 ? d : ret;
+
+  return ret;
+#endif
+}
+
+EXPORT CONST double xfabs(double x) { return fabsk(x); }
+
+EXPORT CONST double xcopysign(double x, double y) { return copysignk(x, y); }
+
+EXPORT CONST double xfmax(double x, double y) {
+  return y != y ? x : (x > y ? x : y);
+}
+
+EXPORT CONST double xfmin(double x, double y) {
+  return y != y ? x : (x < y ? x : y);
+}
+
+EXPORT CONST double xfdim(double x, double y) {
+  double ret = x - y;
+  if (ret < 0 || x == y) ret = 0;
+  return ret;
+}
+
+EXPORT CONST double xtrunc(double x) {
+  double fr = x - (double)(1LL << 31) * (int32_t)(x * (1.0 / (1LL << 31)));
+  fr = fr - (int32_t)fr;
+  return (xisinf(x) || fabsk(x) >= (double)(1LL << 52)) ? x : copysignk(x - fr, x);
+}
+
+EXPORT CONST double xfloor(double x) {
+  double fr = x - (double)(1LL << 31) * (int32_t)(x * (1.0 / (1LL << 31)));
+  fr = fr - (int32_t)fr;
+  fr = fr < 0 ? fr+1.0 : fr;
+  return (xisinf(x) || fabsk(x) >= (double)(1LL << 52)) ? x : copysignk(x - fr, x);
+}
+
+EXPORT CONST double xceil(double x) {
+  double fr = x - (double)(1LL << 31) * (int32_t)(x * (1.0 / (1LL << 31)));
+  fr = fr - (int32_t)fr;
+  fr = fr <= 0 ? fr : fr-1.0;
+  return (xisinf(x) || fabsk(x) >= (double)(1LL << 52)) ? x : copysignk(x - fr, x);
+}
+
+EXPORT CONST double xround(double d) {
+  double x = d + 0.5;
+  double fr = x - (double)(1LL << 31) * (int32_t)(x * (1.0 / (1LL << 31)));
+  fr = fr - (int32_t)fr;
+  if (fr == 0 && x <= 0) x--;
+  fr = fr < 0 ? fr+1.0 : fr;
+  x = d == 0.49999999999999994449 ? 0 : x;  // nextafter(0.5, 0)
+  return (xisinf(d) || fabsk(d) >= (double)(1LL << 52)) ? d : copysignk(x - fr, d);
+}
+
+EXPORT CONST double xrint(double d) {
+  double x = d + 0.5;
+  double fr = x - (double)(1LL << 31) * (int32_t)(x * (1.0 / (1LL << 31)));
+  int32_t isodd = (1 & (int32_t)fr) != 0;
+  fr = fr - (int32_t)fr;
+  fr = (fr < 0 || (fr == 0 && isodd)) ? fr+1.0 : fr;
+  x = d == 0.50000000000000011102 ? 0 : x;  // nextafter(0.5, 1)
+  return (xisinf(d) || fabsk(d) >= (double)(1LL << 52)) ? d : copysignk(x - fr, d);
+}
+
+EXPORT CONST double xhypot_u05(double x, double y) {
+  x = fabsk(x);
+  y = fabsk(y);
+  double min = fmink(x, y), n = min;
+  double max = fmaxk(x, y), d = max;
+
+  if (max < DBL_MIN) { n *= 1ULL << 54; d *= 1ULL << 54; }
+  Sleef_double2 t = dddiv_d2_d2_d2(dd(n, 0), dd(d, 0));
+  t = ddmul_d2_d2_d(ddsqrt_d2_d2(ddadd2_d2_d2_d(ddsqu_d2_d2(t), 1)), max);
+  double ret = t.x + t.y;
+  if (xisnan(ret)) ret = INFINITY;
+  if (min == 0) ret = max;
+  if (xisnan(x) || xisnan(y)) ret = NAN;
+  if (x == INFINITY || y == INFINITY) ret = INFINITY;
+  return ret;
+}
+
+EXPORT CONST double xhypot_u35(double x, double y) {
+  x = fabsk(x);
+  y = fabsk(y);
+  double min = fmink(x, y);
+  double max = fmaxk(x, y);
+
+  double t = min / max;
+  double ret = max * sqrt(1 + t*t);
+  if (min == 0) ret = max;
+  if (xisnan(x) || xisnan(y)) ret = NAN;
+  if (x == INFINITY || y == INFINITY) ret = INFINITY;
+  return ret;
+}
+
+EXPORT CONST double xnextafter(double x, double y) {
+  union {
+    double f;
+    int64_t i;
+  } cx;
+
+  x = x == 0 ? mulsign(0, y) : x;
+  cx.f = x;
+  int c = (cx.i < 0) == (y < x);
+  if (c) cx.i = -(cx.i ^ (1ULL << 63));
+
+  if (x != y) cx.i--;
+
+  if (c) cx.i = -(cx.i ^ (1ULL << 63));
+
+  if (cx.f == 0 && x != 0) cx.f = mulsign(0, x);
+  if (x == 0 && y == 0) cx.f = y;
+  if (xisnan(x) || xisnan(y)) cx.f = NAN;
+
+  return cx.f;
+}
+
+EXPORT CONST double xfrfrexp(double x) {
+  union {
+    double f;
+    uint64_t u;
+  } cx;
+
+  if (xisnan(x)) return x;
+
+  if (fabsk(x) < DBL_MIN) x *= (1ULL << 63);
+
+  cx.f = x;
+  cx.u &= ~0x7ff0000000000000ULL;
+  cx.u |=  0x3fe0000000000000ULL;
+
+  if (xisinf(x)) cx.f = mulsign(INFINITY, x);
+  if (x == 0) cx.f = x;
+
+  return cx.f;
+}
+
+EXPORT CONST int xexpfrexp(double x) {
+  union {
+    double f;
+    uint64_t u;
+  } cx;
+
+  int ret = 0;
+
+  if (fabsk(x) < DBL_MIN) { x *= (1ULL << 63); ret = -63; }
+
+  cx.f = x;
+  ret += (int32_t)(((cx.u >> 52) & 0x7ff)) - 0x3fe;
+
+  if (x == 0 || xisnan(x) || xisinf(x)) ret = 0;
+
+  return ret;
+}
+
+static INLINE CONST double toward0(double d) {
+  return d == 0 ? 0 : longBitsToDouble(doubleToRawLongBits(d)-1);
+}
+
+static INLINE CONST double removelsb(double d) {
+  return longBitsToDouble(doubleToRawLongBits(d) & 0xfffffffffffffffeLL);
+}
+
+static INLINE CONST double ptrunc(double x) {
+  double fr = mla(-(double)(1LL << 31), (int32_t)(x * (1.0 / (1LL << 31))), x);
+  return fabsk(x) >= (double)(1LL << 52) ? x : (x - (fr - (int32_t)fr));
+}
+
+EXPORT CONST double xfmod(double x, double y) {
+  double nu = fabsk(x), de = fabsk(y), s = 1, q;
+  if (de < DBL_MIN) { nu *= 1ULL << 54; de *= 1ULL << 54; s = 1.0 / (1ULL << 54); }
+  Sleef_double2 r = dd(nu, 0);
+  double rde = toward0(1.0 / de);
+
+  for(int i=0;i < 21;i++) { // ceil(log2(DBL_MAX) / 51) + 1
+    q = (de+de > r.x && r.x >= de) ? 1 : (toward0(r.x) * rde);
+    r = ddnormalize_d2_d2(ddadd2_d2_d2_d2(r, ddmul_d2_d_d(removelsb(ptrunc(q)), -de)));
+    if (r.x < de) break;
+  }
+
+  double ret = r.x * s;
+  if (r.x + r.y == de) ret = 0;
+  ret = mulsign(ret, x);
+  if (nu < de) ret = x;
+  if (de == 0) ret = NAN;
+
+  return ret;
+}
+
+
+EXPORT CONST Sleef_double2 xmodf(double x) {
+  double fr = x - (double)(1LL << 31) * (int32_t)(x * (1.0 / (1LL << 31)));
+  fr = fr - (int32_t)fr;
+  fr = fabsk(x) >= (double)(1LL << 52) ? 0 : fr;
+  Sleef_double2 ret = { copysignk(fr, x), copysignk(x - fr, x) };
+  return ret;
+}
+
+typedef struct {
+  Sleef_double2 a, b;
+} dd2;
+
+static CONST dd2 gammak(double a) {
+  Sleef_double2 clc = dd(0, 0), clln = dd(1, 0), clld = dd(1, 0), v = dd(1, 0), x, y, z;
+  double t, u;
+
+  int otiny = fabsk(a) < 1e-306, oref = a < 0.5;
+
+  x = otiny ? dd(0, 0) : (oref ? ddadd2_d2_d_d(1, -a) : dd(a, 0));
+
+  int o0 = (0.5 <= x.x && x.x <= 1.1), o2 = 2.3 < x.x;
+
+  y = ddnormalize_d2_d2(ddmul_d2_d2_d2(ddadd2_d2_d2_d(x, 1), x));
+  y = ddnormalize_d2_d2(ddmul_d2_d2_d2(ddadd2_d2_d2_d(x, 2), y));
+  y = ddnormalize_d2_d2(ddmul_d2_d2_d2(ddadd2_d2_d2_d(x, 3), y));
+  y = ddnormalize_d2_d2(ddmul_d2_d2_d2(ddadd2_d2_d2_d(x, 4), y));
+
+  clln = (o2 && x.x <= 7) ? y : clln;
+
+  x = (o2 && x.x <= 7) ? ddadd2_d2_d2_d(x, 5) : x;
+  t = o2 ? (1.0 / x.x) : ddnormalize_d2_d2(ddadd2_d2_d2_d(x, o0 ? -1 : -2)).x;
+
+  u = o2 ? -156.801412704022726379848862 : (o0 ? +0.2947916772827614196e+2 : +0.7074816000864609279e-7);
+  u = mla(u, t, o2 ? +1.120804464289911606838558160000 : (o0 ? +0.1281459691827820109e+3 : +0.4009244333008730443e-6));
+  u = mla(u, t, o2 ? +13.39798545514258921833306020000 : (o0 ? +0.2617544025784515043e+3 : +0.1040114641628246946e-5));
+  u = mla(u, t, o2 ? -0.116546276599463200848033357000 : (o0 ? +0.3287022855685790432e+3 : +0.1508349150733329167e-5));
+  u = mla(u, t, o2 ? -1.391801093265337481495562410000 : (o0 ? +0.2818145867730348186e+3 : +0.1288143074933901020e-5));
+  u = mla(u, t, o2 ? +0.015056113040026424412918973400 : (o0 ? +0.1728670414673559605e+3 : +0.4744167749884993937e-6));
+  u = mla(u, t, o2 ? +0.179540117061234856098844714000 : (o0 ? +0.7748735764030416817e+2 : -0.6554816306542489902e-7));
+  u = mla(u, t, o2 ? -0.002481743600264997730942489280 : (o0 ? +0.2512856643080930752e+2 : -0.3189252471452599844e-6));
+  u = mla(u, t, o2 ? -0.029527880945699120504851034100 : (o0 ? +0.5766792106140076868e+1 : +0.1358883821470355377e-6));
+  u = mla(u, t, o2 ? +0.000540164767892604515196325186 : (o0 ? +0.7270275473996180571e+0 : -0.4343931277157336040e-6));
+  u = mla(u, t, o2 ? +0.006403362833808069794787256200 : (o0 ? +0.8396709124579147809e-1 : +0.9724785897406779555e-6));
+  u = mla(u, t, o2 ? -0.000162516262783915816896611252 : (o0 ? -0.8211558669746804595e-1 : -0.2036886057225966011e-5));
+  u = mla(u, t, o2 ? -0.001914438498565477526465972390 : (o0 ? +0.6828831828341884458e-1 : +0.4373363141819725815e-5));
+  u = mla(u, t, o2 ? +7.20489541602001055898311517e-05 : (o0 ? -0.7712481339961671511e-1 : -0.9439951268304008677e-5));
+  u = mla(u, t, o2 ? +0.000839498720672087279971000786 : (o0 ? +0.8337492023017314957e-1 : +0.2050727030376389804e-4));
+  u = mla(u, t, o2 ? -5.17179090826059219329394422e-05 : (o0 ? -0.9094964931456242518e-1 : -0.4492620183431184018e-4));
+  u = mla(u, t, o2 ? -0.000592166437353693882857342347 : (o0 ? +0.1000996313575929358e+0 : +0.9945751236071875931e-4));
+  u = mla(u, t, o2 ? +6.97281375836585777403743539e-05 : (o0 ? -0.1113342861544207724e+0 : -0.2231547599034983196e-3));
+  u = mla(u, t, o2 ? +0.000784039221720066627493314301 : (o0 ? +0.1255096673213020875e+0 : +0.5096695247101967622e-3));
+  u = mla(u, t, o2 ? -0.000229472093621399176949318732 : (o0 ? -0.1440498967843054368e+0 : -0.1192753911667886971e-2));
+  u = mla(u, t, o2 ? -0.002681327160493827160473958490 : (o0 ? +0.1695571770041949811e+0 : +0.2890510330742210310e-2));
+  u = mla(u, t, o2 ? +0.003472222222222222222175164840 : (o0 ? -0.2073855510284092762e+0 : -0.7385551028674461858e-2));
+  u = mla(u, t, o2 ? +0.083333333333333333335592087900 : (o0 ? +0.2705808084277815939e+0 : +0.2058080842778455335e-1));
+
+  y = ddmul_d2_d2_d2(ddadd2_d2_d2_d(x, -0.5), logk2(x));
+  y = ddadd2_d2_d2_d2(y, ddneg_d2_d2(x));
+  y = ddadd2_d2_d2_d2(y, dd(0.91893853320467278056, -3.8782941580672414498e-17)); // 0.5*log(2*M_PI)
+
+  z = ddadd2_d2_d2_d(ddmul_d2_d_d (u, t), o0 ? -0.4006856343865314862e+0 : -0.6735230105319810201e-1);
+  z = ddadd2_d2_d2_d(ddmul_d2_d2_d(z, t), o0 ? +0.8224670334241132030e+0 : +0.3224670334241132030e+0);
+  z = ddadd2_d2_d2_d(ddmul_d2_d2_d(z, t), o0 ? -0.5772156649015328655e+0 : +0.4227843350984671345e+0);
+  z = ddmul_d2_d2_d(z, t);
+
+  clc = o2 ? y : z;
+
+  clld = o2 ? ddadd2_d2_d2_d(ddmul_d2_d_d(u, t), 1) : clld;
+
+  y = clln;
+
+  clc = otiny ? dd(83.1776616671934334590333, 3.67103459631568507221878e-15) : // log(2^120)
+    (oref ? ddadd2_d2_d2_d2(dd(1.1447298858494001639, 1.026595116270782638e-17), ddneg_d2_d2(clc)) : clc); // log(M_PI)
+  clln = otiny ? dd(1, 0) : (oref ? clln : clld);
+
+  if (oref) x = ddmul_d2_d2_d2(clld, sinpik(a - (double)(1LL << 28) * (int32_t)(a * (1.0 / (1LL << 28)))));
+
+  clld = otiny ? dd(a*((1LL << 60)*(double)(1LL << 60)), 0) : (oref ? x : y);
+
+  dd2 ret = { clc, dddiv_d2_d2_d2(clln, clld) };
+
+  return ret;
+}
+
+EXPORT CONST double xtgamma_u1(double a) {
+  dd2 d = gammak(a);
+  Sleef_double2 y = ddmul_d2_d2_d2(expk2(d.a), d.b);
+  double r = y.x + y.y;
+  r = (a == -INFINITY || (a < 0 && xisint(a)) || (xisnumber(a) && a < 0 && xisnan(r))) ? NAN : r;
+  r = ((a == INFINITY || xisnumber(a)) && a >= -DBL_MIN && (a == 0 || a > 200 || xisnan(r))) ? mulsign(INFINITY, a) : r;
+  return r;
+}
+
+EXPORT CONST double xlgamma_u1(double a) {
+  dd2 d = gammak(a);
+  Sleef_double2 y = ddadd2_d2_d2_d2(d.a, logk2(ddabs_d2_d2(d.b)));
+  double r = y.x + y.y;
+  r = (xisinf(a) || (a <= 0 && xisint(a)) || (xisnumber(a) && xisnan(r))) ? INFINITY : r;
+  return r;
+}
+
+EXPORT CONST Sleef_double2 xlgamma_r_u1(double a) {
+  dd2 d = gammak(a);
+  Sleef_double2 y = ddadd2_d2_d2_d2(d.a, logk2(ddabs_d2_d2(d.b)));
+  double r = y.x + y.y;
+  r = (xisinf(a) || (a <= 0 && xisint(a)) || (xisnumber(a) && xisnan(r))) ? INFINITY : r;
+  Sleef_double2 ret;
+  ret.x = r;
+  ret.y = longBitsToDouble((doubleToRawLongBits(d.b.x) & (1L << 63)) | (0x3ff0000000000000L));
+  return ret;
+}
+
+
+EXPORT CONST double xerf_u1(double a) {
+  double s = a, t, u;
+  Sleef_double2 d;
+
+  a = fabsk(a);
+  int o0 = a < 1.0, o1 = a < 3.7, o2 = a < 6.0;
+  u = o0 ? (a*a) : a;
+
+  t = o0 ? +0.6801072401395392157e-20 : o1 ? +0.2830954522087717660e-13 : -0.5846750404269610493e-17;
+  t = mla(t, u, o0 ? -0.2161766247570056391e-18 : o1 ? -0.1509491946179481940e-11 : +0.6076691048812607898e-15);
+  t = mla(t, u, o0 ? +0.4695919173301598752e-17 : o1 ? +0.3827857177807173152e-10 : -0.3007518609604893831e-13);
+  t = mla(t, u, o0 ? -0.9049140419888010819e-16 : o1 ? -0.6139733921558987241e-09 : +0.9427906260824646063e-12);
+  t = mla(t, u, o0 ? +0.1634018903557411517e-14 : o1 ? +0.6985387934608038824e-08 : -0.2100110908269393629e-10);
+  t = mla(t, u, o0 ? -0.2783485786333455216e-13 : o1 ? -0.5988224513034371474e-07 : +0.3534639523461223473e-09);
+  t = mla(t, u, o0 ? +0.4463221276786412722e-12 : o1 ? +0.4005716952355346640e-06 : -0.4664967728285395926e-08);
+  t = mla(t, u, o0 ? -0.6711366622850138987e-11 : o1 ? -0.2132190104575784400e-05 : +0.4943823283769000532e-07);
+  t = mla(t, u, o0 ? +0.9422759050232658346e-10 : o1 ? +0.9092461304042630325e-05 : -0.4271203394761148254e-06);
+  t = mla(t, u, o0 ? -0.1229055530100228477e-08 : o1 ? -0.3079188080966205457e-04 : +0.3034067677404915895e-05);
+  t = mla(t, u, o0 ? +0.1480719281585085023e-07 : o1 ? +0.7971413443082370762e-04 : -0.1776295289066871135e-04);
+  t = mla(t, u, o0 ? -0.1636584469123402714e-06 : o1 ? -0.1387853215225442864e-03 : +0.8524547630559505050e-04);
+  t = mla(t, u, o0 ? +0.1646211436588923363e-05 : o1 ? +0.6469678026257590965e-04 : -0.3290582944961784398e-03);
+  t = mla(t, u, o0 ? -0.1492565035840624866e-04 : o1 ? +0.4996645280372945860e-03 : +0.9696966068789101157e-03);
+  t = mla(t, u, o0 ? +0.1205533298178966496e-03 : o1 ? -0.1622802482842520535e-02 : -0.1812527628046986137e-02);
+  t = mla(t, u, o0 ? -0.8548327023450851166e-03 : o1 ? +0.1615320557049377171e-03 : -0.4725409828123619017e-03);
+  t = mla(t, u, o0 ? +0.5223977625442188799e-02 : o1 ? +0.1915262325574875607e-01 : +0.2090315427924229266e-01);
+  t = mla(t, u, o0 ? -0.2686617064513125569e-01 : o1 ? -0.1027818298486033455e+00 : -0.1052041921842776645e+00);
+  t = mla(t, u, o0 ? +0.1128379167095512753e+00 : o1 ? -0.6366172819842503827e+00 : -0.6345351808766568347e+00);
+  t = mla(t, u, o0 ? -0.3761263890318375380e+00 : o1 ? -0.1128379590648910469e+01 : -0.1129442929103524396e+01);
+  d = ddmul_d2_d_d(t, u);
+  d = ddadd2_d2_d2_d2(d, o0 ? dd(1.1283791670955125586, 1.5335459613165822674e-17) :
+          o1 ? dd(3.4110644736196137587e-08, -2.4875650708323294246e-24) :
+          dd(0.00024963035690526438285, -5.4362665034856259795e-21));
+  d = o0 ? ddmul_d2_d2_d(d, a) : ddadd_d2_d_d2(1.0, ddneg_d2_d2(expk2(d)));
+  u = mulsign(o2 ? (d.x + d.y) : 1, s);
+  u = xisnan(a) ? NAN : u;
+  return u;
+}
+
+EXPORT CONST double xerfc_u15(double a) {
+  double s = a, r = 0, t;
+  Sleef_double2 u, d, x;
+  a = fabsk(a);
+  int o0 = a < 1.0, o1 = a < 2.2, o2 = a < 4.2, o3 = a < 27.3;
+  u = o0 ? ddmul_d2_d_d(a, a) : o1 ? dd(a, 0) : dddiv_d2_d2_d2(dd(1, 0), dd(a, 0));
+
+  t = o0 ? +0.6801072401395386139e-20 : o1 ? +0.3438010341362585303e-12 : o2 ? -0.5757819536420710449e+2 : +0.2334249729638701319e+5;
+  t = mla(t, u.x, o0 ? -0.2161766247570055669e-18 : o1 ? -0.1237021188160598264e-10 : o2 ? +0.4669289654498104483e+3 : -0.4695661044933107769e+5);
+  t = mla(t, u.x, o0 ? +0.4695919173301595670e-17 : o1 ? +0.2117985839877627852e-09 : o2 ? -0.1796329879461355858e+4 : +0.3173403108748643353e+5);
+  t = mla(t, u.x, o0 ? -0.9049140419888007122e-16 : o1 ? -0.2290560929177369506e-08 : o2 ? +0.4355892193699575728e+4 : +0.3242982786959573787e+4);
+  t = mla(t, u.x, o0 ? +0.1634018903557410728e-14 : o1 ? +0.1748931621698149538e-07 : o2 ? -0.7456258884965764992e+4 : -0.2014717999760347811e+5);
+  t = mla(t, u.x, o0 ? -0.2783485786333451745e-13 : o1 ? -0.9956602606623249195e-07 : o2 ? +0.9553977358167021521e+4 : +0.1554006970967118286e+5);
+  t = mla(t, u.x, o0 ? +0.4463221276786415752e-12 : o1 ? +0.4330010240640327080e-06 : o2 ? -0.9470019905444229153e+4 : -0.6150874190563554293e+4);
+  t = mla(t, u.x, o0 ? -0.6711366622850136563e-11 : o1 ? -0.1435050600991763331e-05 : o2 ? +0.7387344321849855078e+4 : +0.1240047765634815732e+4);
+  t = mla(t, u.x, o0 ? +0.9422759050232662223e-10 : o1 ? +0.3460139479650695662e-05 : o2 ? -0.4557713054166382790e+4 : -0.8210325475752699731e+2);
+  t = mla(t, u.x, o0 ? -0.1229055530100229098e-08 : o1 ? -0.4988908180632898173e-05 : o2 ? +0.2207866967354055305e+4 : +0.3242443880839930870e+2);
+  t = mla(t, u.x, o0 ? +0.1480719281585086512e-07 : o1 ? -0.1308775976326352012e-05 : o2 ? -0.8217975658621754746e+3 : -0.2923418863833160586e+2);
+  t = mla(t, u.x, o0 ? -0.1636584469123399803e-06 : o1 ? +0.2825086540850310103e-04 : o2 ? +0.2268659483507917400e+3 : +0.3457461732814383071e+0);
+  t = mla(t, u.x, o0 ? +0.1646211436588923575e-05 : o1 ? -0.6393913713069986071e-04 : o2 ? -0.4633361260318560682e+2 : +0.5489730155952392998e+1);
+  t = mla(t, u.x, o0 ? -0.1492565035840623511e-04 : o1 ? -0.2566436514695078926e-04 : o2 ? +0.9557380123733945965e+1 : +0.1559934132251294134e-2);
+  t = mla(t, u.x, o0 ? +0.1205533298178967851e-03 : o1 ? +0.5895792375659440364e-03 : o2 ? -0.2958429331939661289e+1 : -0.1541741566831520638e+1);
+  t = mla(t, u.x, o0 ? -0.8548327023450850081e-03 : o1 ? -0.1695715579163588598e-02 : o2 ? +0.1670329508092765480e+0 : +0.2823152230558364186e-5);
+  t = mla(t, u.x, o0 ? +0.5223977625442187932e-02 : o1 ? +0.2089116434918055149e-03 : o2 ? +0.6096615680115419211e+0 : +0.6249999184195342838e+0);
+  t = mla(t, u.x, o0 ? -0.2686617064513125222e-01 : o1 ? +0.1912855949584917753e-01 : o2 ? +0.1059212443193543585e-2 : +0.1741749416408701288e-8);
+
+  d = ddmul_d2_d2_d(u, t);
+  d = ddadd2_d2_d2_d2(d, o0 ? dd(0.11283791670955126141, -4.0175691625932118483e-18) :
+          o1 ? dd(-0.10277263343147646779, -6.2338714083404900225e-18) :
+          o2 ? dd(-0.50005180473999022439, 2.6362140569041995803e-17) :
+          dd(-0.5000000000258444377, -4.0074044712386992281e-17));
+  d = ddmul_d2_d2_d2(d, u);
+  d = ddadd2_d2_d2_d2(d, o0 ? dd(-0.37612638903183753802, 1.3391897206042552387e-17) :
+          o1 ? dd(-0.63661976742916359662, 7.6321019159085724662e-18) :
+          o2 ? dd(1.601106273924963368e-06, 1.1974001857764476775e-23) :
+          dd(2.3761973137523364792e-13, -1.1670076950531026582e-29));
+  d = ddmul_d2_d2_d2(d, u);
+  d = ddadd2_d2_d2_d2(d, o0 ? dd(1.1283791670955125586, 1.5335459613165822674e-17) :
+          o1 ? dd(-1.1283791674717296161, 8.0896847755965377194e-17) :
+          o2 ? dd(-0.57236496645145429341, 3.0704553245872027258e-17) :
+          dd(-0.57236494292470108114, -2.3984352208056898003e-17));
+
+  x = ddmul_d2_d2_d(o1 ? d : dd(-a, 0), a);
+  x = o1 ? x : ddadd2_d2_d2_d2(x, d);
+  x = o0 ? ddsub_d2_d2_d2(dd(1, 0), x) : expk2(x);
+  x = o1 ? x : ddmul_d2_d2_d2(x, u);
+
+  r = o3 ? (x.x + x.y) : 0;
+  if (s < 0) r = 2 - r;
+  r = xisnan(s) ? NAN : r;
+  return r;
+}
+
+#ifdef ENABLE_MAIN
+// gcc -w -DENABLE_MAIN -I../common sleefdp.c -lm
+#include <stdlib.h>
+int main(int argc, char **argv) {
+  double d1 = atof(argv[1]);
+  printf("arg1 = %.20g\n", d1);
+  //int i1 = atoi(argv[1]);
+  //double d2 = atof(argv[2]);
+  //printf("arg2 = %.20g\n", d2);
+  //printf("%d\n", (int)d2);
+#if 0
+  double d3 = atof(argv[3]);
+  printf("arg3 = %.20g\n", d3);
+#endif
+  //printf("%g\n", pow2i(i1));
+  //int exp = xexpfrexp(d1);
+  //double r = xnextafter(d1, d2);
+  //double r = xfma(d1, d2, d3);
+  printf("test = %.20g\n", xcos_u1(d1));
+  //printf("test = %.20g\n", xlog(d1));
+  //r = nextafter(d1, d2);
+  printf("corr = %.20g\n", cos(d1));
+  //printf("%.20g %.20g\n", xround(d1), xrint(d1));
+  //Sleef_double2 r = xsincospi_u35(d);
+  //printf("%g, %g\n", (double)r.x, (double)r.y);
+}
+#endif
diff --git a/lib/kernel/sleef/libm/sleefsimddp.c b/lib/kernel/sleef/libm/sleefsimddp.c
new file mode 100644
index 0000000..f04e484
--- /dev/null
+++ b/lib/kernel/sleef/libm/sleefsimddp.c
@@ -0,0 +1,2551 @@
+//          Copyright Naoki Shibata 2010 - 2017.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+// Always use -ffp-contract=off option to compile SLEEF.
+
+#include <stdint.h>
+#include <math.h>
+#include <limits.h>
+#include <float.h>
+
+#include "misc.h"
+
+#if (defined(_MSC_VER))
+#pragma fp_contract (off)
+#endif
+
+#include "helpers.h"
+#include "dd.h"
+
+static INLINE vopmask vnot_vo64_vo64(vopmask x) {
+  return vxor_vo_vo_vo(x, veq64_vo_vm_vm(vcast_vm_i_i(0, 0), vcast_vm_i_i(0, 0)));
+}
+
+static INLINE CONST vopmask vsignbit_vo_vd(vdouble d) {
+  return veq64_vo_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(vcast_vd_d(-0.0)));
+}
+
+// return d0 < d1 ? x : y
+static INLINE CONST vint vsel_vi_vd_vd_vi_vi(vdouble d0, vdouble d1, vint x, vint y) { return vsel_vi_vo_vi_vi(vcast_vo32_vo64(vlt_vo_vd_vd(d0, d1)), x, y); }
+
+// return d0 < 0 ? x : 0
+static INLINE CONST vint vsel_vi_vd_vi(vdouble d, vint x) { return vand_vi_vo_vi(vcast_vo32_vo64(vsignbit_vo_vd(d)), x); }
+
+static INLINE CONST vopmask visnegzero_vo_vd(vdouble d) {
+  return veq64_vo_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(vcast_vd_d(-0.0)));
+}
+
+static INLINE CONST vopmask visnumber_vo_vd(vdouble x) {
+  return vandnot_vo_vo_vo(visinf_vo_vd(x), veq_vo_vd_vd(x, x));
+}
+
+static INLINE CONST vmask vsignbit_vm_vd(vdouble d) {
+  return vand_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(vcast_vd_d(-0.0)));
+}
+
+static INLINE CONST vdouble vmulsign_vd_vd_vd(vdouble x, vdouble y) {
+  return vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(x), vsignbit_vm_vd(y)));
+}
+
+static INLINE CONST vdouble vcopysign_vd_vd_vd(vdouble x, vdouble y) {
+  return vreinterpret_vd_vm(vxor_vm_vm_vm(vandnot_vm_vm_vm(vreinterpret_vm_vd(vcast_vd_d(-0.0)), vreinterpret_vm_vd(x)),
+            vand_vm_vm_vm   (vreinterpret_vm_vd(vcast_vd_d(-0.0)), vreinterpret_vm_vd(y))));
+}
+
+static INLINE CONST vdouble vsign_vd_vd(vdouble d) {
+  return vmulsign_vd_vd_vd(vcast_vd_d(1.0), d);
+}
+
+static INLINE CONST vdouble vpow2i_vd_vi(vint q) {
+  q = vadd_vi_vi_vi(vcast_vi_i(0x3ff), q);
+  vint2 r = vcastu_vi2_vi(q);
+  return vreinterpret_vd_vi2(vsll_vi2_vi2_i(r, 20));
+}
+
+static INLINE CONST vdouble vldexp_vd_vd_vi(vdouble x, vint q) {
+  vint m = vsra_vi_vi_i(q, 31);
+  m = vsll_vi_vi_i(vsub_vi_vi_vi(vsra_vi_vi_i(vadd_vi_vi_vi(m, q), 9), m), 7);
+  q = vsub_vi_vi_vi(q, vsll_vi_vi_i(m, 2));
+  m = vadd_vi_vi_vi(vcast_vi_i(0x3ff), m);
+  m = vandnot_vi_vo_vi(vgt_vo_vi_vi(vcast_vi_i(0), m), m);
+  m = vsel_vi_vo_vi_vi(vgt_vo_vi_vi(m, vcast_vi_i(0x7ff)), vcast_vi_i(0x7ff), m);
+  vint2 r = vcastu_vi2_vi(m);
+  vdouble y = vreinterpret_vd_vi2(vsll_vi2_vi2_i(r, 20));
+  return vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(x, y), y), y), y), vpow2i_vd_vi(q));
+}
+
+static INLINE CONST vdouble vldexp2_vd_vd_vi(vdouble d, vint e) {
+  return vmul_vd_vd_vd(vmul_vd_vd_vd(d, vpow2i_vd_vi(vsra_vi_vi_i(e, 1))), vpow2i_vd_vi(vsub_vi_vi_vi(e, vsra_vi_vi_i(e, 1))));
+}
+
+static INLINE CONST vdouble vldexp3_vd_vd_vi(vdouble d, vint q) {
+  return vreinterpret_vd_vi2(vadd_vi2_vi2_vi2(vreinterpret_vi2_vd(d), vsll_vi2_vi2_i(vcastu_vi2_vi(q), 20)));
+}
+
+#ifndef ENABLE_AVX512F
+static INLINE CONST vint vilogbk_vi_vd(vdouble d) {
+  vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(4.9090934652977266E-91));
+  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(vcast_vd_d(2.037035976334486E90), d), d);
+  vint q = vcastu_vi_vi2(vreinterpret_vi2_vd(d));
+  q = vand_vi_vi_vi(q, vcast_vi_i(((1 << 12)-1) << 20));
+  q = vsrl_vi_vi_i(q, 20);
+  q = vsub_vi_vi_vi(q, vsel_vi_vo_vi_vi(vcast_vo32_vo64(o), vcast_vi_i(300 + 0x3ff), vcast_vi_i(0x3ff)));
+  return q;
+}
+
+static INLINE CONST vint vilogb2k_vi_vd(vdouble d) {
+  vint q = vcastu_vi_vi2(vreinterpret_vi2_vd(d));
+  q = vsrl_vi_vi_i(q, 20);
+  q = vand_vi_vi_vi(q, vcast_vi_i(0x7ff));
+  q = vsub_vi_vi_vi(q, vcast_vi_i(0x3ff));
+  return q;
+}
+#endif
+
+static INLINE CONST vopmask visint_vo_vd(vdouble d) {
+  vdouble x = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0 / (1LL << 31))));
+  x = vmla_vd_vd_vd_vd(vcast_vd_d(-(double)(1LL << 31)), x, d);
+  return vor_vo_vo_vo(veq_vo_vd_vd(vtruncate_vd_vd(x), x),
+          vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(1LL << 53)));
+}
+
+static INLINE CONST vopmask visodd_vo_vd(vdouble d) {
+  vdouble x = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0 / (1LL << 31))));
+  x = vmla_vd_vd_vd_vd(vcast_vd_d(-(double)(1LL << 31)), x, d);
+
+  return vand_vo_vo_vo(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vtruncate_vi_vd(x), vcast_vi_i(1)), vcast_vi_i(1))),
+           vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(1LL << 53)));
+}
+
+EXPORT CONST vdouble xldexp(vdouble x, vint q) {
+  // TODO this is probably possible to do more elegantly
+  vint q_plus = vadd_vi_vi_vi(q, vcast_vi_i(10));
+  q = vsel_vi_vo_vi_vi(
+    veq_vo_vi_vi(q, vcast_vi_i(-2147483648)),
+    q_plus,
+    q);
+
+  vdouble res = vldexp_vd_vd_vi(x, q);
+  vdouble zero = vcast_vd_d(0.0);
+
+  res = vsel_vd_vo_vd_vd(veq_vo_vd_vd(vabs_vd_vd(x), zero), x, res);
+  res = vsel_vd_vo_vd_vd(visinf_vo_vd(x), x, res);
+  res = vsel_vd_vo_vd_vd(visnan_vo_vd(x), x, res);
+
+  return res;
+}
+
+EXPORT CONST vint xilogb(vdouble d) {
+  vdouble e = vcast_vd_vi(vilogbk_vi_vd(vabs_vd_vd(d)));
+  e = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(FP_ILOGB0), e);
+  e = vsel_vd_vo_vd_vd(visnan_vo_vd(d), vcast_vd_d(FP_ILOGBNAN), e);
+  e = vsel_vd_vo_vd_vd(visinf_vo_vd(d), vcast_vd_d(INT_MAX), e);
+  return vrint_vi_vd(e);
+}
+
+EXPORT CONST vdouble xsin(vdouble d) {
+  vdouble u, s, r = d;
+  vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI / (1 << 24))));
+  dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));
+  vdouble dql = vrint_vd_vd(vmlapn_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), dqh));
+  vint ql = vrint_vi_vd(dql);
+
+  d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A), d);
+  d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A), d);
+  d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B), d);
+  d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B), d);
+  d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C), d);
+  d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C), d);
+  d = vmla_vd_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D), d);
+
+  s = vmul_vd_vd_vd(d, d);
+
+  d = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(d)));
+
+  u = vcast_vd_d(-7.97255955009037868891952e-18);
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.81009972710863200091251e-15));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-7.64712219118158833288484e-13));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(1.60590430605664501629054e-10));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.50521083763502045810755e-08));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.75573192239198747630416e-06));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.000198412698412696162806809));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00833333333333332974823815));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.166666666666666657414808));
+
+  u = vadd_vd_vd_vd(vmul_vd_vd_vd(s, vmul_vd_vd_vd(u, d)), d);
+
+  u = vsel_vd_vo_vd_vd(vandnot_vo_vo_vo(visinf_vo_vd(r),
+          vor_vo_vo_vo(visnegzero_vo_vd(r),
+                 vgt_vo_vd_vd(vabs_vd_vd(r), vcast_vd_d(TRIGRANGEMAX)))),
+           vcast_vd_d(-0.0), u);
+
+  return u;
+}
+
+EXPORT CONST vdouble xsin_u1(vdouble d) {
+  vdouble u;
+  vdouble2 s, t, x;
+  vint ql;
+
+  if (vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2)))) {
+    const vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI)));
+    ql = vrint_vi_vd(dql);
+    u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2), d);
+    s = ddadd_vd2_vd_vd (u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B2)));
+  } else {
+    vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI / (1 << 24))));
+    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));
+    const vdouble dql = vrint_vd_vd(vmlapn_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), dqh));
+    ql = vrint_vi_vd(dql);
+
+    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A), d);
+    s = ddadd_vd2_vd_vd  (u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A)));
+    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B)));
+    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B)));
+    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C)));
+    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C)));
+    s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D)));
+  }
+
+  t = s;
+  s = ddsqu_vd2_vd2(s);
+
+  u = vcast_vd_d(2.72052416138529567917983e-15);
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-7.6429259411395447190023e-13));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(1.60589370117277896211623e-10));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-2.5052106814843123359368e-08));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.75573192104428224777379e-06));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-0.000198412698412046454654947));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.00833333333333318056201922));
+
+  x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd2_vd2(ddadd_vd2_vd_vd(vcast_vd_d(-0.166666666666666657414808), vmul_vd_vd_vd(u, s.x)), s));
+
+  u = ddmul_vd_vd2_vd2(t, x);
+
+  u = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1))),
+                   vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(u)));
+  u = vsel_vd_vo_vd_vd(vandnot_vo_vo_vo(visinf_vo_vd(d), vor_vo_vo_vo(visnegzero_vo_vd(d),
+                      vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX)))),
+           vcast_vd_d(-0.0), u);
+
+  return u;
+}
+
+EXPORT CONST vdouble xcos(vdouble d) {
+  vdouble u, s, r = d;
+  vdouble dqh = vtruncate_vd_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI / (1 << 23)), vcast_vd_d(-M_1_PI / (1 << 24))));
+  vint ql = vrint_vi_vd(vadd_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI)),
+              vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-(1 << 23)), vcast_vd_d(-0.5))));
+  dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));
+  ql = vadd_vi_vi_vi(vadd_vi_vi_vi(ql, ql), vcast_vi_i(1));
+  vdouble dql = vcast_vd_vi(ql);
+
+  d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d);
+  d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A * 0.5), d);
+  d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B * 0.5), d);
+  d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B * 0.5), d);
+  d = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C * 0.5), d);
+  d = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C * 0.5), d);
+  d = vmla_vd_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D * 0.5), d);
+
+  s = vmul_vd_vd_vd(d, d);
+
+  d = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(0))), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(d)));
+
+  u = vcast_vd_d(-7.97255955009037868891952e-18);
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.81009972710863200091251e-15));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-7.64712219118158833288484e-13));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(1.60590430605664501629054e-10));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.50521083763502045810755e-08));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.75573192239198747630416e-06));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.000198412698412696162806809));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00833333333333332974823815));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.166666666666666657414808));
+
+  u = vadd_vd_vd_vd(vmul_vd_vd_vd(s, vmul_vd_vd_vd(u, d)), d);
+
+  u = vsel_vd_vo_vd_vd(vandnot_vo_vo_vo(visinf_vo_vd(r), vgt_vo_vd_vd(vabs_vd_vd(r), vcast_vd_d(TRIGRANGEMAX))), vcast_vd_d(1), u);
+
+  return u;
+}
+
+EXPORT CONST vdouble xcos_u1(vdouble d) {
+  vdouble u;
+  vdouble2 s, t, x;
+  vint ql;
+
+  if (vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2)))) {
+    vdouble dql = vrint_vd_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI), vcast_vd_d(-0.5)));
+    dql = vmla_vd_vd_vd_vd(vcast_vd_d(2), dql, vcast_vd_d(1));
+    ql = vrint_vi_vd(dql);
+    s = ddadd2_vd2_vd_vd(d, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A2*0.5)));
+    s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B2*0.5)));
+  } else {
+    vdouble dqh = vtruncate_vd_vd(vmla_vd_vd_vd_vd(d, vcast_vd_d(M_1_PI / (1 << 23)), vcast_vd_d(-M_1_PI / (1 << 24))));
+    ql = vrint_vi_vd(vadd_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(M_1_PI)),
+          vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-(1 << 23)), vcast_vd_d(-0.5))));
+    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));
+    ql = vadd_vi_vi_vi(vadd_vi_vi_vi(ql, ql), vcast_vi_i(1));
+    const vdouble dql = vcast_vd_vi(ql);
+
+    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d);
+    s = ddadd2_vd2_vd_vd(u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A*0.5)));
+    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B*0.5)));
+    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B*0.5)));
+    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C*0.5)));
+    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C*0.5)));
+    s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D*0.5)));
+  }
+
+  t = s;
+  s = ddsqu_vd2_vd2(s);
+
+  u = vcast_vd_d(2.72052416138529567917983e-15);
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-7.6429259411395447190023e-13));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(1.60589370117277896211623e-10));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-2.5052106814843123359368e-08));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.75573192104428224777379e-06));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-0.000198412698412046454654947));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.00833333333333318056201922));
+
+  x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd2_vd2(ddadd_vd2_vd_vd(vcast_vd_d(-0.166666666666666657414808), vmul_vd_vd_vd(u, s.x)), s));
+
+  u = ddmul_vd_vd2_vd2(t, x);
+
+  u = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(0))), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(u)));
+
+  u = vsel_vd_vo_vd_vd(vandnot_vo_vo_vo(visinf_vo_vd(d), vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX))), vcast_vd_d(1), u);
+
+  return u;
+}
+
+#ifdef ENABLE_GNUABI
+#define TYPE2_FUNCATR static INLINE CONST
+#define TYPE6_FUNCATR static INLINE CONST
+#define XSINCOS sincosk
+#define XSINCOS_U1 sincosk_u1
+#define XSINCOSPI_U05 sincospik_u05
+#define XSINCOSPI_U35 sincospik_u35
+#define XMODF modfk
+#else
+#define TYPE2_FUNCATR EXPORT
+#define TYPE6_FUNCATR EXPORT CONST
+#define XSINCOS xsincos
+#define XSINCOS_U1 xsincos_u1
+#define XSINCOSPI_U05 xsincospi_u05
+#define XSINCOSPI_U35 xsincospi_u35
+#define XMODF xmodf
+#endif
+
+TYPE2_FUNCATR vdouble2 XSINCOS(vdouble d) {
+  vopmask o;
+  vdouble u, s, t, rx, ry;
+  vdouble2 r;
+
+  s = d;
+  vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24))));
+  dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));
+  vdouble dql = vrint_vd_vd(vsub_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI)), dqh));
+  vint ql = vrint_vi_vd(dql);
+
+  s = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), s);
+  s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A * 0.5), s);
+  s = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B * 0.5), s);
+  s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B * 0.5), s);
+  s = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C * 0.5), s);
+  s = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C * 0.5), s);
+  s = vmla_vd_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D * 0.5), s);
+
+  t = s;
+
+  s = vmul_vd_vd_vd(s, s);
+
+  u = vcast_vd_d(1.58938307283228937328511e-10);
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.50506943502539773349318e-08));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.75573131776846360512547e-06));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.000198412698278911770864914));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0083333333333191845961746));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.166666666666666130709393));
+
+  rx = vmla_vd_vd_vd_vd(vmul_vd_vd_vd(u, s), t, t);
+  rx = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), rx);
+
+  u = vcast_vd_d(-1.13615350239097429531523e-11);
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.08757471207040055479366e-09));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.75573144028847567498567e-07));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.48015872890001867311915e-05));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.00138888888888714019282329));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0416666666666665519592062));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.5));
+
+  ry = vmla_vd_vd_vd_vd(s, u, vcast_vd_d(1));
+
+  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(0)));
+  r.x = vsel_vd_vo_vd_vd(o, rx, ry);
+  r.y = vsel_vd_vo_vd_vd(o, ry, rx);
+
+  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(2)));
+  r.x = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.x)));
+
+  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2)));
+  r.y = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.y)));
+
+  o = vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX));
+  r.x = vreinterpret_vd_vm(vandnot_vm_vo64_vm(o, vreinterpret_vm_vd(r.x)));
+  r.y = vsel_vd_vo_vd_vd(o, vcast_vd_d(1), r.y);
+
+  o = visinf_vo_vd(d);
+  r.x = vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(r.x)));
+  r.y = vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(r.y)));
+
+  return r;
+}
+
+TYPE2_FUNCATR vdouble2 XSINCOS_U1(vdouble d) {
+  vopmask o;
+  vdouble u, rx, ry;
+  vdouble2 r, s, t, x;
+  vint ql;
+
+  if (vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2)))) {
+    const vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2 * M_1_PI)));
+    ql = vrint_vi_vd(dql);
+    u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2*0.5), d);
+    s = ddadd_vd2_vd_vd (u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B2*0.5)));
+  } else {
+    vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24))));
+    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));
+    const vdouble dql = vrint_vd_vd(vsub_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI)), dqh));
+    ql = vrint_vi_vd(dql);
+
+    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d);
+    s = ddadd_vd2_vd_vd(u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A*0.5)));
+    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B*0.5)));
+    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B*0.5)));
+    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C*0.5)));
+    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C*0.5)));
+    s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D*0.5)));
+  }
+
+  t = s;
+
+  s.x = ddsqu_vd_vd2(s);
+
+  u = vcast_vd_d(1.58938307283228937328511e-10);
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-2.50506943502539773349318e-08));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.75573131776846360512547e-06));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-0.000198412698278911770864914));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.0083333333333191845961746));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-0.166666666666666130709393));
+
+  u = vmul_vd_vd_vd(u, vmul_vd_vd_vd(s.x, t.x));
+
+  x = ddadd_vd2_vd2_vd(t, u);
+  rx = vadd_vd_vd_vd(x.x, x.y);
+
+  rx = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), rx);
+
+  u = vcast_vd_d(-1.13615350239097429531523e-11);
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.08757471207040055479366e-09));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-2.75573144028847567498567e-07));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.48015872890001867311915e-05));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-0.00138888888888714019282329));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.0416666666666665519592062));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-0.5));
+
+  x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd_vd(s.x, u));
+  ry = vadd_vd_vd_vd(x.x, x.y);
+
+  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(0)));
+  r.x = vsel_vd_vo_vd_vd(o, rx, ry);
+  r.y = vsel_vd_vo_vd_vd(o, ry, rx);
+
+  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(2)), vcast_vi_i(2)));
+  r.x = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.x)));
+
+  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(2)), vcast_vi_i(2)));
+  r.y = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.y)));
+
+  o = vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX));
+  r.x = vreinterpret_vd_vm(vandnot_vm_vo64_vm(o, vreinterpret_vm_vd(r.x)));
+  r.y = vsel_vd_vo_vd_vd(o, vcast_vd_d(1), r.y);
+
+  o = visinf_vo_vd(d);
+  r.x = vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(r.x)));
+  r.y = vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(r.y)));
+
+  return r;
+}
+
+TYPE2_FUNCATR vdouble2 XSINCOSPI_U05(vdouble d) {
+  vopmask o;
+  vdouble u, s, t, rx, ry;
+  vdouble2 r, x, s2;
+
+  u = vmul_vd_vd_vd(d, vcast_vd_d(4.0));
+  vint q = vand_vi_vi_vi(vrint_vi_vd(vadd_vd_vd_vd(u, vcast_vd_d(0.5))), vcast_vi_i(~1));
+  s = vsub_vd_vd_vd(u, vcast_vd_vi(q));
+
+  t = s;
+  s = vmul_vd_vd_vd(s, s);
+  s2 = ddmul_vd2_vd_vd(t, t);
+
+  //
+
+  u = vcast_vd_d(-2.02461120785182399295868e-14);
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(6.94821830580179461327784e-12));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-1.75724749952853179952664e-09));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(3.13361688966868392878422e-07));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-3.6576204182161551920361e-05));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00249039457019271850274356));
+  x = ddadd2_vd2_vd_vd2(vmul_vd_vd_vd(u, s), vcast_vd2_d_d(-0.0807455121882807852484731, 3.61852475067037104849987e-18));
+  x = ddadd2_vd2_vd2_vd2(ddmul_vd2_vd2_vd2(s2, x), vcast_vd2_d_d(0.785398163397448278999491, 3.06287113727155002607105e-17));
+
+  x = ddmul_vd2_vd2_vd(x, t);
+  rx = vadd_vd_vd_vd(x.x, x.y);
+
+  rx = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), rx);
+
+  //
+
+  u = vcast_vd_d(9.94480387626843774090208e-16);
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-3.89796226062932799164047e-13));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(1.15011582539996035266901e-10));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-2.4611369501044697495359e-08));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(3.59086044859052754005062e-06));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.000325991886927389905997954));
+  x = ddadd2_vd2_vd_vd2(vmul_vd_vd_vd(u, s), vcast_vd2_d_d(0.0158543442438155018914259, -1.04693272280631521908845e-18));
+  x = ddadd2_vd2_vd2_vd2(ddmul_vd2_vd2_vd2(s2, x), vcast_vd2_d_d(-0.308425137534042437259529, -1.95698492133633550338345e-17));
+
+  x = ddadd2_vd2_vd2_vd(ddmul_vd2_vd2_vd2(x, s2), vcast_vd_d(1));
+  ry = vadd_vd_vd_vd(x.x, x.y);
+
+  //
+
+  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(0)));
+  r.x = vsel_vd_vo_vd_vd(o, rx, ry);
+  r.y = vsel_vd_vo_vd_vd(o, ry, rx);
+
+  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(4)), vcast_vi_i(4)));
+  r.x = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.x)));
+
+  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(4)), vcast_vi_i(4)));
+  r.y = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.y)));
+
+  o = vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX3/4));
+  r.x = vreinterpret_vd_vm(vandnot_vm_vo64_vm(o, vreinterpret_vm_vd(r.x)));
+  r.y = vsel_vd_vo_vd_vd(o, vcast_vd_d(1), r.y);
+
+  o = visinf_vo_vd(d);
+  r.x = vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(r.x)));
+  r.y = vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(r.y)));
+
+  return r;
+}
+
+TYPE2_FUNCATR vdouble2 XSINCOSPI_U35(vdouble d) {
+  vopmask o;
+  vdouble u, s, t, rx, ry;
+  vdouble2 r;
+
+  u = vmul_vd_vd_vd(d, vcast_vd_d(4.0));
+  vint q = vtruncate_vi_vd(u);
+  q = vand_vi_vi_vi(vadd_vi_vi_vi(q, vxor_vi_vi_vi(vsrl_vi_vi_i(q, 31), vcast_vi_i(1))), vcast_vi_i(~1));
+  s = vsub_vd_vd_vd(u, vcast_vd_vi(q));
+
+  t = s;
+  s = vmul_vd_vd_vd(s, s);
+
+  //
+
+  u = vcast_vd_d(+0.6880638894766060136e-11);
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.1757159564542310199e-8));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.3133616327257867311e-6));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.3657620416388486452e-4));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.2490394570189932103e-2));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.8074551218828056320e-1));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.7853981633974482790e+0));
+
+  rx = vmul_vd_vd_vd(u, t);
+
+  //
+
+  u = vcast_vd_d(-0.3860141213683794352e-12);
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1150057888029681415e-9));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.2461136493006663553e-7));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.3590860446623516713e-5));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.3259918869269435942e-3));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1585434424381541169e-1));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.3084251375340424373e+0));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(1));
+
+  ry = u;
+
+  //
+
+  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(0)));
+  r.x = vsel_vd_vo_vd_vd(o, rx, ry);
+  r.y = vsel_vd_vo_vd_vd(o, ry, rx);
+
+  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(4)), vcast_vi_i(4)));
+  r.x = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.x)));
+
+  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(4)), vcast_vi_i(4)));
+  r.y = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(r.y)));
+
+  o = vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX3/4));
+  r.x = vreinterpret_vd_vm(vandnot_vm_vo64_vm(o, vreinterpret_vm_vd(r.x)));
+  r.y = vreinterpret_vd_vm(vandnot_vm_vo64_vm(o, vreinterpret_vm_vd(r.y)));
+
+  o = visinf_vo_vd(d);
+  r.x = vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(r.x)));
+  r.y = vreinterpret_vd_vm(vor_vm_vo64_vm(o, vreinterpret_vm_vd(r.y)));
+
+  return r;
+}
+
+TYPE6_FUNCATR vdouble2 XMODF(vdouble x) {
+  vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(1LL << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (1LL << 31)))))));
+  fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr)));
+  fr = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(1LL << 52)), vcast_vd_d(0), fr);
+
+  vdouble2 ret;
+
+  ret.x = vcopysign_vd_vd_vd(fr, x);
+  ret.y = vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), x);
+
+  return ret;
+}
+
+#ifdef ENABLE_GNUABI
+EXPORT void xsincos(vdouble a, double *ps, double *pc) {
+  vdouble2 r = sincosk(a);
+  vstoreu_v_p_vd(ps, r.x);
+  vstoreu_v_p_vd(pc, r.y);
+}
+
+EXPORT void xsincos_u1(vdouble a, double *ps, double *pc) {
+  vdouble2 r = sincosk_u1(a);
+  vstoreu_v_p_vd(ps, r.x);
+  vstoreu_v_p_vd(pc, r.y);
+}
+
+EXPORT void xsincospi_u05(vdouble a, double *ps, double *pc) {
+  vdouble2 r = sincospik_u05(a);
+  vstoreu_v_p_vd(ps, r.x);
+  vstoreu_v_p_vd(pc, r.y);
+}
+
+EXPORT void xsincospi_u35(vdouble a, double *ps, double *pc) {
+  vdouble2 r = sincospik_u35(a);
+  vstoreu_v_p_vd(ps, r.x);
+  vstoreu_v_p_vd(pc, r.y);
+}
+
+EXPORT CONST vdouble xmodf(vdouble a, double *iptr) {
+  vdouble2 r = modfk(a);
+  vstoreu_v_p_vd(iptr, r.y);
+  return r.x;
+}
+#endif // #ifdef ENABLE_GNUABI
+
+static INLINE CONST vdouble2 sinpik(vdouble d) {
+  vopmask o;
+  vdouble u, s, t;
+  vdouble2 x, s2;
+
+  u = vmul_vd_vd_vd(d, vcast_vd_d(4.0));
+  vint q = vtruncate_vi_vd(u);
+  q = vand_vi_vi_vi(vadd_vi_vi_vi(q, vxor_vi_vi_vi(vsrl_vi_vi_i(q, 31), vcast_vi_i(1))), vcast_vi_i(~1));
+  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(2)));
+
+  s = vsub_vd_vd_vd(u, vcast_vd_vi(q));
+  t = s;
+  s = vmul_vd_vd_vd(s, s);
+  s2 = ddmul_vd2_vd_vd(t, t);
+
+  //
+
+  u = vsel_vd_vo_d_d(o, 9.94480387626843774090208e-16, -2.02461120785182399295868e-14);
+  u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, -3.89796226062932799164047e-13, 6.948218305801794613277840e-12));
+  u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, 1.150115825399960352669010e-10, -1.75724749952853179952664e-09));
+  u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, -2.46113695010446974953590e-08, 3.133616889668683928784220e-07));
+  u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, 3.590860448590527540050620e-06, -3.65762041821615519203610e-05));
+  u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, -0.000325991886927389905997954, 0.0024903945701927185027435600));
+  x = ddadd2_vd2_vd_vd2(vmul_vd_vd_vd(u, s),
+      vsel_vd2_vo_d_d_d_d(o, 0.0158543442438155018914259, -1.04693272280631521908845e-18,
+              -0.0807455121882807852484731, 3.61852475067037104849987e-18));
+  x = ddadd2_vd2_vd2_vd2(ddmul_vd2_vd2_vd2(s2, x),
+       vsel_vd2_vo_d_d_d_d(o, -0.308425137534042437259529, -1.95698492133633550338345e-17,
+               0.785398163397448278999491, 3.06287113727155002607105e-17));
+
+  x = ddmul_vd2_vd2_vd2(x, vsel_vd2_vo_vd2_vd2(o, s2, vcast_vd2_vd_vd(t, vcast_vd_d(0))));
+  x = vsel_vd2_vo_vd2_vd2(o, ddadd2_vd2_vd2_vd(x, vcast_vd_d(1)), x);
+
+  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(4)), vcast_vi_i(4)));
+  x.x = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(x.x)));
+  x.y = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(x.y)));
+
+  return x;
+}
+
+EXPORT CONST vdouble xsinpi_u05(vdouble d) {
+  vdouble2 x = sinpik(d);
+  vdouble r = vadd_vd_vd_vd(x.x, x.y);
+
+  r = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), r);
+  r = vreinterpret_vd_vm(vandnot_vm_vo64_vm(vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX3/4)), vreinterpret_vm_vd(r)));
+  r = vreinterpret_vd_vm(vor_vm_vo64_vm(visinf_vo_vd(d), vreinterpret_vm_vd(r)));
+
+  return r;
+}
+
+static INLINE CONST vdouble2 cospik(vdouble d) {
+  vopmask o;
+  vdouble u, s, t;
+  vdouble2 x, s2;
+
+  u = vmul_vd_vd_vd(d, vcast_vd_d(4.0));
+  vint q = vtruncate_vi_vd(u);
+  q = vand_vi_vi_vi(vadd_vi_vi_vi(q, vxor_vi_vi_vi(vsrl_vi_vi_i(q, 31), vcast_vi_i(1))), vcast_vi_i(~1));
+  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(0)));
+
+  s = vsub_vd_vd_vd(u, vcast_vd_vi(q));
+  t = s;
+  s = vmul_vd_vd_vd(s, s);
+  s2 = ddmul_vd2_vd_vd(t, t);
+
+  //
+
+  u = vsel_vd_vo_d_d(o, 9.94480387626843774090208e-16, -2.02461120785182399295868e-14);
+  u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, -3.89796226062932799164047e-13, 6.948218305801794613277840e-12));
+  u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, 1.150115825399960352669010e-10, -1.75724749952853179952664e-09));
+  u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, -2.46113695010446974953590e-08, 3.133616889668683928784220e-07));
+  u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, 3.590860448590527540050620e-06, -3.65762041821615519203610e-05));
+  u = vmla_vd_vd_vd_vd(u, s, vsel_vd_vo_d_d(o, -0.000325991886927389905997954, 0.0024903945701927185027435600));
+  x = ddadd2_vd2_vd_vd2(vmul_vd_vd_vd(u, s),
+      vsel_vd2_vo_d_d_d_d(o, 0.0158543442438155018914259, -1.04693272280631521908845e-18,
+              -0.0807455121882807852484731, 3.61852475067037104849987e-18));
+  x = ddadd2_vd2_vd2_vd2(ddmul_vd2_vd2_vd2(s2, x),
+       vsel_vd2_vo_d_d_d_d(o, -0.308425137534042437259529, -1.95698492133633550338345e-17,
+               0.785398163397448278999491, 3.06287113727155002607105e-17));
+
+  x = ddmul_vd2_vd2_vd2(x, vsel_vd2_vo_vd2_vd2(o, s2, vcast_vd2_vd_vd(t, vcast_vd_d(0))));
+  x = vsel_vd2_vo_vd2_vd2(o, ddadd2_vd2_vd2_vd(x, vcast_vd_d(1)), x);
+
+  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vadd_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(4)), vcast_vi_i(4)));
+  x.x = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(x.x)));
+  x.y = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(x.y)));
+
+  return x;
+}
+
+EXPORT CONST vdouble xcospi_u05(vdouble d) {
+  vdouble2 x = cospik(d);
+  vdouble r = vadd_vd_vd_vd(x.x, x.y);
+
+  r = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX3/4)), vcast_vd_d(1), r);
+  r = vreinterpret_vd_vm(vor_vm_vo64_vm(visinf_vo_vd(d), vreinterpret_vm_vd(r)));
+
+  return r;
+}
+
+EXPORT CONST vdouble xtan(vdouble d) {
+  vdouble u, s, x;
+  vopmask o;
+  vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24))));
+  dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));
+  vdouble dql = vrint_vd_vd(vsub_vd_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI)), dqh));
+  vint ql = vrint_vi_vd(dql);
+
+  x = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d);
+  x = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A * 0.5), x);
+  x = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_B * 0.5), x);
+  x = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_B * 0.5), x);
+  x = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_C * 0.5), x);
+  x = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_C * 0.5), x);
+  x = vmla_vd_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D * 0.5), x);
+
+  s = vmul_vd_vd_vd(x, x);
+
+  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1)));
+  x = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(x)));
+
+  u = vcast_vd_d(9.99583485362149960784268e-06);
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-4.31184585467324750724175e-05));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000103573238391744000389851));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-0.000137892809714281708733524));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000157624358465342784274554));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(-6.07500301486087879295969e-05));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000148898734751616411290179));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000219040550724571513561967));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000595799595197098359744547));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00145461240472358871965441));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0035923150771440177410343));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00886321546662684547901456));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0218694899718446938985394));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0539682539049961967903002));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.133333333334818976423364));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.333333333333320047664472));
+
+  u = vmla_vd_vd_vd_vd(s, vmul_vd_vd_vd(u, x), x);
+
+  u = vsel_vd_vo_vd_vd(o, vrec_vd_vd(u), u);
+
+#ifndef ENABLE_AVX512F
+  u = vreinterpret_vd_vm(vor_vm_vo64_vm(visinf_vo_vd(d), vreinterpret_vm_vd(u)));
+#else
+  u = vfixup_vd_vd_vd_vi2_i(u, d, vcast_vi2_i((3 << (4*4)) | (3 << (5*4))), 0);
+#endif
+  u = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), u);
+
+  return u;
+}
+
+EXPORT CONST vdouble xtan_u1(vdouble d) {
+  vdouble u;
+  vdouble2 s, t, x;
+  vopmask o;
+  vint ql;
+
+  if (vtestallones_i_vo64(vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX2)))) {
+    vdouble dql = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2 * M_1_PI)));
+    ql = vrint_vi_vd(dql);
+    u = vmla_vd_vd_vd_vd(dql, vcast_vd_d(-PI_A2*0.5), d);
+    s = ddadd_vd2_vd_vd (u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B2*0.5)));
+  } else {
+    vdouble dqh = vtruncate_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(2*M_1_PI / (1 << 24))));
+    dqh = vmul_vd_vd_vd(dqh, vcast_vd_d(1 << 24));
+    s = ddadd2_vd2_vd2_vd(ddmul_vd2_vd2_vd(vcast_vd2_d_d(M_2_PI_H, M_2_PI_L), d),
+        vsub_vd_vd_vd(vsel_vd_vo_vd_vd(vlt_vo_vd_vd(d, vcast_vd_d(0)),
+               vcast_vd_d(-0.5), vcast_vd_d(0.5)), dqh));
+    const vdouble dql = vtruncate_vd_vd(vadd_vd_vd_vd(s.x, s.y));
+    ql = vrint_vi_vd(dql);
+
+    u = vmla_vd_vd_vd_vd(dqh, vcast_vd_d(-PI_A * 0.5), d);
+    s = ddadd_vd2_vd_vd(u, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_A*0.5            )));
+    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_B*0.5)));
+    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_B*0.5            )));
+    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dqh, vcast_vd_d(-PI_C*0.5)));
+    s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dql, vcast_vd_d(-PI_C*0.5            )));
+    s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(vadd_vd_vd_vd(dqh, dql), vcast_vd_d(-PI_D*0.5)));
+  }
+
+  o = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(ql, vcast_vi_i(1)), vcast_vi_i(1)));
+  vmask n = vand_vm_vo64_vm(o, vreinterpret_vm_vd(vcast_vd_d(-0.0)));
+  s.x = vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(s.x), n));
+  s.y = vreinterpret_vd_vm(vxor_vm_vm_vm(vreinterpret_vm_vd(s.y), n));
+
+  t = s;
+  s = ddsqu_vd2_vd2(s);
+
+  u = vcast_vd_d(1.01419718511083373224408e-05);
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-2.59519791585924697698614e-05));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(5.23388081915899855325186e-05));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(-3.05033014433946488225616e-05));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(7.14707504084242744267497e-05));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(8.09674518280159187045078e-05));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.000244884931879331847054404));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.000588505168743587154904506));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.00145612788922812427978848));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.00359208743836906619142924));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.00886323944362401618113356));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.0218694882853846389592078));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.0539682539781298417636002));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.133333333333125941821962));
+
+  x = ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd2_vd2(ddadd_vd2_vd_vd(vcast_vd_d(0.333333333333334980164153), vmul_vd_vd_vd(u, s.x)), s));
+  x = ddmul_vd2_vd2_vd2(t, x);
+
+  x = vsel_vd2_vo_vd2_vd2(o, ddrec_vd2_vd2(x), x);
+
+  u = vadd_vd_vd_vd(x.x, x.y);
+
+  u = vsel_vd_vo_vd_vd(vandnot_vo_vo_vo(visinf_vo_vd(d),
+          vor_vo_vo_vo(visnegzero_vo_vd(d),
+                 vgt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(TRIGRANGEMAX)))),
+           vcast_vd_d(-0.0), u);
+
+  return u;
+}
+
+static INLINE CONST vdouble atan2k(vdouble y, vdouble x) {
+  vdouble s, t, u;
+  vint q;
+  vopmask p;
+
+  q = vsel_vi_vd_vi(x, vcast_vi_i(-2));
+  x = vabs_vd_vd(x);
+
+  q = vsel_vi_vd_vd_vi_vi(x, y, vadd_vi_vi_vi(q, vcast_vi_i(1)), q);
+  p = vlt_vo_vd_vd(x, y);
+  s = vsel_vd_vo_vd_vd(p, vneg_vd_vd(x), y);
+  t = vmax_vd_vd_vd(x, y);
+
+  s = vdiv_vd_vd_vd(s, t);
+  t = vmul_vd_vd_vd(s, s);
+
+  u = vcast_vd_d(-1.88796008463073496563746e-05);
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.000209850076645816976906797));
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.00110611831486672482563471));
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.00370026744188713119232403));
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.00889896195887655491740809));
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.016599329773529201970117));
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.0254517624932312641616861));
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.0337852580001353069993897));
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.0407629191276836500001934));
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.0466667150077840625632675));
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.0523674852303482457616113));
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.0587666392926673580854313));
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.0666573579361080525984562));
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.0769219538311769618355029));
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.090908995008245008229153));
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.111111105648261418443745));
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.14285714266771329383765));
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.199999999996591265594148));
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.333333333333311110369124));
+
+  t = vmla_vd_vd_vd_vd(s, vmul_vd_vd_vd(t, u), s);
+  t = vmla_vd_vd_vd_vd(vcast_vd_vi(q), vcast_vd_d(M_PI/2), t);
+
+  return t;
+}
+
+static INLINE CONST vdouble2 atan2k_u1(vdouble2 y, vdouble2 x) {
+  vdouble u;
+  vdouble2 s, t;
+  vint q;
+  vopmask p;
+
+  q = vsel_vi_vd_vi(x.x, vcast_vi_i(-2));
+  p = vlt_vo_vd_vd(x.x, vcast_vd_d(0));
+  vmask b = vand_vm_vo64_vm(p, vreinterpret_vm_vd(vcast_vd_d(-0.0)));
+  x.x = vreinterpret_vd_vm(vxor_vm_vm_vm(b, vreinterpret_vm_vd(x.x)));
+  x.y = vreinterpret_vd_vm(vxor_vm_vm_vm(b, vreinterpret_vm_vd(x.y)));
+
+  q = vsel_vi_vd_vd_vi_vi(x.x, y.x, vadd_vi_vi_vi(q, vcast_vi_i(1)), q);
+  p = vlt_vo_vd_vd(x.x, y.x);
+  s = vsel_vd2_vo_vd2_vd2(p, ddneg_vd2_vd2(x), y);
+  t = vsel_vd2_vo_vd2_vd2(p, y, x);
+
+  s = dddiv_vd2_vd2_vd2(s, t);
+  t = ddsqu_vd2_vd2(s);
+  t = ddnormalize_vd2_vd2(t);
+
+  u = vcast_vd_d(1.06298484191448746607415e-05);
+  u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.000125620649967286867384336));
+  u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.00070557664296393412389774));
+  u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.00251865614498713360352999));
+  u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.00646262899036991172313504));
+  u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.0128281333663399031014274));
+  u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.0208024799924145797902497));
+  u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.0289002344784740315686289));
+  u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.0359785005035104590853656));
+  u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.041848579703592507506027));
+  u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.0470843011653283988193763));
+  u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.0524914210588448421068719));
+  u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.0587946590969581003860434));
+  u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.0666620884778795497194182));
+  u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.0769225330296203768654095));
+  u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.0909090442773387574781907));
+  u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.111111108376896236538123));
+  u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.142857142756268568062339));
+  u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(0.199999999997977351284817));
+  u = vmla_vd_vd_vd_vd(u, t.x, vcast_vd_d(-0.333333333333317605173818));
+
+  t = ddmul_vd2_vd2_vd(t, u);
+  t = ddmul_vd2_vd2_vd2(s, ddadd_vd2_vd_vd2(vcast_vd_d(1), t));
+  t = ddadd2_vd2_vd2_vd2(ddmul_vd2_vd2_vd(vcast_vd2_d_d(1.570796326794896557998982, 6.12323399573676603586882e-17), vcast_vd_vi(q)), t);
+
+  return t;
+}
+
+static INLINE CONST vdouble visinf2_vd_vd_vd(vdouble d, vdouble m) {
+  return vreinterpret_vd_vm(vand_vm_vo64_vm(visinf_vo_vd(d), vor_vm_vm_vm(vand_vm_vm_vm(vreinterpret_vm_vd(d), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(m))));
+}
+
+EXPORT CONST vdouble xatan2(vdouble y, vdouble x) {
+  vdouble r = atan2k(vabs_vd_vd(y), x);
+
+  r = vmulsign_vd_vd_vd(r, x);
+  r = vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), veq_vo_vd_vd(x, vcast_vd_d(0))), vsub_vd_vd_vd(vcast_vd_d(M_PI/2), visinf2_vd_vd_vd(x, vmulsign_vd_vd_vd(vcast_vd_d(M_PI/2), x))), r);
+  r = vsel_vd_vo_vd_vd(visinf_vo_vd(y), vsub_vd_vd_vd(vcast_vd_d(M_PI/2), visinf2_vd_vd_vd(x, vmulsign_vd_vd_vd(vcast_vd_d(M_PI/4), x))), r);
+  r = vsel_vd_vo_vd_vd(veq_vo_vd_vd(y, vcast_vd_d(0.0)), vreinterpret_vd_vm(vand_vm_vo64_vm(vsignbit_vo_vd(x), vreinterpret_vm_vd(vcast_vd_d(M_PI)))), r);
+
+  r = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visnan_vo_vd(x), visnan_vo_vd(y)), vreinterpret_vm_vd(vmulsign_vd_vd_vd(r, y))));
+  return r;
+}
+
+EXPORT CONST vdouble xatan2_u1(vdouble y, vdouble x) {
+  vopmask o = vlt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(5.5626846462680083984e-309)); // nexttoward((1.0 / DBL_MAX), 1)
+  x = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(x, vcast_vd_d(1ULL << 53)), x);
+  y = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(y, vcast_vd_d(1ULL << 53)), y);
+
+  vdouble2 d = atan2k_u1(vcast_vd2_vd_vd(vabs_vd_vd(y), vcast_vd_d(0)), vcast_vd2_vd_vd(x, vcast_vd_d(0)));
+  vdouble r = vadd_vd_vd_vd(d.x, d.y);
+
+  r = vmulsign_vd_vd_vd(r, x);
+  r = vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), veq_vo_vd_vd(x, vcast_vd_d(0))), vsub_vd_vd_vd(vcast_vd_d(M_PI/2), visinf2_vd_vd_vd(x, vmulsign_vd_vd_vd(vcast_vd_d(M_PI/2), x))), r);
+  r = vsel_vd_vo_vd_vd(visinf_vo_vd(y), vsub_vd_vd_vd(vcast_vd_d(M_PI/2), visinf2_vd_vd_vd(x, vmulsign_vd_vd_vd(vcast_vd_d(M_PI/4), x))), r);
+  r = vsel_vd_vo_vd_vd(veq_vo_vd_vd(y, vcast_vd_d(0.0)), vreinterpret_vd_vm(vand_vm_vo64_vm(vsignbit_vo_vd(x), vreinterpret_vm_vd(vcast_vd_d(M_PI)))), r);
+
+  r = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visnan_vo_vd(x), visnan_vo_vd(y)), vreinterpret_vm_vd(vmulsign_vd_vd_vd(r, y))));
+  return r;
+}
+
+EXPORT CONST vdouble xasin(vdouble d) {
+  vopmask o = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(0.5));
+  vdouble x2 = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, d), vmul_vd_vd_vd(vsub_vd_vd_vd(vcast_vd_d(1), vabs_vd_vd(d)), vcast_vd_d(0.5)));
+  vdouble x = vsel_vd_vo_vd_vd(o, vabs_vd_vd(d), vsqrt_vd_vd(x2)), u;
+
+  u = vcast_vd_d(+0.3161587650653934628e-1);
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(-0.1581918243329996643e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1929045477267910674e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.6606077476277170610e-2));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1215360525577377331e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1388715184501609218e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1735956991223614604e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.2237176181932048341e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.3038195928038132237e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.4464285681377102438e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.7500000000378581611e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1666666666666497543e+0));
+  u = vmla_vd_vd_vd_vd(u, vmul_vd_vd_vd(x, x2), x);
+
+  vdouble r = vsel_vd_vo_vd_vd(o, u, vmla_vd_vd_vd_vd(u, vcast_vd_d(-2), vcast_vd_d(M_PI/2)));
+  return vmulsign_vd_vd_vd(r, d);
+}
+
+EXPORT CONST vdouble xasin_u1(vdouble d) {
+  vopmask o = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(0.5));
+  vdouble x2 = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, d), vmul_vd_vd_vd(vsub_vd_vd_vd(vcast_vd_d(1), vabs_vd_vd(d)), vcast_vd_d(0.5))), u;
+  vdouble2 x = vsel_vd2_vo_vd2_vd2(o, vcast_vd2_vd_vd(vabs_vd_vd(d), vcast_vd_d(0)), ddsqrt_vd2_vd(x2));
+  x = vsel_vd2_vo_vd2_vd2(veq_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(1.0)), vcast_vd2_d_d(0, 0), x);
+
+  u = vcast_vd_d(+0.3161587650653934628e-1);
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(-0.1581918243329996643e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1929045477267910674e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.6606077476277170610e-2));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1215360525577377331e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1388715184501609218e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1735956991223614604e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.2237176181932048341e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.3038195928038132237e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.4464285681377102438e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.7500000000378581611e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1666666666666497543e+0));
+  u = vmul_vd_vd_vd(u, vmul_vd_vd_vd(x2, x.x));
+
+  vdouble2 y = ddsub_vd2_vd2_vd(ddsub_vd2_vd2_vd2(vcast_vd2_d_d(3.141592653589793116/4, 1.2246467991473532072e-16/4), x), u);
+
+  vdouble r = vsel_vd_vo_vd_vd(o, vadd_vd_vd_vd(u, x.x),
+             vmul_vd_vd_vd(vadd_vd_vd_vd(y.x, y.y), vcast_vd_d(2)));
+  return vmulsign_vd_vd_vd(r, d);
+}
+
+EXPORT CONST vdouble xacos(vdouble d) {
+  vopmask o = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(0.5));
+  vdouble x2 = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, d),
+                                vmul_vd_vd_vd(vsub_vd_vd_vd(vcast_vd_d(1), vabs_vd_vd(d)), vcast_vd_d(0.5))), u;
+  vdouble x = vsel_vd_vo_vd_vd(o, vabs_vd_vd(d), vsqrt_vd_vd(x2));
+  x = vsel_vd_vo_vd_vd(veq_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(1.0)), vcast_vd_d(0), x);
+
+  u = vcast_vd_d(+0.3161587650653934628e-1);
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(-0.1581918243329996643e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1929045477267910674e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.6606077476277170610e-2));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1215360525577377331e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1388715184501609218e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1735956991223614604e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.2237176181932048341e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.3038195928038132237e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.4464285681377102438e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.7500000000378581611e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1666666666666497543e+0));
+  u = vmul_vd_vd_vd(u, vmul_vd_vd_vd(x2, x));
+
+  vdouble y = vsub_vd_vd_vd(vcast_vd_d(M_PI/2), vadd_vd_vd_vd(vmulsign_vd_vd_vd(x, d), vmulsign_vd_vd_vd(u, d)));
+  x = vadd_vd_vd_vd(x, u);
+  vdouble r = vsel_vd_vo_vd_vd(o, y, vmul_vd_vd_vd(x, vcast_vd_d(2)));
+  return vsel_vd_vo_vd_vd(vandnot_vo_vo_vo(o, vlt_vo_vd_vd(d, vcast_vd_d(0))),
+                          ddadd_vd2_vd2_vd(vcast_vd2_d_d(3.141592653589793116, 1.2246467991473532072e-16),
+                                           vneg_vd_vd(r)).x, r);
+}
+
+EXPORT CONST vdouble xacos_u1(vdouble d) {
+  vopmask o = vlt_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(0.5));
+  vdouble x2 = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, d), vmul_vd_vd_vd(vsub_vd_vd_vd(vcast_vd_d(1), vabs_vd_vd(d)), vcast_vd_d(0.5))), u;
+  vdouble2 x = vsel_vd2_vo_vd2_vd2(o, vcast_vd2_vd_vd(vabs_vd_vd(d), vcast_vd_d(0)), ddsqrt_vd2_vd(x2));
+  x = vsel_vd2_vo_vd2_vd2(veq_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(1.0)), vcast_vd2_d_d(0, 0), x);
+
+  u = vcast_vd_d(+0.3161587650653934628e-1);
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(-0.1581918243329996643e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1929045477267910674e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.6606077476277170610e-2));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1215360525577377331e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1388715184501609218e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1735956991223614604e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.2237176181932048341e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.3038195928038132237e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.4464285681377102438e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.7500000000378581611e-1));
+  u = vmla_vd_vd_vd_vd(u, x2, vcast_vd_d(+0.1666666666666497543e+0));
+  u = vmul_vd_vd_vd(u, vmul_vd_vd_vd(x2, x.x));
+
+  vdouble2 y = ddsub_vd2_vd2_vd2(vcast_vd2_d_d(3.141592653589793116/2, 1.2246467991473532072e-16/2),
+                                 ddadd_vd2_vd_vd(vmulsign_vd_vd_vd(x.x, d), vmulsign_vd_vd_vd(u, d)));
+  x = ddadd_vd2_vd2_vd(x, u);
+
+  y = vsel_vd2_vo_vd2_vd2(o, y, ddscale_vd2_vd2_vd(x, vcast_vd_d(2)));
+
+  y = vsel_vd2_vo_vd2_vd2(vandnot_vo_vo_vo(o, vlt_vo_vd_vd(d, vcast_vd_d(0))),
+                          ddsub_vd2_vd2_vd2(vcast_vd2_d_d(3.141592653589793116, 1.2246467991473532072e-16), y), y);
+
+  return vadd_vd_vd_vd(y.x, y.y);
+}
+
+EXPORT CONST vdouble xatan_u1(vdouble d) {
+  vdouble2 d2 = atan2k_u1(vcast_vd2_vd_vd(vabs_vd_vd(d), vcast_vd_d(0)), vcast_vd2_d_d(1, 0));
+  vdouble r = vadd_vd_vd_vd(d2.x, d2.y);
+  r = vsel_vd_vo_vd_vd(visinf_vo_vd(d), vcast_vd_d(1.570796326794896557998982), r);
+  return vmulsign_vd_vd_vd(r, d);
+}
+
+EXPORT CONST vdouble xatan(vdouble s) {
+  vdouble t, u;
+  vint q;
+
+  q = vsel_vi_vd_vi(s, vcast_vi_i(2));
+  s = vabs_vd_vd(s);
+
+  q = vsel_vi_vd_vd_vi_vi(vcast_vd_d(1), s, vadd_vi_vi_vi(q, vcast_vi_i(1)), q);
+  s = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(vcast_vd_d(1), s), vrec_vd_vd(s), s);
+
+  t = vmul_vd_vd_vd(s, s);
+
+  u = vcast_vd_d(-1.88796008463073496563746e-05);
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.000209850076645816976906797));
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.00110611831486672482563471));
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.00370026744188713119232403));
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.00889896195887655491740809));
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.016599329773529201970117));
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.0254517624932312641616861));
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.0337852580001353069993897));
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.0407629191276836500001934));
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.0466667150077840625632675));
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.0523674852303482457616113));
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.0587666392926673580854313));
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.0666573579361080525984562));
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.0769219538311769618355029));
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.090908995008245008229153));
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.111111105648261418443745));
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.14285714266771329383765));
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(0.199999999996591265594148));
+  u = vmla_vd_vd_vd_vd(u, t, vcast_vd_d(-0.333333333333311110369124));
+
+  t = vmla_vd_vd_vd_vd(s, vmul_vd_vd_vd(t, u), s);
+
+  t = vsel_vd_vo_vd_vd(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(1)), vcast_vi_i(1))), vsub_vd_vd_vd(vcast_vd_d(M_PI/2), t), t);
+  t = vreinterpret_vd_vm(vxor_vm_vm_vm(vand_vm_vo64_vm(vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(q, vcast_vi_i(2)), vcast_vi_i(2))), vreinterpret_vm_vd(vcast_vd_d(-0.0))), vreinterpret_vm_vd(t)));
+
+  return t;
+}
+
+EXPORT CONST vdouble xlog(vdouble d) {
+  vdouble x, x2;
+  vdouble t, m;
+
+#ifndef ENABLE_AVX512F
+  vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(DBL_MIN));
+  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d((double)(1LL << 32) * (double)(1LL << 32))), d);
+  vint e = vilogb2k_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75)));
+  m = vldexp3_vd_vd_vi(d, vneg_vi_vi(e));
+  e = vsel_vi_vo_vi_vi(vcast_vo32_vo64(o), vsub_vi_vi_vi(e, vcast_vi_i(64)), e);
+#else
+  vdouble e = vgetexp_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75)));
+  e = vsel_vd_vo_vd_vd(vispinf_vo_vd(e), vcast_vd_d(1024.0), e);
+  m = vgetmant_vd_vd(d);
+#endif
+
+  x = vdiv_vd_vd_vd(vadd_vd_vd_vd(vcast_vd_d(-1), m), vadd_vd_vd_vd(vcast_vd_d(1), m));
+  x2 = vmul_vd_vd_vd(x, x);
+
+  t = vcast_vd_d(0.153487338491425068243146);
+  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.152519917006351951593857));
+  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.181863266251982985677316));
+  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.222221366518767365905163));
+  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.285714294746548025383248));
+  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.399999999950799600689777));
+  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.6666666666667778740063));
+  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(2));
+
+#ifndef ENABLE_AVX512F
+  x = vmla_vd_vd_vd_vd(x, t, vmul_vd_vd_vd(vcast_vd_d(0.693147180559945286226764), vcast_vd_vi(e)));
+
+  x = vsel_vd_vo_vd_vd(vispinf_vo_vd(d), vcast_vd_d(INFINITY), x);
+  x = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vlt_vo_vd_vd(d, vcast_vd_d(0)), visnan_vo_vd(d)), vcast_vd_d(NAN), x);
+  x = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(-INFINITY), x);
+#else
+  x = vmla_vd_vd_vd_vd(x, t, vmul_vd_vd_vd(vcast_vd_d(0.693147180559945286226764), e));
+  x = vfixup_vd_vd_vd_vi2_i(x, d, vcast_vi2_i((5 << (5*4))), 0);
+#endif
+
+  return x;
+}
+
+EXPORT CONST vdouble xexp(vdouble d) {
+  vdouble u = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(R_LN2))), s;
+  vint q = vrint_vi_vd(u);
+
+  s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-L2U), d);
+  s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-L2L), s);
+
+#ifdef ENABLE_FMA_DP
+  u = vcast_vd_d(+0.2081276378237164457e-8);
+  u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(+0.2511210703042288022e-7));
+  u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(+0.2755762628169491192e-6));
+  u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(+0.2755723402025388239e-5));
+  u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(+0.2480158687479686264e-4));
+  u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1984126989855865850e-3));
+  u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1388888888914497797e-2));
+  u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(+0.8333333333314938210e-2));
+  u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(+0.4166666666666602598e-1));
+  u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1666666666666669072e+0));
+  u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(+0.5000000000000000000e+0));
+  u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1000000000000000000e+1));
+  u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1000000000000000000e+1));
+#else
+  u = vcast_vd_d(2.08860621107283687536341e-09);
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.51112930892876518610661e-08));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.75573911234900471893338e-07));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.75572362911928827629423e-06));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(2.4801587159235472998791e-05));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.000198412698960509205564975));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00138888888889774492207962));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.00833333333331652721664984));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.0416666666666665047591422));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.166666666666666851703837));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(0.5));
+
+  u = vadd_vd_vd_vd(vcast_vd_d(1), vmla_vd_vd_vd_vd(vmul_vd_vd_vd(s, s), u, s));
+#endif
+
+  u = vldexp2_vd_vd_vi(u, q);
+
+  u = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(d, vcast_vd_d(709.78271114955742909217217426)), vcast_vd_d(INFINITY), u);
+  u = vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(d, vcast_vd_d(-1000)), vreinterpret_vm_vd(u)));
+
+  return u;
+}
+
+static INLINE CONST vdouble2 logk(vdouble d) {
+  vdouble2 x, x2, s;
+  vdouble t, m;
+
+#ifndef ENABLE_AVX512F
+  vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(DBL_MIN));
+  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d((double)(1LL << 32) * (double)(1LL << 32))), d);
+  vint e = vilogb2k_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75)));
+  m = vldexp3_vd_vd_vi(d, vneg_vi_vi(e));
+  e = vsel_vi_vo_vi_vi(vcast_vo32_vo64(o), vsub_vi_vi_vi(e, vcast_vi_i(64)), e);
+#else
+  vdouble e = vgetexp_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75)));
+  e = vsel_vd_vo_vd_vd(vispinf_vo_vd(e), vcast_vd_d(1024.0), e);
+  m = vgetmant_vd_vd(d);
+#endif
+
+  x = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd_vd(vcast_vd_d(-1), m), ddadd2_vd2_vd_vd(vcast_vd_d(1), m));
+  x2 = ddsqu_vd2_vd2(x);
+
+  t = vcast_vd_d(0.116255524079935043668677);
+  t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.103239680901072952701192));
+  t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.117754809412463995466069));
+  t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.13332981086846273921509));
+  t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.153846227114512262845736));
+  t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.181818180850050775676507));
+  t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.222222222230083560345903));
+  t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.285714285714249172087875));
+  t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.400000000000000077715612));
+  vdouble2 c = vcast_vd2_d_d(0.666666666666666629659233, 3.80554962542412056336616e-17);
+
+#ifndef ENABLE_AVX512F
+  s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.693147180559945286226764, 2.319046813846299558417771e-17), vcast_vd_vi(e));
+#else
+  s = ddmul_vd2_vd2_vd(vcast_vd2_vd_vd(vcast_vd_d(0.693147180559945286226764), vcast_vd_d(2.319046813846299558417771e-17)), e);
+#endif
+
+  s = ddadd_vd2_vd2_vd2(s, ddscale_vd2_vd2_vd(x, vcast_vd_d(2)));
+  s = ddadd_vd2_vd2_vd2(s, ddmul_vd2_vd2_vd2(ddmul_vd2_vd2_vd2(x2, x),
+                                            ddadd2_vd2_vd2_vd2(ddmul_vd2_vd2_vd(x2, t), c)));
+  return s;
+}
+
+EXPORT CONST vdouble xlog_u1(vdouble d) {
+  vdouble2 x;
+  vdouble t, m, x2;
+
+#ifndef ENABLE_AVX512F
+  vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(DBL_MIN));
+  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d((double)(1LL << 32) * (double)(1LL << 32))), d);
+  vint e = vilogb2k_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75)));
+  m = vldexp3_vd_vd_vi(d, vneg_vi_vi(e));
+  e = vsel_vi_vo_vi_vi(vcast_vo32_vo64(o), vsub_vi_vi_vi(e, vcast_vi_i(64)), e);
+#else
+  vdouble e = vgetexp_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75)));
+  e = vsel_vd_vo_vd_vd(vispinf_vo_vd(e), vcast_vd_d(1024.0), e);
+  m = vgetmant_vd_vd(d);
+#endif
+
+  x = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd_vd(vcast_vd_d(-1), m), ddadd2_vd2_vd_vd(vcast_vd_d(1), m));
+  x2 = vmul_vd_vd_vd(x.x, x.x);
+
+  t = vcast_vd_d(0.1532076988502701353e+0);
+  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.1525629051003428716e+0));
+  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.1818605932937785996e+0));
+  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.2222214519839380009e+0));
+  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.2857142932794299317e+0));
+  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.3999999999635251990e+0));
+  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.6666666666667333541e+0));
+
+#ifndef ENABLE_AVX512F
+  vdouble2 s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.693147180559945286226764, 2.319046813846299558417771e-17), vcast_vd_vi(e));
+#else
+  vdouble2 s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.693147180559945286226764, 2.319046813846299558417771e-17), e);
+#endif
+
+  s = ddadd_vd2_vd2_vd2(s, ddscale_vd2_vd2_vd(x, vcast_vd_d(2)));
+  s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(vmul_vd_vd_vd(x2, x.x), t));
+
+  vdouble r = vadd_vd_vd_vd(s.x, s.y);
+
+#ifndef ENABLE_AVX512F
+  r = vsel_vd_vo_vd_vd(vispinf_vo_vd(d), vcast_vd_d(INFINITY), r);
+  r = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vlt_vo_vd_vd(d, vcast_vd_d(0)), visnan_vo_vd(d)), vcast_vd_d(NAN), r);
+  r = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(-INFINITY), r);
+#else
+  r = vfixup_vd_vd_vd_vi2_i(r, d, vcast_vi2_i((4 << (2*4)) | (3 << (4*4)) | (5 << (5*4)) | (2 << (6*4))), 0);
+#endif
+
+  return r;
+}
+
+static INLINE CONST vdouble expk(vdouble2 d) {
+  vdouble u = vmul_vd_vd_vd(vadd_vd_vd_vd(d.x, d.y), vcast_vd_d(R_LN2));
+  vdouble dq = vrint_vd_vd(u);
+  vint q = vrint_vi_vd(dq);
+  vdouble2 s, t;
+
+  s = ddadd2_vd2_vd2_vd(d, vmul_vd_vd_vd(dq, vcast_vd_d(-L2U)));
+  s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dq, vcast_vd_d(-L2L)));
+
+  s = ddnormalize_vd2_vd2(s);
+
+  u = vcast_vd_d(2.51069683420950419527139e-08);
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.76286166770270649116855e-07));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.75572496725023574143864e-06));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(2.48014973989819794114153e-05));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.000198412698809069797676111));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.0013888888939977128960529));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.00833333333332371417601081));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.0416666666665409524128449));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.166666666666666740681535));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(0.500000000000000999200722));
+
+  t = ddadd_vd2_vd2_vd2(s, ddmul_vd2_vd2_vd(ddsqu_vd2_vd2(s), u));
+
+  t = ddadd_vd2_vd_vd2(vcast_vd_d(1), t);
+  u = vadd_vd_vd_vd(t.x, t.y);
+  u = vldexp2_vd_vd_vi(u, q);
+
+  u = vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(d.x, vcast_vd_d(-1000)), vreinterpret_vm_vd(u)));
+
+  return u;
+}
+
+EXPORT CONST vdouble xpow(vdouble x, vdouble y) {
+#if 1
+  vopmask yisint = visint_vo_vd(y);
+  vopmask yisodd = vand_vo_vo_vo(visodd_vo_vd(y), yisint);
+
+  vdouble2 d = ddmul_vd2_vd2_vd(logk(vabs_vd_vd(x)), y);
+  vdouble result = expk(d);
+  result = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(d.x, vcast_vd_d(709.78271114955742909217217426)), vcast_vd_d(INFINITY), result);
+
+  result = vmul_vd_vd_vd(result,
+       vsel_vd_vo_vd_vd(vgt_vo_vd_vd(x, vcast_vd_d(0)),
+            vcast_vd_d(1),
+            vsel_vd_vo_vd_vd(yisint, vsel_vd_vo_vd_vd(yisodd, vcast_vd_d(-1.0), vcast_vd_d(1)), vcast_vd_d(NAN))));
+
+  vdouble efx = vmulsign_vd_vd_vd(vsub_vd_vd_vd(vabs_vd_vd(x), vcast_vd_d(1)), y);
+
+  result = vsel_vd_vo_vd_vd(visinf_vo_vd(y),
+          vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(efx, vcast_vd_d(0.0)),
+                  vreinterpret_vm_vd(vsel_vd_vo_vd_vd(veq_vo_vd_vd(efx, vcast_vd_d(0.0)),
+                              vcast_vd_d(1.0),
+                              vcast_vd_d(INFINITY))))),
+          result);
+
+  result = vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), veq_vo_vd_vd(x, vcast_vd_d(0.0))),
+          vmul_vd_vd_vd(vsel_vd_vo_vd_vd(yisodd, vsign_vd_vd(x), vcast_vd_d(1.0)),
+            vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(vsel_vd_vo_vd_vd(veq_vo_vd_vd(x, vcast_vd_d(0.0)), vneg_vd_vd(y), y), vcast_vd_d(0.0)),
+                    vreinterpret_vm_vd(vcast_vd_d(INFINITY))))),
+          result);
+
+  result = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visnan_vo_vd(x), visnan_vo_vd(y)), vreinterpret_vm_vd(result)));
+
+  result = vsel_vd_vo_vd_vd(vor_vo_vo_vo(veq_vo_vd_vd(y, vcast_vd_d(0)), veq_vo_vd_vd(x, vcast_vd_d(1))), vcast_vd_d(1), result);
+
+  return result;
+#else
+  return expk(ddmul_vd2_vd2_vd(logk(x), y));
+#endif
+}
+
+EXPORT CONST vdouble xpown(vdouble x, vint y) {
+    vdouble res = xpow(x, vcast_vd_vi(y));
+
+    vint is_odd = vand_vi_vi_vi(y, vcast_vi_i(1));
+
+    vopmask is_odd_o =
+      vgt_cvt_vo_vi_vi(is_odd, vcast_vi_i(0));
+
+    vopmask is_lt0_o = vlt_vo_vd_vd(x, vcast_vd_d(0.0));
+
+    res = vabs_vd_vd(res);
+    vdouble neg = vneg_vd_vd(res);
+
+    res = vsel_vd_vo_vd_vd(
+              vand_vo_vo_vo(is_lt0_o, is_odd_o),
+              neg,
+              res);
+
+    //pown ( ±0, n ) is ±∞ for odd n < 0.
+    //pown ( ±0, n ) is +∞ for even n < 0.
+    //pown ( ±0, n ) is +0 for even n > 0.
+    //pown ( ±0, n ) is ±0 for odd n > 0.
+
+    vdouble xiszero = vsel_vd_vo_vd_vd(
+                  vgt_cvt_vo_vi_vi(y, vcast_vi_i(0)),
+                  vcast_vd_d(0.0),
+                  vcast_vd_d(INFINITY));
+
+    vdouble with_sig = vcopysign_vd_vd_vd(xiszero, x);
+
+    xiszero = vsel_vd_vo_vd_vd(is_odd_o, with_sig, xiszero);
+
+    res = vsel_vd_vo_vd_vd(
+            veq_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(0.0)),
+            xiszero,
+            res);
+
+    // pown ( x, 0 ) is 1 for any x
+    res = vsel_vd_vo_vd_vd(
+            veq_cvt_vo_vi_vi(y, vcast_vi_i(0)),
+            vcast_vd_d(1.0),
+            res);
+
+    res = vsel_vd_vo_vd_vd(
+            veq_cvt_vo_vi_vi(y, vcast_vi_i(1)),
+            x,
+            res);
+
+    return res;
+}
+
+
+EXPORT CONST vdouble xpowr(vdouble x, vdouble y) {
+    vdouble res = xpow(x, y);
+
+    vdouble ax = vabs_vd_vd(x);
+    vdouble ay = vabs_vd_vd(y);
+    vdouble zeroes = vcast_vd_d(0.0);
+
+    //powr ( ±0, y ) is +0 for y > 0.
+    //powr ( ±0, y ) is +∞ for finite y < 0.
+    vdouble r_Xzero = vsel_vd_vo_vd_vd(
+                       vlt_vo_vd_vd(y, zeroes),
+                       vcast_vd_d(INFINITY),
+                       zeroes);
+
+    //powr ( ±0, -∞) is +∞.
+    r_Xzero = vsel_vd_vo_vd_vd(
+                veq_vo_vd_vd(y, vcast_vd_d(-INFINITY)),
+                vcast_vd_d(INFINITY),
+                r_Xzero);
+
+    res = vsel_vd_vo_vd_vd(
+            veq_vo_vd_vd(ax, zeroes),
+            r_Xzero,
+            res);
+
+    //powr ( ±0, ±0 ) returns NaN.
+    vdouble r_Yzero = vsel_vd_vo_vd_vd(
+                        veq_vo_vd_vd(ax, zeroes),
+                        vcast_vd_d(NAN),
+                        zeroes);
+    //powr ( x, ±0 ) is 1 for finite x > 0.
+    r_Yzero = vsel_vd_vo_vd_vd(
+                vgt_vo_vd_vd(x, zeroes),
+                vcast_vd_d(1.0),
+                r_Yzero);
+
+    //powr ( +∞, ±0 ) returns NaN.
+    r_Yzero = vsel_vd_vo_vd_vd(
+                veq_vo_vd_vd(x, vcast_vd_d(INFINITY)),
+                vcast_vd_d(NAN),
+                r_Yzero);
+
+    res = vsel_vd_vo_vd_vd(
+            veq_vo_vd_vd(ay, zeroes),
+            r_Yzero,
+            res);
+
+
+    //powr ( +1, y ) is 1 for finite y.
+    //powr ( +1, ±∞ ) returns NaN.
+    vdouble r_Xone = vsel_vd_vo_vd_vd(
+                      veq_vo_vd_vd(ay, vcast_vd_d(INFINITY)),
+                      vcast_vd_d(NAN),
+                      vcast_vd_d(1.0));
+
+    res = vsel_vd_vo_vd_vd(
+            veq_vo_vd_vd(ax, vcast_vd_d(1.0)),
+            r_Xone,
+            res);
+
+    // powr(x, 1) is x
+    res = vsel_vd_vo_vd_vd(
+            veq_vo_vd_vd(y, vcast_vd_d(1.0)),
+            x,
+            res);
+
+    //powr ( x, y ) returns NaN for x < 0.
+    res = vsel_vd_vo_vd_vd(
+            vlt_vo_vd_vd(x, zeroes),
+            vcast_vd_d(NAN),
+            res);
+
+    //powr ( NaN, y ) returns the NaN
+    res = vsel_vd_vo_vd_vd(
+            visnan_vo_vd(x),
+            x,
+            res);
+
+    //powr ( x, NaN ) returns the NaN for x >= 0.
+    res = vsel_vd_vo_vd_vd(
+            visnan_vo_vd(y),
+            y,
+            res);
+    return res;
+}
+
+
+static INLINE CONST vdouble2 expk2(vdouble2 d) {
+  vdouble u = vmul_vd_vd_vd(vadd_vd_vd_vd(d.x, d.y), vcast_vd_d(R_LN2));
+  vdouble dq = vrint_vd_vd(u);
+  vint q = vrint_vi_vd(dq);
+  vdouble2 s, t;
+
+  s = ddadd2_vd2_vd2_vd(d, vmul_vd_vd_vd(dq, vcast_vd_d(-L2U)));
+  s = ddadd2_vd2_vd2_vd(s, vmul_vd_vd_vd(dq, vcast_vd_d(-L2L)));
+
+  u = vcast_vd_d(+0.1602472219709932072e-9);
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(+0.2092255183563157007e-8));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(+0.2505230023782644465e-7));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(+0.2755724800902135303e-6));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(+0.2755731892386044373e-5));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(+0.2480158735605815065e-4));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(+0.1984126984148071858e-3));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(+0.1388888888886763255e-2));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(+0.8333333333333347095e-2));
+  u = vmla_vd_vd_vd_vd(u, s.x, vcast_vd_d(+0.4166666666666669905e-1));
+
+  t = ddadd2_vd2_vd2_vd(ddmul_vd2_vd2_vd(s, u), vcast_vd_d(+0.1666666666666666574e+0));
+  t = ddadd2_vd2_vd2_vd(ddmul_vd2_vd2_vd2(s, t), vcast_vd_d(0.5));
+  t = ddadd2_vd2_vd2_vd2(s, ddmul_vd2_vd2_vd2(ddsqu_vd2_vd2(s), t));
+
+  t = ddadd_vd2_vd_vd2(vcast_vd_d(1), t);
+
+  t.x = vldexp2_vd_vd_vi(t.x, q);
+  t.y = vldexp2_vd_vd_vi(t.y, q);
+
+  t.x = vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(d.x, vcast_vd_d(-1000)), vreinterpret_vm_vd(t.x)));
+  t.y = vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(d.x, vcast_vd_d(-1000)), vreinterpret_vm_vd(t.y)));
+
+  return t;
+}
+
+EXPORT CONST vdouble xsinh(vdouble x) {
+  vdouble y = vabs_vd_vd(x);
+  vdouble2 d = expk2(vcast_vd2_vd_vd(y, vcast_vd_d(0)));
+  d = ddsub_vd2_vd2_vd2(d, ddrec_vd2_vd2(d));
+  y = vmul_vd_vd_vd(vadd_vd_vd_vd(d.x, d.y), vcast_vd_d(0.5));
+
+  y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(710)), visnan_vo_vd(y)), vcast_vd_d(INFINITY), y);
+  y = vmulsign_vd_vd_vd(y, x);
+  y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y)));
+
+  return y;
+}
+
+EXPORT CONST vdouble xcosh(vdouble x) {
+  vdouble y = vabs_vd_vd(x);
+  vdouble2 d = expk2(vcast_vd2_vd_vd(y, vcast_vd_d(0)));
+  d = ddadd_vd2_vd2_vd2(d, ddrec_vd2_vd2(d));
+  y = vmul_vd_vd_vd(vadd_vd_vd_vd(d.x, d.y), vcast_vd_d(0.5));
+
+  y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(710)), visnan_vo_vd(y)), vcast_vd_d(INFINITY), y);
+  y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y)));
+
+  return y;
+}
+
+EXPORT CONST vdouble xtanh(vdouble x) {
+  vdouble y = vabs_vd_vd(x);
+  vdouble2 d = expk2(vcast_vd2_vd_vd(y, vcast_vd_d(0)));
+  vdouble2 e = ddrec_vd2_vd2(d);
+  d = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd2_vd2(d, ddneg_vd2_vd2(e)), ddadd2_vd2_vd2_vd2(d, e));
+  y = vadd_vd_vd_vd(d.x, d.y);
+
+  y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(18.714973875)), visnan_vo_vd(y)), vcast_vd_d(1.0), y);
+  y = vmulsign_vd_vd_vd(y, x);
+  y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y)));
+
+  return y;
+}
+
+static INLINE CONST vdouble2 logk2(vdouble2 d) {
+  vdouble2 x, x2, m, s;
+  vdouble t;
+  vint e;
+
+  e = vilogbk_vi_vd(vmul_vd_vd_vd(d.x, vcast_vd_d(1.0/0.75)));
+
+  m.x = vldexp2_vd_vd_vi(d.x, vneg_vi_vi(e));
+  m.y = vldexp2_vd_vd_vi(d.y, vneg_vi_vi(e));
+
+  x = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd2_vd(m, vcast_vd_d(-1)), ddadd2_vd2_vd2_vd(m, vcast_vd_d(1)));
+  x2 = ddsqu_vd2_vd2(x);
+
+  t = vcast_vd_d(0.13860436390467167910856);
+  t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.131699838841615374240845));
+  t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.153914168346271945653214));
+  t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.181816523941564611721589));
+  t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.22222224632662035403996));
+  t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.285714285511134091777308));
+  t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.400000000000914013309483));
+  t = vmla_vd_vd_vd_vd(t, x2.x, vcast_vd_d(0.666666666666664853302393));
+
+  s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.693147180559945286226764, 2.319046813846299558417771e-17), vcast_vd_vi(e));
+  s = ddadd_vd2_vd2_vd2(s, ddscale_vd2_vd2_vd(x, vcast_vd_d(2)));
+  s = ddadd_vd2_vd2_vd2(s, ddmul_vd2_vd2_vd(ddmul_vd2_vd2_vd2(x2, x), t));
+
+  return  s;
+}
+
+EXPORT CONST vdouble xasinh(vdouble x) {
+  vdouble y = vabs_vd_vd(x);
+  vopmask o = vgt_vo_vd_vd(y, vcast_vd_d(1));
+  vdouble2 d;
+
+  d = vsel_vd2_vo_vd2_vd2(o, ddrec_vd2_vd(x), vcast_vd2_vd_vd(y, vcast_vd_d(0)));
+  d = ddsqrt_vd2_vd2(ddadd2_vd2_vd2_vd(ddsqu_vd2_vd2(d), vcast_vd_d(1)));
+  d = vsel_vd2_vo_vd2_vd2(o, ddmul_vd2_vd2_vd(d, y), d);
+
+  d = logk2(ddnormalize_vd2_vd2(ddadd2_vd2_vd2_vd(d, x)));
+  y = vadd_vd_vd_vd(d.x, d.y);
+
+  y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(SQRT_DBL_MAX)),
+            visnan_vo_vd(y)),
+           vmulsign_vd_vd_vd(vcast_vd_d(INFINITY), x), y);
+
+  y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y)));
+  y = vsel_vd_vo_vd_vd(visnegzero_vo_vd(x), vcast_vd_d(-0.0), y);
+
+  return y;
+}
+
+EXPORT CONST vdouble xacosh(vdouble x) {
+  vdouble2 d = logk2(ddadd2_vd2_vd2_vd(ddmul_vd2_vd2_vd2(ddsqrt_vd2_vd2(ddadd2_vd2_vd_vd(x, vcast_vd_d(1))), ddsqrt_vd2_vd2(ddadd2_vd2_vd_vd(x, vcast_vd_d(-1)))), x));
+  vdouble y = vadd_vd_vd_vd(d.x, d.y);
+
+  y = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vgt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(SQRT_DBL_MAX)),
+            visnan_vo_vd(y)),
+           vcast_vd_d(INFINITY), y);
+  y = vreinterpret_vd_vm(vandnot_vm_vo64_vm(veq_vo_vd_vd(x, vcast_vd_d(1.0)), vreinterpret_vm_vd(y)));
+
+  y = vreinterpret_vd_vm(vor_vm_vo64_vm(vlt_vo_vd_vd(x, vcast_vd_d(1.0)), vreinterpret_vm_vd(y)));
+  y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y)));
+
+  return y;
+}
+
+EXPORT CONST vdouble xatanh(vdouble x) {
+  vdouble y = vabs_vd_vd(x);
+  vdouble2 d = logk2(dddiv_vd2_vd2_vd2(ddadd2_vd2_vd_vd(vcast_vd_d(1), y), ddadd2_vd2_vd_vd(vcast_vd_d(1), vneg_vd_vd(y))));
+  y = vreinterpret_vd_vm(vor_vm_vo64_vm(vgt_vo_vd_vd(y, vcast_vd_d(1.0)), vreinterpret_vm_vd(vsel_vd_vo_vd_vd(veq_vo_vd_vd(y, vcast_vd_d(1.0)), vcast_vd_d(INFINITY), vmul_vd_vd_vd(vadd_vd_vd_vd(d.x, d.y), vcast_vd_d(0.5))))));
+
+  y = vmulsign_vd_vd_vd(y, x);
+  y = vreinterpret_vd_vm(vor_vm_vo64_vm(vor_vo_vo_vo(visinf_vo_vd(x), visnan_vo_vd(y)), vreinterpret_vm_vd(y)));
+  y = vreinterpret_vd_vm(vor_vm_vo64_vm(visnan_vo_vd(x), vreinterpret_vm_vd(y)));
+
+  return y;
+}
+
+EXPORT CONST vdouble xcbrt(vdouble d) {
+  vdouble x, y, q = vcast_vd_d(1.0);
+  vint e, qu, re;
+  vdouble t;
+
+#ifdef ENABLE_AVX512F
+  vdouble s = d;
+#endif
+  e = vadd_vi_vi_vi(vilogbk_vi_vd(vabs_vd_vd(d)), vcast_vi_i(1));
+  d = vldexp2_vd_vd_vi(d, vneg_vi_vi(e));
+
+  t = vadd_vd_vd_vd(vcast_vd_vi(e), vcast_vd_d(6144));
+  qu = vtruncate_vi_vd(vmul_vd_vd_vd(t, vcast_vd_d(1.0/3.0)));
+  re = vtruncate_vi_vd(vsub_vd_vd_vd(t, vmul_vd_vd_vd(vcast_vd_vi(qu), vcast_vd_d(3))));
+
+  q = vsel_vd_vo_vd_vd(vcast_vo64_vo32(veq_vo_vi_vi(re, vcast_vi_i(1))), vcast_vd_d(1.2599210498948731647672106), q);
+  q = vsel_vd_vo_vd_vd(vcast_vo64_vo32(veq_vo_vi_vi(re, vcast_vi_i(2))), vcast_vd_d(1.5874010519681994747517056), q);
+  q = vldexp2_vd_vd_vi(q, vsub_vi_vi_vi(qu, vcast_vi_i(2048)));
+
+  q = vmulsign_vd_vd_vd(q, d);
+
+  d = vabs_vd_vd(d);
+
+  x = vcast_vd_d(-0.640245898480692909870982);
+  x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(2.96155103020039511818595));
+  x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(-5.73353060922947843636166));
+  x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(6.03990368989458747961407));
+  x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(-3.85841935510444988821632));
+  x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(2.2307275302496609725722));
+
+  y = vmul_vd_vd_vd(x, x); y = vmul_vd_vd_vd(y, y); x = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vmlapn_vd_vd_vd_vd(d, y, x), vcast_vd_d(1.0 / 3.0)));
+  y = vmul_vd_vd_vd(vmul_vd_vd_vd(d, x), x);
+  y = vmul_vd_vd_vd(vsub_vd_vd_vd(y, vmul_vd_vd_vd(vmul_vd_vd_vd(vcast_vd_d(2.0 / 3.0), y), vmla_vd_vd_vd_vd(y, x, vcast_vd_d(-1.0)))), q);
+
+#ifdef ENABLE_AVX512F
+  y = vsel_vd_vo_vd_vd(visinf_vo_vd(s), vmulsign_vd_vd_vd(vcast_vd_d(INFINITY), s), y);
+  y = vsel_vd_vo_vd_vd(veq_vo_vd_vd(s, vcast_vd_d(0)), vmulsign_vd_vd_vd(vcast_vd_d(0), s), y);
+#endif
+
+  y = vsel_vd_vo_vd_vd(visnan_vo_vd(d), d, y);
+
+  return y;
+}
+
+EXPORT CONST vdouble xcbrt_u1(vdouble d) {
+  vdouble x, y, z, t;
+  vdouble2 q2 = vcast_vd2_d_d(1, 0), u, v;
+  vint e, qu, re;
+
+#ifdef ENABLE_AVX512F
+  vdouble s = d;
+#endif
+  e = vadd_vi_vi_vi(vilogbk_vi_vd(vabs_vd_vd(d)), vcast_vi_i(1));
+  d = vldexp2_vd_vd_vi(d, vneg_vi_vi(e));
+
+  t = vadd_vd_vd_vd(vcast_vd_vi(e), vcast_vd_d(6144));
+  qu = vtruncate_vi_vd(vmul_vd_vd_vd(t, vcast_vd_d(1.0/3.0)));
+  re = vtruncate_vi_vd(vsub_vd_vd_vd(t, vmul_vd_vd_vd(vcast_vd_vi(qu), vcast_vd_d(3))));
+
+  q2 = vsel_vd2_vo_vd2_vd2(vcast_vo64_vo32(veq_vo_vi_vi(re, vcast_vi_i(1))), vcast_vd2_d_d(1.2599210498948731907, -2.5899333753005069177e-17), q2);
+  q2 = vsel_vd2_vo_vd2_vd2(vcast_vo64_vo32(veq_vo_vi_vi(re, vcast_vi_i(2))), vcast_vd2_d_d(1.5874010519681995834, -1.0869008194197822986e-16), q2);
+
+  q2.x = vmulsign_vd_vd_vd(q2.x, d); q2.y = vmulsign_vd_vd_vd(q2.y, d);
+  d = vabs_vd_vd(d);
+
+  x = vcast_vd_d(-0.640245898480692909870982);
+  x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(2.96155103020039511818595));
+  x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(-5.73353060922947843636166));
+  x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(6.03990368989458747961407));
+  x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(-3.85841935510444988821632));
+  x = vmla_vd_vd_vd_vd(x, d, vcast_vd_d(2.2307275302496609725722));
+
+  y = vmul_vd_vd_vd(x, x); y = vmul_vd_vd_vd(y, y); x = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vmlapn_vd_vd_vd_vd(d, y, x), vcast_vd_d(1.0 / 3.0)));
+
+  z = x;
+
+  u = ddmul_vd2_vd_vd(x, x);
+  u = ddmul_vd2_vd2_vd2(u, u);
+  u = ddmul_vd2_vd2_vd(u, d);
+  u = ddadd2_vd2_vd2_vd(u, vneg_vd_vd(x));
+  y = vadd_vd_vd_vd(u.x, u.y);
+
+  y = vmul_vd_vd_vd(vmul_vd_vd_vd(vcast_vd_d(-2.0 / 3.0), y), z);
+  v = ddadd2_vd2_vd2_vd(ddmul_vd2_vd_vd(z, z), y);
+  v = ddmul_vd2_vd2_vd(v, d);
+  v = ddmul_vd2_vd2_vd2(v, q2);
+  z = vldexp2_vd_vd_vi(vadd_vd_vd_vd(v.x, v.y), vsub_vi_vi_vi(qu, vcast_vi_i(2048)));
+
+#ifndef ENABLE_AVX512F
+  z = vsel_vd_vo_vd_vd(visinf_vo_vd(d), vmulsign_vd_vd_vd(vcast_vd_d(INFINITY), q2.x), z);
+  z = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vreinterpret_vd_vm(vsignbit_vm_vd(q2.x)), z);
+#else
+  z = vsel_vd_vo_vd_vd(visinf_vo_vd(s), vmulsign_vd_vd_vd(vcast_vd_d(INFINITY), s), z);
+  z = vsel_vd_vo_vd_vd(veq_vo_vd_vd(s, vcast_vd_d(0)), vmulsign_vd_vd_vd(vcast_vd_d(0), s), z);
+#endif
+
+  return z;
+}
+
+EXPORT CONST vdouble xexp2(vdouble d) {
+  vdouble u = vrint_vd_vd(d), s;
+  vint q = vrint_vi_vd(u);
+
+  s = vsub_vd_vd_vd(d, u);
+
+  u = vcast_vd_d(+0.4434359082926529454e-9);
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.7073164598085707425e-8));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1017819260921760451e-6));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1321543872511327615e-5));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1525273353517584730e-4));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1540353045101147808e-3));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1333355814670499073e-2));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.9618129107597600536e-2));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.5550410866482046596e-1));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.2402265069591012214e+0));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.6931471805599452862e+0));
+
+#ifdef ENABLE_FMA_DP
+  u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(1));
+#else
+  u = ddnormalize_vd2_vd2(ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd_vd(u, s))).x;
+#endif
+  
+  u = vldexp2_vd_vd_vi(u, q);
+
+  u = vsel_vd_vo_vd_vd(vge_vo_vd_vd(d, vcast_vd_d(1024)), vcast_vd_d(INFINITY), u);
+  u = vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(d, vcast_vd_d(-2000)), vreinterpret_vm_vd(u)));
+
+  return u;
+}
+
+EXPORT CONST vdouble xexp10(vdouble d) {
+  vdouble u = vrint_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(LOG10_2))), s;
+  vint q = vrint_vi_vd(u);
+
+  s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-L10U), d);
+  s = vmla_vd_vd_vd_vd(u, vcast_vd_d(-L10L), s);
+
+  u = vcast_vd_d(+0.2411463498334267652e-3);
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1157488415217187375e-2));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.5013975546789733659e-2));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1959762320720533080e-1));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.6808936399446784138e-1));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.2069958494722676234e+0));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.5393829292058536229e+0));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.1171255148908541655e+1));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.2034678592293432953e+1));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.2650949055239205876e+1));
+  u = vmla_vd_vd_vd_vd(u, s, vcast_vd_d(+0.2302585092994045901e+1));
+
+#ifdef ENABLE_FMA_DP
+  u = vfma_vd_vd_vd_vd(u, s, vcast_vd_d(1));
+#else
+  u = ddnormalize_vd2_vd2(ddadd_vd2_vd_vd2(vcast_vd_d(1), ddmul_vd2_vd_vd(u, s))).x;
+#endif
+  
+  u = vldexp2_vd_vd_vi(u, q);
+
+  u = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(d, vcast_vd_d(308.25471555991671)), vcast_vd_d(INFINITY), u);
+  u = vreinterpret_vd_vm(vandnot_vm_vo64_vm(vlt_vo_vd_vd(d, vcast_vd_d(-350)), vreinterpret_vm_vd(u)));
+
+  return u;
+}
+
+EXPORT CONST vdouble xexpm1(vdouble a) {
+  vdouble2 d = ddadd2_vd2_vd2_vd(expk2(vcast_vd2_vd_vd(a, vcast_vd_d(0))), vcast_vd_d(-1.0));
+  vdouble x = vadd_vd_vd_vd(d.x, d.y);
+  x = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(a, vcast_vd_d(709.782712893383996732223)), vcast_vd_d(INFINITY), x);
+  x = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(a, vcast_vd_d(-36.736800569677101399113302437)), vcast_vd_d(-1), x);
+  x = vsel_vd_vo_vd_vd(visnegzero_vo_vd(a), vcast_vd_d(-0.0), x);
+  return x;
+}
+
+EXPORT CONST vdouble xlog10(vdouble d) {
+  vdouble2 x;
+  vdouble t, m, x2;
+
+#ifndef ENABLE_AVX512F
+  vopmask o = vlt_vo_vd_vd(d, vcast_vd_d(DBL_MIN));
+  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d((double)(1LL << 32) * (double)(1LL << 32))), d);
+  vint e = vilogb2k_vi_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75)));
+  m = vldexp3_vd_vd_vi(d, vneg_vi_vi(e));
+  e = vsel_vi_vo_vi_vi(vcast_vo32_vo64(o), vsub_vi_vi_vi(e, vcast_vi_i(64)), e);
+#else
+  vdouble e = vgetexp_vd_vd(vmul_vd_vd_vd(d, vcast_vd_d(1.0/0.75)));
+  e = vsel_vd_vo_vd_vd(vispinf_vo_vd(e), vcast_vd_d(1024.0), e);
+  m = vgetmant_vd_vd(d);
+#endif
+
+  x = dddiv_vd2_vd2_vd2(ddadd2_vd2_vd_vd(vcast_vd_d(-1), m), ddadd2_vd2_vd_vd(vcast_vd_d(1), m));
+  x2 = vmul_vd_vd_vd(x.x, x.x);
+
+  t = vcast_vd_d(+0.6653725819576758460e-1);
+  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.6625722782820833712e-1));
+  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.7898105214313944078e-1));
+  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.9650955035715275132e-1));
+  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.1240841409721444993e+0));
+  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.1737177927454605086e+0));
+  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(+0.2895296546021972617e+0));
+  
+#ifndef ENABLE_AVX512F
+  vdouble2 s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.30102999566398119802, -2.803728127785170339e-18), vcast_vd_vi(e));
+#else
+  vdouble2 s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.30102999566398119802, -2.803728127785170339e-18), e);
+#endif
+
+  s = ddadd_vd2_vd2_vd2(s, ddmul_vd2_vd2_vd2(x, vcast_vd2_d_d(0.86858896380650363334, 1.1430059694096389311e-17)));
+  s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(vmul_vd_vd_vd(x2, x.x), t));
+
+  vdouble r = vadd_vd_vd_vd(s.x, s.y);
+
+#ifndef ENABLE_AVX512F
+  r = vsel_vd_vo_vd_vd(vispinf_vo_vd(d), vcast_vd_d(INFINITY), r);
+  r = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vlt_vo_vd_vd(d, vcast_vd_d(0)), visnan_vo_vd(d)), vcast_vd_d(NAN), r);
+  r = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(-INFINITY), r);
+#else
+  r = vfixup_vd_vd_vd_vi2_i(r, d, vcast_vi2_i((4 << (2*4)) | (3 << (4*4)) | (5 << (5*4)) | (2 << (6*4))), 0);
+#endif
+  
+  return r;
+}
+
+EXPORT CONST vdouble xlog1p_fast(vdouble d) {
+  vdouble2 x;
+  vdouble t, m, x2;
+
+  vdouble dp1 = vadd_vd_vd_vd(d, vcast_vd_d(1));
+
+#ifndef ENABLE_AVX512F
+  vopmask o = vlt_vo_vd_vd(dp1, vcast_vd_d(DBL_MIN));
+  dp1 = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(dp1, vcast_vd_d((double)(1LL << 32) * (double)(1LL << 32))), dp1);
+  vint e = vilogb2k_vi_vd(vmul_vd_vd_vd(dp1, vcast_vd_d(1.0/0.75)));
+  t = vldexp3_vd_vd_vi(vcast_vd_d(1), vneg_vi_vi(e));
+  m = vmla_vd_vd_vd_vd(d, t, vsub_vd_vd_vd(t, vcast_vd_d(1)));
+  e = vsel_vi_vo_vi_vi(vcast_vo32_vo64(o), vsub_vi_vi_vi(e, vcast_vi_i(64)), e);
+  vdouble2 s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.693147180559945286226764, 2.319046813846299558417771e-17), vcast_vd_vi(e));
+#else
+  vdouble e = vgetexp_vd_vd(vmul_vd_vd_vd(dp1, vcast_vd_d(1.0/0.75)));
+  e = vsel_vd_vo_vd_vd(vispinf_vo_vd(e), vcast_vd_d(1024.0), e);
+  t = vldexp3_vd_vd_vi(vcast_vd_d(1), vneg_vi_vi(vrint_vi_vd(e)));
+  m = vmla_vd_vd_vd_vd(d, t, vsub_vd_vd_vd(t, vcast_vd_d(1)));
+  vdouble2 s = ddmul_vd2_vd2_vd(vcast_vd2_d_d(0.693147180559945286226764, 2.319046813846299558417771e-17), e);
+#endif
+
+  x = dddiv_vd2_vd2_vd2(vcast_vd2_vd_vd(m, vcast_vd_d(0)), ddadd_vd2_vd_vd(vcast_vd_d(2), m));
+  x2 = vmul_vd_vd_vd(x.x, x.x);
+
+  t = vcast_vd_d(0.1532076988502701353e+0);
+  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.1525629051003428716e+0));
+  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.1818605932937785996e+0));
+  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.2222214519839380009e+0));
+  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.2857142932794299317e+0));
+  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.3999999999635251990e+0));
+  t = vmla_vd_vd_vd_vd(t, x2, vcast_vd_d(0.6666666666667333541e+0));
+
+  s = ddadd_vd2_vd2_vd2(s, ddscale_vd2_vd2_vd(x, vcast_vd_d(2)));
+  s = ddadd_vd2_vd2_vd(s, vmul_vd_vd_vd(vmul_vd_vd_vd(x2, x.x), t));
+
+  vdouble r = vadd_vd_vd_vd(s.x, s.y);
+
+  r = vsel_vd_vo_vd_vd(vgt_vo_vd_vd(d, vcast_vd_d(1e+307)), vcast_vd_d(INFINITY), r);
+  r = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vlt_vo_vd_vd(d, vcast_vd_d(-1)), visnan_vo_vd(d)), vcast_vd_d(NAN), r);
+  r = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(-1)), vcast_vd_d(-INFINITY), r);
+  r = vsel_vd_vo_vd_vd(visnegzero_vo_vd(d), vcast_vd_d(-0.0), r);
+
+  return r;
+}
+
+
+EXPORT CONST vdouble xlog1p(vdouble a) {
+  vdouble log1_small = xlog1p_fast(a);
+
+  vdouble cutoff = vcast_vd_d(1e52);
+  if (vall_lte64_i_vd_vd(a, cutoff))
+    return log1_small;
+
+  vopmask gt_cutoff = vgt_vo_vd_vd(a, cutoff);
+  vdouble log1_big = xlog(a);
+  return vsel_vd_vo_vd_vd(gt_cutoff, log1_big, log1_small);
+}
+
+//
+
+static INLINE CONST vint2 vcast_vi2_i_i(int i0, int i1) { return vcast_vi2_vm(vcast_vm_i_i(i0, i1)); }
+static INLINE CONST vint2 vrev21_vi2_vi2(vint2 i) { return vreinterpret_vi2_vf(vrev21_vf_vf(vreinterpret_vf_vi2(i))); }
+
+EXPORT CONST vdouble xfabs(vdouble x) { return vabs_vd_vd(x); }
+
+EXPORT CONST vdouble xcopysign(vdouble x, vdouble y) { return vcopysign_vd_vd_vd(x, y); }
+
+EXPORT CONST vdouble xfmax(vdouble x, vdouble y) {
+#if SLEEF_DOUBLE_MINMAXNUM_AVAILABLE
+  return vmaxnum_vd_vd_vd(x, y);
+#else
+  return vsel_vd_vo_vd_vd(visnan_vo_vd(y), x, vsel_vd_vo_vd_vd(vgt_vo_vd_vd(x, y), x, y));
+#endif
+}
+
+EXPORT CONST vdouble xfmin(vdouble x, vdouble y) {
+#if SLEEF_DOUBLE_MINMAXNUM_AVAILABLE
+  return vminnum_vd_vd_vd(x, y);
+#else
+  return vsel_vd_vo_vd_vd(visnan_vo_vd(y), x, vsel_vd_vo_vd_vd(vgt_vo_vd_vd(y, x), x, y));
+#endif
+}
+
+EXPORT CONST vdouble xfdim(vdouble x, vdouble y) {
+  vdouble ret = vsub_vd_vd_vd(x, y);
+  ret = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vlt_vo_vd_vd(ret, vcast_vd_d(0)), veq_vo_vd_vd(x, y)), vcast_vd_d(0), ret);
+  return ret;
+}
+
+EXPORT CONST vdouble xtrunc(vdouble x) {
+  vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(1LL << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (1LL << 31)))))));
+  fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr)));
+  return vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), vge_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(1LL << 52))), x, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), x));
+}
+
+EXPORT CONST vdouble xfloor(vdouble x) {
+  vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(1LL << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (1LL << 31)))))));
+  fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr)));
+  fr = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(fr, vcast_vd_d(0)), vadd_vd_vd_vd(fr, vcast_vd_d(1.0)), fr);
+  return vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), vge_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(1LL << 52))), x, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), x));
+}
+
+EXPORT CONST vdouble xceil(vdouble x) {
+  vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(1LL << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (1LL << 31)))))));
+  fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr)));
+  fr = vsel_vd_vo_vd_vd(vle_vo_vd_vd(fr, vcast_vd_d(0)), fr, vsub_vd_vd_vd(fr, vcast_vd_d(1.0)));
+  return vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(x), vge_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(1LL << 52))), x, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), x));
+}
+
+EXPORT CONST vdouble xround(vdouble d) {
+  vdouble x = vadd_vd_vd_vd(d, vcast_vd_d(0.5));
+  vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(1LL << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (1LL << 31)))))));
+  fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr)));
+  x = vsel_vd_vo_vd_vd(vand_vo_vo_vo(vle_vo_vd_vd(x, vcast_vd_d(0)), veq_vo_vd_vd(fr, vcast_vd_d(0))), vsub_vd_vd_vd(x, vcast_vd_d(1.0)), x);
+  fr = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(fr, vcast_vd_d(0)), vadd_vd_vd_vd(fr, vcast_vd_d(1.0)), fr);
+  x = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0.49999999999999994449)), vcast_vd_d(0), x);
+  return vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(d), vge_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(1LL << 52))), d, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), d));
+}
+
+EXPORT CONST vdouble xrint(vdouble d) {
+  vdouble x = vadd_vd_vd_vd(d, vcast_vd_d(0.5));
+  vdouble fr = vsub_vd_vd_vd(x, vmul_vd_vd_vd(vcast_vd_d(1LL << 31), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (1LL << 31)))))));
+  vopmask isodd = vcast_vo64_vo32(veq_vo_vi_vi(vand_vi_vi_vi(vcast_vi_i(1), vtruncate_vi_vd(fr)), vcast_vi_i(1)));
+  fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr)));
+  fr = vsel_vd_vo_vd_vd(vor_vo_vo_vo(vlt_vo_vd_vd(fr, vcast_vd_d(0)), vand_vo_vo_vo(veq_vo_vd_vd(fr, vcast_vd_d(0)), isodd)), vadd_vd_vd_vd(fr, vcast_vd_d(1.0)), fr);
+  x = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0.50000000000000011102)), vcast_vd_d(0), x);
+  vdouble ret = vsel_vd_vo_vd_vd(vor_vo_vo_vo(visinf_vo_vd(d), vge_vo_vd_vd(vabs_vd_vd(d), vcast_vd_d(1LL << 52))), d, vcopysign_vd_vd_vd(vsub_vd_vd_vd(x, fr), d));
+  return ret;
+}
+
+EXPORT CONST vdouble xnextafter(vdouble x, vdouble y) {
+  x = vsel_vd_vo_vd_vd(veq_vo_vd_vd(x, vcast_vd_d(0)), vmulsign_vd_vd_vd(vcast_vd_d(0), y), x);
+  vint2 t, xi2 = vreinterpret_vi2_vd(x);
+  vopmask c = vxor_vo_vo_vo(vsignbit_vo_vd(x), vge_vo_vd_vd(y, x));
+
+  t = vadd_vi2_vi2_vi2(vxor_vi2_vi2_vi2(xi2, vcast_vi2_i_i(0x7fffffff, 0xffffffff)), vcast_vi2_i_i(0, 1));
+  t = vadd_vi2_vi2_vi2(t, vrev21_vi2_vi2(vand_vi2_vi2_vi2(vcast_vi2_i_i(0, 1), veq_vi2_vi2_vi2(t, vcast_vi2_i_i(-1, 0)))));
+  xi2 = vreinterpret_vi2_vd(vsel_vd_vo_vd_vd(c, vreinterpret_vd_vi2(t), vreinterpret_vd_vi2(xi2)));
+
+  xi2 = vsub_vi2_vi2_vi2(xi2, vcast_vi2_vm(vand_vm_vo64_vm(vneq_vo_vd_vd(x, y), vcast_vm_i_i(0, 1))));
+
+  xi2 = vreinterpret_vi2_vd(vsel_vd_vo_vd_vd(vneq_vo_vd_vd(x, y),
+               vreinterpret_vd_vi2(vadd_vi2_vi2_vi2(xi2, vrev21_vi2_vi2(vand_vi2_vi2_vi2(vcast_vi2_i_i(0, -1), veq_vi2_vi2_vi2(xi2, vcast_vi2_i_i(0, -1)))))),
+               vreinterpret_vd_vi2(xi2)));
+
+  t = vadd_vi2_vi2_vi2(vxor_vi2_vi2_vi2(xi2, vcast_vi2_i_i(0x7fffffff, 0xffffffff)), vcast_vi2_i_i(0, 1));
+  t = vadd_vi2_vi2_vi2(t, vrev21_vi2_vi2(vand_vi2_vi2_vi2(vcast_vi2_i_i(0, 1), veq_vi2_vi2_vi2(t, vcast_vi2_i_i(-1, 0)))));
+  xi2 = vreinterpret_vi2_vd(vsel_vd_vo_vd_vd(c, vreinterpret_vd_vi2(t), vreinterpret_vd_vi2(xi2)));
+
+  vdouble ret = vreinterpret_vd_vi2(xi2);
+
+  ret = vsel_vd_vo_vd_vd(vand_vo_vo_vo(veq_vo_vd_vd(ret, vcast_vd_d(0)), vneq_vo_vd_vd(x, vcast_vd_d(0))),
+       vmulsign_vd_vd_vd(vcast_vd_d(0), x), ret);
+
+  ret = vsel_vd_vo_vd_vd(vand_vo_vo_vo(veq_vo_vd_vd(x, vcast_vd_d(0)), veq_vo_vd_vd(y, vcast_vd_d(0))), y, ret);
+
+  ret = vsel_vd_vo_vd_vd(vor_vo_vo_vo(visnan_vo_vd(x), visnan_vo_vd(y)), vcast_vd_d(NAN), ret);
+
+  return ret;
+}
+
+EXPORT CONST vdouble xfrfrexp(vdouble x) {
+  vdouble j = x;
+  x = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(DBL_MIN)),
+                       vmul_vd_vd_vd(x, vcast_vd_d((double)(1ULL << 63))),
+                       x);
+
+  vmask xm = vreinterpret_vm_vd(x);
+  xm = vand_vm_vm_vm(xm, vcast_vm_i_i(~0x7ff00000, ~0));
+  xm = vor_vm_vm_vm (xm, vcast_vm_i_i( 0x3fe00000,  0));
+
+  vdouble ret = vreinterpret_vd_vm(xm);
+
+  ret = vsel_vd_vo_vd_vd(visinf_vo_vd(x),
+                         vmulsign_vd_vd_vd(vcast_vd_d(INFINITY), x),
+                         ret);
+  ret = vsel_vd_vo_vd_vd(veq_vo_vd_vd(x, vcast_vd_d(0.0)), x, ret);
+
+  ret = vsel_vd_vo_vd_vd(visnan_vo_vd(j), j, ret);
+  return ret;
+}
+
+EXPORT CONST vmask xexpfrexp(vdouble x) {
+  vopmask isnan = vor_vo_vo_vo(visinf_vo_vd(x), visnan_vo_vd(x));
+  vdouble mul = vmul_vd_vd_vd(x, vcast_vd_d(0x1p+63));
+  vopmask is_denorm = vlt_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(0x1p-1022));
+
+  x = vsel_vd_vo_vd_vd(is_denorm, mul, x);
+  const vint2 m63 = vcast_vi2_i64(-63);
+  const vint2 zeros = vcast_vi2_i(0);
+  vint2 correct = vsel_vi2_vo_vi2_vi2(is_denorm, m63, zeros);
+
+  vint2 ret = vreinterpret_vi2_vd(x);
+
+#if defined(ENABLE_NEON32) || defined(ENABLE_ADVSIMD)
+  ret = vsrl64_vi2_vi_52(ret);
+#else
+  ret = vsrl64_vi2_vi(ret, 52);
+#endif
+  ret = vand_vi2_vi2_vi2(ret, vcast_vi2_i64(0x7ff));
+  ret = vsub64_vi2_vi2_vi2(ret, vcast_vi2_i64(0x3fe));
+  ret = vadd64_vi2_vi2_vi2(ret, correct);
+
+  ret = vsel_vi2_vo_vi2_vi2(
+            veq_vo_vd_vd(x, vreinterpret_vd_vi2(zeros)),
+            zeros,
+            ret);
+  ret = vsel_vi2_vo_vi2_vi2(isnan, zeros, ret);
+
+  return vcast_vm_vi2(ret);
+}
+
+EXPORT CONST vdouble xfma(vdouble x, vdouble y, vdouble z) {
+#ifdef ENABLE_FMA_DP
+  return vmla_vd_vd_vd_vd(x, y, z);
+#else
+  vdouble h2 = vadd_vd_vd_vd(vmul_vd_vd_vd(x, y), z), q = vcast_vd_d(1);
+  vopmask o = vlt_vo_vd_vd(vabs_vd_vd(h2), vcast_vd_d(1e-300));
+  {
+    const double c0 = 1ULL << 54, c1 = c0 * c0, c2 = c1 * c1;
+    x = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(x, vcast_vd_d(c1)), x);
+    y = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(y, vcast_vd_d(c1)), y);
+    z = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(z, vcast_vd_d(c2)), z);
+    q = vsel_vd_vo_vd_vd(o, vcast_vd_d(1.0 / c2), q);
+  }
+  o = vgt_vo_vd_vd(vabs_vd_vd(h2), vcast_vd_d(1e+300));
+  {
+    const double c0 = 1ULL << 54, c1 = c0 * c0, c2 = c1 * c1;
+    x = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(x, vcast_vd_d(1.0 / c1)), x);
+    y = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(y, vcast_vd_d(1.0 / c1)), y);
+    z = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(z, vcast_vd_d(1.0 / c2)), z);
+    q = vsel_vd_vo_vd_vd(o, vcast_vd_d(c2), q);
+  }
+  vdouble2 d = ddmul_vd2_vd_vd(x, y);
+  d = ddadd2_vd2_vd2_vd(d, z);
+  vdouble ret = vsel_vd_vo_vd_vd(vor_vo_vo_vo(veq_vo_vd_vd(x, vcast_vd_d(0)), veq_vo_vd_vd(y, vcast_vd_d(0))), z, vadd_vd_vd_vd(d.x, d.y));
+  o = visinf_vo_vd(z);
+  o = vandnot_vo_vo_vo(visinf_vo_vd(x), o);
+  o = vandnot_vo_vo_vo(visnan_vo_vd(x), o);
+  o = vandnot_vo_vo_vo(visinf_vo_vd(y), o);
+  o = vandnot_vo_vo_vo(visnan_vo_vd(y), o);
+  h2 = vsel_vd_vo_vd_vd(o, z, h2);
+
+  o = vor_vo_vo_vo(visinf_vo_vd(h2), visnan_vo_vd(h2));
+
+  return vsel_vd_vo_vd_vd(o, h2, vmul_vd_vd_vd(ret, q));
+#endif
+}
+
+EXPORT CONST vdouble xsqrt_u05(vdouble d) {
+#if 1
+  return vsqrt_vd_vd(d);
+#else
+  vdouble q;
+  vopmask o;
+
+  d = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(d, vcast_vd_d(0)), vcast_vd_d(NAN), d);
+
+  o = vlt_vo_vd_vd(d, vcast_vd_d(8.636168555094445E-78));
+  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d(1.157920892373162E77)), d);
+  q = vsel_vd_vo_vd_vd(o, vcast_vd_d(2.9387358770557188E-39*0.5), vcast_vd_d(0.5));
+
+  o = vgt_vo_vd_vd(d, vcast_vd_d(1.3407807929942597e+154));
+  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d(7.4583407312002070e-155)), d);
+  q = vsel_vd_vo_vd_vd(o, vcast_vd_d(1.1579208923731620e+77*0.5), q);
+
+  vdouble x = vreinterpret_vd_vi2(vsub_vi2_vi2_vi2(vcast_vi2_i_i(0x5fe6ec86, 0), vsrl_vi2_vi2_i(vreinterpret_vi2_vd(vadd_vd_vd_vd(d, vcast_vd_d(1e-320))), 1)));
+
+  x = vmul_vd_vd_vd(x, vsub_vd_vd_vd(vcast_vd_d(1.5), vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vcast_vd_d(0.5), d), x), x)));
+  x = vmul_vd_vd_vd(x, vsub_vd_vd_vd(vcast_vd_d(1.5), vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vcast_vd_d(0.5), d), x), x)));
+  x = vmul_vd_vd_vd(x, vsub_vd_vd_vd(vcast_vd_d(1.5), vmul_vd_vd_vd(vmul_vd_vd_vd(vmul_vd_vd_vd(vcast_vd_d(0.5), d), x), x)));
+  x = vmul_vd_vd_vd(x, d);
+
+  vdouble2 d2 = ddmul_vd2_vd2_vd2(ddadd2_vd2_vd_vd2(d, ddmul_vd2_vd_vd(x, x)), ddrec_vd2_vd(x));
+
+  x = vmul_vd_vd_vd(vadd_vd_vd_vd(d2.x, d2.y), q);
+
+  x = vsel_vd_vo_vd_vd(vispinf_vo_vd(d), vcast_vd_d(INFINITY), x);
+  x = vsel_vd_vo_vd_vd(veq_vo_vd_vd(d, vcast_vd_d(0)), d, x);
+  x = vsel_vd_vo_vd_vd(visnan_vo_vd(d), d, x);
+
+  return x;
+#endif
+}
+
+EXPORT CONST vdouble xsqrt_u35(vdouble d) { return xsqrt_u05(d); }
+
+EXPORT CONST vdouble xhypot_u05(vdouble x, vdouble y) {
+  x = vabs_vd_vd(x);
+  y = vabs_vd_vd(y);
+  vdouble min = vmin_vd_vd_vd(x, y), n = min;
+  vdouble max = vmax_vd_vd_vd(x, y), d = max;
+
+  vopmask o = vlt_vo_vd_vd(max, vcast_vd_d(DBL_MIN));
+  n = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(n, vcast_vd_d(1ULL << 54)), n);
+  d = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(d, vcast_vd_d(1ULL << 54)), d);
+
+  vdouble2 t = dddiv_vd2_vd2_vd2(vcast_vd2_vd_vd(n, vcast_vd_d(0)), vcast_vd2_vd_vd(d, vcast_vd_d(0)));
+  t = ddmul_vd2_vd2_vd(ddsqrt_vd2_vd2(ddadd2_vd2_vd2_vd(ddsqu_vd2_vd2(t), vcast_vd_d(1))), max);
+  vdouble ret = vadd_vd_vd_vd(t.x, t.y);
+  ret = vsel_vd_vo_vd_vd(visnan_vo_vd(ret), vcast_vd_d(INFINITY), ret);
+  ret = vsel_vd_vo_vd_vd(veq_vo_vd_vd(min, vcast_vd_d(0)), max, ret);
+  ret = vsel_vd_vo_vd_vd(vor_vo_vo_vo(visnan_vo_vd(x), visnan_vo_vd(y)), vcast_vd_d(NAN), ret);
+  ret = vsel_vd_vo_vd_vd(vor_vo_vo_vo(veq_vo_vd_vd(x, vcast_vd_d(INFINITY)), veq_vo_vd_vd(y, vcast_vd_d(INFINITY))), vcast_vd_d(INFINITY), ret);
+
+  return ret;
+}
+
+EXPORT CONST vdouble xhypot_u35(vdouble x, vdouble y) {
+  x = vabs_vd_vd(x);
+  y = vabs_vd_vd(y);
+  vdouble min = vmin_vd_vd_vd(x, y);
+  vdouble max = vmax_vd_vd_vd(x, y);
+
+  vdouble t = vdiv_vd_vd_vd(min, max);
+  vdouble ret = vmul_vd_vd_vd(max, vsqrt_vd_vd(vmla_vd_vd_vd_vd(t, t, vcast_vd_d(1))));
+  ret = vsel_vd_vo_vd_vd(veq_vo_vd_vd(min, vcast_vd_d(0)), max, ret);
+  ret = vsel_vd_vo_vd_vd(vor_vo_vo_vo(visnan_vo_vd(x), visnan_vo_vd(y)), vcast_vd_d(NAN), ret);
+  ret = vsel_vd_vo_vd_vd(vor_vo_vo_vo(veq_vo_vd_vd(x, vcast_vd_d(INFINITY)), veq_vo_vd_vd(y, vcast_vd_d(INFINITY))), vcast_vd_d(INFINITY), ret);
+
+  return ret;
+}
+
+static INLINE CONST vdouble vtoward0(vdouble x) { // returns nextafter(x, 0)
+  vdouble t = vreinterpret_vd_vm(vadd64_vm_vm_vm(vreinterpret_vm_vd(x), vcast_vm_i_i(-1, -1)));
+  return vsel_vd_vo_vd_vd(veq_vo_vd_vd(x, vcast_vd_d(0)), vcast_vd_d(0), t);
+}
+
+static INLINE CONST vdouble vptrunc(vdouble x) { // round to integer toward 0, positive argument only
+#ifdef FULL_FP_ROUNDING
+  return vtruncate_vd_vd(x);
+#else
+  vdouble fr = vmla_vd_vd_vd_vd(vcast_vd_d(-(double)(1LL << 31)), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(x, vcast_vd_d(1.0 / (1LL << 31))))), x);
+  fr = vsub_vd_vd_vd(fr, vcast_vd_vi(vtruncate_vi_vd(fr)));
+  return vsel_vd_vo_vd_vd(vge_vo_vd_vd(vabs_vd_vd(x), vcast_vd_d(1LL << 52)), x, vsub_vd_vd_vd(x, fr));
+#endif
+}
+
+/* TODO AArch64: potential optimization by using `vfmad_lane_f64` */
+EXPORT CONST vdouble xfmod(vdouble x, vdouble y) {
+  vdouble nu = vabs_vd_vd(x), de = vabs_vd_vd(y), s = vcast_vd_d(1), q;
+  vopmask o = vlt_vo_vd_vd(de, vcast_vd_d(DBL_MIN));
+  nu = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(nu, vcast_vd_d(1ULL << 54)), nu);
+  de = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(de, vcast_vd_d(1ULL << 54)), de);
+  s  = vsel_vd_vo_vd_vd(o, vmul_vd_vd_vd(s , vcast_vd_d(1.0 / (1ULL << 54))), s);
+  vdouble rde = vtoward0(vrec_vd_vd(de));
+  vdouble2 r = vcast_vd2_vd_vd(nu, vcast_vd_d(0));
+
+  for(int i=0;i<21;i++) { // ceil(log2(DBL_MAX) / 51) + 1
+    q = vsel_vd_vo_vd_vd(vand_vo_vo_vo(vgt_vo_vd_vd(vadd_vd_vd_vd(de, de), r.x),
+				       vge_vo_vd_vd(r.x, de)),
+			 vcast_vd_d(1), vmul_vd_vd_vd(vtoward0(r.x), rde));
+    q = vreinterpret_vd_vm(vand_vm_vm_vm(vreinterpret_vm_vd(vptrunc(q)), vcast_vm_i_i(0xffffffff, 0xfffffffe)));
+    r = ddnormalize_vd2_vd2(ddadd2_vd2_vd2_vd2(r, ddmul_vd2_vd_vd(q, vneg_vd_vd(de))));
+    if (vtestallones_i_vo64(vlt_vo_vd_vd(r.x, de))) break;
+  }
+
+  vdouble ret = vmul_vd_vd_vd(r.x, s);
+  ret = vsel_vd_vo_vd_vd(veq_vo_vd_vd(vadd_vd_vd_vd(r.x, r.y), de), vcast_vd_d(0), ret);
+
+  ret = vmulsign_vd_vd_vd(ret, x);
+
+  ret = vsel_vd_vo_vd_vd(vlt_vo_vd_vd(nu, de), x, ret);
+  ret = vsel_vd_vo_vd_vd(veq_vo_vd_vd(de, vcast_vd_d(0)), vcast_vd_d(NAN), ret);
+
+  return ret;
+}
+
+typedef struct {
+  vdouble2 a, b;
+} dd2;
+
+/* TODO AArch64: potential optimization by using `vfmad_lane_f64` */
+static CONST dd2 gammak(vdouble a) {
+  vdouble2 clc = vcast_vd2_d_d(0, 0), clln = vcast_vd2_d_d(1, 0), clld = vcast_vd2_d_d(1, 0);
+  vdouble2 v = vcast_vd2_d_d(1, 0), x, y, z;
+  vdouble t, u;
+
+  vopmask otiny = vlt_vo_vd_vd(vabs_vd_vd(a), vcast_vd_d(1e-306)), oref = vlt_vo_vd_vd(a, vcast_vd_d(0.5));
+
+  x = vsel_vd2_vo_vd2_vd2(otiny, vcast_vd2_d_d(0, 0),
+        vsel_vd2_vo_vd2_vd2(oref, ddadd2_vd2_vd_vd(vcast_vd_d(1), vneg_vd_vd(a)),
+                vcast_vd2_vd_vd(a, vcast_vd_d(0))));
+
+  vopmask o0 = vand_vo_vo_vo(vle_vo_vd_vd(vcast_vd_d(0.5), x.x), vle_vo_vd_vd(x.x, vcast_vd_d(1.1)));
+  vopmask o2 = vle_vo_vd_vd(vcast_vd_d(2.3), x.x);
+
+  y = ddnormalize_vd2_vd2(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd2_vd(x, vcast_vd_d(1)), x));
+  y = ddnormalize_vd2_vd2(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd2_vd(x, vcast_vd_d(2)), y));
+  y = ddnormalize_vd2_vd2(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd2_vd(x, vcast_vd_d(3)), y));
+  y = ddnormalize_vd2_vd2(ddmul_vd2_vd2_vd2(ddadd2_vd2_vd2_vd(x, vcast_vd_d(4)), y));
+
+  vopmask o = vand_vo_vo_vo(o2, vle_vo_vd_vd(x.x, vcast_vd_d(7)));
+  clln = vsel_vd2_vo_vd2_vd2(o, y, clln);
+
+  x = vsel_vd2_vo_vd2_vd2(o, ddadd2_vd2_vd2_vd(x, vcast_vd_d(5)), x);
+
+  t = vsel_vd_vo_vd_vd(o2, vrec_vd_vd(x.x), ddnormalize_vd2_vd2(ddadd2_vd2_vd2_vd(x, vsel_vd_vo_d_d(o0, -1, -2))).x);
+
+  u = vsel_vd_vo_vo_d_d_d(o2, o0, -156.801412704022726379848862, +0.2947916772827614196e+2, +0.7074816000864609279e-7);
+  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +1.120804464289911606838558160000, +0.1281459691827820109e+3, +0.4009244333008730443e-6));
+  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +13.39798545514258921833306020000, +0.2617544025784515043e+3, +0.1040114641628246946e-5));
+  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -0.116546276599463200848033357000, +0.3287022855685790432e+3, +0.1508349150733329167e-5));
+  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -1.391801093265337481495562410000, +0.2818145867730348186e+3, +0.1288143074933901020e-5));
+  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +0.015056113040026424412918973400, +0.1728670414673559605e+3, +0.4744167749884993937e-6));
+  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +0.179540117061234856098844714000, +0.7748735764030416817e+2, -0.6554816306542489902e-7));
+  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -0.002481743600264997730942489280, +0.2512856643080930752e+2, -0.3189252471452599844e-6));
+  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -0.029527880945699120504851034100, +0.5766792106140076868e+1, +0.1358883821470355377e-6));
+  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +0.000540164767892604515196325186, +0.7270275473996180571e+0, -0.4343931277157336040e-6));
+  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +0.006403362833808069794787256200, +0.8396709124579147809e-1, +0.9724785897406779555e-6));
+  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -0.000162516262783915816896611252, -0.8211558669746804595e-1, -0.2036886057225966011e-5));
+  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -0.001914438498565477526465972390, +0.6828831828341884458e-1, +0.4373363141819725815e-5));
+  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +7.20489541602001055898311517e-05, -0.7712481339961671511e-1, -0.9439951268304008677e-5));
+  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +0.000839498720672087279971000786, +0.8337492023017314957e-1, +0.2050727030376389804e-4));
+  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -5.17179090826059219329394422e-05, -0.9094964931456242518e-1, -0.4492620183431184018e-4));
+  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -0.000592166437353693882857342347, +0.1000996313575929358e+0, +0.9945751236071875931e-4));
+  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +6.97281375836585777403743539e-05, -0.1113342861544207724e+0, -0.2231547599034983196e-3));
+  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +0.000784039221720066627493314301, +0.1255096673213020875e+0, +0.5096695247101967622e-3));
+  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -0.000229472093621399176949318732, -0.1440498967843054368e+0, -0.1192753911667886971e-2));
+  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, -0.002681327160493827160473958490, +0.1695571770041949811e+0, +0.2890510330742210310e-2));
+  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +0.003472222222222222222175164840, -0.2073855510284092762e+0, -0.7385551028674461858e-2));
+  u = vmla_vd_vd_vd_vd(u, t, vsel_vd_vo_vo_d_d_d(o2, o0, +0.083333333333333333335592087900, +0.2705808084277815939e+0, +0.2058080842778455335e-1));
+
+  y = ddmul_vd2_vd2_vd2(ddadd2_vd2_vd2_vd(x, vcast_vd_d(-0.5)), logk2(x));
+  y = ddadd2_vd2_vd2_vd2(y, ddneg_vd2_vd2(x));
+  y = ddadd2_vd2_vd2_vd2(y, vcast_vd2_d_d(0.91893853320467278056, -3.8782941580672414498e-17)); // 0.5*log(2*M_PI)
+
+  z = ddadd2_vd2_vd2_vd(ddmul_vd2_vd_vd (u, t), vsel_vd_vo_d_d(o0, -0.4006856343865314862e+0, -0.6735230105319810201e-1));
+  z = ddadd2_vd2_vd2_vd(ddmul_vd2_vd2_vd(z, t), vsel_vd_vo_d_d(o0, +0.8224670334241132030e+0, +0.3224670334241132030e+0));
+  z = ddadd2_vd2_vd2_vd(ddmul_vd2_vd2_vd(z, t), vsel_vd_vo_d_d(o0, -0.5772156649015328655e+0, +0.4227843350984671345e+0));
+  z = ddmul_vd2_vd2_vd(z, t);
+
+  clc = vsel_vd2_vo_vd2_vd2(o2, y, z);
+
+  clld = vsel_vd2_vo_vd2_vd2(o2, ddadd2_vd2_vd2_vd(ddmul_vd2_vd_vd(u, t), vcast_vd_d(1)), clld);
+
+  y = clln;
+
+  clc = vsel_vd2_vo_vd2_vd2(otiny, vcast_vd2_d_d(83.1776616671934334590333, 3.67103459631568507221878e-15), // log(2^120)
+          vsel_vd2_vo_vd2_vd2(oref, ddadd2_vd2_vd2_vd2(vcast_vd2_d_d(1.1447298858494001639, 1.026595116270782638e-17), ddneg_vd2_vd2(clc)), clc)); // log(M_PI)
+  clln = vsel_vd2_vo_vd2_vd2(otiny, vcast_vd2_d_d(1, 0), vsel_vd2_vo_vd2_vd2(oref, clln, clld));
+
+  if (!vtestallones_i_vo64(vnot_vo64_vo64(oref))) {
+    t = vsub_vd_vd_vd(a, vmul_vd_vd_vd(vcast_vd_d(1LL << 28), vcast_vd_vi(vtruncate_vi_vd(vmul_vd_vd_vd(a, vcast_vd_d(1.0 / (1LL << 28)))))));
+    x = ddmul_vd2_vd2_vd2(clld, sinpik(t));
+  }
+
+  clld = vsel_vd2_vo_vd2_vd2(otiny, vcast_vd2_vd_vd(vmul_vd_vd_vd(a, vcast_vd_d((1LL << 60)*(double)(1LL << 60))), vcast_vd_d(0)),
+           vsel_vd2_vo_vd2_vd2(oref, x, y));
+
+  dd2 ret = { clc, dddiv_vd2_vd2_vd2(clln, clld) };
+
+  return ret;
+}
+
+EXPORT CONST vdouble xtgamma_u1(vdouble a) {
+  dd2 d = gammak(a);
+  vdouble2 y = ddmul_vd2_vd2_vd2(expk2(d.a), d.b);
+  vdouble r = vadd_vd_vd_vd(y.x, y.y);
+  vopmask o;
+
+  o = vor_vo_vo_vo(vor_vo_vo_vo(veq_vo_vd_vd(a, vcast_vd_d(-INFINITY)),
+        vand_vo_vo_vo(vlt_vo_vd_vd(a, vcast_vd_d(0)), visint_vo_vd(a))),
+       vand_vo_vo_vo(vand_vo_vo_vo(visnumber_vo_vd(a), vlt_vo_vd_vd(a, vcast_vd_d(0))), visnan_vo_vd(r)));
+  r = vsel_vd_vo_vd_vd(o, vcast_vd_d(NAN), r);
+
+  o = vand_vo_vo_vo(vand_vo_vo_vo(vor_vo_vo_vo(veq_vo_vd_vd(a, vcast_vd_d(INFINITY)), visnumber_vo_vd(a)),
+          vge_vo_vd_vd(a, vcast_vd_d(-DBL_MIN))),
+        vor_vo_vo_vo(vor_vo_vo_vo(veq_vo_vd_vd(a, vcast_vd_d(0)), vgt_vo_vd_vd(a, vcast_vd_d(200))), visnan_vo_vd(r)));
+  r = vsel_vd_vo_vd_vd(o, vmulsign_vd_vd_vd(vcast_vd_d(INFINITY), a), r);
+
+  return r;
+}
+
+EXPORT CONST vdouble2 xlgamma_r_u1(vdouble a) {
+  dd2 d = gammak(a);
+  vdouble2 y = ddadd2_vd2_vd2_vd2(d.a, logk2(ddabs_vd2_vd2(d.b)));
+  vdouble r = vadd_vd_vd_vd(y.x, y.y);
+  vopmask o;
+
+  o = vor_vo_vo_vo(visinf_vo_vd(a),
+       vor_vo_vo_vo(vand_vo_vo_vo(vle_vo_vd_vd(a, vcast_vd_d(0)), visint_vo_vd(a)),
+        vand_vo_vo_vo(visnumber_vo_vd(a), visnan_vo_vd(r))));
+  r = vsel_vd_vo_vd_vd(o, vcast_vd_d(INFINITY), r);
+
+  vdouble2 ret;
+  ret.x = r;
+  ret.y = vreinterpret_vd_vm(vor_vm_vm_vm(
+                               vand_vm_vm_vm(vreinterpret_vm_vd(d.b.x),
+                                 vreinterpret_vm_vd(vcast_vd_d(-0.0))),
+                               vreinterpret_vm_vd(vcast_vd_d(1.0)))
+                            );
+
+  return ret;
+}
+
+EXPORT CONST vdouble xlgamma_u1(vdouble a) {
+  dd2 d = gammak(a);
+  vdouble2 y = ddadd2_vd2_vd2_vd2(d.a, logk2(ddabs_vd2_vd2(d.b)));
+  vdouble r = vadd_vd_vd_vd(y.x, y.y);
+  vopmask o;
+
+  o = vor_vo_vo_vo(visinf_vo_vd(a),
+       vor_vo_vo_vo(vand_vo_vo_vo(vle_vo_vd_vd(a, vcast_vd_d(0)), visint_vo_vd(a)),
+        vand_vo_vo_vo(visnumber_vo_vd(a), visnan_vo_vd(r))));
+  r = vsel_vd_vo_vd_vd(o, vcast_vd_d(INFINITY), r);
+
+  return r;
+}
+
+/* TODO AArch64: potential optimization by using `vfmad_lane_f64` */
+EXPORT CONST vdouble xerf_u1(vdouble a) {
+  vdouble s = a, t, u;
+  vdouble2 d;
+
+  a = vabs_vd_vd(a);
+  vopmask o0 = vlt_vo_vd_vd(a, vcast_vd_d(1.0));
+  vopmask o1 = vlt_vo_vd_vd(a, vcast_vd_d(3.7));
+  vopmask o2 = vlt_vo_vd_vd(a, vcast_vd_d(6.0));
+  u = vsel_vd_vo_vd_vd(o0, vmul_vd_vd_vd(a, a), a);
+
+  t = vsel_vd_vo_vo_d_d_d(o0, o1, +0.6801072401395392157e-20, +0.2830954522087717660e-13, -0.5846750404269610493e-17);
+  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.2161766247570056391e-18, -0.1509491946179481940e-11, +0.6076691048812607898e-15));
+  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.4695919173301598752e-17, +0.3827857177807173152e-10, -0.3007518609604893831e-13));
+  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.9049140419888010819e-16, -0.6139733921558987241e-09, +0.9427906260824646063e-12));
+  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.1634018903557411517e-14, +0.6985387934608038824e-08, -0.2100110908269393629e-10));
+  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.2783485786333455216e-13, -0.5988224513034371474e-07, +0.3534639523461223473e-09));
+  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.4463221276786412722e-12, +0.4005716952355346640e-06, -0.4664967728285395926e-08));
+  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.6711366622850138987e-11, -0.2132190104575784400e-05, +0.4943823283769000532e-07));
+  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.9422759050232658346e-10, +0.9092461304042630325e-05, -0.4271203394761148254e-06));
+  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.1229055530100228477e-08, -0.3079188080966205457e-04, +0.3034067677404915895e-05));
+  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.1480719281585085023e-07, +0.7971413443082370762e-04, -0.1776295289066871135e-04));
+  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.1636584469123402714e-06, -0.1387853215225442864e-03, +0.8524547630559505050e-04));
+  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.1646211436588923363e-05, +0.6469678026257590965e-04, -0.3290582944961784398e-03));
+  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.1492565035840624866e-04, +0.4996645280372945860e-03, +0.9696966068789101157e-03));
+  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.1205533298178966496e-03, -0.1622802482842520535e-02, -0.1812527628046986137e-02));
+  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.8548327023450851166e-03, +0.1615320557049377171e-03, -0.4725409828123619017e-03));
+  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.5223977625442188799e-02, +0.1915262325574875607e-01, +0.2090315427924229266e-01));
+  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.2686617064513125569e-01, -0.1027818298486033455e+00, -0.1052041921842776645e+00));
+  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, +0.1128379167095512753e+00, -0.6366172819842503827e+00, -0.6345351808766568347e+00));
+  t = vmla_vd_vd_vd_vd(t, u, vsel_vd_vo_vo_d_d_d(o0, o1, -0.3761263890318375380e+00, -0.1128379590648910469e+01, -0.1129442929103524396e+01));
+  d = ddmul_vd2_vd_vd(t, u);
+
+  d = ddadd2_vd2_vd2_vd2(d, vcast_vd2_vd_vd(vsel_vd_vo_vo_d_d_d(o0, o1, 1.1283791670955125586, 3.4110644736196137587e-08, 0.00024963035690526438285),
+              vsel_vd_vo_vo_d_d_d(o0, o1, 1.5335459613165822674e-17, -2.4875650708323294246e-24, -5.4362665034856259795e-21)));
+  d = vsel_vd2_vo_vd2_vd2(o0, ddmul_vd2_vd2_vd(d, a), ddadd_vd2_vd_vd2(vcast_vd_d(1.0), ddneg_vd2_vd2(expk2(d))));
+
+  u = vmulsign_vd_vd_vd(vsel_vd_vo_vd_vd(o2, vadd_vd_vd_vd(d.x, d.y), vcast_vd_d(1)), s);
+  u = vsel_vd_vo_vd_vd(visnan_vo_vd(a), vcast_vd_d(NAN), u);
+
+  return u;
+}
+
+/* TODO AArch64: potential optimization by using `vfmad_lane_f64` */
+EXPORT CONST vdouble xerfc_u15(vdouble a) {
+  vdouble s = a, r = vcast_vd_d(0), t;
+  vdouble2 u, d, x;
+  a = vabs_vd_vd(a);
+  vopmask o0 = vlt_vo_vd_vd(a, vcast_vd_d(1.0));
+  vopmask o1 = vlt_vo_vd_vd(a, vcast_vd_d(2.2));
+  vopmask o2 = vlt_vo_vd_vd(a, vcast_vd_d(4.2));
+  vopmask o3 = vlt_vo_vd_vd(a, vcast_vd_d(27.3));
+
+  u = vsel_vd2_vo_vd2_vd2(o0, ddmul_vd2_vd_vd(a, a), vsel_vd2_vo_vd2_vd2(o1, vcast_vd2_vd_vd(a, vcast_vd_d(0)), dddiv_vd2_vd2_vd2(vcast_vd2_d_d(1, 0), vcast_vd2_vd_vd(a, vcast_vd_d(0)))));
+
+  t = vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.6801072401395386139e-20, +0.3438010341362585303e-12, -0.5757819536420710449e+2, +0.2334249729638701319e+5);
+  t = vmla_vd_vd_vd_vd(t, u.x, vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.2161766247570055669e-18, -0.1237021188160598264e-10, +0.4669289654498104483e+3, -0.4695661044933107769e+5));
+  t = vmla_vd_vd_vd_vd(t, u.x, vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.4695919173301595670e-17, +0.2117985839877627852e-09, -0.1796329879461355858e+4, +0.3173403108748643353e+5));
+  t = vmla_vd_vd_vd_vd(t, u.x, vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.9049140419888007122e-16, -0.2290560929177369506e-08, +0.4355892193699575728e+4, +0.3242982786959573787e+4));
+  t = vmla_vd_vd_vd_vd(t, u.x, vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.1634018903557410728e-14, +0.1748931621698149538e-07, -0.7456258884965764992e+4, -0.2014717999760347811e+5));
+  t = vmla_vd_vd_vd_vd(t, u.x, vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.2783485786333451745e-13, -0.9956602606623249195e-07, +0.9553977358167021521e+4, +0.1554006970967118286e+5));
+  t = vmla_vd_vd_vd_vd(t, u.x, vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.4463221276786415752e-12, +0.4330010240640327080e-06, -0.9470019905444229153e+4, -0.6150874190563554293e+4));
+  t = vmla_vd_vd_vd_vd(t, u.x, vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.6711366622850136563e-11, -0.1435050600991763331e-05, +0.7387344321849855078e+4, +0.1240047765634815732e+4));
+  t = vmla_vd_vd_vd_vd(t, u.x, vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.9422759050232662223e-10, +0.3460139479650695662e-05, -0.4557713054166382790e+4, -0.8210325475752699731e+2));
+  t = vmla_vd_vd_vd_vd(t, u.x, vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.1229055530100229098e-08, -0.4988908180632898173e-05, +0.2207866967354055305e+4, +0.3242443880839930870e+2));
+  t = vmla_vd_vd_vd_vd(t, u.x, vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.1480719281585086512e-07, -0.1308775976326352012e-05, -0.8217975658621754746e+3, -0.2923418863833160586e+2));
+  t = vmla_vd_vd_vd_vd(t, u.x, vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.1636584469123399803e-06, +0.2825086540850310103e-04, +0.2268659483507917400e+3, +0.3457461732814383071e+0));
+  t = vmla_vd_vd_vd_vd(t, u.x, vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.1646211436588923575e-05, -0.6393913713069986071e-04, -0.4633361260318560682e+2, +0.5489730155952392998e+1));
+  t = vmla_vd_vd_vd_vd(t, u.x, vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.1492565035840623511e-04, -0.2566436514695078926e-04, +0.9557380123733945965e+1, +0.1559934132251294134e-2));
+  t = vmla_vd_vd_vd_vd(t, u.x, vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.1205533298178967851e-03, +0.5895792375659440364e-03, -0.2958429331939661289e+1, -0.1541741566831520638e+1));
+  t = vmla_vd_vd_vd_vd(t, u.x, vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.8548327023450850081e-03, -0.1695715579163588598e-02, +0.1670329508092765480e+0, +0.2823152230558364186e-5));
+  t = vmla_vd_vd_vd_vd(t, u.x, vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, +0.5223977625442187932e-02, +0.2089116434918055149e-03, +0.6096615680115419211e+0, +0.6249999184195342838e+0));
+  t = vmla_vd_vd_vd_vd(t, u.x, vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.2686617064513125222e-01, +0.1912855949584917753e-01, +0.1059212443193543585e-2, +0.1741749416408701288e-8));
+
+  d = ddmul_vd2_vd2_vd(u, t);
+  d = ddadd2_vd2_vd2_vd2(d, vcast_vd2_vd_vd(vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, 0.11283791670955126141, -0.10277263343147646779, -0.50005180473999022439, -0.5000000000258444377),
+              vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -4.0175691625932118483e-18, -6.2338714083404900225e-18, 2.6362140569041995803e-17, -4.0074044712386992281e-17)));
+  d = ddmul_vd2_vd2_vd2(d, u);
+  d = ddadd2_vd2_vd2_vd2(d, vcast_vd2_vd_vd(vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.37612638903183753802, -0.63661976742916359662, 1.601106273924963368e-06, 2.3761973137523364792e-13),
+              vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, 1.3391897206042552387e-17, 7.6321019159085724662e-18, 1.1974001857764476775e-23, -1.1670076950531026582e-29)));
+  d = ddmul_vd2_vd2_vd2(d, u);
+  d = ddadd2_vd2_vd2_vd2(d, vcast_vd2_vd_vd(vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, 1.1283791670955125586, -1.1283791674717296161, -0.57236496645145429341, -0.57236494292470108114),
+              vsel_vd_vo_vo_vo_d_d_d_d(o0, o1, o2, 1.5335459613165822674e-17, 8.0896847755965377194e-17, 3.0704553245872027258e-17, -2.3984352208056898003e-17)));
+
+  x = ddmul_vd2_vd2_vd(vsel_vd2_vo_vd2_vd2(o1, d, vcast_vd2_vd_vd(vneg_vd_vd(a), vcast_vd_d(0))), a);
+  x = vsel_vd2_vo_vd2_vd2(o1, x, ddadd2_vd2_vd2_vd2(x, d));
+  x = vsel_vd2_vo_vd2_vd2(o0, ddsub_vd2_vd2_vd2(vcast_vd2_d_d(1, 0), x), expk2(x));
+  x = vsel_vd2_vo_vd2_vd2(o1, x, ddmul_vd2_vd2_vd2(x, u));
+
+  r = vsel_vd_vo_vd_vd(o3, vadd_vd_vd_vd(x.x, x.y), vcast_vd_d(0));
+  r = vsel_vd_vo_vd_vd(vsignbit_vo_vd(s), vsub_vd_vd_vd(vcast_vd_d(2), r), r);
+  r = vsel_vd_vo_vd_vd(visnan_vo_vd(s), vcast_vd_d(NAN), r);
+  return r;
+}
+
+#ifdef ENABLE_MAIN
+// gcc -DENABLE_MAIN -Wno-attributes -I../common -I../arch -DENABLE_AVX2 -mavx2 -mfma sleefsimddp.c ../common/common.c -lm
+#include <stdio.h>
+#include <stdlib.h>
+int main(int argc, char **argv) {
+  vdouble d1 = vcast_vd_d(atof(argv[1]));
+  vdouble d2 = vcast_vd_d(atof(argv[2]));
+  //vdouble d3 = vcast_vd_d(atof(argv[3]));
+  //vdouble r = xnextafter(d1, d2);
+  //int i;
+  //double fr = frexp(atof(argv[1]), &i);
+  //printf("%.20g\n", xfma(d1, d2, d3)[0]);;
+  //printf("test %.20g\n", xtgamma_u1(d1)[0]);
+  //printf("corr %.20g\n", tgamma(d1[0]));
+  //printf("test %.20g\n", xerf_u1(d1)[0]);
+  //printf("corr %.20g\n", erf(d1[0]));
+  //printf("test %.20g\n", xerfc_u15(d1)[0]);
+  //printf("corr %.20g\n", erfc(d1[0]));
+  //printf("%.20g\n", nextafter(d1[0], d2[0]));;
+  //printf("%.20g\n", vcast_d_vd(xhypot_u05(d1, d2)));
+  //printf("%.20g\n", fr);
+  printf("%.20g\n", fmod(atof(argv[1]), atof(argv[2])));
+  printf("%.20g\n", xfmod(d1, d2)[0]);
+  //vdouble2 r = xsincospi_u35(a);
+  //printf("%g, %g\n", vcast_d_vd(r.x), vcast_d_vd(r.y));
+}
+#endif
diff --git a/lib/kernel/sleef/libm/sleefsimdsp.c b/lib/kernel/sleef/libm/sleefsimdsp.c
new file mode 100644
index 0000000..d3d68da
--- /dev/null
+++ b/lib/kernel/sleef/libm/sleefsimdsp.c
@@ -0,0 +1,2307 @@
+//          Copyright Naoki Shibata 2010 - 2017.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+// Always use -ffp-contract=off option to compile SLEEF.
+
+#include <stdint.h>
+#include <assert.h>
+#include <math.h>
+#include <limits.h>
+#include <float.h>
+
+#include "misc.h"
+
+#if (defined(_MSC_VER))
+#pragma fp_contract (off)
+#endif
+
+//
+
+#include "helpers.h"
+#include "df.h"
+
+static INLINE CONST vopmask visnegzero_vo_vf(vfloat d) {
+  return veq_vo_vi2_vi2(vreinterpret_vi2_vf(d), vreinterpret_vi2_vf(vcast_vf_f(-0.0)));
+}
+
+static INLINE vopmask vnot_vo32_vo32(vopmask x) {
+  return vxor_vo_vo_vo(x, veq_vo_vi2_vi2(vcast_vi2_i(0), vcast_vi2_i(0)));
+}
+
+static INLINE CONST vmask vsignbit_vm_vf(vfloat f) {
+  return vand_vm_vm_vm(vreinterpret_vm_vf(f), vreinterpret_vm_vf(vcast_vf_f(-0.0f)));
+}
+
+static INLINE CONST vfloat vmulsign_vf_vf_vf(vfloat x, vfloat y) {
+  return vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(x), vsignbit_vm_vf(y)));
+}
+
+static INLINE CONST vfloat vcopysign_vf_vf_vf(vfloat x, vfloat y) {
+  return vreinterpret_vf_vm(vxor_vm_vm_vm(vandnot_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(x)),
+            vand_vm_vm_vm   (vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(y))));
+}
+
+static INLINE CONST vfloat vsign_vf_vf(vfloat f) {
+  return vreinterpret_vf_vm(vor_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(1.0f)), vand_vm_vm_vm(vreinterpret_vm_vf(vcast_vf_f(-0.0f)), vreinterpret_vm_vf(f))));
+}
+
+static INLINE CONST vopmask vsignbit_vo_vf(vfloat d) {
+  return veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vreinterpret_vi2_vf(d), vcast_vi2_i(0x80000000)), vcast_vi2_i(0x80000000));
+}
+
+static INLINE CONST vint2 vsel_vi2_vf_vf_vi2_vi2(vfloat f0, vfloat f1, vint2 x, vint2 y) {
+  return vsel_vi2_vo_vi2_vi2(vlt_vo_vf_vf(f0, f1), x, y);
+}
+
+static INLINE CONST vint2 vsel_vi2_vf_vi2(vfloat d, vint2 x) {
+  return vand_vi2_vo_vi2(vsignbit_vo_vf(d), x);
+}
+
+static INLINE CONST vopmask visint_vo_vf(vfloat y) { return veq_vo_vf_vf(vtruncate_vf_vf(y), y); }
+
+static INLINE CONST vopmask visnumber_vo_vf(vfloat x) { return vnot_vo32_vo32(vor_vo_vo_vo(visinf_vo_vf(x), visnan_vo_vf(x))); }
+
+#ifndef ENABLE_AVX512F
+static INLINE CONST vint2 vilogbk_vi2_vf(vfloat d) {
+  vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(5.421010862427522E-20f));
+  d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(vcast_vf_f(1.8446744073709552E19f), d), d);
+  vint2 q = vand_vi2_vi2_vi2(vsrl_vi2_vi2_i(vcast_vi2_vm(vreinterpret_vm_vf(d)), 23), vcast_vi2_i(0xff));
+  q = vsub_vi2_vi2_vi2(q, vsel_vi2_vo_vi2_vi2(o, vcast_vi2_i(64 + 0x7f), vcast_vi2_i(0x7f)));
+  return q;
+}
+
+static INLINE CONST vint2 vilogb2k_vi2_vf(vfloat d) {
+  vint2 q = vreinterpret_vi2_vf(d);
+  q = vsrl_vi2_vi2_i(q, 23);
+  q = vand_vi2_vi2_vi2(q, vcast_vi2_i(0xff));
+  q = vsub_vi2_vi2_vi2(q, vcast_vi2_i(0x7f));
+  return q;
+}
+#endif
+
+//
+
+EXPORT CONST vmask xilogbf(vfloat d) {
+  vint2 e = vilogbk_vi2_vf(vabs_vf_vf(d));
+  e = vsel_vi2_vo_vi2_vi2(veq_vo_vf_vf(d, vcast_vf_f(0.0f)), vcast_vi2_i(FP_ILOGB0), e);
+  e = vsel_vi2_vo_vi2_vi2(visnan_vo_vf(d), vcast_vi2_i(FP_ILOGBNAN), e);
+  e = vsel_vi2_vo_vi2_vi2(visinf_vo_vf(d), vcast_vi2_i(INT_MAX), e);
+  return vcast_vm_vi2(e);
+}
+
+static INLINE CONST vfloat vpow2i_vf_vi2(vint2 q) {
+  return vreinterpret_vf_vm(vcast_vm_vi2(vsll_vi2_vi2_i(vadd_vi2_vi2_vi2(q, vcast_vi2_i(0x7f)), 23)));
+}
+
+static INLINE CONST vfloat vldexp_vf_vf_vi2(vfloat x, vint2 q) {
+  vfloat u;
+  vint2 m = vsra_vi2_vi2_i(q, 31);
+  m = vsll_vi2_vi2_i(vsub_vi2_vi2_vi2(vsra_vi2_vi2_i(vadd_vi2_vi2_vi2(m, q), 6), m), 4);
+  q = vsub_vi2_vi2_vi2(q, vsll_vi2_vi2_i(m, 2));
+  m = vadd_vi2_vi2_vi2(m, vcast_vi2_i(0x7f));
+  m = vand_vi2_vi2_vi2(vgt_vi2_vi2_vi2(m, vcast_vi2_i(0)), m);
+  vint2 n = vgt_vi2_vi2_vi2(m, vcast_vi2_i(0xff));
+  m = vor_vi2_vi2_vi2(vandnot_vi2_vi2_vi2(n, m), vand_vi2_vi2_vi2(n, vcast_vi2_i(0xff)));
+  u = vreinterpret_vf_vm(vcast_vm_vi2(vsll_vi2_vi2_i(m, 23)));
+  x = vmul_vf_vf_vf(vmul_vf_vf_vf(vmul_vf_vf_vf(vmul_vf_vf_vf(x, u), u), u), u);
+  u = vreinterpret_vf_vm(vcast_vm_vi2(vsll_vi2_vi2_i(vadd_vi2_vi2_vi2(q, vcast_vi2_i(0x7f)), 23)));
+  return vmul_vf_vf_vf(x, u);
+}
+
+static INLINE CONST vfloat vldexp2_vf_vf_vi2(vfloat d, vint2 e) {
+  return vmul_vf_vf_vf(vmul_vf_vf_vf(d, vpow2i_vf_vi2(vsra_vi2_vi2_i(e, 1))), vpow2i_vf_vi2(vsub_vi2_vi2_vi2(e, vsra_vi2_vi2_i(e, 1))));
+}
+
+static INLINE CONST vfloat vldexp3_vf_vf_vi2(vfloat d, vint2 q) {
+  return vreinterpret_vf_vi2(vadd_vi2_vi2_vi2(vreinterpret_vi2_vf(d), vsll_vi2_vi2_i(q, 23)));
+}
+
+EXPORT CONST vfloat xldexpf(vfloat x, vmask qm) {
+  vint2 q1 = vcast_vi2_vm(qm);
+  vint2 min = vcast_vi2_i(-2000);
+  vint2 mask = vgt_vi2_vi2_vi2(min, q1);
+  vint2 q = vor_vi2_vi2_vi2(vand_vi2_vi2_vi2(mask, min),
+                                vandnot_vi2_vi2_vi2(mask, q1));
+  vfloat res = vldexp_vf_vf_vi2(x, q);
+
+  res = vsel_vf_vo_vf_vf(veq_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(0.0f)), x, res);
+  res = vsel_vf_vo_vf_vf(visinf_vo_vf(x), x, res);
+  res = vsel_vf_vo_vf_vf(visnan_vo_vf(x), x, res);
+  return res;
+}
+
+EXPORT CONST vfloat xsinf(vfloat d) {
+  vint2 q;
+  vfloat u, s, r = d;
+
+  q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_1_PI)));
+  u = vcast_vf_vi2(q);
+
+  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Af), d);
+  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Bf), d);
+  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Cf), d);
+  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Df), d);
+
+  s = vmul_vf_vf_vf(d, d);
+
+  d = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vreinterpret_vm_vf(vcast_vf_f(-0.0f))), vreinterpret_vm_vf(d)));
+
+  u = vcast_vf_f(2.6083159809786593541503e-06f);
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.0001981069071916863322258f));
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833307858556509017944336f));
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.166666597127914428710938f));
+
+  u = vadd_vf_vf_vf(vmul_vf_vf_vf(s, vmul_vf_vf_vf(u, d)), d);
+
+  u = vsel_vf_vo_vf_vf(vor_vo_vo_vo(visnegzero_vo_vf(r),
+            vgt_vo_vf_vf(vabs_vf_vf(r), vcast_vf_f(TRIGRANGEMAXf))),
+           vcast_vf_f(-0.0), u);
+
+  u = vreinterpret_vf_vm(vor_vm_vo32_vm(visinf_vo_vf(d), vreinterpret_vm_vf(u)));
+
+  return u;
+}
+
+EXPORT CONST vfloat xcosf(vfloat d) {
+  vint2 q;
+  vfloat u, s, r = d;
+
+  q = vrint_vi2_vf(vsub_vf_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_1_PI)), vcast_vf_f(0.5f)));
+  q = vadd_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, q), vcast_vi2_i(1));
+
+  u = vcast_vf_vi2(q);
+  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Af*0.5f), d);
+  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Bf*0.5f), d);
+  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Cf*0.5f), d);
+  d = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Df*0.5f), d);
+
+  s = vmul_vf_vf_vf(d, d);
+
+  d = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(0)), vreinterpret_vm_vf(vcast_vf_f(-0.0f))), vreinterpret_vm_vf(d)));
+
+  u = vcast_vf_f(2.6083159809786593541503e-06f);
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.0001981069071916863322258f));
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833307858556509017944336f));
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.166666597127914428710938f));
+
+  u = vadd_vf_vf_vf(vmul_vf_vf_vf(s, vmul_vf_vf_vf(u, d)), d);
+
+  u = vreinterpret_vf_vm(vandnot_vm_vo32_vm(vgt_vo_vf_vf(vabs_vf_vf(r), vcast_vf_f(TRIGRANGEMAXf)),
+              vreinterpret_vm_vf(u)));
+
+  u = vreinterpret_vf_vm(vor_vm_vo32_vm(visinf_vo_vf(d), vreinterpret_vm_vf(u)));
+
+  return u;
+}
+
+EXPORT CONST vfloat xtanf(vfloat d) {
+  vint2 q;
+  vopmask o;
+  vfloat u, s, x;
+
+  q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)(2 * M_1_PI))));
+
+  x = d;
+
+  u = vcast_vf_vi2(q);
+  x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Af*0.5f), x);
+  x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Bf*0.5f), x);
+  x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Cf*0.5f), x);
+  x = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Df*0.5f), x);
+
+  s = vmul_vf_vf_vf(x, x);
+
+  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1));
+  x = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0f))), vreinterpret_vm_vf(x)));
+
+  u = vcast_vf_f(0.00927245803177356719970703f);
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00331984995864331722259521f));
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0242998078465461730957031f));
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0534495301544666290283203f));
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.133383005857467651367188f));
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.333331853151321411132812f));
+
+  u = vmla_vf_vf_vf_vf(s, vmul_vf_vf_vf(u, x), x);
+
+  u = vsel_vf_vo_vf_vf(o, vrec_vf_vf(u), u);
+
+#ifndef ENABLE_AVX512F
+  u = vreinterpret_vf_vm(vor_vm_vo32_vm(visinf_vo_vf(d), vreinterpret_vm_vf(u)));
+#else
+  u = vfixup_vf_vf_vf_vi2_i(u, d, vcast_vi2_i((3 << (4*4)) | (3 << (5*4))), 0);
+#endif
+
+  return u;
+}
+
+EXPORT CONST vfloat xsinf_u1(vfloat d) {
+  vint2 q;
+  vfloat u, v;
+  vfloat2 s, t, x;
+
+  if (vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f)))) {
+    u = vrint_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(M_1_PI)));
+    q = vrint_vi2_vf(u);
+    v = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f), d);
+    s = dfadd2_vf2_vf_vf(v, vmul_vf_vf_vf(u, vcast_vf_f(-PI_B2f)));
+    s = dfadd_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_C2f)));
+  } else {
+    vfloat2 dfq = dfmul_vf2_vf2_vf(vcast_vf2_f_f(M_1_PI, M_1_PI - (float)M_1_PI), d);
+    vfloat t = vrint_vf_vf(vmul_vf_vf_vf(dfq.x, vcast_vf_f(1.0f / (1 << 16))));
+    dfq.y = vrint_vf_vf(vadd_vf_vf_vf(vmla_vf_vf_vf_vf(t, vcast_vf_f(-(1 << 16)), dfq.x), dfq.y));
+    q = vrint_vi2_vf(dfq.y);
+    dfq.x = vmul_vf_vf_vf(t, vcast_vf_f(1 << 16));
+    dfq = dfnormalize_vf2_vf2(dfq);
+
+    s = dfadd2_vf2_vf_vf2 (d, dfmul_vf2_vf2_vf(dfq, vcast_vf_f(-PI_A3f)));
+    s = dfnormalize_vf2_vf2(s);
+    s = dfadd2_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf(dfq, vcast_vf_f(-PI_B3f)));
+    s = dfnormalize_vf2_vf2(s);
+    s = dfadd2_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf(dfq, vcast_vf_f(-PI_C3f)));
+    s = dfnormalize_vf2_vf2(s);
+    s = dfadd2_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf(dfq, vcast_vf_f(-PI_D3f)));
+    s = dfnormalize_vf2_vf2(s);
+    s = dfadd2_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf(dfq, vcast_vf_f(-PI_E3f)));
+    s = dfnormalize_vf2_vf2(s);
+  }
+
+  t = s;
+  s = dfsqu_vf2_vf2(s);
+
+  u = vcast_vf_f(2.6083159809786593541503e-06f);
+  u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(-0.0001981069071916863322258f));
+  u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.00833307858556509017944336f));
+
+  x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf2_vf2(dfadd_vf2_vf_vf(vcast_vf_f(-0.166666597127914428710938f), vmul_vf_vf_vf(u, s.x)), s));
+
+  u = dfmul_vf_vf2_vf2(t, x);
+
+  u = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(u)));
+  u = vsel_vf_vo_vf_vf(vandnot_vo_vo_vo(visinf_vo_vf(d), vor_vo_vo_vo(visnegzero_vo_vf(d),
+                      vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX3f)))),
+           vcast_vf_f(-0.0), u);
+
+  return u;
+}
+
+EXPORT CONST vfloat xcosf_u1(vfloat d) {
+  vint2 q;
+  vfloat u;
+  vfloat2 s, t, x;
+
+  if (vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f)))) {
+    vfloat dq = vmla_vf_vf_vf_vf(vrint_vf_vf(vmla_vf_vf_vf_vf(d, vcast_vf_f(M_1_PI), vcast_vf_f(-0.5f))),
+         vcast_vf_f(2), vcast_vf_f(1));
+    q = vrint_vi2_vf(dq);
+    s = dfadd2_vf2_vf_vf (d, vmul_vf_vf_vf(dq, vcast_vf_f(-PI_A2f*0.5f)));
+    s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(dq, vcast_vf_f(-PI_B2f*0.5f)));
+    s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(dq, vcast_vf_f(-PI_C2f*0.5f)));
+  } else {
+    vfloat2 dfq = dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf(vcast_vf2_f_f(M_1_PI, M_1_PI - (float)M_1_PI), d), vcast_vf_f(-0.5f));
+    vfloat t = vrint_vf_vf(vmul_vf_vf_vf(dfq.x, vcast_vf_f(1.0f / (1 << 16))));
+    dfq.y = vmla_vf_vf_vf_vf(vrint_vf_vf(vadd_vf_vf_vf(vmla_vf_vf_vf_vf(t, vcast_vf_f(-(1 << 16)), dfq.x), dfq.y)),
+           vcast_vf_f(2), vcast_vf_f(1));
+    q = vrint_vi2_vf(dfq.y);
+    dfq.x = vmul_vf_vf_vf(t, vcast_vf_f(1 << 17));
+    dfq = dfnormalize_vf2_vf2(dfq);
+
+    s = dfadd2_vf2_vf_vf2 (d, dfmul_vf2_vf2_vf(dfq, vcast_vf_f(-PI_A3f*0.5f)));
+    s = dfnormalize_vf2_vf2(s);
+    s = dfadd2_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf(dfq, vcast_vf_f(-PI_B3f*0.5f)));
+    s = dfnormalize_vf2_vf2(s);
+    s = dfadd2_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf(dfq, vcast_vf_f(-PI_C3f*0.5f)));
+    s = dfnormalize_vf2_vf2(s);
+    s = dfadd2_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf(dfq, vcast_vf_f(-PI_D3f*0.5f)));
+    s = dfnormalize_vf2_vf2(s);
+    s = dfadd2_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf(dfq, vcast_vf_f(-PI_E3f*0.5f)));
+    s = dfnormalize_vf2_vf2(s);
+  }
+
+  t = s;
+  s = dfsqu_vf2_vf2(s);
+
+  u = vcast_vf_f(2.6083159809786593541503e-06f);
+  u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(-0.0001981069071916863322258f));
+  u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.00833307858556509017944336f));
+
+  x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf2_vf2(dfadd_vf2_vf_vf(vcast_vf_f(-0.166666597127914428710938f), vmul_vf_vf_vf(u, s.x)), s));
+
+  u = dfmul_vf_vf2_vf2(t, x);
+
+  u = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(0)), vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(u)));
+
+  u = vreinterpret_vf_vm(vandnot_vm_vo32_vm(vandnot_vo_vo_vo(visinf_vo_vf(d), vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX3f))),
+              vreinterpret_vm_vf(u)));
+
+  return u;
+}
+
+#ifdef ENABLE_GNUABI
+#define TYPE2_FUNCATR static INLINE CONST
+#define TYPE6_FUNCATR static INLINE CONST
+#define XSINCOSF sincosfk
+#define XSINCOSF_U1 sincosfk_u1
+#define XSINCOSPIF_U05 sincospifk_u05
+#define XSINCOSPIF_U35 sincospifk_u35
+#define XMODFF modffk
+#else
+#define TYPE2_FUNCATR EXPORT CONST
+#define TYPE6_FUNCATR EXPORT
+#define XSINCOSF xsincosf
+#define XSINCOSF_U1 xsincosf_u1
+#define XSINCOSPIF_U05 xsincospif_u05
+#define XSINCOSPIF_U35 xsincospif_u35
+#define XMODFF xmodff
+#endif
+
+TYPE2_FUNCATR vfloat2 XSINCOSF(vfloat d) {
+  vint2 q;
+  vopmask o;
+  vfloat u, s, t, rx, ry;
+  vfloat2 r;
+
+  q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f((float)M_2_PI)));
+
+  s = d;
+
+  u = vcast_vf_vi2(q);
+  s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Af*0.5f), s);
+  s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Bf*0.5f), s);
+  s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Cf*0.5f), s);
+  s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_Df*0.5f), s);
+
+  t = s;
+
+  s = vmul_vf_vf_vf(s, s);
+
+  u = vcast_vf_f(-0.000195169282960705459117889f);
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833215750753879547119141f));
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.166666537523269653320312f));
+
+  rx = vmla_vf_vf_vf_vf(vmul_vf_vf_vf(u, s), t, t);
+  rx = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0f), rx);
+
+  u = vcast_vf_f(-2.71811842367242206819355e-07f);
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(2.47990446951007470488548e-05f));
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.00138888787478208541870117f));
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0416666641831398010253906f));
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.5));
+
+  ry = vmla_vf_vf_vf_vf(s, u, vcast_vf_f(1));
+
+  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(0));
+  r.x = vsel_vf_vo_vf_vf(o, rx, ry);
+  r.y = vsel_vf_vo_vf_vf(o, ry, rx);
+
+  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(2));
+  r.x = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.x)));
+
+  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(2)), vcast_vi2_i(2));
+  r.y = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.y)));
+
+  o = vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAXf));
+  r.x = vreinterpret_vf_vm(vandnot_vm_vo32_vm(o, vreinterpret_vm_vf(r.x)));
+  r.y = vreinterpret_vf_vm(vandnot_vm_vo32_vm(o, vreinterpret_vm_vf(r.y)));
+
+  o = visinf_vo_vf(d);
+  r.x = vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(r.x)));
+  r.y = vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(r.y)));
+
+  return r;
+}
+
+TYPE2_FUNCATR vfloat2 XSINCOSF_U1(vfloat d) {
+  vint2 q;
+  vopmask o;
+  vfloat u, v, rx, ry;
+  vfloat2 r, s, t, x;
+
+  if (vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f)))) {
+    u = vrint_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(2 * M_1_PI)));
+    q = vrint_vi2_vf(u);
+    v = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f*0.5f), d);
+    s = dfadd2_vf2_vf_vf(v, vmul_vf_vf_vf(u, vcast_vf_f(-PI_B2f*0.5f)));
+    s = dfadd_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_C2f*0.5f)));
+  } else {
+    vfloat2 dfq = dfmul_vf2_vf2_vf(vcast_vf2_f_f(2*M_1_PI, 2*M_1_PI - (float)(2*M_1_PI)), d);
+    vfloat t = vrint_vf_vf(vmul_vf_vf_vf(dfq.x, vcast_vf_f(1.0f / (1 << 16))));
+    dfq.y = vrint_vf_vf(vadd_vf_vf_vf(vmla_vf_vf_vf_vf(t, vcast_vf_f(-(1 << 16)), dfq.x), dfq.y));
+    q = vrint_vi2_vf(dfq.y);
+    dfq.x = vmul_vf_vf_vf(t, vcast_vf_f(1 << 16));
+    dfq = dfnormalize_vf2_vf2(dfq);
+
+    s = dfadd2_vf2_vf_vf2 (d, dfmul_vf2_vf2_vf(dfq, vcast_vf_f(-PI_A3f*0.5f)));
+    s = dfnormalize_vf2_vf2(s);
+    s = dfadd2_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf(dfq, vcast_vf_f(-PI_B3f*0.5f)));
+    s = dfnormalize_vf2_vf2(s);
+    s = dfadd2_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf(dfq, vcast_vf_f(-PI_C3f*0.5f)));
+    s = dfnormalize_vf2_vf2(s);
+    s = dfadd2_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf(dfq, vcast_vf_f(-PI_D3f*0.5f)));
+    s = dfnormalize_vf2_vf2(s);
+    s = dfadd2_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf(dfq, vcast_vf_f(-PI_E3f*0.5f)));
+    s = dfnormalize_vf2_vf2(s);
+  }
+
+  t = s;
+
+  s.x = dfsqu_vf_vf2(s);
+
+  u = vcast_vf_f(-0.000195169282960705459117889f);
+  u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.00833215750753879547119141f));
+  u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(-0.166666537523269653320312f));
+
+  u = vmul_vf_vf_vf(u, vmul_vf_vf_vf(s.x, t.x));
+
+  x = dfadd_vf2_vf2_vf(t, u);
+  rx = vadd_vf_vf_vf(x.x, x.y);
+
+  rx = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0f), rx);
+
+  u = vcast_vf_f(-2.71811842367242206819355e-07f);
+  u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(2.47990446951007470488548e-05f));
+  u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(-0.00138888787478208541870117f));
+  u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.0416666641831398010253906f));
+  u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(-0.5));
+
+  x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf_vf(s.x, u));
+  ry = vadd_vf_vf_vf(x.x, x.y);
+
+  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(0));
+  r.x = vsel_vf_vo_vf_vf(o, rx, ry);
+  r.y = vsel_vf_vo_vf_vf(o, ry, rx);
+
+  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(2));
+  r.x = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.x)));
+
+  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(2)), vcast_vi2_i(2));
+  r.y = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.y)));
+
+  o = vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX3f));
+  r.x = vreinterpret_vf_vm(vandnot_vm_vo32_vm(o, vreinterpret_vm_vf(r.x)));
+  r.y = vreinterpret_vf_vm(vandnot_vm_vo32_vm(o, vreinterpret_vm_vf(r.y)));
+
+  o = visinf_vo_vf(d);
+  r.x = vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(r.x)));
+  r.y = vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(r.y)));
+
+  return r;
+}
+
+TYPE2_FUNCATR vfloat2 XSINCOSPIF_U05(vfloat d) {
+  vopmask o;
+  vfloat u, s, t, rx, ry;
+  vfloat2 r, x, s2;
+
+  u = vmul_vf_vf_vf(d, vcast_vf_f(4));
+  vint2 q = vtruncate_vi2_vf(u);
+  q = vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vxor_vi2_vi2_vi2(vsrl_vi2_vi2_i(q, 31), vcast_vi2_i(1))), vcast_vi2_i(~1));
+  s = vsub_vf_vf_vf(u, vcast_vf_vi2(q));
+
+  t = s;
+  s = vmul_vf_vf_vf(s, s);
+  s2 = dfmul_vf2_vf_vf(t, t);
+
+  //
+
+  u = vcast_vf_f(+0.3093842054e-6);
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.3657307388e-4));
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2490393585e-2));
+  x = dfadd2_vf2_vf_vf2(vmul_vf_vf_vf(u, s), vcast_vf2_f_f(-0.080745510756969451904, -1.3373665339076936258e-09));
+  x = dfadd2_vf2_vf2_vf2(dfmul_vf2_vf2_vf2(s2, x), vcast_vf2_f_f(0.78539818525314331055, -2.1857338617566484855e-08));
+
+  x = dfmul_vf2_vf2_vf(x, t);
+  rx = vadd_vf_vf_vf(x.x, x.y);
+
+  rx = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0f), rx);
+
+  //
+
+  u = vcast_vf_f(-0.2430611801e-7);
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.3590577080e-5));
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.3259917721e-3));
+  x = dfadd2_vf2_vf_vf2(vmul_vf_vf_vf(u, s), vcast_vf2_f_f(0.015854343771934509277, 4.4940051354032242811e-10));
+  x = dfadd2_vf2_vf2_vf2(dfmul_vf2_vf2_vf2(s2, x), vcast_vf2_f_f(-0.30842512845993041992, -9.0728339030733922277e-09));
+
+  x = dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf2(x, s2), vcast_vf_f(1));
+  ry = vadd_vf_vf_vf(x.x, x.y);
+
+  //
+
+  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(0));
+  r.x = vsel_vf_vo_vf_vf(o, rx, ry);
+  r.y = vsel_vf_vo_vf_vf(o, ry, rx);
+
+  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(4)), vcast_vi2_i(4));
+  r.x = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.x)));
+
+  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(4)), vcast_vi2_i(4));
+  r.y = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.y)));
+
+  o = vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAXf));
+  r.x = vreinterpret_vf_vm(vandnot_vm_vo32_vm(o, vreinterpret_vm_vf(r.x)));
+  r.y = vreinterpret_vf_vm(vandnot_vm_vo32_vm(o, vreinterpret_vm_vf(r.y)));
+
+  o = visinf_vo_vf(d);
+  r.x = vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(r.x)));
+  r.y = vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(r.y)));
+
+  return r;
+}
+
+TYPE2_FUNCATR vfloat2 XSINCOSPIF_U35(vfloat d) {
+  vopmask o;
+  vfloat u, s, t, rx, ry;
+  vfloat2 r;
+
+  u = vmul_vf_vf_vf(d, vcast_vf_f(4));
+  vint2 q = vtruncate_vi2_vf(u);
+  q = vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vxor_vi2_vi2_vi2(vsrl_vi2_vi2_i(q, 31), vcast_vi2_i(1))), vcast_vi2_i(~1));
+  s = vsub_vf_vf_vf(u, vcast_vf_vi2(q));
+
+  t = s;
+  s = vmul_vf_vf_vf(s, s);
+
+  //
+
+  u = vcast_vf_f(-0.3600925265e-4);
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2490088111e-2));
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.8074551076e-1));
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.7853981853e+0));
+
+  rx = vmul_vf_vf_vf(u, t);
+
+  //
+
+  u = vcast_vf_f(+0.3539815225e-5);
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.3259574005e-3));
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.1585431583e-1));
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(-0.3084251285e+0));
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(1));
+
+  ry = u;
+
+  //
+
+  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(0));
+  r.x = vsel_vf_vo_vf_vf(o, rx, ry);
+  r.y = vsel_vf_vo_vf_vf(o, ry, rx);
+
+  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(4)), vcast_vi2_i(4));
+  r.x = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.x)));
+
+  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(4)), vcast_vi2_i(4));
+  r.y = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(r.y)));
+
+  o = vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAXf));
+  r.x = vreinterpret_vf_vm(vandnot_vm_vo32_vm(o, vreinterpret_vm_vf(r.x)));
+  r.y = vreinterpret_vf_vm(vandnot_vm_vo32_vm(o, vreinterpret_vm_vf(r.y)));
+
+  o = visinf_vo_vf(d);
+  r.x = vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(r.x)));
+  r.y = vreinterpret_vf_vm(vor_vm_vo32_vm(o, vreinterpret_vm_vf(r.y)));
+
+  return r;
+}
+
+TYPE6_FUNCATR vfloat2 XMODFF(vfloat x) {
+  vfloat fr = vsub_vf_vf_vf(x, vcast_vf_vi2(vtruncate_vi2_vf(x)));
+  fr = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(1LL << 23)), vcast_vf_f(0), fr);
+
+  vfloat2 ret;
+
+  ret.x = vcopysign_vf_vf_vf(fr, x);
+  ret.y = vcopysign_vf_vf_vf(vsub_vf_vf_vf(x, fr), x);
+
+  return ret;
+}
+
+#ifdef ENABLE_GNUABI
+EXPORT void xsincosf(vfloat a, float *ps, float *pc) {
+  vfloat2 r = sincosfk(a);
+  vstoreu_v_p_vf(ps, r.x);
+  vstoreu_v_p_vf(pc, r.y);
+}
+
+EXPORT void xsincosf_u1(vfloat a, float *ps, float *pc) {
+  vfloat2 r = sincosfk_u1(a);
+  vstoreu_v_p_vf(ps, r.x);
+  vstoreu_v_p_vf(pc, r.y);
+}
+
+EXPORT void xsincospif_u05(vfloat a, float *ps, float *pc) {
+  vfloat2 r = sincospifk_u05(a);
+  vstoreu_v_p_vf(ps, r.x);
+  vstoreu_v_p_vf(pc, r.y);
+}
+
+EXPORT void xsincospif_u35(vfloat a, float *ps, float *pc) {
+  vfloat2 r = sincospifk_u35(a);
+  vstoreu_v_p_vf(ps, r.x);
+  vstoreu_v_p_vf(pc, r.y);
+}
+
+EXPORT CONST vfloat xmodff(vfloat a, float *iptr) {
+  vfloat2 r = modffk(a);
+  vstoreu_v_p_vf(iptr, r.y);
+  return r.x;
+}
+#endif // #ifdef ENABLE_GNUABI
+
+EXPORT CONST vfloat xtanf_u1(vfloat d) {
+  vint2 q;
+  vfloat u, v;
+  vfloat2 s, t, x;
+  vopmask o;
+
+  if (vtestallones_i_vo32(vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX2f)))) {
+    u = vrint_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(2 * M_1_PI)));
+    q = vrint_vi2_vf(u);
+    v = vmla_vf_vf_vf_vf(u, vcast_vf_f(-PI_A2f*0.5f), d);
+    s = dfadd2_vf2_vf_vf(v, vmul_vf_vf_vf(u, vcast_vf_f(-PI_B2f*0.5f)));
+    s = dfadd_vf2_vf2_vf(s, vmul_vf_vf_vf(u, vcast_vf_f(-PI_C2f*0.5f)));
+  } else {
+    vfloat2 dfq = dfmul_vf2_vf2_vf(vcast_vf2_f_f(2*M_1_PI, 2*M_1_PI - (float)(2*M_1_PI)), d);
+    vfloat t = vrint_vf_vf(vmul_vf_vf_vf(dfq.x, vcast_vf_f(1.0f / (1 << 16))));
+    dfq.y = vrint_vf_vf(vadd_vf_vf_vf(vmla_vf_vf_vf_vf(t, vcast_vf_f(-(1 << 16)), dfq.x), dfq.y));
+    q = vrint_vi2_vf(dfq.y);
+    dfq.x = vmul_vf_vf_vf(t, vcast_vf_f(1 << 16));
+    dfq = dfnormalize_vf2_vf2(dfq);
+
+    s = dfadd2_vf2_vf_vf2 (d, dfmul_vf2_vf2_vf(dfq, vcast_vf_f(-PI_A3f*0.5f)));
+    s = dfnormalize_vf2_vf2(s);
+    s = dfadd2_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf(dfq, vcast_vf_f(-PI_B3f*0.5f)));
+    s = dfnormalize_vf2_vf2(s);
+    s = dfadd2_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf(dfq, vcast_vf_f(-PI_C3f*0.5f)));
+    s = dfnormalize_vf2_vf2(s);
+    s = dfadd2_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf(dfq, vcast_vf_f(-PI_D3f*0.5f)));
+    s = dfnormalize_vf2_vf2(s);
+    s = dfadd2_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf(dfq, vcast_vf_f(-PI_E3f*0.5f)));
+    s = dfnormalize_vf2_vf2(s);
+  }
+
+  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1));
+  vmask n = vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0)));
+  s.x = vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(s.x), n));
+  s.y = vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(s.y), n));
+
+  t = s;
+  s = dfsqu_vf2_vf2(s);
+  s = dfnormalize_vf2_vf2(s);
+
+  u = vcast_vf_f(0.00446636462584137916564941f);
+  u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(-8.3920182078145444393158e-05f));
+  u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.0109639242291450500488281f));
+  u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.0212360303848981857299805f));
+  u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.0540687143802642822265625f));
+
+  x = dfadd_vf2_vf_vf(vcast_vf_f(0.133325666189193725585938f), vmul_vf_vf_vf(u, s.x));
+  x = dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf2_vf2(dfadd_vf2_vf_vf2(vcast_vf_f(0.33333361148834228515625f), dfmul_vf2_vf2_vf2(s, x)), s));
+  x = dfmul_vf2_vf2_vf2(t, x);
+
+  x = vsel_vf2_vo_vf2_vf2(o, dfrec_vf2_vf2(x), x);
+
+  u = vadd_vf_vf_vf(x.x, x.y);
+
+  u = vsel_vf_vo_vf_vf(vandnot_vo_vo_vo(visinf_vo_vf(d),
+          vor_vo_vo_vo(visnegzero_vo_vf(d),
+                 vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX3f)))),
+           vcast_vf_f(-0.0f), u);
+
+  return u;
+}
+
+EXPORT CONST vfloat xatanf(vfloat d) {
+  vfloat s, t, u;
+  vint2 q;
+
+  q = vsel_vi2_vf_vi2(d, vcast_vi2_i(2));
+  s = vabs_vf_vf(d);
+
+  q = vsel_vi2_vf_vf_vi2_vi2(vcast_vf_f(1.0f), s, vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), q);
+  s = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(vcast_vf_f(1.0f), s), vrec_vf_vf(s), s);
+
+  t = vmul_vf_vf_vf(s, s);
+
+  u = vcast_vf_f(0.00282363896258175373077393f);
+  u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(-0.0159569028764963150024414f));
+  u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(0.0425049886107444763183594f));
+  u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(-0.0748900920152664184570312f));
+  u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(0.106347933411598205566406f));
+  u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(-0.142027363181114196777344f));
+  u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(0.199926957488059997558594f));
+  u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(-0.333331018686294555664062f));
+
+  t = vmla_vf_vf_vf_vf(s, vmul_vf_vf_vf(t, u), s);
+
+  t = vsel_vf_vo_vf_vf(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(1)), vcast_vi2_i(1)), vsub_vf_vf_vf(vcast_vf_f((float)(M_PI/2)), t), t);
+
+  t = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(2)), vreinterpret_vm_vf(vcast_vf_f(-0.0f))), vreinterpret_vm_vf(t)));
+
+#ifdef ENABLE_NEON32
+  t = vsel_vf_vo_vf_vf(visinf_vo_vf(d), vmulsign_vf_vf_vf(vcast_vf_f(1.5874010519681994747517056f), d), t);
+#endif
+
+  return t;
+}
+
+static INLINE CONST vfloat atan2kf(vfloat y, vfloat x) {
+  vfloat s, t, u;
+  vint2 q;
+  vopmask p;
+
+  q = vsel_vi2_vf_vi2(x, vcast_vi2_i(-2));
+  x = vabs_vf_vf(x);
+
+  q = vsel_vi2_vf_vf_vi2_vi2(x, y, vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), q);
+  p = vlt_vo_vf_vf(x, y);
+  s = vsel_vf_vo_vf_vf(p, vneg_vf_vf(x), y);
+  t = vmax_vf_vf_vf(x, y);
+
+  s = vdiv_vf_vf_vf(s, t);
+  t = vmul_vf_vf_vf(s, s);
+
+  u = vcast_vf_f(0.00282363896258175373077393f);
+  u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(-0.0159569028764963150024414f));
+  u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(0.0425049886107444763183594f));
+  u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(-0.0748900920152664184570312f));
+  u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(0.106347933411598205566406f));
+  u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(-0.142027363181114196777344f));
+  u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(0.199926957488059997558594f));
+  u = vmla_vf_vf_vf_vf(u, t, vcast_vf_f(-0.333331018686294555664062f));
+
+  t = vmla_vf_vf_vf_vf(s, vmul_vf_vf_vf(t, u), s);
+  t = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f((float)(M_PI/2)), t);
+
+  return t;
+}
+
+static INLINE CONST vfloat visinf2_vf_vf_vf(vfloat d, vfloat m) {
+  return vreinterpret_vf_vm(vand_vm_vo32_vm(visinf_vo_vf(d), vor_vm_vm_vm(vsignbit_vm_vf(d), vreinterpret_vm_vf(m))));
+}
+
+EXPORT CONST vfloat xatan2f(vfloat y, vfloat x) {
+  vfloat r = atan2kf(vabs_vf_vf(y), x);
+
+  r = vmulsign_vf_vf_vf(r, x);
+  r = vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(x), veq_vo_vf_vf(x, vcast_vf_f(0.0f))), vsub_vf_vf_vf(vcast_vf_f((float)(M_PI/2)), visinf2_vf_vf_vf(x, vmulsign_vf_vf_vf(vcast_vf_f((float)(M_PI/2)), x))), r);
+  r = vsel_vf_vo_vf_vf(visinf_vo_vf(y), vsub_vf_vf_vf(vcast_vf_f((float)(M_PI/2)), visinf2_vf_vf_vf(x, vmulsign_vf_vf_vf(vcast_vf_f((float)(M_PI/4)), x))), r);
+
+  r = vsel_vf_vo_vf_vf(veq_vo_vf_vf(y, vcast_vf_f(0.0f)), vreinterpret_vf_vm(vand_vm_vo32_vm(vsignbit_vo_vf(x), vreinterpret_vm_vf(vcast_vf_f((float)M_PI)))), r);
+
+  r = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visnan_vo_vf(x), visnan_vo_vf(y)), vreinterpret_vm_vf(vmulsign_vf_vf_vf(r, y))));
+  return r;
+}
+
+EXPORT CONST vfloat xasinf(vfloat d) {
+  vopmask o = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(0.5f));
+  vfloat x2 = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, d), vmul_vf_vf_vf(vsub_vf_vf_vf(vcast_vf_f(1), vabs_vf_vf(d)), vcast_vf_f(0.5f)));
+  vfloat x = vsel_vf_vo_vf_vf(o, vabs_vf_vf(d), vsqrt_vf_vf(x2)), u;
+
+  u = vcast_vf_f(+0.4197454825e-1);
+  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.2424046025e-1));
+  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.4547423869e-1));
+  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.7495029271e-1));
+  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.1666677296e+0));
+  u = vmla_vf_vf_vf_vf(u, vmul_vf_vf_vf(x, x2), x);
+
+  vfloat r = vsel_vf_vo_vf_vf(o, u, vmla_vf_vf_vf_vf(u, vcast_vf_f(-2), vcast_vf_f(M_PIf/2)));
+  return vmulsign_vf_vf_vf(r, d);
+}
+
+EXPORT CONST vfloat xacosf(vfloat d) {
+  vopmask o = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(0.5f));
+  vfloat x2 = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, d),
+				vmul_vf_vf_vf(vsub_vf_vf_vf(vcast_vf_f(1), vabs_vf_vf(d)), vcast_vf_f(0.5f))), u;
+  vfloat x = vsel_vf_vo_vf_vf(o, vabs_vf_vf(d), vsqrt_vf_vf(x2));
+  x = vsel_vf_vo_vf_vf(veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(1.0f)), vcast_vf_f(0), x);
+
+  u = vcast_vf_f(+0.4197454825e-1);
+  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.2424046025e-1));
+  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.4547423869e-1));
+  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.7495029271e-1));
+  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.1666677296e+0));
+  u = vmul_vf_vf_vf(u, vmul_vf_vf_vf(x2, x));
+
+  vfloat y = vsub_vf_vf_vf(vcast_vf_f(3.1415926535897932f/2), vadd_vf_vf_vf(vmulsign_vf_vf_vf(x, d), vmulsign_vf_vf_vf(u, d)));
+  x = vadd_vf_vf_vf(x, u);
+  vfloat r = vsel_vf_vo_vf_vf(o, y, vmul_vf_vf_vf(x, vcast_vf_f(2)));
+  return vsel_vf_vo_vf_vf(vandnot_vo_vo_vo(o, vlt_vo_vf_vf(d, vcast_vf_f(0))),
+			  dfadd_vf2_vf2_vf(vcast_vf2_f_f(3.1415927410125732422f,-8.7422776573475857731e-08f),
+					   vneg_vf_vf(r)).x, r);
+}
+
+//
+
+static INLINE CONST vfloat2 atan2kf_u1(vfloat2 y, vfloat2 x) {
+  vfloat u;
+  vfloat2 s, t;
+  vint2 q;
+  vopmask p;
+  vmask r;
+
+  q = vsel_vi2_vf_vf_vi2_vi2(x.x, vcast_vf_f(0), vcast_vi2_i(-2), vcast_vi2_i(0));
+  p = vlt_vo_vf_vf(x.x, vcast_vf_f(0));
+  r = vand_vm_vo32_vm(p, vreinterpret_vm_vf(vcast_vf_f(-0.0)));
+  x.x = vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(x.x), r));
+  x.y = vreinterpret_vf_vm(vxor_vm_vm_vm(vreinterpret_vm_vf(x.y), r));
+
+  q = vsel_vi2_vf_vf_vi2_vi2(x.x, y.x, vadd_vi2_vi2_vi2(q, vcast_vi2_i(1)), q);
+  p = vlt_vo_vf_vf(x.x, y.x);
+  s = vsel_vf2_vo_vf2_vf2(p, dfneg_vf2_vf2(x), y);
+  t = vsel_vf2_vo_vf2_vf2(p, y, x);
+
+  s = dfdiv_vf2_vf2_vf2(s, t);
+  t = dfsqu_vf2_vf2(s);
+  t = dfnormalize_vf2_vf2(t);
+
+  u = vcast_vf_f(-0.00176397908944636583328247f);
+  u = vmla_vf_vf_vf_vf(u, t.x, vcast_vf_f(0.0107900900766253471374512f));
+  u = vmla_vf_vf_vf_vf(u, t.x, vcast_vf_f(-0.0309564601629972457885742f));
+  u = vmla_vf_vf_vf_vf(u, t.x, vcast_vf_f(0.0577365085482597351074219f));
+  u = vmla_vf_vf_vf_vf(u, t.x, vcast_vf_f(-0.0838950723409652709960938f));
+  u = vmla_vf_vf_vf_vf(u, t.x, vcast_vf_f(0.109463557600975036621094f));
+  u = vmla_vf_vf_vf_vf(u, t.x, vcast_vf_f(-0.142626821994781494140625f));
+  u = vmla_vf_vf_vf_vf(u, t.x, vcast_vf_f(0.199983194470405578613281f));
+
+  t = dfmul_vf2_vf2_vf2(t, dfadd_vf2_vf_vf(vcast_vf_f(-0.333332866430282592773438f), vmul_vf_vf_vf(u, t.x)));
+  t = dfmul_vf2_vf2_vf2(s, dfadd_vf2_vf_vf2(vcast_vf_f(1), t));
+  t = dfadd2_vf2_vf2_vf2(dfmul_vf2_vf2_vf(vcast_vf2_f_f(1.5707963705062866211f, -4.3711388286737928865e-08f), vcast_vf_vi2(q)), t);
+
+  return t;
+}
+
+EXPORT CONST vfloat xatan2f_u1(vfloat y, vfloat x) {
+  vopmask o = vlt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(2.9387372783541830947e-39f)); // nexttowardf((1.0 / FLT_MAX), 1)
+  x = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(x, vcast_vf_f(1 << 24)), x);
+  y = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(y, vcast_vf_f(1 << 24)), y);
+
+  vfloat2 d = atan2kf_u1(vcast_vf2_vf_vf(vabs_vf_vf(y), vcast_vf_f(0)), vcast_vf2_vf_vf(x, vcast_vf_f(0)));
+  vfloat r = vadd_vf_vf_vf(d.x, d.y);
+
+  r = vmulsign_vf_vf_vf(r, x);
+  r = vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(x), veq_vo_vf_vf(x, vcast_vf_f(0))), vsub_vf_vf_vf(vcast_vf_f(M_PI/2), visinf2_vf_vf_vf(x, vmulsign_vf_vf_vf(vcast_vf_f(M_PI/2), x))), r);
+  r = vsel_vf_vo_vf_vf(visinf_vo_vf(y), vsub_vf_vf_vf(vcast_vf_f(M_PI/2), visinf2_vf_vf_vf(x, vmulsign_vf_vf_vf(vcast_vf_f(M_PI/4), x))), r);
+  r = vsel_vf_vo_vf_vf(veq_vo_vf_vf(y, vcast_vf_f(0.0f)), vreinterpret_vf_vm(vand_vm_vo32_vm(vsignbit_vo_vf(x), vreinterpret_vm_vf(vcast_vf_f((float)M_PI)))), r);
+
+  r = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visnan_vo_vf(x), visnan_vo_vf(y)), vreinterpret_vm_vf(vmulsign_vf_vf_vf(r, y))));
+  return r;
+}
+
+EXPORT CONST vfloat xasinf_u1(vfloat d) {
+  vopmask o = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(0.5f));
+  vfloat x2 = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, d), vmul_vf_vf_vf(vsub_vf_vf_vf(vcast_vf_f(1), vabs_vf_vf(d)), vcast_vf_f(0.5f))), u;
+  vfloat2 x = vsel_vf2_vo_vf2_vf2(o, vcast_vf2_vf_vf(vabs_vf_vf(d), vcast_vf_f(0)), dfsqrt_vf2_vf(x2));
+  x = vsel_vf2_vo_vf2_vf2(veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(1.0f)), vcast_vf2_f_f(0, 0), x);
+
+  u = vcast_vf_f(+0.4197454825e-1);
+  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.2424046025e-1));
+  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.4547423869e-1));
+  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.7495029271e-1));
+  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.1666677296e+0));
+  u = vmul_vf_vf_vf(u, vmul_vf_vf_vf(x2, x.x));
+
+  vfloat2 y = dfsub_vf2_vf2_vf(dfsub_vf2_vf2_vf2(vcast_vf2_f_f(3.1415927410125732422f/4,-8.7422776573475857731e-08f/4), x), u);
+
+  vfloat r = vsel_vf_vo_vf_vf(o, vadd_vf_vf_vf(u, x.x),
+             vmul_vf_vf_vf(vadd_vf_vf_vf(y.x, y.y), vcast_vf_f(2)));
+  return vmulsign_vf_vf_vf(r, d);
+}
+
+EXPORT CONST vfloat xacosf_u1(vfloat d) {
+  vopmask o = vlt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(0.5f));
+  vfloat x2 = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, d), vmul_vf_vf_vf(vsub_vf_vf_vf(vcast_vf_f(1), vabs_vf_vf(d)), vcast_vf_f(0.5f))), u;
+  vfloat2 x = vsel_vf2_vo_vf2_vf2(o, vcast_vf2_vf_vf(vabs_vf_vf(d), vcast_vf_f(0)), dfsqrt_vf2_vf(x2));
+  x = vsel_vf2_vo_vf2_vf2(veq_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(1.0f)), vcast_vf2_f_f(0, 0), x);
+
+  u = vcast_vf_f(+0.4197454825e-1);
+  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.2424046025e-1));
+  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.4547423869e-1));
+  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.7495029271e-1));
+  u = vmla_vf_vf_vf_vf(u, x2, vcast_vf_f(+0.1666677296e+0));
+  u = vmul_vf_vf_vf(u, vmul_vf_vf_vf(x2, x.x));
+
+  vfloat2 y = dfsub_vf2_vf2_vf2(vcast_vf2_f_f(3.1415927410125732422f/2, -8.7422776573475857731e-08f/2),
+                                 dfadd_vf2_vf_vf(vmulsign_vf_vf_vf(x.x, d), vmulsign_vf_vf_vf(u, d)));
+  x = dfadd_vf2_vf2_vf(x, u);
+
+  y = vsel_vf2_vo_vf2_vf2(o, y, dfscale_vf2_vf2_vf(x, vcast_vf_f(2)));
+
+  y = vsel_vf2_vo_vf2_vf2(vandnot_vo_vo_vo(o, vlt_vo_vf_vf(d, vcast_vf_f(0))),
+                          dfsub_vf2_vf2_vf2(vcast_vf2_f_f(3.1415927410125732422f, -8.7422776573475857731e-08f), y), y);
+
+  return vadd_vf_vf_vf(y.x, y.y);
+}
+
+EXPORT CONST vfloat xatanf_u1(vfloat d) {
+  vfloat2 d2 = atan2kf_u1(vcast_vf2_vf_vf(vabs_vf_vf(d), vcast_vf_f(0)), vcast_vf2_f_f(1, 0));
+  vfloat r = vadd_vf_vf_vf(d2.x, d2.y);
+  r = vsel_vf_vo_vf_vf(visinf_vo_vf(d), vcast_vf_f(1.570796326794896557998982), r);
+  return vmulsign_vf_vf_vf(r, d);
+}
+
+//
+
+EXPORT CONST vfloat xlogf(vfloat d) {
+  vfloat x, x2, t, m;
+
+#ifndef ENABLE_AVX512F
+  vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(FLT_MIN));
+  d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f((float)(1LL << 32) * (float)(1LL << 32))), d);
+  vint2 e = vilogb2k_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0f/0.75f)));
+  m = vldexp3_vf_vf_vi2(d, vneg_vi2_vi2(e));
+  e = vsel_vi2_vo_vi2_vi2(o, vsub_vi2_vi2_vi2(e, vcast_vi2_i(64)), e);
+#else
+  vfloat e = vgetexp_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0f/0.75f)));
+  e = vsel_vf_vo_vf_vf(vispinf_vo_vf(e), vcast_vf_f(128.0f), e);
+  m = vgetmant_vf_vf(d);
+#endif
+
+  x = vdiv_vf_vf_vf(vadd_vf_vf_vf(vcast_vf_f(-1.0f), m), vadd_vf_vf_vf(vcast_vf_f(1.0f), m));
+  x2 = vmul_vf_vf_vf(x, x);
+
+  t = vcast_vf_f(0.2392828464508056640625f);
+  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(0.28518211841583251953125f));
+  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(0.400005877017974853515625f));
+  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(0.666666686534881591796875f));
+  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(2.0f));
+
+#ifndef ENABLE_AVX512F
+  x = vmla_vf_vf_vf_vf(x, t, vmul_vf_vf_vf(vcast_vf_f(0.693147180559945286226764f), vcast_vf_vi2(e)));
+  x = vsel_vf_vo_vf_vf(vispinf_vo_vf(d), vcast_vf_f(INFINITYf), x);
+  x = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vlt_vo_vf_vf(d, vcast_vf_f(0)), visnan_vo_vf(d)), vcast_vf_f(NANf), x);
+  x = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(0)), vcast_vf_f(-INFINITYf), x);
+#else
+  x = vmla_vf_vf_vf_vf(x, t, vmul_vf_vf_vf(vcast_vf_f(0.693147180559945286226764f), e));
+  x = vfixup_vf_vf_vf_vi2_i(x, d, vcast_vi2_i((5 << (5*4))), 0);
+#endif
+
+  return x;
+}
+
+EXPORT CONST vfloat xexpf(vfloat d) {
+  vint2 q = vrint_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(R_LN2f)));
+  vfloat s, u;
+
+  s = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf), d);
+  s = vmla_vf_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf), s);
+
+  u = vcast_vf_f(0.000198527617612853646278381);
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00139304355252534151077271));
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.00833336077630519866943359));
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.0416664853692054748535156));
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.166666671633720397949219));
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(0.5));
+
+  u = vadd_vf_vf_vf(vcast_vf_f(1.0f), vmla_vf_vf_vf_vf(vmul_vf_vf_vf(s, s), u, s));
+
+  u = vldexp2_vf_vf_vi2(u, q);
+
+  u = vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(d, vcast_vf_f(-104)), vreinterpret_vm_vf(u)));
+  u = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(vcast_vf_f(100), d), vcast_vf_f(INFINITYf), u);
+
+  return u;
+}
+
+#ifdef ENABLE_NEON32
+EXPORT CONST vfloat xsqrtf_u35(vfloat d) {
+  vfloat e = vreinterpret_vf_vi2(vadd_vi2_vi2_vi2(vcast_vi2_i(0x20000000), vand_vi2_vi2_vi2(vcast_vi2_i(0x7f000000), vsrl_vi2_vi2_i(vreinterpret_vi2_vf(d), 1))));
+  vfloat m = vreinterpret_vf_vi2(vadd_vi2_vi2_vi2(vcast_vi2_i(0x3f000000), vand_vi2_vi2_vi2(vcast_vi2_i(0x01ffffff), vreinterpret_vi2_vf(d))));
+  float32x4_t x = vrsqrteq_f32(m);
+  x = vmulq_f32(x, vrsqrtsq_f32(m, vmulq_f32(x, x)));
+  float32x4_t u = vmulq_f32(x, m);
+  u = vmlaq_f32(u, vmlsq_f32(m, u, u), vmulq_f32(x, vdupq_n_f32(0.5)));
+  e = vreinterpret_vf_vm(vandnot_vm_vo32_vm(veq_vo_vf_vf(d, vcast_vf_f(0)), vreinterpret_vm_vf(e)));
+  u = vmul_vf_vf_vf(e, u);
+
+  u = vsel_vf_vo_vf_vf(visinf_vo_vf(d), vcast_vf_f(INFINITYf), u);
+  u = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visnan_vo_vf(d), vlt_vo_vf_vf(d, vcast_vf_f(0))), vreinterpret_vm_vf(u)));
+  u = vmulsign_vf_vf_vf(u, d);
+
+  return u;
+}
+#elif defined(ENABLE_VECEXT)
+EXPORT CONST vfloat xsqrtf_u35(vfloat d) {
+  vfloat q = vsqrt_vf_vf(d);
+  q = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0), q);
+  return vsel_vf_vo_vf_vf(vispinf_vo_vf(d), vcast_vf_f(INFINITYf), q);
+}
+#else
+EXPORT CONST vfloat xsqrtf_u35(vfloat d) { return vsqrt_vf_vf(d); }
+#endif
+
+EXPORT CONST vfloat xcbrtf(vfloat d) {
+  vfloat x, y, q = vcast_vf_f(1.0), t;
+  vint2 e, qu, re;
+
+#ifdef ENABLE_AVX512F
+  vfloat s = d;
+#endif
+  e = vadd_vi2_vi2_vi2(vilogbk_vi2_vf(vabs_vf_vf(d)), vcast_vi2_i(1));
+  d = vldexp2_vf_vf_vi2(d, vneg_vi2_vi2(e));
+
+  t = vadd_vf_vf_vf(vcast_vf_vi2(e), vcast_vf_f(6144));
+  qu = vtruncate_vi2_vf(vmul_vf_vf_vf(t, vcast_vf_f(1.0f/3.0f)));
+  re = vtruncate_vi2_vf(vsub_vf_vf_vf(t, vmul_vf_vf_vf(vcast_vf_vi2(qu), vcast_vf_f(3))));
+
+  q = vsel_vf_vo_vf_vf(veq_vo_vi2_vi2(re, vcast_vi2_i(1)), vcast_vf_f(1.2599210498948731647672106f), q);
+  q = vsel_vf_vo_vf_vf(veq_vo_vi2_vi2(re, vcast_vi2_i(2)), vcast_vf_f(1.5874010519681994747517056f), q);
+  q = vldexp2_vf_vf_vi2(q, vsub_vi2_vi2_vi2(qu, vcast_vi2_i(2048)));
+
+  q = vmulsign_vf_vf_vf(q, d);
+  d = vabs_vf_vf(d);
+
+  x = vcast_vf_f(-0.601564466953277587890625f);
+  x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(2.8208892345428466796875f));
+  x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(-5.532182216644287109375f));
+  x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(5.898262500762939453125f));
+  x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(-3.8095417022705078125f));
+  x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(2.2241256237030029296875f));
+
+  y = vmul_vf_vf_vf(vmul_vf_vf_vf(d, x), x);
+  y = vmul_vf_vf_vf(vsub_vf_vf_vf(y, vmul_vf_vf_vf(vmul_vf_vf_vf(vcast_vf_f(2.0f / 3.0f), y), vmla_vf_vf_vf_vf(y, x, vcast_vf_f(-1.0f)))), q);
+
+#ifdef ENABLE_AVX512F
+  y = vsel_vf_vo_vf_vf(visinf_vo_vf(s), vmulsign_vf_vf_vf(vcast_vf_f(INFINITYf), s), y);
+  y = vsel_vf_vo_vf_vf(veq_vo_vf_vf(s, vcast_vf_f(0)), vmulsign_vf_vf_vf(vcast_vf_f(0), s), y);
+#endif
+
+  y = vsel_vf_vo_vf_vf(visnan_vo_vf(d), d, y);
+
+  return y;
+}
+
+EXPORT CONST vfloat xcbrtf_u1(vfloat d) {
+  vfloat x, y, z, t;
+  vfloat2 q2 = vcast_vf2_f_f(1, 0), u, v;
+  vint2 e, qu, re;
+
+#ifdef ENABLE_AVX512F
+  vfloat s = d;
+#endif
+  e = vadd_vi2_vi2_vi2(vilogbk_vi2_vf(vabs_vf_vf(d)), vcast_vi2_i(1));
+  d = vldexp2_vf_vf_vi2(d, vneg_vi2_vi2(e));
+
+  t = vadd_vf_vf_vf(vcast_vf_vi2(e), vcast_vf_f(6144));
+  qu = vtruncate_vi2_vf(vmul_vf_vf_vf(t, vcast_vf_f(1.0/3.0)));
+  re = vtruncate_vi2_vf(vsub_vf_vf_vf(t, vmul_vf_vf_vf(vcast_vf_vi2(qu), vcast_vf_f(3))));
+
+  q2 = vsel_vf2_vo_vf2_vf2(veq_vo_vi2_vi2(re, vcast_vi2_i(1)), vcast_vf2_f_f(1.2599210739135742188f, -2.4018701694217270415e-08), q2);
+  q2 = vsel_vf2_vo_vf2_vf2(veq_vo_vi2_vi2(re, vcast_vi2_i(2)), vcast_vf2_f_f(1.5874010324478149414f,  1.9520385308169352356e-08), q2);
+
+  q2.x = vmulsign_vf_vf_vf(q2.x, d); q2.y = vmulsign_vf_vf_vf(q2.y, d);
+  d = vabs_vf_vf(d);
+
+  x = vcast_vf_f(-0.601564466953277587890625f);
+  x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(2.8208892345428466796875f));
+  x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(-5.532182216644287109375f));
+  x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(5.898262500762939453125f));
+  x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(-3.8095417022705078125f));
+  x = vmla_vf_vf_vf_vf(x, d, vcast_vf_f(2.2241256237030029296875f));
+
+  y = vmul_vf_vf_vf(x, x); y = vmul_vf_vf_vf(y, y); x = vsub_vf_vf_vf(x, vmul_vf_vf_vf(vmlanp_vf_vf_vf_vf(d, y, x), vcast_vf_f(-1.0 / 3.0)));
+
+  z = x;
+
+  u = dfmul_vf2_vf_vf(x, x);
+  u = dfmul_vf2_vf2_vf2(u, u);
+  u = dfmul_vf2_vf2_vf(u, d);
+  u = dfadd2_vf2_vf2_vf(u, vneg_vf_vf(x));
+  y = vadd_vf_vf_vf(u.x, u.y);
+
+  y = vmul_vf_vf_vf(vmul_vf_vf_vf(vcast_vf_f(-2.0 / 3.0), y), z);
+  v = dfadd2_vf2_vf2_vf(dfmul_vf2_vf_vf(z, z), y);
+  v = dfmul_vf2_vf2_vf(v, d);
+  v = dfmul_vf2_vf2_vf2(v, q2);
+  z = vldexp2_vf_vf_vi2(vadd_vf_vf_vf(v.x, v.y), vsub_vi2_vi2_vi2(qu, vcast_vi2_i(2048)));
+
+  z = vsel_vf_vo_vf_vf(visinf_vo_vf(d), vmulsign_vf_vf_vf(vcast_vf_f(INFINITYf), q2.x), z);
+  z = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(0)), vreinterpret_vf_vm(vsignbit_vm_vf(q2.x)), z);
+
+#ifdef ENABLE_AVX512F
+  z = vsel_vf_vo_vf_vf(visinf_vo_vf(s), vmulsign_vf_vf_vf(vcast_vf_f(INFINITYf), s), z);
+  z = vsel_vf_vo_vf_vf(veq_vo_vf_vf(s, vcast_vf_f(0)), vmulsign_vf_vf_vf(vcast_vf_f(0), s), z);
+#endif
+
+  return z;
+}
+
+static INLINE CONST vfloat2 logkf(vfloat d) {
+  vfloat2 x, x2;
+  vfloat t, m;
+
+#ifndef ENABLE_AVX512F
+  vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(FLT_MIN));
+  d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f((float)(1LL << 32) * (float)(1LL << 32))), d);
+  vint2 e = vilogb2k_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0f/0.75f)));
+  m = vldexp3_vf_vf_vi2(d, vneg_vi2_vi2(e));
+  e = vsel_vi2_vo_vi2_vi2(o, vsub_vi2_vi2_vi2(e, vcast_vi2_i(64)), e);
+#else
+  vfloat e = vgetexp_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0f/0.75f)));
+  e = vsel_vf_vo_vf_vf(vispinf_vo_vf(e), vcast_vf_f(128.0f), e);
+  m = vgetmant_vf_vf(d);
+#endif
+
+  x = dfdiv_vf2_vf2_vf2(dfadd2_vf2_vf_vf(vcast_vf_f(-1), m), dfadd2_vf2_vf_vf(vcast_vf_f(1), m));
+  x2 = dfsqu_vf2_vf2(x);
+
+  t = vcast_vf_f(0.240320354700088500976562);
+  t = vmla_vf_vf_vf_vf(t, x2.x, vcast_vf_f(0.285112679004669189453125));
+  t = vmla_vf_vf_vf_vf(t, x2.x, vcast_vf_f(0.400007992982864379882812));
+  vfloat2 c = vcast_vf2_f_f(0.66666662693023681640625f, 3.69183861259614332084311e-09f);
+
+#ifndef ENABLE_AVX512F
+  vfloat2 s = dfmul_vf2_vf2_vf(vcast_vf2_f_f(0.69314718246459960938f, -1.904654323148236017e-09f), vcast_vf_vi2(e));
+#else
+  vfloat2 s = dfmul_vf2_vf2_vf(vcast_vf2_f_f(0.69314718246459960938f, -1.904654323148236017e-09f), e);
+#endif
+
+  s = dfadd_vf2_vf2_vf2(s, dfscale_vf2_vf2_vf(x, vcast_vf_f(2)));
+  s = dfadd_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf2(dfmul_vf2_vf2_vf2(x2, x),
+                                            dfadd2_vf2_vf2_vf2(dfmul_vf2_vf2_vf(x2, t), c)));
+  return s;
+}
+
+EXPORT CONST vfloat xlogf_u1(vfloat d) {
+  vfloat2 x;
+  vfloat t, m, x2;
+
+#ifndef ENABLE_AVX512F
+  vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(FLT_MIN));
+  d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f((float)(1LL << 32) * (float)(1LL << 32))), d);
+  vint2 e = vilogb2k_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0f/0.75f)));
+  m = vldexp3_vf_vf_vi2(d, vneg_vi2_vi2(e));
+  e = vsel_vi2_vo_vi2_vi2(o, vsub_vi2_vi2_vi2(e, vcast_vi2_i(64)), e);
+  vfloat2 s = dfmul_vf2_vf2_vf(vcast_vf2_f_f(0.69314718246459960938f, -1.904654323148236017e-09f), vcast_vf_vi2(e));
+#else
+  vfloat e = vgetexp_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0f/0.75f)));
+  e = vsel_vf_vo_vf_vf(vispinf_vo_vf(e), vcast_vf_f(128.0f), e);
+  m = vgetmant_vf_vf(d);
+  vfloat2 s = dfmul_vf2_vf2_vf(vcast_vf2_f_f(0.69314718246459960938f, -1.904654323148236017e-09f), e);
+#endif
+
+  x = dfdiv_vf2_vf2_vf2(dfadd2_vf2_vf_vf(vcast_vf_f(-1), m), dfadd2_vf2_vf_vf(vcast_vf_f(1), m));
+  x2 = vmul_vf_vf_vf(x.x, x.x);
+
+  t = vcast_vf_f(+0.3027294874e+0f);
+  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(+0.3996108174e+0f));
+  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(+0.6666694880e+0f));
+
+  s = dfadd_vf2_vf2_vf2(s, dfscale_vf2_vf2_vf(x, vcast_vf_f(2)));
+  s = dfadd_vf2_vf2_vf(s, vmul_vf_vf_vf(vmul_vf_vf_vf(x2, x.x), t));
+
+  vfloat r = vadd_vf_vf_vf(s.x, s.y);
+
+#ifndef ENABLE_AVX512F
+  r = vsel_vf_vo_vf_vf(vispinf_vo_vf(d), vcast_vf_f(INFINITYf), r);
+  r = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vlt_vo_vf_vf(d, vcast_vf_f(0)), visnan_vo_vf(d)), vcast_vf_f(NANf), r);
+  r = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(0)), vcast_vf_f(-INFINITYf), r);
+#else
+  r = vfixup_vf_vf_vf_vi2_i(r, d, vcast_vi2_i((4 << (2*4)) | (3 << (4*4)) | (5 << (5*4)) | (2 << (6*4))), 0);
+#endif
+
+  return r;
+}
+
+static INLINE CONST vfloat expkf(vfloat2 d) {
+  vfloat u = vmul_vf_vf_vf(vadd_vf_vf_vf(d.x, d.y), vcast_vf_f(R_LN2f));
+  vint2 q = vrint_vi2_vf(u);
+  vfloat2 s, t;
+
+  s = dfadd2_vf2_vf2_vf(d, vmul_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf)));
+  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf)));
+
+  s = dfnormalize_vf2_vf2(s);
+
+  u = vcast_vf_f(0.00136324646882712841033936f);
+  u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.00836596917361021041870117f));
+  u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.0416710823774337768554688f));
+  u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.166665524244308471679688f));
+  u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(0.499999850988388061523438f));
+
+  t = dfadd_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf(dfsqu_vf2_vf2(s), u));
+
+  t = dfadd_vf2_vf_vf2(vcast_vf_f(1), t);
+  u = vadd_vf_vf_vf(t.x, t.y);
+  u = vldexp_vf_vf_vi2(u, q);
+
+  u = vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(d.x, vcast_vf_f(-104)), vreinterpret_vm_vf(u)));
+
+  return u;
+}
+
+EXPORT CONST vfloat xpowf(vfloat x, vfloat y) {
+#if 1
+  vopmask yisint = vor_vo_vo_vo(veq_vo_vf_vf(vtruncate_vf_vf(y), y), vgt_vo_vf_vf(vabs_vf_vf(y), vcast_vf_f(1 << 24)));
+  vopmask yisodd = vand_vo_vo_vo(vand_vo_vo_vo(veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vtruncate_vi2_vf(y), vcast_vi2_i(1)), vcast_vi2_i(1)), yisint),
+         vlt_vo_vf_vf(vabs_vf_vf(y), vcast_vf_f(1 << 24)));
+
+#ifdef ENABLE_NEON32
+  yisodd = vandnot_vm_vo32_vm(visinf_vo_vf(y), yisodd);
+#endif
+
+  vfloat result = expkf(dfmul_vf2_vf2_vf(logkf(vabs_vf_vf(x)), y));
+
+  result = vsel_vf_vo_vf_vf(visnan_vo_vf(result), vcast_vf_f(INFINITYf), result);
+
+  result = vmul_vf_vf_vf(result,
+       vsel_vf_vo_vf_vf(vgt_vo_vf_vf(x, vcast_vf_f(0)),
+            vcast_vf_f(1),
+            vsel_vf_vo_vf_vf(yisint, vsel_vf_vo_vf_vf(yisodd, vcast_vf_f(-1.0f), vcast_vf_f(1)), vcast_vf_f(NANf))));
+
+  vfloat efx = vmulsign_vf_vf_vf(vsub_vf_vf_vf(vabs_vf_vf(x), vcast_vf_f(1)), y);
+
+  result = vsel_vf_vo_vf_vf(visinf_vo_vf(y),
+          vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(efx, vcast_vf_f(0.0f)),
+                  vreinterpret_vm_vf(vsel_vf_vo_vf_vf(veq_vo_vf_vf(efx, vcast_vf_f(0.0f)),
+                              vcast_vf_f(1.0f),
+                              vcast_vf_f(INFINITYf))))),
+          result);
+
+  result = vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(x), veq_vo_vf_vf(x, vcast_vf_f(0))),
+          vmul_vf_vf_vf(vsel_vf_vo_vf_vf(yisodd, vsign_vf_vf(x), vcast_vf_f(1)),
+            vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(vsel_vf_vo_vf_vf(veq_vo_vf_vf(x, vcast_vf_f(0)), vneg_vf_vf(y), y), vcast_vf_f(0)),
+                    vreinterpret_vm_vf(vcast_vf_f(INFINITYf))))),
+          result);
+
+  result = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visnan_vo_vf(x), visnan_vo_vf(y)), vreinterpret_vm_vf(result)));
+
+  result = vsel_vf_vo_vf_vf(vor_vo_vo_vo(veq_vo_vf_vf(y, vcast_vf_f(0)), veq_vo_vf_vf(x, vcast_vf_f(1))), vcast_vf_f(1), result);
+
+  return result;
+#else
+  return expkf(dfmul_vf2_vf2_vf(logkf(x), y));
+#endif
+}
+
+EXPORT CONST vfloat xpownf(vfloat x, vmask ym) {
+    vint2 y = vcast_vi2_vm(ym);
+    vfloat res = xpowf(x, vcast_vf_vi2(y));
+
+    vint2 is_odd = vand_vi2_vi2_vi2(y, vcast_vi2_i(1));
+    vopmask is_odd_o = vgt_vo_vi2_vi2(is_odd, vcast_vi2_i(0));
+
+    // pown ( -x, odd y) == -res
+    vfloat neg = vcopysign_vf_vf_vf(res, vcast_vf_f(-0.0f));
+
+    res = vsel_vf_vo_vf_vf(
+              vand_vo_vo_vo(
+                vlt_vo_vf_vf(x, vcast_vf_f(0.0f)),
+                is_odd_o),
+              neg,
+              res);
+
+    //pown ( ±0, n ) is ±∞ for odd n < 0.
+    //pown ( ±0, n ) is +∞ for even n < 0.
+    //pown ( ±0, n ) is +0 for even n > 0.
+    //pown ( ±0, n ) is ±0 for odd n > 0.
+
+    vfloat xiszero = vsel_vf_vo_vf_vf(
+                  vgt_vo_vi2_vi2(y, vcast_vi2_i(0)),
+                  vcast_vf_f(0.0f),
+                  vcast_vf_f(INFINITYf));
+
+    vfloat with_sig = vcopysign_vf_vf_vf(xiszero, x);
+
+    xiszero = vsel_vf_vo_vf_vf(is_odd_o, with_sig, xiszero);
+
+    res = vsel_vf_vo_vf_vf(
+              veq_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(0.0f)),
+              xiszero,
+              res);
+
+    // pown ( x, 0 ) is 1 for any x
+    res = vsel_vf_vo_vf_vf(
+              veq_vo_vi2_vi2(y, vcast_vi2_i(0)),
+              vcast_vf_f(1.0f),
+              res);
+
+    return res;
+}
+
+EXPORT CONST vfloat xpowrf(vfloat x, vfloat y) {
+    vfloat res = xpowf(x, y);
+
+    vfloat ax = vabs_vf_vf(x);
+    vfloat ay = vabs_vf_vf(y);
+    vfloat zeroes = vcast_vf_f(0.0f);
+
+    //powr ( ±0, y ) is +0 for y > 0.
+    //powr ( ±0, y ) is +∞ for finite y < 0.
+    //powr ( ±0, -∞) is +∞.
+    vfloat r_Xzero = vsel_vf_vo_vf_vf(
+                       vlt_vo_vf_vf(y, zeroes),
+                       vcast_vf_f(INFINITYf),
+                       zeroes);
+    r_Xzero = vsel_vf_vo_vf_vf(
+                veq_vo_vf_vf(y, vcast_vf_f(-INFINITYf)),
+                vcast_vf_f(INFINITYf),
+                r_Xzero);
+
+    res = vsel_vf_vo_vf_vf(
+            veq_vo_vf_vf(ax, zeroes),
+            r_Xzero,
+            res);
+
+    //powr ( ±0, ±0 ) returns NaN.
+    vfloat r_Yzero = vsel_vf_vo_vf_vf(
+                        veq_vo_vf_vf(ax, zeroes),
+                        vcast_vf_f(NANf),
+                        zeroes);
+    //powr ( x, ±0 ) is 1 for finite x > 0.
+    r_Yzero = vsel_vf_vo_vf_vf(
+                vgt_vo_vf_vf(x, zeroes),
+                vcast_vf_f(1.0f),
+                r_Yzero);
+
+    //powr ( +∞, ±0 ) returns NaN.
+    r_Yzero = vsel_vf_vo_vf_vf(
+                veq_vo_vf_vf(x, vcast_vf_f(INFINITYf)),
+                vcast_vf_f(NANf),
+                r_Yzero);
+
+    res = vsel_vf_vo_vf_vf(
+            veq_vo_vf_vf(ay, zeroes),
+            r_Yzero,
+            res);
+
+    //powr ( +1, y ) is 1 for finite y.
+    //powr ( +1, ±∞ ) returns NaN.
+    vfloat r_Xone = vsel_vf_vo_vf_vf(
+                      veq_vo_vf_vf(ay, vcast_vf_f(INFINITYf)),
+                      vcast_vf_f(NANf),
+                      vcast_vf_f(1.0f));
+
+    res = vsel_vf_vo_vf_vf(
+            veq_vo_vf_vf(x, vcast_vf_f(1.0f)),
+            r_Xone,
+            res);
+
+    //powr ( x, y ) returns NaN for x < 0.
+    res = vsel_vf_vo_vf_vf(
+            vlt_vo_vf_vf(x, zeroes),
+            vcast_vf_f(NANf),
+            res);
+
+    //powr ( NaN, y ) returns the NaN
+    res = vsel_vf_vo_vf_vf(
+            visnan_vo_vf(x),
+            x,
+            res);
+
+    //powr ( x, NaN ) returns the NaN for x >= 0.
+    res = vsel_vf_vo_vf_vf(
+            visnan_vo_vf(y),
+            y,
+            res);
+    return res;
+
+}
+
+
+static INLINE CONST vfloat2 expk2f(vfloat2 d) {
+  vfloat u = vmul_vf_vf_vf(vadd_vf_vf_vf(d.x, d.y), vcast_vf_f(R_LN2f));
+  vint2 q = vrint_vi2_vf(u);
+  vfloat2 s, t;
+
+  s = dfadd2_vf2_vf2_vf(d, vmul_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Uf)));
+  s = dfadd2_vf2_vf2_vf(s, vmul_vf_vf_vf(vcast_vf_vi2(q), vcast_vf_f(-L2Lf)));
+
+  u = vcast_vf_f(+0.1980960224e-3f);
+  u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(+0.1394256484e-2f));
+  u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(+0.8333456703e-2f));
+  u = vmla_vf_vf_vf_vf(u, s.x, vcast_vf_f(+0.4166637361e-1f));
+
+  t = dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf(s, u), vcast_vf_f(+0.166666659414234244790680580464e+0f));
+  t = dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf2(s, t), vcast_vf_f(0.5));
+  t = dfadd2_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf2(dfsqu_vf2_vf2(s), t));
+
+  t = dfadd_vf2_vf_vf2(vcast_vf_f(1), t);
+
+  t.x = vldexp2_vf_vf_vi2(t.x, q);
+  t.y = vldexp2_vf_vf_vi2(t.y, q);
+
+  t.x = vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(d.x, vcast_vf_f(-104)), vreinterpret_vm_vf(t.x)));
+  t.y = vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(d.x, vcast_vf_f(-104)), vreinterpret_vm_vf(t.y)));
+
+  return t;
+}
+
+EXPORT CONST vfloat xsinhf(vfloat x) {
+  vfloat y = vabs_vf_vf(x);
+  vfloat2 d = expk2f(vcast_vf2_vf_vf(y, vcast_vf_f(0)));
+  d = dfsub_vf2_vf2_vf2(d, dfrec_vf2_vf2(d));
+  y = vmul_vf_vf_vf(vadd_vf_vf_vf(d.x, d.y), vcast_vf_f(0.5));
+
+  y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(89)),
+            visnan_vo_vf(y)), vcast_vf_f(INFINITYf), y);
+  y = vmulsign_vf_vf_vf(y, x);
+  y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y)));
+
+  return y;
+}
+
+EXPORT CONST vfloat xcoshf(vfloat x) {
+  vfloat y = vabs_vf_vf(x);
+  vfloat2 d = expk2f(vcast_vf2_vf_vf(y, vcast_vf_f(0)));
+  d = dfadd_vf2_vf2_vf2(d, dfrec_vf2_vf2(d));
+  y = vmul_vf_vf_vf(vadd_vf_vf_vf(d.x, d.y), vcast_vf_f(0.5));
+
+  y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(89)),
+            visnan_vo_vf(y)), vcast_vf_f(INFINITYf), y);
+  y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y)));
+
+  return y;
+}
+
+EXPORT CONST vfloat xtanhf(vfloat x) {
+  vfloat y = vabs_vf_vf(x);
+  vfloat2 d = expk2f(vcast_vf2_vf_vf(y, vcast_vf_f(0)));
+  vfloat2 e = dfrec_vf2_vf2(d);
+  d = dfdiv_vf2_vf2_vf2(dfadd_vf2_vf2_vf2(d, dfneg_vf2_vf2(e)), dfadd_vf2_vf2_vf2(d, e));
+  y = vadd_vf_vf_vf(d.x, d.y);
+
+  y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(8.664339742f)),
+            visnan_vo_vf(y)), vcast_vf_f(1.0f), y);
+  y = vmulsign_vf_vf_vf(y, x);
+  y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y)));
+
+  return y;
+}
+
+static INLINE CONST vfloat2 logk2f(vfloat2 d) {
+  vfloat2 x, x2, m, s;
+  vfloat t;
+  vint2 e;
+
+#ifndef ENABLE_AVX512F
+  e = vilogbk_vi2_vf(vmul_vf_vf_vf(d.x, vcast_vf_f(1.0f/0.75f)));
+#else
+  e = vrint_vi2_vf(vgetexp_vf_vf(vmul_vf_vf_vf(d.x, vcast_vf_f(1.0f/0.75f))));
+#endif
+  m = dfscale_vf2_vf2_vf(d, vpow2i_vf_vi2(vneg_vi2_vi2(e)));
+
+  x = dfdiv_vf2_vf2_vf2(dfadd2_vf2_vf2_vf(m, vcast_vf_f(-1)), dfadd2_vf2_vf2_vf(m, vcast_vf_f(1)));
+  x2 = dfsqu_vf2_vf2(x);
+
+  t = vcast_vf_f(0.2392828464508056640625f);
+  t = vmla_vf_vf_vf_vf(t, x2.x, vcast_vf_f(0.28518211841583251953125f));
+  t = vmla_vf_vf_vf_vf(t, x2.x, vcast_vf_f(0.400005877017974853515625f));
+  t = vmla_vf_vf_vf_vf(t, x2.x, vcast_vf_f(0.666666686534881591796875f));
+
+  s = dfmul_vf2_vf2_vf(vcast_vf2_vf_vf(vcast_vf_f(0.69314718246459960938f), vcast_vf_f(-1.904654323148236017e-09f)), vcast_vf_vi2(e));
+  s = dfadd_vf2_vf2_vf2(s, dfscale_vf2_vf2_vf(x, vcast_vf_f(2)));
+  s = dfadd_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf(dfmul_vf2_vf2_vf2(x2, x), t));
+
+  return s;
+}
+
+EXPORT CONST vfloat xasinhf(vfloat x) {
+  vfloat y = vabs_vf_vf(x);
+  vopmask o = vgt_vo_vf_vf(y, vcast_vf_f(1));
+  vfloat2 d;
+
+  d = vsel_vf2_vo_vf2_vf2(o, dfrec_vf2_vf(x), vcast_vf2_vf_vf(y, vcast_vf_f(0)));
+  d = dfsqrt_vf2_vf2(dfadd2_vf2_vf2_vf(dfsqu_vf2_vf2(d), vcast_vf_f(1)));
+  d = vsel_vf2_vo_vf2_vf2(o, dfmul_vf2_vf2_vf(d, y), d);
+
+  d = logk2f(dfnormalize_vf2_vf2(dfadd2_vf2_vf2_vf(d, x)));
+  y = vadd_vf_vf_vf(d.x, d.y);
+
+  y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(SQRT_FLT_MAX)),
+            visnan_vo_vf(y)),
+           vmulsign_vf_vf_vf(vcast_vf_f(INFINITYf), x), y);
+  y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y)));
+  y = vsel_vf_vo_vf_vf(visnegzero_vo_vf(x), vcast_vf_f(-0.0), y);
+
+  return y;
+}
+
+EXPORT CONST vfloat xacoshf(vfloat x) {
+  vfloat2 d = logk2f(dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf2(dfsqrt_vf2_vf2(dfadd2_vf2_vf_vf(x, vcast_vf_f(1))), dfsqrt_vf2_vf2(dfadd2_vf2_vf_vf(x, vcast_vf_f(-1)))), x));
+  vfloat y = vadd_vf_vf_vf(d.x, d.y);
+
+  y = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vgt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(SQRT_FLT_MAX)),
+            visnan_vo_vf(y)),
+           vcast_vf_f(INFINITYf), y);
+
+  y = vreinterpret_vf_vm(vandnot_vm_vo32_vm(veq_vo_vf_vf(x, vcast_vf_f(1.0f)), vreinterpret_vm_vf(y)));
+
+  y = vreinterpret_vf_vm(vor_vm_vo32_vm(vlt_vo_vf_vf(x, vcast_vf_f(1.0f)), vreinterpret_vm_vf(y)));
+  y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y)));
+
+  return y;
+}
+
+EXPORT CONST vfloat xatanhf(vfloat x) {
+  vfloat y = vabs_vf_vf(x);
+  vfloat2 d = logk2f(dfdiv_vf2_vf2_vf2(dfadd2_vf2_vf_vf(vcast_vf_f(1), y), dfadd2_vf2_vf_vf(vcast_vf_f(1), vneg_vf_vf(y))));
+  y = vreinterpret_vf_vm(vor_vm_vo32_vm(vgt_vo_vf_vf(y, vcast_vf_f(1.0)), vreinterpret_vm_vf(vsel_vf_vo_vf_vf(veq_vo_vf_vf(y, vcast_vf_f(1.0)), vcast_vf_f(INFINITYf), vmul_vf_vf_vf(vadd_vf_vf_vf(d.x, d.y), vcast_vf_f(0.5))))));
+
+  y = vreinterpret_vf_vm(vor_vm_vo32_vm(vor_vo_vo_vo(visinf_vo_vf(x), visnan_vo_vf(y)), vreinterpret_vm_vf(y)));
+  y = vmulsign_vf_vf_vf(y, x);
+  y = vreinterpret_vf_vm(vor_vm_vo32_vm(visnan_vo_vf(x), vreinterpret_vm_vf(y)));
+
+  return y;
+}
+
+EXPORT CONST vfloat xexp2f(vfloat d) {
+  vfloat u = vrint_vf_vf(d), s;
+  vint2 q = vrint_vi2_vf(u);
+
+  s = vsub_vf_vf_vf(d, u);
+
+  u = vcast_vf_f(+0.1535920892e-3);
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.1339262701e-2));
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.9618384764e-2));
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.5550347269e-1));
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2402264476e+0));
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.6931471825e+0));
+
+#ifdef ENABLE_FMA_SP
+  u = vfma_vf_vf_vf_vf(u, s, vcast_vf_f(1));
+#else
+  u = dfnormalize_vf2_vf2(dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf_vf(u, s))).x;
+#endif
+  
+  u = vldexp2_vf_vf_vi2(u, q);
+
+  u = vsel_vf_vo_vf_vf(vge_vo_vf_vf(d, vcast_vf_f(128)), vcast_vf_f(INFINITY), u);
+  u = vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(d, vcast_vf_f(-150)), vreinterpret_vm_vf(u)));
+
+  return u;
+}
+
+EXPORT CONST vfloat xexp10f(vfloat d) {
+  vfloat u = vrint_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(LOG10_2))), s;
+  vint2 q = vrint_vi2_vf(u);
+
+  s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-L10Uf), d);
+  s = vmla_vf_vf_vf_vf(u, vcast_vf_f(-L10Lf), s);
+
+  u = vcast_vf_f(+0.2064004987e+0);
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.5417877436e+0));
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.1171286821e+1));
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2034656048e+1));
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2650948763e+1));
+  u = vmla_vf_vf_vf_vf(u, s, vcast_vf_f(+0.2302585125e+1));
+
+#ifdef ENABLE_FMA_SP
+  u = vfma_vf_vf_vf_vf(u, s, vcast_vf_f(1));
+#else
+  u = dfnormalize_vf2_vf2(dfadd_vf2_vf_vf2(vcast_vf_f(1), dfmul_vf2_vf_vf(u, s))).x;
+#endif
+  
+  u = vldexp2_vf_vf_vi2(u, q);
+
+  u = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(d, vcast_vf_f(38.5318394191036238941387f)), vcast_vf_f(INFINITYf), u);
+  u = vreinterpret_vf_vm(vandnot_vm_vo32_vm(vlt_vo_vf_vf(d, vcast_vf_f(-50)), vreinterpret_vm_vf(u)));
+
+  return u;
+}
+
+EXPORT CONST vfloat xexpm1f(vfloat a) {
+  vfloat2 d = dfadd2_vf2_vf2_vf(expk2f(vcast_vf2_vf_vf(a, vcast_vf_f(0))), vcast_vf_f(-1.0));
+  vfloat x = vadd_vf_vf_vf(d.x, d.y);
+  x = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(a, vcast_vf_f(88.72283172607421875f)), vcast_vf_f(INFINITYf), x);
+  x = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(a, vcast_vf_f(-16.635532333438687426013570f)), vcast_vf_f(-1), x);
+  x = vsel_vf_vo_vf_vf(visnegzero_vo_vf(a), vcast_vf_f(-0.0f), x);
+  return x;
+}
+
+EXPORT CONST vfloat xlog10f(vfloat d) {
+  vfloat2 x;
+  vfloat t, m, x2;
+
+#ifndef ENABLE_AVX512F
+  vopmask o = vlt_vo_vf_vf(d, vcast_vf_f(FLT_MIN));
+  d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f((float)(1LL << 32) * (float)(1LL << 32))), d);
+  vint2 e = vilogb2k_vi2_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0/0.75)));
+  m = vldexp3_vf_vf_vi2(d, vneg_vi2_vi2(e));
+  e = vsel_vi2_vo_vi2_vi2(o, vsub_vi2_vi2_vi2(e, vcast_vi2_i(64)), e);
+#else
+  vfloat e = vgetexp_vf_vf(vmul_vf_vf_vf(d, vcast_vf_f(1.0/0.75)));
+  e = vsel_vf_vo_vf_vf(vispinf_vo_vf(e), vcast_vf_f(128.0f), e);
+  m = vgetmant_vf_vf(d);
+#endif
+
+  x = dfdiv_vf2_vf2_vf2(dfadd2_vf2_vf_vf(vcast_vf_f(-1), m), dfadd2_vf2_vf_vf(vcast_vf_f(1), m));
+  x2 = vmul_vf_vf_vf(x.x, x.x);
+
+  t = vcast_vf_f(+0.1314289868e+0);
+  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f( +0.1735493541e+0));
+  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f( +0.2895309627e+0));
+  
+#ifndef ENABLE_AVX512F
+  vfloat2 s = dfmul_vf2_vf2_vf(vcast_vf2_f_f(0.30103001, -1.432098889e-08), vcast_vf_vi2(e));
+#else
+  vfloat2 s = dfmul_vf2_vf2_vf(vcast_vf2_f_f(0.30103001, -1.432098889e-08), e);
+#endif
+
+  s = dfadd_vf2_vf2_vf2(s, dfmul_vf2_vf2_vf2(x, vcast_vf2_f_f(0.868588984, -2.170757285e-08)));
+  s = dfadd_vf2_vf2_vf(s, vmul_vf_vf_vf(vmul_vf_vf_vf(x2, x.x), t));
+
+  vfloat r = vadd_vf_vf_vf(s.x, s.y);
+
+#ifndef ENABLE_AVX512F
+  r = vsel_vf_vo_vf_vf(vispinf_vo_vf(d), vcast_vf_f(INFINITY), r);
+  r = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vlt_vo_vf_vf(d, vcast_vf_f(0)), visnan_vo_vf(d)), vcast_vf_f(NAN), r);
+  r = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(0)), vcast_vf_f(-INFINITY), r);
+#else
+  r = vfixup_vf_vf_vf_vi2_i(r, d, vcast_vi2_i((4 << (2*4)) | (3 << (4*4)) | (5 << (5*4)) | (2 << (6*4))), 0);
+#endif
+  
+  return r;
+}
+
+EXPORT CONST vfloat xlog1pf_fast(vfloat d) {
+  vfloat2 x;
+  vfloat t, m, x2;
+
+  vfloat dp1 = vadd_vf_vf_vf(d, vcast_vf_f(1));
+
+#ifndef ENABLE_AVX512F
+  vopmask o = vlt_vo_vf_vf(dp1, vcast_vf_f(FLT_MIN));
+  dp1 = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(dp1, vcast_vf_f((float)(1LL << 32) * (float)(1LL << 32))), dp1);
+  vint2 e = vilogb2k_vi2_vf(vmul_vf_vf_vf(dp1, vcast_vf_f(1.0f/0.75f)));
+  t = vldexp3_vf_vf_vi2(vcast_vf_f(1), vneg_vi2_vi2(e));
+  m = vmla_vf_vf_vf_vf(d, t, vsub_vf_vf_vf(t, vcast_vf_f(1)));
+  e = vsel_vi2_vo_vi2_vi2(o, vsub_vi2_vi2_vi2(e, vcast_vi2_i(64)), e);
+  vfloat2 s = dfmul_vf2_vf2_vf(vcast_vf2_f_f(0.69314718246459960938f, -1.904654323148236017e-09f), vcast_vf_vi2(e));
+#else
+  vfloat e = vgetexp_vf_vf(vmul_vf_vf_vf(dp1, vcast_vf_f(1.0f/0.75f)));
+  e = vsel_vf_vo_vf_vf(vispinf_vo_vf(e), vcast_vf_f(128.0f), e);
+  t = vldexp3_vf_vf_vi2(vcast_vf_f(1), vneg_vi2_vi2(vrint_vi2_vf(e)));
+  m = vmla_vf_vf_vf_vf(d, t, vsub_vf_vf_vf(t, vcast_vf_f(1)));
+  vfloat2 s = dfmul_vf2_vf2_vf(vcast_vf2_f_f(0.69314718246459960938f, -1.904654323148236017e-09f), e);
+#endif
+
+  x = dfdiv_vf2_vf2_vf2(vcast_vf2_vf_vf(m, vcast_vf_f(0)), dfadd_vf2_vf_vf(vcast_vf_f(2), m));
+  x2 = vmul_vf_vf_vf(x.x, x.x);
+
+  t = vcast_vf_f(+0.3027294874e+0f);
+  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(+0.3996108174e+0f));
+  t = vmla_vf_vf_vf_vf(t, x2, vcast_vf_f(+0.6666694880e+0f));
+
+  s = dfadd_vf2_vf2_vf2(s, dfscale_vf2_vf2_vf(x, vcast_vf_f(2)));
+  s = dfadd_vf2_vf2_vf(s, vmul_vf_vf_vf(vmul_vf_vf_vf(x2, x.x), t));
+
+  vfloat r = vadd_vf_vf_vf(s.x, s.y);
+
+  r = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(d, vcast_vf_f(1e+38)), vcast_vf_f(INFINITYf), r);
+  r = vreinterpret_vf_vm(vor_vm_vo32_vm(vgt_vo_vf_vf(vcast_vf_f(-1), d), vreinterpret_vm_vf(r)));
+  r = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(-1)), vcast_vf_f(-INFINITYf), r);
+  r = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0f), r);
+
+  return r;
+}
+
+EXPORT CONST vfloat xlog1pf(vfloat a) {
+  vfloat log1_small = xlog1pf_fast(a);
+
+  vfloat cutoff = vcast_vf_f( (float)1.0e23 );
+  if (vall_lte32_i_vf_vf(a, cutoff))
+    return log1_small;
+
+  vopmask gt_cutoff = vgt_vo_vf_vf(a, cutoff);
+  vfloat log1_big = xlogf(a);
+  return vsel_vf_vo_vf_vf(gt_cutoff, log1_big, log1_small);
+}
+
+//
+
+EXPORT CONST vfloat xfabsf(vfloat x) { return vabs_vf_vf(x); }
+
+EXPORT CONST vfloat xcopysignf(vfloat x, vfloat y) { return vcopysign_vf_vf_vf(x, y); }
+
+EXPORT CONST vfloat xfmaxf(vfloat x, vfloat y) {
+#if SLEEF_SINGLE_MINMAXNUM_AVAILABLE
+  return vmaxnum_vf_vf_vf(x, y);
+#else
+  return vsel_vf_vo_vf_vf(visnan_vo_vf(y), x, vsel_vf_vo_vf_vf(vgt_vo_vf_vf(x, y), x, y));
+#endif
+}
+
+EXPORT CONST vfloat xfminf(vfloat x, vfloat y) {
+#if SLEEF_SINGLE_MINMAXNUM_AVAILABLE
+  return vminnum_vf_vf_vf(x, y);
+#else
+  return vsel_vf_vo_vf_vf(visnan_vo_vf(y), x, vsel_vf_vo_vf_vf(vgt_vo_vf_vf(y, x), x, y));
+#endif
+}
+
+EXPORT CONST vfloat xfdimf(vfloat x, vfloat y) {
+  vfloat ret = vsub_vf_vf_vf(x, y);
+  ret = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vlt_vo_vf_vf(ret, vcast_vf_f(0)), veq_vo_vf_vf(x, y)), vcast_vf_f(0), ret);
+  return ret;
+}
+
+EXPORT CONST vfloat xtruncf(vfloat x) {
+  vfloat fr = vsub_vf_vf_vf(x, vcast_vf_vi2(vtruncate_vi2_vf(x)));
+  return vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(x), vge_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(1LL << 23))), x, vcopysign_vf_vf_vf(vsub_vf_vf_vf(x, fr), x));
+}
+
+EXPORT CONST vfloat xfloorf(vfloat x) {
+  vfloat fr = vsub_vf_vf_vf(x, vcast_vf_vi2(vtruncate_vi2_vf(x)));
+  fr = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(fr, vcast_vf_f(0)), vadd_vf_vf_vf(fr, vcast_vf_f(1.0f)), fr);
+  return vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(x), vge_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(1LL << 23))), x, vcopysign_vf_vf_vf(vsub_vf_vf_vf(x, fr), x));
+}
+
+EXPORT CONST vfloat xceilf(vfloat x) {
+  vfloat fr = vsub_vf_vf_vf(x, vcast_vf_vi2(vtruncate_vi2_vf(x)));
+  fr = vsel_vf_vo_vf_vf(vle_vo_vf_vf(fr, vcast_vf_f(0)), fr, vsub_vf_vf_vf(fr, vcast_vf_f(1.0f)));
+  return vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(x), vge_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(1LL << 23))), x, vcopysign_vf_vf_vf(vsub_vf_vf_vf(x, fr), x));
+}
+
+EXPORT CONST vfloat xroundf(vfloat d) {
+  vfloat x = vadd_vf_vf_vf(d, vcast_vf_f(0.5f));
+  vfloat fr = vsub_vf_vf_vf(x, vcast_vf_vi2(vtruncate_vi2_vf(x)));
+  x = vsel_vf_vo_vf_vf(vand_vo_vo_vo(vle_vo_vf_vf(x, vcast_vf_f(0)), veq_vo_vf_vf(fr, vcast_vf_f(0))), vsub_vf_vf_vf(x, vcast_vf_f(1.0f)), x);
+  fr = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(fr, vcast_vf_f(0)), vadd_vf_vf_vf(fr, vcast_vf_f(1.0f)), fr);
+  x = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(0.4999999701976776123f)), vcast_vf_f(0), x);
+  return vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(d), vge_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(1LL << 23))), d, vcopysign_vf_vf_vf(vsub_vf_vf_vf(x, fr), d));
+}
+
+EXPORT CONST vfloat xrintf(vfloat d) {
+  vfloat x = vadd_vf_vf_vf(d, vcast_vf_f(0.5f));
+  vopmask isodd = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vcast_vi2_i(1), vtruncate_vi2_vf(x)), vcast_vi2_i(1));
+  vfloat fr = vsub_vf_vf_vf(x, vcast_vf_vi2(vtruncate_vi2_vf(x)));
+  fr = vsel_vf_vo_vf_vf(vor_vo_vo_vo(vlt_vo_vf_vf(fr, vcast_vf_f(0)), vand_vo_vo_vo(veq_vo_vf_vf(fr, vcast_vf_f(0)), isodd)), vadd_vf_vf_vf(fr, vcast_vf_f(1.0f)), fr);
+  x = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(0.50000005960464477539f)), vcast_vf_f(0), x);
+  vfloat ret = vsel_vf_vo_vf_vf(vor_vo_vo_vo(visinf_vo_vf(d), vge_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(1LL << 23))), d, vcopysign_vf_vf_vf(vsub_vf_vf_vf(x, fr), d));
+  return ret;
+}
+
+EXPORT CONST vfloat xfmaf(vfloat x, vfloat y, vfloat z) {
+#ifdef ENABLE_FMA_SP
+  return vmla_vf_vf_vf_vf(x, y, z);
+#else
+  vfloat h2 = vadd_vf_vf_vf(vmul_vf_vf_vf(x, y), z), q = vcast_vf_f(1);
+  vopmask o = vlt_vo_vf_vf(vabs_vf_vf(h2), vcast_vf_f(1e-38f));
+  {
+    const float c0 = 1ULL << 25, c1 = c0 * c0, c2 = c1 * c1;
+    x = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(x, vcast_vf_f(c1)), x);
+    y = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(y, vcast_vf_f(c1)), y);
+    z = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(z, vcast_vf_f(c2)), z);
+    q = vsel_vf_vo_vf_vf(o, vcast_vf_f(1.0f / c2), q);
+  }
+  o = vgt_vo_vf_vf(vabs_vf_vf(h2), vcast_vf_f(1e+38f));
+  {
+    const float c0 = 1ULL << 25, c1 = c0 * c0, c2 = c1 * c1;
+    x = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(x, vcast_vf_f(1.0f / c1)), x);
+    y = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(y, vcast_vf_f(1.0f / c1)), y);
+    z = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(z, vcast_vf_f(1.0f / c2)), z);
+    q = vsel_vf_vo_vf_vf(o, vcast_vf_f(c2), q);
+  }
+  vfloat2 d = dfmul_vf2_vf_vf(x, y);
+  d = dfadd2_vf2_vf2_vf(d, z);
+  vfloat ret = vsel_vf_vo_vf_vf(vor_vo_vo_vo(veq_vo_vf_vf(x, vcast_vf_f(0)), veq_vo_vf_vf(y, vcast_vf_f(0))), z, vadd_vf_vf_vf(d.x, d.y));
+  o = visinf_vo_vf(z);
+  o = vandnot_vo_vo_vo(visinf_vo_vf(x), o);
+  o = vandnot_vo_vo_vo(visnan_vo_vf(x), o);
+  o = vandnot_vo_vo_vo(visinf_vo_vf(y), o);
+  o = vandnot_vo_vo_vo(visnan_vo_vf(y), o);
+  h2 = vsel_vf_vo_vf_vf(o, z, h2);
+
+  o = vor_vo_vo_vo(visinf_vo_vf(h2), visnan_vo_vf(h2));
+
+  return vsel_vf_vo_vf_vf(o, h2, vmul_vf_vf_vf(ret, q));
+#endif
+}
+
+static INLINE CONST vint2 vcast_vi2_i_i(int i0, int i1) { return vcast_vi2_vm(vcast_vm_i_i(i0, i1)); }
+
+EXPORT CONST vfloat xsqrtf_u05(vfloat d) {
+#if 1
+  return vsqrt_vf_vf(d);
+#else
+  vfloat q;
+  vopmask o;
+
+  d = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(d, vcast_vf_f(0)), vcast_vf_f(NANf), d);
+
+  o = vlt_vo_vf_vf(d, vcast_vf_f(5.2939559203393770e-23f));
+  d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f(1.8889465931478580e+22f)), d);
+  q = vsel_vf_vo_vf_vf(o, vcast_vf_f(7.2759576141834260e-12f*0.5f), vcast_vf_f(0.5f));
+
+  o = vgt_vo_vf_vf(d, vcast_vf_f(1.8446744073709552e+19f));
+  d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f(5.4210108624275220e-20f)), d);
+  q = vsel_vf_vo_vf_vf(o, vcast_vf_f(4294967296.0f * 0.5f), q);
+
+  vfloat x = vreinterpret_vf_vi2(vsub_vi2_vi2_vi2(vcast_vi2_i(0x5f375a86), vsrl_vi2_vi2_i(vreinterpret_vi2_vf(vadd_vf_vf_vf(d, vcast_vf_f(1e-45f))), 1)));
+
+  x = vmul_vf_vf_vf(x, vsub_vf_vf_vf(vcast_vf_f(1.5f), vmul_vf_vf_vf(vmul_vf_vf_vf(vmul_vf_vf_vf(vcast_vf_f(0.5f), d), x), x)));
+  x = vmul_vf_vf_vf(x, vsub_vf_vf_vf(vcast_vf_f(1.5f), vmul_vf_vf_vf(vmul_vf_vf_vf(vmul_vf_vf_vf(vcast_vf_f(0.5f), d), x), x)));
+  x = vmul_vf_vf_vf(x, vsub_vf_vf_vf(vcast_vf_f(1.5f), vmul_vf_vf_vf(vmul_vf_vf_vf(vmul_vf_vf_vf(vcast_vf_f(0.5f), d), x), x)));
+  x = vmul_vf_vf_vf(x, d);
+
+  vfloat2 d2 = dfmul_vf2_vf2_vf2(dfadd2_vf2_vf_vf2(d, dfmul_vf2_vf_vf(x, x)), dfrec_vf2_vf(x));
+
+  x = vmul_vf_vf_vf(vadd_vf_vf_vf(d2.x, d2.y), q);
+
+  x = vsel_vf_vo_vf_vf(vispinf_vo_vf(d), vcast_vf_f(INFINITYf), x);
+  x = vsel_vf_vo_vf_vf(veq_vo_vf_vf(d, vcast_vf_f(0)), d, x);
+
+  return x;
+#endif
+}
+
+EXPORT CONST vfloat xhypotf_u05(vfloat x, vfloat y) {
+  x = vabs_vf_vf(x);
+  y = vabs_vf_vf(y);
+  vfloat min = vmin_vf_vf_vf(x, y), n = min;
+  vfloat max = vmax_vf_vf_vf(x, y), d = max;
+
+  vopmask o = vlt_vo_vf_vf(max, vcast_vf_f(FLT_MIN));
+  n = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(n, vcast_vf_f(1ULL << 24)), n);
+  d = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(d, vcast_vf_f(1ULL << 24)), d);
+
+  vfloat2 t = dfdiv_vf2_vf2_vf2(vcast_vf2_vf_vf(n, vcast_vf_f(0)), vcast_vf2_vf_vf(d, vcast_vf_f(0)));
+  t = dfmul_vf2_vf2_vf(dfsqrt_vf2_vf2(dfadd2_vf2_vf2_vf(dfsqu_vf2_vf2(t), vcast_vf_f(1))), max);
+  vfloat ret = vadd_vf_vf_vf(t.x, t.y);
+  ret = vsel_vf_vo_vf_vf(visnan_vo_vf(ret), vcast_vf_f(INFINITYf), ret);
+  ret = vsel_vf_vo_vf_vf(veq_vo_vf_vf(min, vcast_vf_f(0)), max, ret);
+  ret = vsel_vf_vo_vf_vf(vor_vo_vo_vo(visnan_vo_vf(x), visnan_vo_vf(y)), vcast_vf_f(NANf), ret);
+  ret = vsel_vf_vo_vf_vf(vor_vo_vo_vo(veq_vo_vf_vf(x, vcast_vf_f(INFINITYf)), veq_vo_vf_vf(y, vcast_vf_f(INFINITYf))), vcast_vf_f(INFINITYf), ret);
+
+  return ret;
+}
+
+EXPORT CONST vfloat xhypotf_u35(vfloat x, vfloat y) {
+  x = vabs_vf_vf(x);
+  y = vabs_vf_vf(y);
+  vfloat min = vmin_vf_vf_vf(x, y), n = min;
+  vfloat max = vmax_vf_vf_vf(x, y), d = max;
+
+  vfloat t = vdiv_vf_vf_vf(min, max);
+  vfloat ret = vmul_vf_vf_vf(max, vsqrt_vf_vf(vmla_vf_vf_vf_vf(t, t, vcast_vf_f(1))));
+  ret = vsel_vf_vo_vf_vf(veq_vo_vf_vf(min, vcast_vf_f(0)), max, ret);
+  ret = vsel_vf_vo_vf_vf(vor_vo_vo_vo(visnan_vo_vf(x), visnan_vo_vf(y)), vcast_vf_f(NANf), ret);
+  ret = vsel_vf_vo_vf_vf(vor_vo_vo_vo(veq_vo_vf_vf(x, vcast_vf_f(INFINITYf)), veq_vo_vf_vf(y, vcast_vf_f(INFINITYf))), vcast_vf_f(INFINITYf), ret);
+
+  return ret;
+}
+
+EXPORT CONST vfloat xnextafterf(vfloat x, vfloat y) {
+  x = vsel_vf_vo_vf_vf(veq_vo_vf_vf(x, vcast_vf_f(0)), vmulsign_vf_vf_vf(vcast_vf_f(0), y), x);
+  vint2 t, xi2 = vreinterpret_vi2_vf(x);
+  vopmask c = vxor_vo_vo_vo(vsignbit_vo_vf(x), vge_vo_vf_vf(y, x));
+
+  xi2 = vsel_vi2_vo_vi2_vi2(c, vsub_vi2_vi2_vi2(vcast_vi2_i(0), vxor_vi2_vi2_vi2(xi2, vcast_vi2_i(1 << 31))), xi2);
+
+  xi2 = vsel_vi2_vo_vi2_vi2(vneq_vo_vf_vf(x, y), vsub_vi2_vi2_vi2(xi2, vcast_vi2_i(1)), xi2);
+
+  xi2 = vsel_vi2_vo_vi2_vi2(c, vsub_vi2_vi2_vi2(vcast_vi2_i(0), vxor_vi2_vi2_vi2(xi2, vcast_vi2_i(1 << 31))), xi2);
+
+  vfloat ret = vreinterpret_vf_vi2(xi2);
+
+  ret = vsel_vf_vo_vf_vf(vand_vo_vo_vo(veq_vo_vf_vf(ret, vcast_vf_f(0)), vneq_vo_vf_vf(x, vcast_vf_f(0))),
+       vmulsign_vf_vf_vf(vcast_vf_f(0), x), ret);
+
+  ret = vsel_vf_vo_vf_vf(vand_vo_vo_vo(veq_vo_vf_vf(x, vcast_vf_f(0)), veq_vo_vf_vf(y, vcast_vf_f(0))), y, ret);
+
+  ret = vsel_vf_vo_vf_vf(vor_vo_vo_vo(visnan_vo_vf(x), visnan_vo_vf(y)), vcast_vf_f(NANf), ret);
+
+  return ret;
+}
+
+EXPORT CONST vfloat xfrfrexpf(vfloat x) {
+  vfloat j = x;
+  x = vsel_vf_vo_vf_vf(
+        vlt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(FLT_MIN)),
+        vmul_vf_vf_vf(x, vcast_vf_f((float)(1U << 30))),
+        x);
+
+
+  vmask xm = vreinterpret_vm_vf(x);
+  xm = vand_vm_vm_vm(xm, vcast_vm_i_i(~0x7f800000U, ~0x7f800000U));
+  xm = vor_vm_vm_vm (xm, vcast_vm_i_i( 0x3f000000U,  0x3f000000U));
+
+  vfloat ret = vreinterpret_vf_vm(xm);
+
+  ret = vsel_vf_vo_vf_vf(visinf_vo_vf(x),
+                         vmulsign_vf_vf_vf(vcast_vf_f(INFINITYf), x),
+                         ret);
+  ret = vsel_vf_vo_vf_vf(veq_vo_vf_vf(x, vcast_vf_f(0.0f)), x, ret);
+
+  ret = vsel_vf_vo_vf_vf(visnan_vo_vf(j), j, ret);
+
+  return ret;
+}
+
+EXPORT CONST vmask xexpfrexpf(vfloat x) {
+  vopmask isnan = vor_vo_vo_vo(visinf_vo_vf(x), visnan_vo_vf(x));
+  vfloat mul = vmul_vf_vf_vf(x, vcast_vf_f(0x1p+30));   //(float)(1U << 30)
+  vopmask is_denorm = vlt_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(0x1p-126)); //FLT_MIN
+
+  x = vsel_vf_vo_vf_vf(is_denorm, mul, x);
+  const vint2 zeros = vcast_vi2_i(0);
+  vint2 correct = vsel_vi2_vo_vi2_vi2(is_denorm, vcast_vi2_i(-30), zeros);
+
+  vint2 ret = vreinterpret_vi2_vf(x);
+
+  ret = vsrl_vi2_vi2_i(ret, 23);
+  ret = vand_vi2_vi2_vi2(ret, vcast_vi2_i(0xff));
+  ret = vsub_vi2_vi2_vi2(ret, vcast_vi2_i(0x7e));
+  ret = vadd_vi2_vi2_vi2(ret, correct);
+
+  ret = vsel_vi2_vo_vi2_vi2(
+            veq_vo_vf_vf(x, vreinterpret_vf_vi2(zeros)),
+            zeros,
+            ret);
+
+  ret = vsel_vi2_vo_vi2_vi2(isnan, zeros, ret);
+
+  return vcast_vm_vi2(ret);;
+}
+
+static INLINE CONST vfloat vtoward0f(vfloat x) {
+  vfloat t = vreinterpret_vf_vi2(vsub_vi2_vi2_vi2(vreinterpret_vi2_vf(x), vcast_vi2_i(1)));
+  return vsel_vf_vo_vf_vf(veq_vo_vf_vf(x, vcast_vf_f(0)), vcast_vf_f(0), t);
+}
+
+static INLINE CONST vfloat vptruncf(vfloat x) {
+#ifdef FULL_FP_ROUNDING
+  return vtruncate_vf_vf(x);
+#else
+  vfloat fr = vsub_vf_vf_vf(x, vcast_vf_vi2(vtruncate_vi2_vf(x)));
+  return vsel_vf_vo_vf_vf(vge_vo_vf_vf(vabs_vf_vf(x), vcast_vf_f(1LL << 23)), x, vsub_vf_vf_vf(x, fr));
+#endif
+}
+
+EXPORT CONST vfloat xfmodf(vfloat x, vfloat y) {
+  vfloat nu = vabs_vf_vf(x), de = vabs_vf_vf(y), s = vcast_vf_f(1), q;
+  vopmask o = vlt_vo_vf_vf(de, vcast_vf_f(FLT_MIN));
+  nu = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(nu, vcast_vf_f(1ULL << 25)), nu);
+  de = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(de, vcast_vf_f(1ULL << 25)), de);
+  s  = vsel_vf_vo_vf_vf(o, vmul_vf_vf_vf(s , vcast_vf_f(1.0f / (1ULL << 25))), s);
+  vfloat rde = vtoward0f(vrec_vf_vf(de));
+#ifdef ENABLE_NEON32
+  rde = vtoward0f(rde);
+#endif
+  vfloat2 r = vcast_vf2_vf_vf(nu, vcast_vf_f(0));
+
+  for(int i=0;i<8;i++) { // ceil(log2(FLT_MAX) / 22)+1
+    q = vsel_vf_vo_vf_vf(vand_vo_vo_vo(vgt_vo_vf_vf(vadd_vf_vf_vf(de, de), r.x),
+                                       vge_vo_vf_vf(r.x, de)),
+                         vcast_vf_f(1), vmul_vf_vf_vf(vtoward0f(r.x), rde));
+    r = dfnormalize_vf2_vf2(dfadd2_vf2_vf2_vf2(r, dfmul_vf2_vf_vf(vptruncf(q), vneg_vf_vf(de))));
+    if (vtestallones_i_vo32(vlt_vo_vf_vf(r.x, de))) break;
+  }
+
+  vfloat ret = vmul_vf_vf_vf(vadd_vf_vf_vf(r.x, r.y), s);
+  ret = vsel_vf_vo_vf_vf(veq_vo_vf_vf(vadd_vf_vf_vf(r.x, r.y), de), vcast_vf_f(0), ret);
+
+  ret = vmulsign_vf_vf_vf(ret, x);
+
+  ret = vsel_vf_vo_vf_vf(vlt_vo_vf_vf(nu, de), x, ret);
+  ret = vsel_vf_vo_vf_vf(veq_vo_vf_vf(de, vcast_vf_f(0)), vcast_vf_f(NANf), ret);
+
+  return ret;
+}
+
+//
+
+static INLINE CONST vfloat2 sinpifk(vfloat d) {
+  vopmask o;
+  vfloat u, s, t;
+  vfloat2 x, s2;
+
+  u = vmul_vf_vf_vf(d, vcast_vf_f(4.0));
+  vint2 q = vtruncate_vi2_vf(u);
+  q = vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vxor_vi2_vi2_vi2(vsrl_vi2_vi2_i(q, 31), vcast_vi2_i(1))), vcast_vi2_i(~1));
+  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(2));
+
+  s = vsub_vf_vf_vf(u, vcast_vf_vi2(q));
+  t = s;
+  s = vmul_vf_vf_vf(s, s);
+  s2 = dfmul_vf2_vf_vf(t, t);
+
+  //
+
+  u = vsel_vf_vo_f_f(o, -0.2430611801e-7f, +0.3093842054e-6f);
+  u = vmla_vf_vf_vf_vf(u, s, vsel_vf_vo_f_f(o, +0.3590577080e-5f, -0.3657307388e-4f));
+  u = vmla_vf_vf_vf_vf(u, s, vsel_vf_vo_f_f(o, -0.3259917721e-3f, +0.2490393585e-2f));
+  x = dfadd2_vf2_vf_vf2(vmul_vf_vf_vf(u, s),
+      vsel_vf2_vo_f_f_f_f(o, 0.015854343771934509277, 4.4940051354032242811e-10,
+              -0.080745510756969451904, -1.3373665339076936258e-09));
+  x = dfadd2_vf2_vf2_vf2(dfmul_vf2_vf2_vf2(s2, x),
+       vsel_vf2_vo_f_f_f_f(o, -0.30842512845993041992, -9.0728339030733922277e-09,
+               0.78539818525314331055, -2.1857338617566484855e-08));
+
+  x = dfmul_vf2_vf2_vf2(x, vsel_vf2_vo_vf2_vf2(o, s2, vcast_vf2_vf_vf(t, vcast_vf_f(0))));
+  x = vsel_vf2_vo_vf2_vf2(o, dfadd2_vf2_vf2_vf(x, vcast_vf_f(1)), x);
+
+  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(4)), vcast_vi2_i(4));
+  x.x = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(x.x)));
+  x.y = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(x.y)));
+
+  return x;
+}
+
+EXPORT CONST vfloat xsinpif_u05(vfloat d) {
+  vfloat2 x = sinpifk(d);
+  vfloat r = vadd_vf_vf_vf(x.x, x.y);
+
+  r = vsel_vf_vo_vf_vf(visnegzero_vo_vf(d), vcast_vf_f(-0.0), r);
+  r = vreinterpret_vf_vm(vandnot_vm_vo32_vm(vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX4f)), vreinterpret_vm_vf(r)));
+  r = vreinterpret_vf_vm(vor_vm_vo32_vm(visinf_vo_vf(d), vreinterpret_vm_vf(r)));
+
+  return r;
+}
+
+static INLINE CONST vfloat2 cospifk(vfloat d) {
+  vopmask o;
+  vfloat u, s, t;
+  vfloat2 x, s2;
+
+  u = vmul_vf_vf_vf(d, vcast_vf_f(4.0));
+  vint2 q = vtruncate_vi2_vf(u);
+  q = vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vxor_vi2_vi2_vi2(vsrl_vi2_vi2_i(q, 31), vcast_vi2_i(1))), vcast_vi2_i(~1));
+  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(0));
+
+  s = vsub_vf_vf_vf(u, vcast_vf_vi2(q));
+  t = s;
+  s = vmul_vf_vf_vf(s, s);
+  s2 = dfmul_vf2_vf_vf(t, t);
+
+  //
+
+  u = vsel_vf_vo_f_f(o, -0.2430611801e-7f, +0.3093842054e-6f);
+  u = vmla_vf_vf_vf_vf(u, s, vsel_vf_vo_f_f(o, +0.3590577080e-5f, -0.3657307388e-4f));
+  u = vmla_vf_vf_vf_vf(u, s, vsel_vf_vo_f_f(o, -0.3259917721e-3f, +0.2490393585e-2f));
+  x = dfadd2_vf2_vf_vf2(vmul_vf_vf_vf(u, s),
+      vsel_vf2_vo_f_f_f_f(o, 0.015854343771934509277, 4.4940051354032242811e-10,
+              -0.080745510756969451904, -1.3373665339076936258e-09));
+  x = dfadd2_vf2_vf2_vf2(dfmul_vf2_vf2_vf2(s2, x),
+       vsel_vf2_vo_f_f_f_f(o, -0.30842512845993041992, -9.0728339030733922277e-09,
+               0.78539818525314331055, -2.1857338617566484855e-08));
+
+  x = dfmul_vf2_vf2_vf2(x, vsel_vf2_vo_vf2_vf2(o, s2, vcast_vf2_vf_vf(t, vcast_vf_f(0))));
+  x = vsel_vf2_vo_vf2_vf2(o, dfadd2_vf2_vf2_vf(x, vcast_vf_f(1)), x);
+
+  o = veq_vo_vi2_vi2(vand_vi2_vi2_vi2(vadd_vi2_vi2_vi2(q, vcast_vi2_i(2)), vcast_vi2_i(4)), vcast_vi2_i(4));
+  x.x = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(x.x)));
+  x.y = vreinterpret_vf_vm(vxor_vm_vm_vm(vand_vm_vo32_vm(o, vreinterpret_vm_vf(vcast_vf_f(-0.0))), vreinterpret_vm_vf(x.y)));
+
+  return x;
+}
+
+EXPORT CONST vfloat xcospif_u05(vfloat d) {
+  vfloat2 x = cospifk(d);
+  vfloat r = vadd_vf_vf_vf(x.x, x.y);
+
+  r = vsel_vf_vo_vf_vf(vgt_vo_vf_vf(vabs_vf_vf(d), vcast_vf_f(TRIGRANGEMAX4f)), vcast_vf_f(1), r);
+  r = vreinterpret_vf_vm(vor_vm_vo32_vm(visinf_vo_vf(d), vreinterpret_vm_vf(r)));
+
+  return r;
+}
+
+typedef struct {
+  vfloat2 a, b;
+} df2;
+
+/* TODO AArch64: potential optimization by using `vfmad_lane_f64` */
+static CONST df2 gammafk(vfloat a) {
+  vfloat2 clc = vcast_vf2_f_f(0, 0), clln = vcast_vf2_f_f(1, 0), clld = vcast_vf2_f_f(1, 0);
+  vfloat2 v = vcast_vf2_f_f(1, 0), x, y, z;
+  vfloat t, u;
+
+  vopmask otiny = vlt_vo_vf_vf(vabs_vf_vf(a), vcast_vf_f(1e-30f)), oref = vlt_vo_vf_vf(a, vcast_vf_f(0.5));
+
+  x = vsel_vf2_vo_vf2_vf2(otiny, vcast_vf2_f_f(0, 0),
+        vsel_vf2_vo_vf2_vf2(oref, dfadd2_vf2_vf_vf(vcast_vf_f(1), vneg_vf_vf(a)),
+                vcast_vf2_vf_vf(a, vcast_vf_f(0))));
+
+  vopmask o0 = vand_vo_vo_vo(vle_vo_vf_vf(vcast_vf_f(0.5), x.x), vle_vo_vf_vf(x.x, vcast_vf_f(1.2)));
+  vopmask o2 = vle_vo_vf_vf(vcast_vf_f(2.3), x.x);
+
+  y = dfnormalize_vf2_vf2(dfmul_vf2_vf2_vf2(dfadd2_vf2_vf2_vf(x, vcast_vf_f(1)), x));
+  y = dfnormalize_vf2_vf2(dfmul_vf2_vf2_vf2(dfadd2_vf2_vf2_vf(x, vcast_vf_f(2)), y));
+
+  vopmask o = vand_vo_vo_vo(o2, vle_vo_vf_vf(x.x, vcast_vf_f(7)));
+  clln = vsel_vf2_vo_vf2_vf2(o, y, clln);
+
+  x = vsel_vf2_vo_vf2_vf2(o, dfadd2_vf2_vf2_vf(x, vcast_vf_f(3)), x);
+  t = vsel_vf_vo_vf_vf(o2, vrec_vf_vf(x.x), dfnormalize_vf2_vf2(dfadd2_vf2_vf2_vf(x, vsel_vf_vo_f_f(o0, -1, -2))).x);
+
+  u = vsel_vf_vo_vo_f_f_f(o2, o0, +0.000839498720672087279971000786, +0.9435157776e+0f, +0.1102489550e-3f);
+  u = vmla_vf_vf_vf_vf(u, t, vsel_vf_vo_vo_f_f_f(o2, o0, -5.17179090826059219329394422e-05, +0.8670063615e+0f, +0.8160019934e-4f));
+  u = vmla_vf_vf_vf_vf(u, t, vsel_vf_vo_vo_f_f_f(o2, o0, -0.000592166437353693882857342347, +0.4826702476e+0f, +0.1528468856e-3f));
+  u = vmla_vf_vf_vf_vf(u, t, vsel_vf_vo_vo_f_f_f(o2, o0, +6.97281375836585777403743539e-05, -0.8855129778e-1f, -0.2355068718e-3f));
+  u = vmla_vf_vf_vf_vf(u, t, vsel_vf_vo_vo_f_f_f(o2, o0, +0.000784039221720066627493314301, +0.1013825238e+0f, +0.4962242092e-3f));
+  u = vmla_vf_vf_vf_vf(u, t, vsel_vf_vo_vo_f_f_f(o2, o0, -0.000229472093621399176949318732, -0.1493408978e+0f, -0.1193488017e-2f));
+  u = vmla_vf_vf_vf_vf(u, t, vsel_vf_vo_vo_f_f_f(o2, o0, -0.002681327160493827160473958490, +0.1697509140e+0f, +0.2891599433e-2f));
+  u = vmla_vf_vf_vf_vf(u, t, vsel_vf_vo_vo_f_f_f(o2, o0, +0.003472222222222222222175164840, -0.2072454542e+0f, -0.7385451812e-2f));
+  u = vmla_vf_vf_vf_vf(u, t, vsel_vf_vo_vo_f_f_f(o2, o0, +0.083333333333333333335592087900, +0.2705872357e+0f, +0.2058077045e-1f));
+
+  y = dfmul_vf2_vf2_vf2(dfadd2_vf2_vf2_vf(x, vcast_vf_f(-0.5)), logk2f(x));
+  y = dfadd2_vf2_vf2_vf2(y, dfneg_vf2_vf2(x));
+  y = dfadd2_vf2_vf2_vf2(y, vcast_vf2_d(0.91893853320467278056)); // 0.5*log(2*M_PI)
+
+  z = dfadd2_vf2_vf2_vf(dfmul_vf2_vf_vf (u, t), vsel_vf_vo_f_f(o0, -0.400686534596170958447352690395e+0f, -0.673523028297382446749257758235e-1f));
+  z = dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf(z, t), vsel_vf_vo_f_f(o0, +0.822466960142643054450325495997e+0f, +0.322467033928981157743538726901e+0f));
+  z = dfadd2_vf2_vf2_vf(dfmul_vf2_vf2_vf(z, t), vsel_vf_vo_f_f(o0, -0.577215665946766039837398973297e+0f, +0.422784335087484338986941629852e+0f));
+  z = dfmul_vf2_vf2_vf(z, t);
+
+  clc = vsel_vf2_vo_vf2_vf2(o2, y, z);
+
+  clld = vsel_vf2_vo_vf2_vf2(o2, dfadd2_vf2_vf2_vf(dfmul_vf2_vf_vf(u, t), vcast_vf_f(1)), clld);
+
+  y = clln;
+
+  clc = vsel_vf2_vo_vf2_vf2(otiny, vcast_vf2_d(41.58883083359671856503), // log(2^60)
+          vsel_vf2_vo_vf2_vf2(oref, dfadd2_vf2_vf2_vf2(vcast_vf2_d(1.1447298858494001639), dfneg_vf2_vf2(clc)), clc)); // log(M_PI)
+  clln = vsel_vf2_vo_vf2_vf2(otiny, vcast_vf2_f_f(1, 0), vsel_vf2_vo_vf2_vf2(oref, clln, clld));
+
+  if (!vtestallones_i_vo32(vnot_vo32_vo32(oref))) {
+    t = vsub_vf_vf_vf(a, vmul_vf_vf_vf(vcast_vf_f(1LL << 12), vcast_vf_vi2(vtruncate_vi2_vf(vmul_vf_vf_vf(a, vcast_vf_f(1.0 / (1LL << 12)))))));
+    x = dfmul_vf2_vf2_vf2(clld, sinpifk(t));
+  }
+
+  clld = vsel_vf2_vo_vf2_vf2(otiny, vcast_vf2_vf_vf(vmul_vf_vf_vf(a, vcast_vf_f((1LL << 30)*(float)(1LL << 30))), vcast_vf_f(0)),
+           vsel_vf2_vo_vf2_vf2(oref, x, y));
+
+  df2 ret = { clc, dfdiv_vf2_vf2_vf2(clln, clld) };
+
+  return ret;
+}
+
+EXPORT CONST vfloat xtgammaf_u1(vfloat a) {
+  df2 d = gammafk(a);
+  vfloat2 y = dfmul_vf2_vf2_vf2(expk2f(d.a), d.b);
+  vfloat r = vadd_vf_vf_vf(y.x, y.y);
+  vopmask o;
+
+  o = vor_vo_vo_vo(vor_vo_vo_vo(veq_vo_vf_vf(a, vcast_vf_f(-INFINITYf)),
+        vand_vo_vo_vo(vlt_vo_vf_vf(a, vcast_vf_f(0)), visint_vo_vf(a))),
+       vand_vo_vo_vo(vand_vo_vo_vo(visnumber_vo_vf(a), vlt_vo_vf_vf(a, vcast_vf_f(0))), visnan_vo_vf(r)));
+  r = vsel_vf_vo_vf_vf(o, vcast_vf_f(NANf), r);
+
+  o = vand_vo_vo_vo(vand_vo_vo_vo(vor_vo_vo_vo(veq_vo_vf_vf(a, vcast_vf_f(INFINITYf)), visnumber_vo_vf(a)),
+          vge_vo_vf_vf(a, vcast_vf_f(-FLT_MIN))),
+        vor_vo_vo_vo(vor_vo_vo_vo(veq_vo_vf_vf(a, vcast_vf_f(0)), vgt_vo_vf_vf(a, vcast_vf_f(36))), visnan_vo_vf(r)));
+  r = vsel_vf_vo_vf_vf(o, vmulsign_vf_vf_vf(vcast_vf_f(INFINITYf), a), r);
+
+  return r;
+}
+
+EXPORT CONST vfloat xlgammaf_u1(vfloat a) {
+  df2 d = gammafk(a);
+  vfloat2 y = dfadd2_vf2_vf2_vf2(d.a, logk2f(dfabs_vf2_vf2(d.b)));
+  vfloat r = vadd_vf_vf_vf(y.x, y.y);
+  vopmask o;
+
+  o = vor_vo_vo_vo(visinf_vo_vf(a),
+       vor_vo_vo_vo(vand_vo_vo_vo(vle_vo_vf_vf(a, vcast_vf_f(0)), visint_vo_vf(a)),
+        vand_vo_vo_vo(visnumber_vo_vf(a), visnan_vo_vf(r))));
+  r = vsel_vf_vo_vf_vf(o, vcast_vf_f(INFINITYf), r);
+
+  return r;
+}
+
+EXPORT CONST vfloat2 xlgamma_rf_u1(vfloat a) {
+  df2 d = gammafk(a);
+  vfloat2 y = dfadd2_vf2_vf2_vf2(d.a, logk2f(dfabs_vf2_vf2(d.b)));
+  vfloat r = vadd_vf_vf_vf(y.x, y.y);
+  vopmask o;
+
+  o = vor_vo_vo_vo(visinf_vo_vf(a),
+       vor_vo_vo_vo(vand_vo_vo_vo(vle_vo_vf_vf(a, vcast_vf_f(0)), visint_vo_vf(a)),
+        vand_vo_vo_vo(visnumber_vo_vf(a), visnan_vo_vf(r))));
+  r = vsel_vf_vo_vf_vf(o, vcast_vf_f(INFINITYf), r);
+
+  vfloat2 ret;
+  ret.x = r;
+  ret.y = vreinterpret_vf_vm(vor_vm_vm_vm(
+                               vand_vm_vm_vm(vreinterpret_vm_vf(d.b.x),
+                                 vreinterpret_vm_vf(vcast_vf_f(-0.0f))),
+                               vreinterpret_vm_vf(vcast_vf_f(1.0f)))
+                            );
+
+  return ret;
+}
+
+/* TODO AArch64: potential optimization by using `vfmad_lane_f64` */
+EXPORT CONST vfloat xerff_u1(vfloat a) {
+  vfloat s = a, t, u;
+  vfloat2 d;
+
+  a = vabs_vf_vf(a);
+  vopmask o0 = vlt_vo_vf_vf(a, vcast_vf_f(1.1));
+  vopmask o1 = vlt_vo_vf_vf(a, vcast_vf_f(2.4));
+  vopmask o2 = vlt_vo_vf_vf(a, vcast_vf_f(4.0));
+  u = vsel_vf_vo_vf_vf(o0, vmul_vf_vf_vf(a, a), a);
+
+  t = vsel_vf_vo_vo_f_f_f(o0, o1, +0.7089292194e-4f, -0.1792667899e-4f, -0.9495757695e-5f);
+  t = vmla_vf_vf_vf_vf(t, u, vsel_vf_vo_vo_f_f_f(o0, o1, -0.7768311189e-3f, +0.3937633010e-3f, +0.2481465926e-3f));
+  t = vmla_vf_vf_vf_vf(t, u, vsel_vf_vo_vo_f_f_f(o0, o1, +0.5159463733e-2f, -0.3949181177e-2f, -0.2918176819e-2f));
+  t = vmla_vf_vf_vf_vf(t, u, vsel_vf_vo_vo_f_f_f(o0, o1, -0.2683781274e-1f, +0.2445474640e-1f, +0.2059706673e-1f));
+  t = vmla_vf_vf_vf_vf(t, u, vsel_vf_vo_vo_f_f_f(o0, o1, +0.1128318012e+0f, -0.1070996150e+0f, -0.9901899844e-1f));
+  d = dfmul_vf2_vf_vf(t, u);
+  d = dfadd2_vf2_vf2_vf2(d, vsel_vf2_vo_vo_d_d_d(o0, o1, -0.376125876000657465175213237214e+0, -0.634588905908410389971210809210e+0, -0.643598050547891613081201721633e+0));
+  d = dfmul_vf2_vf2_vf(d, u);
+  d = dfadd2_vf2_vf2_vf2(d, vsel_vf2_vo_vo_d_d_d(o0, o1, +0.112837916021059138255978217023e+1, -0.112879855826694507209862753992e+1, -0.112461487742845562801052956293e+1));
+  d = dfmul_vf2_vf2_vf(d, a);
+  d = vsel_vf2_vo_vf2_vf2(o0, d, dfadd_vf2_vf_vf2(vcast_vf_f(1.0), dfneg_vf2_vf2(expk2f(d))));
+  u = vmulsign_vf_vf_vf(vsel_vf_vo_vf_vf(o2, vadd_vf_vf_vf(d.x, d.y), vcast_vf_f(1)), s);
+  u = vsel_vf_vo_vf_vf(visnan_vo_vf(a), vcast_vf_f(NANf), u);
+
+  return u;
+}
+
+/* TODO AArch64: potential optimization by using `vfmad_lane_f64` */
+EXPORT CONST vfloat xerfcf_u15(vfloat a) {
+  vfloat s = a, r = vcast_vf_f(0), t;
+  vfloat2 u, d, x;
+  a = vabs_vf_vf(a);
+  vopmask o0 = vlt_vo_vf_vf(a, vcast_vf_f(1.0));
+  vopmask o1 = vlt_vo_vf_vf(a, vcast_vf_f(2.2));
+  vopmask o2 = vlt_vo_vf_vf(a, vcast_vf_f(4.3));
+  vopmask o3 = vlt_vo_vf_vf(a, vcast_vf_f(10.1));
+
+  u = vsel_vf2_vo_vf2_vf2(o1, vcast_vf2_vf_vf(a, vcast_vf_f(0)), dfdiv_vf2_vf2_vf2(vcast_vf2_f_f(1, 0), vcast_vf2_vf_vf(a, vcast_vf_f(0))));
+
+  t = vsel_vf_vo_vo_vo_f_f_f_f(o0, o1, o2, -0.8638041618e-4f, -0.6236977242e-5f, -0.3869504035e+0f, +0.1115344167e+1f);
+  t = vmla_vf_vf_vf_vf(t, u.x, vsel_vf_vo_vo_vo_f_f_f_f(o0, o1, o2, +0.6000166177e-3f, +0.5749821503e-4f, +0.1288077235e+1f, -0.9454904199e+0f));
+  t = vmla_vf_vf_vf_vf(t, u.x, vsel_vf_vo_vo_vo_f_f_f_f(o0, o1, o2, -0.1665703603e-2f, +0.6002851478e-5f, -0.1816803217e+1f, -0.3667259514e+0f));
+  t = vmla_vf_vf_vf_vf(t, u.x, vsel_vf_vo_vo_vo_f_f_f_f(o0, o1, o2, +0.1795156277e-3f, -0.2851036377e-2f, +0.1249150872e+1f, +0.7155663371e+0f));
+  t = vmla_vf_vf_vf_vf(t, u.x, vsel_vf_vo_vo_vo_f_f_f_f(o0, o1, o2, +0.1914106123e-1f, +0.2260518074e-1f, -0.1328857988e+0f, -0.1262947265e-1f));
+
+  d = dfmul_vf2_vf2_vf(u, t);
+  d = dfadd2_vf2_vf2_vf2(d, vsel_vf2_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.102775359343930288081655368891e+0, -0.105247583459338632253369014063e+0, -0.482365310333045318680618892669e+0, -0.498961546254537647970305302739e+0));
+  d = dfmul_vf2_vf2_vf2(d, u);
+  d = dfadd2_vf2_vf2_vf2(d, vsel_vf2_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.636619483208481931303752546439e+0, -0.635609463574589034216723775292e+0, -0.134450203224533979217859332703e-2, -0.471199543422848492080722832666e-4));
+  d = dfmul_vf2_vf2_vf2(d, u);
+  d = dfadd2_vf2_vf2_vf2(d, vsel_vf2_vo_vo_vo_d_d_d_d(o0, o1, o2, -0.112837917790537404939545770596e+1, -0.112855987376668622084547028949e+1, -0.572319781150472949561786101080e+0, -0.572364030327966044425932623525e+0));
+
+  x = dfmul_vf2_vf2_vf(vsel_vf2_vo_vf2_vf2(o1, d, vcast_vf2_vf_vf(vneg_vf_vf(a), vcast_vf_f(0))), a);
+  x = vsel_vf2_vo_vf2_vf2(o1, x, dfadd2_vf2_vf2_vf2(x, d));
+
+  x = expk2f(x);
+  x = vsel_vf2_vo_vf2_vf2(o1, x, dfmul_vf2_vf2_vf2(x, u));
+
+  r = vsel_vf_vo_vf_vf(o3, vadd_vf_vf_vf(x.x, x.y), vcast_vf_f(0));
+  r = vsel_vf_vo_vf_vf(vsignbit_vo_vf(s), vsub_vf_vf_vf(vcast_vf_f(2), r), r);
+  r = vsel_vf_vo_vf_vf(visnan_vo_vf(s), vcast_vf_f(NANf), r);
+  return r;
+}
+
+#ifdef ENABLE_MAIN
+// gcc -DENABLE_MAIN -Wno-attributes -I../common -I../arch -DENABLE_AVX2 -mavx2 -mfma sleefsimdsp.c ../common/common.c -lm
+#include <stdio.h>
+#include <stdlib.h>
+int main(int argc, char **argv) {
+  vfloat vf1 = vcast_vf_f(atof(argv[1]));
+  //vfloat vf2 = vcast_vf_f(atof(argv[2]));
+
+  //vfloat r = xpowf(vf1, vf2);
+  //vfloat r = xsqrtf_u05(vf1);
+  //printf("%g\n", xnextafterf(vf1, vf2)[0]);
+  //printf("%g\n", nextafterf(atof(argv[1]), atof(argv[2])));
+  printf("t = %.20g\n", xlogf_u1(vf1)[0]);
+  printf("c = %.20g\n", logf(atof(argv[1])));
+
+}
+#endif
diff --git a/lib/kernel/sleef/libm/sleefsp.c b/lib/kernel/sleef/libm/sleefsp.c
new file mode 100644
index 0000000..032f715
--- /dev/null
+++ b/lib/kernel/sleef/libm/sleefsp.c
@@ -0,0 +1,2090 @@
+//          Copyright Naoki Shibata 2010 - 2017.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+
+// Always use -ffp-contract=off option to compile SLEEF.
+
+#include <stdint.h>
+#include <math.h>
+#include <limits.h>
+#include <float.h>
+
+#include "misc.h"
+
+// debug prints using fprintf
+#define NDEBUG
+
+#if (defined(_MSC_VER))
+#pragma fp_contract (off)
+#endif
+
+#include "helpers.h"
+
+static INLINE CONST int32_t floatToRawIntBits(float d) {
+  union {
+    float f;
+    int32_t i;
+  } tmp;
+  tmp.f = d;
+  return tmp.i;
+}
+
+static INLINE CONST float intBitsToFloat(int32_t i) {
+  union {
+    float f;
+    int32_t i;
+  } tmp;
+  tmp.i = i;
+  return tmp.f;
+}
+
+static INLINE CONST float fabsfk(float x) {
+  return intBitsToFloat(0x7fffffffL & floatToRawIntBits(x));
+}
+
+static INLINE CONST float mulsignf(float x, float y) {
+  return intBitsToFloat(floatToRawIntBits(x) ^ (floatToRawIntBits(y) & (1 << 31)));
+}
+
+static INLINE CONST double copysignfk(double x, double y) {
+  return intBitsToFloat((floatToRawIntBits(x) & ~(1 << 31)) ^ (floatToRawIntBits(y) & (1 << 31)));
+}
+
+static INLINE CONST float signf(float d) { return mulsignf(1, d); }
+static INLINE CONST float mlaf(float x, float y, float z) { return x * y + z; }
+static INLINE CONST float rintfk(float x) { return x < 0 ? (int)(x - 0.5f) : (int)(x + 0.5f); }
+static INLINE CONST int ceilfk(float x) { return (int)x + (x < 0 ? 0 : 1); }
+static INLINE CONST float fminfk(float x, float y) { return x < y ? x : y; }
+static INLINE CONST float fmaxfk(float x, float y) { return x > y ? x : y; }
+static INLINE CONST int xisintf(float x) { return (x == (int)x); }
+
+static INLINE CONST int xisnanf(float x) { return x != x; }
+static INLINE CONST int xisinff(float x) { return x == INFINITYf || x == -INFINITYf; }
+static INLINE CONST int xisminff(float x) { return x == -INFINITYf; }
+static INLINE CONST int xispinff(float x) { return x == INFINITYf; }
+static INLINE CONST int xisnegzerof(float x) { return floatToRawIntBits(x) == floatToRawIntBits(-0.0); }
+static INLINE CONST int xisnumberf(double x) { return !xisinff(x) && !xisnanf(x); }
+
+static INLINE CONST int ilogbkf(float d) {
+  int m = d < 5.421010862427522E-20f;
+  d = m ? 1.8446744073709552E19f * d : d;
+  int q = (floatToRawIntBits(d) >> 23) & 0xff;
+  q = m ? q - (64 + 0x7f) : q - 0x7f;
+  return q;
+}
+
+// vilogb2kf is similar to ilogbkf, but the argument has to be a
+// normalized FP value.
+static INLINE CONST int ilogb2kf(float d) {
+  return ((floatToRawIntBits(d) >> 23) & 0xff) - 0x7f;
+}
+
+EXPORT CONST int xilogbf(float d) {
+  int e = ilogbkf(fabsfk(d));
+  e = d == 0.0f  ? FP_ILOGB0 : e;
+  e = xisnanf(d) ? FP_ILOGBNAN : e;
+  e = xisinff(d) ? INT_MAX : e;
+  return e;
+}
+
+static INLINE CONST float pow2if(int q) {
+  return intBitsToFloat(((int32_t)(q + 0x7f)) << 23);
+}
+
+static INLINE CONST float ldexpkf(float x, int q) {
+  float u;
+  int m;
+  m = q >> 31;
+  m = (((m + q) >> 6) - m) << 4;
+  q = q - (m << 2);
+  m += 127;
+  m = m <   0 ?   0 : m;
+  m = m > 255 ? 255 : m;
+  u = intBitsToFloat(((int32_t)m) << 23);
+  x = x * u * u * u * u;
+  u = intBitsToFloat(((int32_t)(q + 0x7f)) << 23);
+  return x * u;
+}
+
+static INLINE CONST float ldexp2kf(float d, int e) { // faster than ldexpkf, short reach
+  return d * pow2if(e >> 1) * pow2if(e - (e >> 1));
+}
+
+static INLINE CONST float ldexp3kf(float d, int e) { // very fast, no denormal
+  return intBitsToFloat(floatToRawIntBits(d) + (e << 23));
+}
+
+//
+
+#ifndef NDEBUG
+static int checkfp(float x) {
+  if (xisinff(x) || xisnanf(x)) return 1;
+  return 0;
+}
+#endif
+
+static INLINE CONST float upperf(float d) {
+  return intBitsToFloat(floatToRawIntBits(d) & 0xfffff000);
+}
+
+static INLINE CONST Sleef_float2 df(float h, float l) {
+  Sleef_float2 ret;
+  ret.x = h; ret.y = l;
+  return ret;
+}
+
+static INLINE CONST Sleef_float2 dfx(double d) {
+  Sleef_float2 ret;
+  ret.x = d; ret.y = d - ret.x;
+  return ret;
+}
+
+static INLINE CONST Sleef_float2 dfnormalize_f2_f2(Sleef_float2 t) {
+  Sleef_float2 s;
+
+  s.x = t.x + t.y;
+  s.y = t.x - s.x + t.y;
+
+  return s;
+}
+
+static INLINE CONST Sleef_float2 dfscale_f2_f2_f(Sleef_float2 d, float s) {
+  Sleef_float2 r;
+
+  r.x = d.x * s;
+  r.y = d.y * s;
+
+  return r;
+}
+
+static INLINE CONST Sleef_float2 dfneg_f2_f2(Sleef_float2 d) {
+  Sleef_float2 r;
+
+  r.x = -d.x;
+  r.y = -d.y;
+
+  return r;
+}
+
+static INLINE CONST Sleef_float2 dfabs_f2_f2(Sleef_float2 x) {
+  return df(x.x < 0 ? -x.x : x.x, x.x < 0 ? -x.y : x.y);
+}
+
+static INLINE CONST Sleef_float2 dfadd_f2_f_f(float x, float y) {
+  // |x| >= |y|
+
+  Sleef_float2 r;
+
+#ifndef NDEBUG
+  if (!(checkfp(x) || checkfp(y) || fabsfk(x) >= fabsfk(y))) fprintf(stderr, "[dfadd_f2_f_f : %g, %g]", x, y);
+#endif
+
+  r.x = x + y;
+  r.y = x - r.x + y;
+
+  return r;
+}
+
+static INLINE CONST Sleef_float2 dfadd2_f2_f_f(float x, float y) {
+  Sleef_float2 r;
+
+  r.x = x + y;
+  float v = r.x - x;
+  r.y = (x - (r.x - v)) + (y - v);
+
+  return r;
+}
+
+static INLINE CONST Sleef_float2 dfadd_f2_f2_f(Sleef_float2 x, float y) {
+  // |x| >= |y|
+
+  Sleef_float2 r;
+
+#ifndef NDEBUG
+  if (!(checkfp(x.x) || checkfp(y) || fabsfk(x.x) >= fabsfk(y))) fprintf(stderr, "[dfadd_f2_f2_f : %g %g]", x.x, y);
+#endif
+
+  r.x = x.x + y;
+  r.y = x.x - r.x + y + x.y;
+
+  return r;
+}
+
+static INLINE CONST Sleef_float2 dfadd_f2_f_f2(float x, Sleef_float2 y) {
+  // |x| >= |y|
+
+  Sleef_float2 r;
+
+#ifndef NDEBUG
+  if (!(checkfp(x) || checkfp(y.x) || fabsfk(x) >= fabsfk(y.x))) {
+    fprintf(stderr, "[dfadd_f2_f_f2 : %g %g]\n", x, y.x);
+    fflush(stderr);
+  }
+#endif
+
+  r.x = x + y.x;
+  r.y = x - r.x + y.x + y.y;
+
+  return r;
+}
+
+static INLINE CONST Sleef_float2 dfadd2_f2_f2_f(Sleef_float2 x, float y) {
+  // |x| >= |y|
+
+  Sleef_float2 r;
+
+  r.x  = x.x + y;
+  float v = r.x - x.x;
+  r.y = (x.x - (r.x - v)) + (y - v);
+  r.y += x.y;
+
+  return r;
+}
+
+static INLINE CONST Sleef_float2 dfadd2_f2_f_f2(float x, Sleef_float2 y) {
+  Sleef_float2 r;
+
+  r.x  = x + y.x;
+  float v = r.x - x;
+  r.y = (x - (r.x - v)) + (y.x - v) + y.y;
+
+  return r;
+}
+
+static INLINE CONST Sleef_float2 dfadd_f2_f2_f2(Sleef_float2 x, Sleef_float2 y) {
+  // |x| >= |y|
+
+  Sleef_float2 r;
+
+#ifndef NDEBUG
+  if (!(checkfp(x.x) || checkfp(y.x) || fabsfk(x.x) >= fabsfk(y.x))) fprintf(stderr, "[dfadd_f2_f2_f2 : %g %g]", x.x, y.x);
+#endif
+
+  r.x = x.x + y.x;
+  r.y = x.x - r.x + y.x + x.y + y.y;
+
+  return r;
+}
+
+static INLINE CONST Sleef_float2 dfadd2_f2_f2_f2(Sleef_float2 x, Sleef_float2 y) {
+  Sleef_float2 r;
+
+  r.x  = x.x + y.x;
+  float v = r.x - x.x;
+  r.y = (x.x - (r.x - v)) + (y.x - v);
+  r.y += x.y + y.y;
+
+  return r;
+}
+
+static INLINE CONST Sleef_float2 dfsub_f2_f2_f2(Sleef_float2 x, Sleef_float2 y) {
+  // |x| >= |y|
+
+  Sleef_float2 r;
+
+#ifndef NDEBUG
+  if (!(checkfp(x.x) || checkfp(y.x) || fabsfk(x.x) >= fabsfk(y.x))) fprintf(stderr, "[dfsub_f2_f2_f2 : %g %g]", x.x, y.x);
+#endif
+
+  r.x = x.x - y.x;
+  r.y = x.x - r.x - y.x + x.y - y.y;
+
+  return r;
+}
+
+static INLINE CONST Sleef_float2 dfdiv_f2_f2_f2(Sleef_float2 n, Sleef_float2 d) {
+  float t = 1.0f / d.x;
+  float dh  = upperf(d.x), dl  = d.x - dh;
+  float th  = upperf(t  ), tl  = t   - th;
+  float nhh = upperf(n.x), nhl = n.x - nhh;
+
+  Sleef_float2 q;
+
+  q.x = n.x * t;
+
+  float u = -q.x + nhh * th + nhh * tl + nhl * th + nhl * tl +
+    q.x * (1 - dh * th - dh * tl - dl * th - dl * tl);
+
+  q.y = t * (n.y - q.x * d.y) + u;
+
+  return q;
+}
+
+static INLINE CONST Sleef_float2 dfmul_f2_f_f(float x, float y) {
+  float xh = upperf(x), xl = x - xh;
+  float yh = upperf(y), yl = y - yh;
+  Sleef_float2 r;
+
+  r.x = x * y;
+  r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl;
+
+  return r;
+}
+
+static INLINE CONST Sleef_float2 dfmul_f2_f2_f(Sleef_float2 x, float y) {
+  float xh = upperf(x.x), xl = x.x - xh;
+  float yh = upperf(y  ), yl = y   - yh;
+  Sleef_float2 r;
+
+  r.x = x.x * y;
+  r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.y * y;
+
+  return r;
+}
+
+static INLINE CONST Sleef_float2 dfmul_f2_f2_f2(Sleef_float2 x, Sleef_float2 y) {
+  float xh = upperf(x.x), xl = x.x - xh;
+  float yh = upperf(y.x), yl = y.x - yh;
+  Sleef_float2 r;
+
+  r.x = x.x * y.x;
+  r.y = xh * yh - r.x + xl * yh + xh * yl + xl * yl + x.x * y.y + x.y * y.x;
+
+  return r;
+}
+
+static INLINE CONST float dfmul_f_f2_f2(Sleef_float2 x, Sleef_float2 y) {
+  float xh = upperf(x.x), xl = x.x - xh;
+  float yh = upperf(y.x), yl = y.x - yh;
+
+  return x.y * yh + xh * y.y + xl * yl + xh * yl + xl * yh + xh * yh;
+}
+
+static INLINE CONST Sleef_float2 dfsqu_f2_f2(Sleef_float2 x) {
+  float xh = upperf(x.x), xl = x.x - xh;
+  Sleef_float2 r;
+
+  r.x = x.x * x.x;
+  r.y = xh * xh - r.x + (xh + xh) * xl + xl * xl + x.x * (x.y + x.y);
+
+  return r;
+}
+
+static INLINE CONST float dfsqu_f_f2(Sleef_float2 x) {
+  float xh = upperf(x.x), xl = x.x - xh;
+
+  return xh * x.y + xh * x.y + xl * xl + (xh * xl + xh * xl) + xh * xh;
+}
+
+static INLINE CONST Sleef_float2 dfrec_f2_f(float d) {
+  float t = 1.0f / d;
+  float dh = upperf(d), dl = d - dh;
+  float th = upperf(t), tl = t - th;
+  Sleef_float2 q;
+
+  q.x = t;
+  q.y = t * (1 - dh * th - dh * tl - dl * th - dl * tl);
+
+  return q;
+}
+
+static INLINE CONST Sleef_float2 dfrec_f2_f2(Sleef_float2 d) {
+  float t = 1.0f / d.x;
+  float dh = upperf(d.x), dl = d.x - dh;
+  float th = upperf(t  ), tl = t   - th;
+  Sleef_float2 q;
+
+  q.x = t;
+  q.y = t * (1 - dh * th - dh * tl - dl * th - dl * tl - d.y * t);
+
+  return q;
+}
+
+static INLINE CONST Sleef_float2 dfsqrt_f2_f2(Sleef_float2 d) {
+  float t = sqrtf(d.x + d.y);
+  return dfscale_f2_f2_f(dfmul_f2_f2_f2(dfadd2_f2_f2_f2(d, dfmul_f2_f_f(t, t)), dfrec_f2_f(t)), 0.5f);
+}
+
+static INLINE CONST Sleef_float2 dfsqrt_f2_f(float d) {
+  float t = sqrtf(d);
+  return dfscale_f2_f2_f(dfmul_f2_f2_f2(dfadd2_f2_f_f2(d, dfmul_f2_f_f(t, t)), dfrec_f2_f(t)), 0.5);
+}
+
+//
+
+EXPORT CONST float xsinf(float d) {
+  int q;
+  float u, s, t = d;
+
+  q = (int)rintfk(d * (float)M_1_PI);
+
+  d = mlaf(q, -PI_Af, d);
+  d = mlaf(q, -PI_Bf, d);
+  d = mlaf(q, -PI_Cf, d);
+  d = mlaf(q, -PI_Df, d);
+
+  s = d * d;
+
+  if (floatToRawIntBits(d) == floatToRawIntBits(-0.0f)) s = -0.0f;
+  if ((q & 1) != 0) d = -d;
+
+  u = 2.6083159809786593541503e-06f;
+  u = mlaf(u, s, -0.0001981069071916863322258f);
+  u = mlaf(u, s, 0.00833307858556509017944336f);
+  u = mlaf(u, s, -0.166666597127914428710938f);
+
+  u = mlaf(s, u * d, d);
+
+  if (xisnegzerof(t) || fabsfk(t) > TRIGRANGEMAXf) u = -0.0f;
+  if (xisinff(t)) u = NANf;
+
+  return u;
+}
+
+EXPORT CONST float xsinf_u1(float d) {
+  int q;
+  float u;
+  Sleef_float2 s, t, x;
+
+  if (fabsfk(d) < TRIGRANGEMAX2f) {
+    q = (int)rintfk(d * (float)M_1_PI);
+    u = mlaf(q, -PI_A2f, d);
+    s = dfadd2_f2_f_f(u, q * (-PI_B2f));
+    s = dfadd_f2_f2_f(s, q * (-PI_C2f));
+  } else {
+    Sleef_float2 dfq = dfmul_f2_f2_f(df(M_1_PI, M_1_PI - (float)M_1_PI), d);
+    float t = rintfk(dfq.x * (1.0f / (1 << 16)));
+    dfq.y = rintfk(dfq.x - t * (1 << 16) + dfq.y);
+    q = (int)dfq.y;
+    dfq.x = t * (1 << 16);
+    dfq = dfnormalize_f2_f2(dfq);
+
+    s = dfadd2_f2_f_f2 (d, dfmul_f2_f2_f(dfq, -PI_A3f));
+    s = dfnormalize_f2_f2(s);
+    s = dfadd2_f2_f2_f2(s, dfmul_f2_f2_f(dfq, -PI_B3f));
+    s = dfnormalize_f2_f2(s);
+    s = dfadd2_f2_f2_f2(s, dfmul_f2_f2_f(dfq, -PI_C3f));
+    s = dfnormalize_f2_f2(s);
+    s = dfadd2_f2_f2_f2(s, dfmul_f2_f2_f(dfq, -PI_D3f));
+    s = dfnormalize_f2_f2(s);
+    s = dfadd2_f2_f2_f2(s, dfmul_f2_f2_f(dfq, -PI_E3f));
+    s = dfnormalize_f2_f2(s);
+  }
+
+  t = s;
+  s = dfsqu_f2_f2(s);
+
+  u = 2.6083159809786593541503e-06f;
+  u = mlaf(u, s.x, -0.0001981069071916863322258f);
+  u = mlaf(u, s.x, 0.00833307858556509017944336f);
+
+  x = dfadd_f2_f_f2(1, dfmul_f2_f2_f2(dfadd_f2_f_f(-0.166666597127914428710938f, u * s.x), s));
+
+  u = dfmul_f_f2_f2(t, x);
+
+  if ((q & 1) != 0) u = -u;
+  if (!xisinff(d) && (xisnegzerof(d) || fabsfk(d) > TRIGRANGEMAX3f)) u = -0.0f;
+
+  return u;
+}
+
+EXPORT CONST float xcosf(float d) {
+  int q;
+  float u, s, t = d;
+
+  q = 1 + 2*(int)rintfk(d * (float)M_1_PI - 0.5f);
+
+  d = mlaf(q, -PI_Af*0.5f, d);
+  d = mlaf(q, -PI_Bf*0.5f, d);
+  d = mlaf(q, -PI_Cf*0.5f, d);
+  d = mlaf(q, -PI_Df*0.5f, d);
+
+  s = d * d;
+
+  if ((q & 2) == 0) d = -d;
+
+  u = 2.6083159809786593541503e-06f;
+  u = mlaf(u, s, -0.0001981069071916863322258f);
+  u = mlaf(u, s, 0.00833307858556509017944336f);
+  u = mlaf(u, s, -0.166666597127914428710938f);
+
+  u = mlaf(s, u * d, d);
+
+  if (fabsfk(t) > TRIGRANGEMAXf) u = 1.0f;
+  if (xisinff(t)) u = NANf;
+
+  return u;
+}
+
+EXPORT CONST float xcosf_u1(float d) {
+  float u;
+  Sleef_float2 s, t, x;
+  int q;
+
+  d = fabsfk(d);
+
+  if (d < TRIGRANGEMAX2f) {
+    float dq = mlaf(rintfk(d * (float)M_1_PI - 0.5f), 2, 1);
+    q = (int)dq;
+    s = dfadd2_f2_f_f (d, dq * (-PI_A2f*0.5f));
+    s = dfadd2_f2_f2_f(s, dq * (-PI_B2f*0.5f));
+    s = dfadd2_f2_f2_f(s, dq * (-PI_C2f*0.5f));
+  } else {
+    Sleef_float2 dfq = dfadd2_f2_f2_f(dfmul_f2_f2_f(df(M_1_PI, M_1_PI - (float)M_1_PI), d), -0.5f);
+    float t = rintfk(dfq.x * (1.0f / (1 << 16)));
+    dfq.y = rintfk(dfq.x - t * (1 << 16) + dfq.y) * 2 + 1;
+    q = (int)dfq.y;
+    dfq.x = t * (1 << 17);
+    dfq = dfnormalize_f2_f2(dfq);
+
+    s = dfadd2_f2_f_f2 (d, dfmul_f2_f2_f(dfq, -PI_A3f*0.5f));
+    s = dfnormalize_f2_f2(s);
+    s = dfadd2_f2_f2_f2(s, dfmul_f2_f2_f(dfq, -PI_B3f*0.5f));
+    s = dfnormalize_f2_f2(s);
+    s = dfadd2_f2_f2_f2(s, dfmul_f2_f2_f(dfq, -PI_C3f*0.5f));
+    s = dfnormalize_f2_f2(s);
+    s = dfadd2_f2_f2_f2(s, dfmul_f2_f2_f(dfq, -PI_D3f*0.5f));
+    s = dfnormalize_f2_f2(s);
+    s = dfadd2_f2_f2_f2(s, dfmul_f2_f2_f(dfq, -PI_E3f*0.5f));
+    s = dfnormalize_f2_f2(s);
+  }
+
+  t = s;
+  s = dfsqu_f2_f2(s);
+
+  u = 2.6083159809786593541503e-06f;
+  u = mlaf(u, s.x, -0.0001981069071916863322258f);
+  u = mlaf(u, s.x, 0.00833307858556509017944336f);
+
+  x = dfadd_f2_f_f2(1, dfmul_f2_f2_f2(dfadd_f2_f_f(-0.166666597127914428710938f, u * s.x), s));
+
+  u = dfmul_f_f2_f2(t, x);
+
+  if ((((int)q) & 2) == 0) u = -u;
+  if (!xisinff(d) && d > TRIGRANGEMAX3f) u = 1.0f;
+  return u;
+}
+
+EXPORT CONST Sleef_float2 xsincosf(float d) {
+  int q;
+  float u, s, t;
+  Sleef_float2 r;
+
+  q = (int)rintfk(d * ((float)(2 * M_1_PI)));
+
+  s = d;
+
+  s = mlaf(q, -PI_Af*0.5f, s);
+  s = mlaf(q, -PI_Bf*0.5f, s);
+  s = mlaf(q, -PI_Cf*0.5f, s);
+  s = mlaf(q, -PI_Df*0.5f, s);
+
+  t = s;
+
+  s = s * s;
+
+  u = -0.000195169282960705459117889f;
+  u = mlaf(u, s, 0.00833215750753879547119141f);
+  u = mlaf(u, s, -0.166666537523269653320312f);
+  u = u * s * t;
+
+  r.x = t + u;
+
+  if (xisnegzerof(d)) r.x = -0.0f;
+
+  u = -2.71811842367242206819355e-07f;
+  u = mlaf(u, s, 2.47990446951007470488548e-05f);
+  u = mlaf(u, s, -0.00138888787478208541870117f);
+  u = mlaf(u, s, 0.0416666641831398010253906f);
+  u = mlaf(u, s, -0.5f);
+
+  r.y = u * s + 1;
+
+  if ((q & 1) != 0) { s = r.y; r.y = r.x; r.x = s; }
+  if ((q & 2) != 0) { r.x = -r.x; }
+  if (((q+1) & 2) != 0) { r.y = -r.y; }
+
+  if (fabsfk(d) > TRIGRANGEMAXf) { r.x = 0; r.y = 1; }
+  if (xisinff(d)) { r.x = r.y = NANf; }
+
+  return r;
+}
+
+EXPORT CONST Sleef_float2 xsincosf_u1(float d) {
+  int q;
+  float u;
+  Sleef_float2 r, s, t, x;
+
+  if (fabsfk(d) < TRIGRANGEMAX2f) {
+    q = (int)rintfk(d * (float)(2 * M_1_PI));
+    u = mlaf(q, -PI_A2f*0.5f, d);
+    s = dfadd2_f2_f_f(u, q * (-PI_B2f*0.5f));
+    s = dfadd_f2_f2_f(s, q * (-PI_C2f*0.5f));
+  } else {
+    Sleef_float2 dfq = dfmul_f2_f2_f(df((2 * M_1_PI), (2 * M_1_PI) - (float)(2 * M_1_PI)), d);
+    float t = rintfk(dfq.x * (1.0f / (1 << 16)));
+    dfq.y = rintfk(dfq.x - t * (1 << 16) + dfq.y);
+    q = (int)dfq.y;
+    dfq.x = t * (1 << 16);
+    dfq = dfnormalize_f2_f2(dfq);
+
+    s = dfadd2_f2_f_f2 (d, dfmul_f2_f2_f(dfq, -PI_A3f*0.5f));
+    s = dfnormalize_f2_f2(s);
+    s = dfadd2_f2_f2_f2(s, dfmul_f2_f2_f(dfq, -PI_B3f*0.5f));
+    s = dfnormalize_f2_f2(s);
+    s = dfadd2_f2_f2_f2(s, dfmul_f2_f2_f(dfq, -PI_C3f*0.5f));
+    s = dfnormalize_f2_f2(s);
+    s = dfadd2_f2_f2_f2(s, dfmul_f2_f2_f(dfq, -PI_D3f*0.5f));
+    s = dfnormalize_f2_f2(s);
+    s = dfadd2_f2_f2_f2(s, dfmul_f2_f2_f(dfq, -PI_E3f*0.5f));
+    s = dfnormalize_f2_f2(s);
+  }
+
+  t = s;
+  s.x = dfsqu_f_f2(s);
+
+  u = -0.000195169282960705459117889f;
+  u = mlaf(u, s.x, 0.00833215750753879547119141f);
+  u = mlaf(u, s.x, -0.166666537523269653320312f);
+
+  u *= s.x * t.x;
+
+  x = dfadd_f2_f2_f(t, u);
+  r.x = x.x + x.y;
+  if (xisnegzerof(d)) r.x = -0.0f;
+
+  u = -2.71811842367242206819355e-07f;
+  u = mlaf(u, s.x, 2.47990446951007470488548e-05f);
+  u = mlaf(u, s.x, -0.00138888787478208541870117f);
+  u = mlaf(u, s.x, 0.0416666641831398010253906f);
+  u = mlaf(u, s.x, -0.5f);
+
+  x = dfadd_f2_f_f2(1, dfmul_f2_f_f(s.x, u));
+  r.y = x.x + x.y;
+
+  if ((q & 1) != 0) { u = r.y; r.y = r.x; r.x = u; }
+  if ((q & 2) != 0) { r.x = -r.x; }
+  if (((q+1) & 2) != 0) { r.y = -r.y; }
+
+  if (fabsfk(d) > TRIGRANGEMAX3f) { r.x = 0; r.y = 1; }
+  if (xisinff(d)) { r.x = r.y = NAN; }
+
+  return r;
+}
+
+EXPORT CONST Sleef_float2 xsincospif_u05(float d) {
+  float u, s, t;
+  Sleef_float2 r, x, s2;
+
+  u = d * 4;
+  int q = ceilfk(u) & ~(int)1;
+
+  s = u - (float)q;
+  t = s;
+  s = s * s;
+  s2 = dfmul_f2_f_f(t, t);
+
+  //
+
+  u = +0.3093842054e-6;
+  u = mlaf(u, s, -0.3657307388e-4);
+  u = mlaf(u, s, +0.2490393585e-2);
+  x = dfadd2_f2_f_f2(u * s, df(-0.080745510756969451904, -1.3373665339076936258e-09));
+  x = dfadd2_f2_f2_f2(dfmul_f2_f2_f2(s2, x), df(0.78539818525314331055, -2.1857338617566484855e-08));
+
+  x = dfmul_f2_f2_f(x, t);
+  r.x = x.x + x.y;
+  if (xisnegzerof(d)) r.x = -0.0f;
+
+  u = -0.2430611801e-7;
+  u = mlaf(u, s, +0.3590577080e-5);
+  u = mlaf(u, s, -0.3259917721e-3);
+  x = dfadd2_f2_f_f2(u * s, df(0.015854343771934509277, 4.4940051354032242811e-10));
+  x = dfadd2_f2_f2_f2(dfmul_f2_f2_f2(s2, x), df(-0.30842512845993041992, -9.0728339030733922277e-09));
+
+  x = dfadd2_f2_f2_f(dfmul_f2_f2_f2(x, s2), 1);
+  r.y = x.x + x.y;
+
+  if ((q & 2) != 0) { s = r.y; r.y = r.x; r.x = s; }
+  if ((q & 4) != 0) { r.x = -r.x; }
+  if (((q+2) & 4) != 0) { r.y = -r.y; }
+
+  if (fabsfk(d) > TRIGRANGEMAXf/4) { r.x = 0; r.y = 1; }
+  if (xisinff(d)) { r.x = r.y = NANf; }
+
+  return r;
+}
+
+EXPORT CONST Sleef_float2 xsincospif_u35(float d) {
+  float u, s, t;
+  Sleef_float2 r;
+
+  u = d * 4;
+  int q = ceilfk(u) & ~(int)1;
+
+  s = u - (float)q;
+  t = s;
+  s = s * s;
+
+  //
+
+  u = -0.3600925265e-4;
+  u = mlaf(u, s, +0.2490088111e-2);
+  u = mlaf(u, s, -0.8074551076e-1);
+  u = mlaf(u, s, +0.7853981853e+0);
+
+  r.x = u * t;
+
+  u = +0.3539815225e-5;
+  u = mlaf(u, s, -0.3259574005e-3);
+  u = mlaf(u, s, +0.1585431583e-1);
+  u = mlaf(u, s, -0.3084251285e+0);
+  u = mlaf(u, s, 1);
+
+  r.y = u;
+
+  if ((q & 2) != 0) { s = r.y; r.y = r.x; r.x = s; }
+  if ((q & 4) != 0) { r.x = -r.x; }
+  if (((q+2) & 4) != 0) { r.y = -r.y; }
+
+  if (fabsfk(d) > TRIGRANGEMAXf/4) { r.x = 0; r.y = 1; }
+  if (xisinff(d)) { r.x = r.y = NANf; }
+
+  return r;
+}
+
+EXPORT CONST float xtanf(float d) {
+  int q;
+  float u, s, x;
+
+  q = (int)rintfk(d * (float)(2 * M_1_PI));
+
+  x = d;
+
+  x = mlaf(q, -PI_Af*0.5f, x);
+  x = mlaf(q, -PI_Bf*0.5f, x);
+  x = mlaf(q, -PI_Cf*0.5f, x);
+  x = mlaf(q, -PI_Df*0.5f, x);
+
+  s = x * x;
+
+  if ((q & 1) != 0) x = -x;
+
+  u = 0.00927245803177356719970703f;
+  u = mlaf(u, s, 0.00331984995864331722259521f);
+  u = mlaf(u, s, 0.0242998078465461730957031f);
+  u = mlaf(u, s, 0.0534495301544666290283203f);
+  u = mlaf(u, s, 0.133383005857467651367188f);
+  u = mlaf(u, s, 0.333331853151321411132812f);
+
+  u = mlaf(s, u * x, x);
+
+  if ((q & 1) != 0) u = 1.0f / u;
+
+  if (xisinff(d)) u = NANf;
+
+  return u;
+}
+
+EXPORT CONST float xtanf_u1(float d) {
+  int q;
+  float u;
+  Sleef_float2 s, t, x;
+
+  if (fabsfk(d) < TRIGRANGEMAX2f) {
+    q = (int)rintfk(d * (float)(2 * M_1_PI));
+    u = mlaf(q, -PI_A2f*0.5f, d);
+    s = dfadd2_f2_f_f(u, q * (-PI_B2f*0.5f));
+    s = dfadd_f2_f2_f(s, q * (-PI_C2f*0.5f));
+  } else {
+    Sleef_float2 dfq = dfmul_f2_f2_f(df((2 * M_1_PI), (2 * M_1_PI) - (float)(2 * M_1_PI)), d);
+    float t = rintfk(dfq.x * (1.0f / (1 << 16)));
+    dfq.y = rintfk(dfq.x - t * (1 << 16) + dfq.y);
+    q = (int)dfq.y;
+    dfq.x = t * (1 << 16);
+    dfq = dfnormalize_f2_f2(dfq);
+
+    s = dfadd2_f2_f_f2 (d, dfmul_f2_f2_f(dfq, -PI_A3f*0.5f));
+    s = dfnormalize_f2_f2(s);
+    s = dfadd2_f2_f2_f2(s, dfmul_f2_f2_f(dfq, -PI_B3f*0.5f));
+    s = dfnormalize_f2_f2(s);
+    s = dfadd2_f2_f2_f2(s, dfmul_f2_f2_f(dfq, -PI_C3f*0.5f));
+    s = dfnormalize_f2_f2(s);
+    s = dfadd2_f2_f2_f2(s, dfmul_f2_f2_f(dfq, -PI_D3f*0.5f));
+    s = dfnormalize_f2_f2(s);
+    s = dfadd2_f2_f2_f2(s, dfmul_f2_f2_f(dfq, -PI_E3f*0.5f));
+    s = dfnormalize_f2_f2(s);
+  }
+
+  if ((q & 1) != 0) s = dfneg_f2_f2(s);
+
+  t = s;
+  s = dfsqu_f2_f2(s);
+  s = dfnormalize_f2_f2(s);
+
+  u = 0.00446636462584137916564941f;
+  u = mlaf(u, s.x, -8.3920182078145444393158e-05f);
+  u = mlaf(u, s.x, 0.0109639242291450500488281f);
+  u = mlaf(u, s.x, 0.0212360303848981857299805f);
+  u = mlaf(u, s.x, 0.0540687143802642822265625f);
+
+  x = dfadd_f2_f_f(0.133325666189193725585938f, u * s.x);
+  x = dfadd_f2_f_f2(1, dfmul_f2_f2_f2(dfadd_f2_f_f2(0.33333361148834228515625f, dfmul_f2_f2_f2(s, x)), s));
+  x = dfmul_f2_f2_f2(t, x);
+
+  if ((q & 1) != 0) x = dfrec_f2_f2(x);
+
+  u = x.x + x.y;
+
+  if (!xisinff(d) && (xisnegzerof(d) || fabsfk(d) > TRIGRANGEMAX3f)) u = -0.0f;
+
+  return u;
+}
+
+EXPORT CONST float xatanf(float s) {
+  float t, u;
+  int q = 0;
+
+  if (signf(s) == -1) { s = -s; q = 2; }
+  if (s > 1) { s = 1.0f / s; q |= 1; }
+
+  t = s * s;
+
+  u = 0.00282363896258175373077393f;
+  u = mlaf(u, t, -0.0159569028764963150024414f);
+  u = mlaf(u, t, 0.0425049886107444763183594f);
+  u = mlaf(u, t, -0.0748900920152664184570312f);
+  u = mlaf(u, t, 0.106347933411598205566406f);
+  u = mlaf(u, t, -0.142027363181114196777344f);
+  u = mlaf(u, t, 0.199926957488059997558594f);
+  u = mlaf(u, t, -0.333331018686294555664062f);
+
+  t = s + s * (t * u);
+
+  if ((q & 1) != 0) t = 1.570796326794896557998982f - t;
+  if ((q & 2) != 0) t = -t;
+
+  return t;
+}
+
+static INLINE CONST float atan2kf(float y, float x) {
+  float s, t, u;
+  int q = 0;
+
+  if (x < 0) { x = -x; q = -2; }
+  if (y > x) { t = x; x = y; y = -t; q += 1; }
+
+  s = y / x;
+  t = s * s;
+
+  u = 0.00282363896258175373077393f;
+  u = mlaf(u, t, -0.0159569028764963150024414f);
+  u = mlaf(u, t, 0.0425049886107444763183594f);
+  u = mlaf(u, t, -0.0748900920152664184570312f);
+  u = mlaf(u, t, 0.106347933411598205566406f);
+  u = mlaf(u, t, -0.142027363181114196777344f);
+  u = mlaf(u, t, 0.199926957488059997558594f);
+  u = mlaf(u, t, -0.333331018686294555664062f);
+
+  t = u * t * s + s;
+  t = q * (float)(M_PI/2) + t;
+
+  return t;
+}
+
+EXPORT CONST float xatan2f(float y, float x) {
+  float r = atan2kf(fabsfk(y), x);
+
+  r = mulsignf(r, x);
+  if (xisinff(x) || x == 0) r = M_PIf/2 - (xisinff(x) ? (signf(x) * (float)(M_PI  /2)) : 0);
+  if (xisinff(y)          ) r = M_PIf/2 - (xisinff(x) ? (signf(x) * (float)(M_PI*1/4)) : 0);
+  if (              y == 0) r = (signf(x) == -1 ? M_PIf : 0);
+
+  return xisnanf(x) || xisnanf(y) ? NANf : mulsignf(r, y);
+}
+
+EXPORT CONST float xasinf(float d) {
+  int o = fabsfk(d) < 0.5f;
+  float x2 = o ? (d*d) : ((1-fabsfk(d))*0.5f), x = o ? fabsfk(d) : sqrtf(x2), u;
+
+  u = +0.4197454825e-1;
+  u = mlaf(u, x2, +0.2424046025e-1);
+  u = mlaf(u, x2, +0.4547423869e-1);
+  u = mlaf(u, x2, +0.7495029271e-1);
+  u = mlaf(u, x2, +0.1666677296e+0);
+  u = mlaf(u, x * x2, x);
+
+  float r = o ? u : (M_PIf/2 - 2*u);
+  r = mulsignf(r, d);
+
+  return r;
+}
+
+EXPORT CONST float xacosf(float d) {
+  int o = fabsfk(d) < 0.5f;
+  float x2 = o ? (d*d) : ((1-fabsfk(d))*0.5f), u;
+  float x = o ? fabsfk(d) : sqrtf(x2);
+  x = fabsfk(d) == 1.0 ? 0 : x;
+
+  u = +0.4197454825e-1;
+  u = mlaf(u, x2, +0.2424046025e-1);
+  u = mlaf(u, x2, +0.4547423869e-1);
+  u = mlaf(u, x2, +0.7495029271e-1);
+  u = mlaf(u, x2, +0.1666677296e+0);
+
+  u *= x * x2;
+
+  float y = 3.1415926535897932f/2 - (mulsignf(x, d) + mulsignf(u, d));
+  x += u;
+  float r = o ? y : (x*2);
+  if (!o && d < 0) r = dfadd_f2_f2_f(df(3.1415927410125732422f,-8.7422776573475857731e-08f), -r).x;
+
+  return r;
+}
+
+static Sleef_float2 atan2kf_u1(Sleef_float2 y, Sleef_float2 x) {
+  float u;
+  Sleef_float2 s, t;
+  int q = 0;
+
+  if (x.x < 0) { x.x = -x.x; x.y = -x.y; q = -2; }
+  if (y.x > x.x) { t = x; x = y; y.x = -t.x; y.y = -t.y; q += 1; }
+
+  s = dfdiv_f2_f2_f2(y, x);
+  t = dfsqu_f2_f2(s);
+  t = dfnormalize_f2_f2(t);
+
+  u = -0.00176397908944636583328247f;
+  u = mlaf(u, t.x, 0.0107900900766253471374512f);
+  u = mlaf(u, t.x, -0.0309564601629972457885742f);
+  u = mlaf(u, t.x, 0.0577365085482597351074219f);
+  u = mlaf(u, t.x, -0.0838950723409652709960938f);
+  u = mlaf(u, t.x, 0.109463557600975036621094f);
+  u = mlaf(u, t.x, -0.142626821994781494140625f);
+  u = mlaf(u, t.x, 0.199983194470405578613281f);
+
+  t = dfmul_f2_f2_f2(t, dfadd_f2_f_f(-0.333332866430282592773438f, u * t.x));
+  t = dfmul_f2_f2_f2(s, dfadd_f2_f_f2(1, t));
+  t = dfadd2_f2_f2_f2(dfmul_f2_f2_f(df(1.5707963705062866211f, -4.3711388286737928865e-08f), q), t);
+
+  return t;
+}
+
+EXPORT CONST float xatan2f_u1(float y, float x) {
+  if (fabsfk(x) < 2.9387372783541830947e-39f) { y *= (1ULL << 24); x *= (1ULL << 24); } // nexttowardf((1.0 / FLT_MAX), 1)
+  Sleef_float2 d = atan2kf_u1(df(fabsfk(y), 0), df(x, 0));
+  float r = d.x + d.y;
+
+  r = mulsignf(r, x);
+  if (xisinff(x) || x == 0) r = (float)M_PI/2 - (xisinff(x) ? (signf(x) * (float)(M_PI  /2)) : 0.0f);
+  if (xisinff(y)          ) r = (float)M_PI/2 - (xisinff(x) ? (signf(x) * (float)(M_PI*1/4)) : 0.0f);
+  if (              y == 0) r = (signf(x) == -1 ? (float)M_PI : 0.0f);
+
+  return xisnanf(x) || xisnanf(y) ? NANf : mulsignf(r, y);
+}
+
+EXPORT CONST float xasinf_u1(float d) {
+  int o = fabsfk(d) < 0.5f;
+  float x2 = o ? (d*d) : ((1-fabsfk(d))*0.5f), u;
+  Sleef_float2 x = o ? df(fabsfk(d), 0) : dfsqrt_f2_f(x2);
+  x = fabsfk(d) == 1.0f ? df(0, 0) : x;
+
+  u = +0.4197454825e-1;
+  u = mlaf(u, x2, +0.2424046025e-1);
+  u = mlaf(u, x2, +0.4547423869e-1);
+  u = mlaf(u, x2, +0.7495029271e-1);
+  u = mlaf(u, x2, +0.1666677296e+0);
+  u *= x2 * x.x;
+
+  Sleef_float2 y = dfadd_f2_f2_f(dfsub_f2_f2_f2(df(3.1415927410125732422f/4,-8.7422776573475857731e-08f/4), x), -u);
+  float r = o ? (u + x.x) : ((y.x + y.y)*2);
+  r = mulsignf(r, d);
+
+  return r;
+}
+
+EXPORT CONST float xacosf_u1(float d) {
+  int o = fabsfk(d) < 0.5f;
+  float x2 = o ? (d*d) : ((1-fabsfk(d))*0.5f), u;
+  Sleef_float2 x = o ? df(fabsfk(d), 0) : dfsqrt_f2_f(x2);
+  x = fabs(d) == 1.0 ? df(0, 0) : x;
+
+  u = +0.4197454825e-1;
+  u = mlaf(u, x2, +0.2424046025e-1);
+  u = mlaf(u, x2, +0.4547423869e-1);
+  u = mlaf(u, x2, +0.7495029271e-1);
+  u = mlaf(u, x2, +0.1666677296e+0);
+
+  u = u * x.x * x2;
+
+  Sleef_float2 y = dfsub_f2_f2_f2(df(3.1415927410125732422f/2,-8.7422776573475857731e-08f/2),
+                                  dfadd_f2_f_f(mulsignf(x.x, d), mulsignf(u, d)));
+  x = dfadd_f2_f2_f(x, u);
+  y = o ? y : dfscale_f2_f2_f(x, 2);
+  if (!o && d < 0) y = dfsub_f2_f2_f2(df(3.1415927410125732422f,-8.7422776573475857731e-08f), y);
+
+  return y.x + y.y;
+}
+
+EXPORT CONST float xatanf_u1(float d) {
+  Sleef_float2 d2 = atan2kf_u1(df(fabsfk(d), 0.0f), df(1.0f, 0.0f));
+  float r = d2.x + d2.y;
+  if (xisinff(d)) r = 1.570796326794896557998982f;
+  return mulsignf(r, d);
+}
+
+EXPORT CONST float xlogf(float d) {
+  float x, x2, t, m;
+  int e;
+
+  int o = d < FLT_MIN;
+  if (o) d *= (float)(1LL << 32) * (float)(1LL << 32);
+
+  e = ilogb2kf(d * (1.0f/0.75f));
+  m = ldexp3kf(d, -e);
+
+  if (o) e -= 64;
+
+  x = (m-1.0f) / (m+1.0f);
+  x2 = x * x;
+
+  t = 0.2392828464508056640625f;
+  t = mlaf(t, x2, 0.28518211841583251953125f);
+  t = mlaf(t, x2, 0.400005877017974853515625f);
+  t = mlaf(t, x2, 0.666666686534881591796875f);
+  t = mlaf(t, x2, 2.0f);
+
+  x = x * t + 0.693147180559945286226764f * e;
+
+  if (xisinff(d)) x = INFINITYf;
+  if (d < 0 || xisnanf(d)) x = NANf;
+  if (d == 0) x = -INFINITYf;
+
+  return x;
+}
+
+EXPORT CONST float xexpf(float d) {
+  int q = (int)rintfk(d * R_LN2f);
+  float s, u;
+
+  s = mlaf(q, -L2Uf, d);
+  s = mlaf(q, -L2Lf, s);
+
+  u = 0.000198527617612853646278381;
+  u = mlaf(u, s, 0.00139304355252534151077271);
+  u = mlaf(u, s, 0.00833336077630519866943359);
+  u = mlaf(u, s, 0.0416664853692054748535156);
+  u = mlaf(u, s, 0.166666671633720397949219);
+  u = mlaf(u, s, 0.5);
+
+  u = s * s * u + s + 1.0f;
+  u = ldexp2kf(u, q);
+
+  if (d < -104) u = 0;
+  if (d >  104) u = INFINITYf;
+
+  return u;
+}
+
+static INLINE CONST float expkf(Sleef_float2 d) {
+  int q = (int)rintfk((d.x + d.y) * R_LN2f);
+  Sleef_float2 s, t;
+  float u;
+
+  s = dfadd2_f2_f2_f(d, q * -L2Uf);
+  s = dfadd2_f2_f2_f(s, q * -L2Lf);
+
+  s = dfnormalize_f2_f2(s);
+
+  u = 0.00136324646882712841033936f;
+  u = mlaf(u, s.x, 0.00836596917361021041870117f);
+  u = mlaf(u, s.x, 0.0416710823774337768554688f);
+  u = mlaf(u, s.x, 0.166665524244308471679688f);
+  u = mlaf(u, s.x, 0.499999850988388061523438f);
+
+  t = dfadd_f2_f2_f2(s, dfmul_f2_f2_f(dfsqu_f2_f2(s), u));
+
+  t = dfadd_f2_f_f2(1, t);
+
+  u = ldexpkf(t.x + t.y, q);
+
+  if (d.x < -104) u = 0;
+
+  return u;
+}
+
+static INLINE CONST Sleef_float2 logkf(float d) {
+  Sleef_float2 x, x2, s;
+  float m, t;
+  int e;
+
+  int o = d < FLT_MIN;
+  if (o) d *= (float)(1LL << 32) * (float)(1LL << 32);
+
+  e = ilogb2kf(d * (1.0f/0.75f));
+  m = ldexp3kf(d, -e);
+
+  if (o) e -= 64;
+
+  x = dfdiv_f2_f2_f2(dfadd2_f2_f_f(-1, m), dfadd2_f2_f_f(1, m));
+  x2 = dfsqu_f2_f2(x);
+
+  t = 0.240320354700088500976562;
+  t = mlaf(t, x2.x, 0.285112679004669189453125);
+  t = mlaf(t, x2.x, 0.400007992982864379882812);
+  Sleef_float2 c = df(0.66666662693023681640625f, 3.69183861259614332084311e-09f);
+
+  s = dfmul_f2_f2_f(df(0.69314718246459960938f, -1.904654323148236017e-09f), e);
+  s = dfadd_f2_f2_f2(s, dfscale_f2_f2_f(x, 2));
+  s = dfadd_f2_f2_f2(s, dfmul_f2_f2_f2(dfmul_f2_f2_f2(x2, x),
+                                      dfadd2_f2_f2_f2(dfmul_f2_f2_f(x2, t), c)));
+  return s;
+}
+
+EXPORT CONST float xlogf_u1(float d) {
+  Sleef_float2 x, s;
+  float m, t, x2;
+  int e;
+
+  int o = d < FLT_MIN;
+  if (o) d *= (float)(1LL << 32) * (float)(1LL << 32);
+
+  e = ilogb2kf(d * (1.0f/0.75f));
+  m = ldexp3kf(d, -e);
+
+  if (o) e -= 64;
+
+  x = dfdiv_f2_f2_f2(dfadd2_f2_f_f(-1, m), dfadd2_f2_f_f(1, m));
+  x2 = x.x * x.x;
+
+  t = +0.3027294874e+0f;
+  t = mlaf(t, x2, +0.3996108174e+0f);
+  t = mlaf(t, x2, +0.6666694880e+0f);
+
+  s = dfmul_f2_f2_f(df(0.69314718246459960938f, -1.904654323148236017e-09f), (float)e);
+  s = dfadd_f2_f2_f2(s, dfscale_f2_f2_f(x, 2));
+  s = dfadd_f2_f2_f(s, x2 * x.x * t);
+
+  float r = s.x + s.y;
+
+  if (xisinff(d)) r = INFINITYf;
+  if (d < 0 || xisnanf(d)) r = NANf;
+  if (d == 0) r = -INFINITYf;
+
+  return r;
+}
+
+static INLINE CONST Sleef_float2 expk2f(Sleef_float2 d) {
+  int q = (int)rintfk((d.x + d.y) * R_LN2f);
+  Sleef_float2 s, t;
+  float u;
+
+  s = dfadd2_f2_f2_f(d, q * -L2Uf);
+  s = dfadd2_f2_f2_f(s, q * -L2Lf);
+
+  u = +0.1980960224e-3f;
+  u = mlaf(u, s.x, +0.1394256484e-2f);
+  u = mlaf(u, s.x, +0.8333456703e-2f);
+  u = mlaf(u, s.x, +0.4166637361e-1f);
+
+  t = dfadd2_f2_f2_f(dfmul_f2_f2_f(s, u), +0.166666659414234244790680580464e+0f);
+  t = dfadd2_f2_f2_f(dfmul_f2_f2_f2(s, t), 0.5);
+  t = dfadd2_f2_f2_f2(s, dfmul_f2_f2_f2(dfsqu_f2_f2(s), t));
+
+  t = dfadd2_f2_f_f2(1, t);
+
+  t.x = ldexp2kf(t.x, q);
+  t.y = ldexp2kf(t.y, q);
+
+  return d.x < -104 ? df(0, 0) : t;
+}
+
+EXPORT CONST float xpowf(float x, float y) {
+  int yisint = (y == (int)y) || (fabsfk(y) >= (float)(1LL << 24));
+  int yisodd = (1 & (int)y) != 0 && yisint && fabsfk(y) < (float)(1LL << 24);
+
+  float result = expkf(dfmul_f2_f2_f(logkf(fabsfk(x)), y));
+
+  result = xisnanf(result) ? INFINITYf : result;
+  result *=  (x >= 0 ? 1 : (!yisint ? NANf : (yisodd ? -1 : 1)));
+
+  float efx = mulsignf(fabsfk(x) - 1, y);
+  if (xisinff(y)) result = efx < 0 ? 0.0f : (efx == 0 ? 1.0f : INFINITYf);
+  if (xisinff(x) || x == 0) result = (yisodd ? signf(x) : 1) * ((x == 0 ? -y : y) < 0 ? 0 : INFINITYf);
+  if (xisnanf(x) || xisnanf(y)) result = NANf;
+  if (y == 0 || x == 1) result = 1;
+
+  return result;
+}
+
+EXPORT CONST float xpownf(float x, int y) {
+  return xpowf(x, (float)y);
+}
+
+EXPORT CONST float xpowrf(float x, float y) {
+  if (x < 0.0f)
+    return NAN;
+  if (isnan(y))
+    return y;
+  return xpowf(x, y);
+}
+
+
+EXPORT CONST float xsinhf(float x) {
+  float y = fabsfk(x);
+  Sleef_float2 d = expk2f(df(y, 0));
+  d = dfsub_f2_f2_f2(d, dfrec_f2_f2(d));
+  y = (d.x + d.y) * 0.5f;
+
+  y = fabsfk(x) > 89 ? INFINITYf : y;
+  y = xisnanf(y) ? INFINITYf : y;
+  y = mulsignf(y, x);
+  y = xisnanf(x) ? NANf : y;
+
+  return y;
+}
+
+EXPORT CONST float xcoshf(float x) {
+  float y = fabsfk(x);
+  Sleef_float2 d = expk2f(df(y, 0));
+  d = dfadd_f2_f2_f2(d, dfrec_f2_f2(d));
+  y = (d.x + d.y) * 0.5f;
+
+  y = fabsfk(x) > 89 ? INFINITYf : y;
+  y = xisnanf(y) ? INFINITYf : y;
+  y = xisnanf(x) ? NANf : y;
+
+  return y;
+}
+
+EXPORT CONST float xtanhf(float x) {
+  float y = fabsfk(x);
+  Sleef_float2 d = expk2f(df(y, 0));
+  Sleef_float2 e = dfrec_f2_f2(d);
+  d = dfdiv_f2_f2_f2(dfsub_f2_f2_f2(d, e), dfadd_f2_f2_f2(d, e));
+  y = d.x + d.y;
+
+  y = fabsfk(x) > 18.714973875f ? 1.0f : y;
+  y = xisnanf(y) ? 1.0f : y;
+  y = mulsignf(y, x);
+  y = xisnanf(x) ? NANf : y;
+
+  return y;
+}
+
+static INLINE CONST Sleef_float2 logk2f(Sleef_float2 d) {
+  Sleef_float2 x, x2, m, s;
+  float t;
+  int e;
+
+  e = ilogbkf(d.x * (1.0f/0.75f));
+  m = dfscale_f2_f2_f(d, pow2if(-e));
+
+  x = dfdiv_f2_f2_f2(dfadd2_f2_f2_f(m, -1), dfadd2_f2_f2_f(m, 1));
+  x2 = dfsqu_f2_f2(x);
+
+  t = 0.2392828464508056640625f;
+  t = mlaf(t, x2.x, 0.28518211841583251953125f);
+  t = mlaf(t, x2.x, 0.400005877017974853515625f);
+  t = mlaf(t, x2.x, 0.666666686534881591796875f);
+
+  s = dfmul_f2_f2_f(df(0.69314718246459960938f, -1.904654323148236017e-09f), e);
+  s = dfadd_f2_f2_f2(s, dfscale_f2_f2_f(x, 2));
+  s = dfadd_f2_f2_f2(s, dfmul_f2_f2_f(dfmul_f2_f2_f2(x2, x), t));
+
+  return s;
+}
+
+EXPORT CONST float xasinhf(float x) {
+  float y = fabsfk(x);
+  Sleef_float2 d;
+
+  d = y > 1 ? dfrec_f2_f(x) : df(y, 0);
+  d = dfsqrt_f2_f2(dfadd2_f2_f2_f(dfsqu_f2_f2(d), 1));
+  d = y > 1 ? dfmul_f2_f2_f(d, y) : d;
+
+  d = logk2f(dfnormalize_f2_f2(dfadd_f2_f2_f(d, x)));
+  y = d.x + d.y;
+
+  y = (fabsfk(x) > SQRT_FLT_MAX || xisnanf(y)) ? mulsignf(INFINITYf, x) : y;
+  y = xisnanf(x) ? NANf : y;
+  y = xisnegzerof(x) ? -0.0f : y;
+
+  return y;
+}
+
+EXPORT CONST float xacoshf(float x) {
+  Sleef_float2 d = logk2f(dfadd2_f2_f2_f(dfmul_f2_f2_f2(dfsqrt_f2_f2(dfadd2_f2_f_f(x, 1)), dfsqrt_f2_f2(dfadd2_f2_f_f(x, -1))), x));
+  float y = d.x + d.y;
+
+  y = (x > SQRT_FLT_MAX || xisnanf(y)) ? INFINITYf : y;
+  y = x == 1.0f ? 0.0f : y;
+  y = x < 1.0f ? NANf : y;
+  y = xisnanf(x) ? NANf : y;
+
+  return y;
+}
+
+EXPORT CONST float xatanhf(float x) {
+  float y = fabsfk(x);
+  Sleef_float2 d = logk2f(dfdiv_f2_f2_f2(dfadd2_f2_f_f(1, y), dfadd2_f2_f_f(1, -y)));
+  y = y > 1.0f ? NANf : (y == 1.0f ? INFINITYf : (d.x + d.y) * 0.5f);
+
+  y = xisinff(x) || xisnanf(y) ? NANf : y;
+  y = mulsignf(y, x);
+  y = xisnanf(x) ? NANf : y;
+
+  return y;
+}
+
+EXPORT CONST float xexp2f(float d) {
+  int q = (int)rintfk(d);
+  float s, u;
+
+  s = d - q;
+
+  u = +0.1535920892e-3;
+  u = mlaf(u, s, +0.1339262701e-2);
+  u = mlaf(u, s, +0.9618384764e-2);
+  u = mlaf(u, s, +0.5550347269e-1);
+  u = mlaf(u, s, +0.2402264476e+0);
+  u = mlaf(u, s, +0.6931471825e+0);
+  u = dfnormalize_f2_f2(dfadd_f2_f_f2(1, dfmul_f2_f_f(u, s))).x;
+
+  u = ldexp2kf(u, q);
+
+  if (d >= 128) u = INFINITYf;
+  if (d < -150) u = 0;
+  
+  return u;
+}
+
+EXPORT CONST float xexp10f(float d) {
+  int q = (int)rintfk(d * (float)LOG10_2);
+  float s, u;
+  
+  s = mlaf(q, -L10Uf, d);
+  s = mlaf(q, -L10Lf, s);
+  
+  u = +0.2064004987e+0;
+  u = mlaf(u, s, +0.5417877436e+0);
+  u = mlaf(u, s, +0.1171286821e+1);
+  u = mlaf(u, s, +0.2034656048e+1);
+  u = mlaf(u, s, +0.2650948763e+1);
+  u = mlaf(u, s, +0.2302585125e+1);
+  u = dfnormalize_f2_f2(dfadd_f2_f_f2(1, dfmul_f2_f_f(u, s))).x;
+
+  u = ldexp2kf(u, q);
+
+  if (d > 38.5318394191036238941387f) u = INFINITYf; // log10(FLT_MAX)
+  if (d < -50) u = 0;
+  
+  return u;
+}
+
+EXPORT CONST float xexpm1f(float a) {
+  Sleef_float2 d = dfadd2_f2_f2_f(expk2f(df(a, 0)), -1.0f);
+  float x = d.x + d.y;
+  if (a > 88.72283172607421875f) x = INFINITYf;
+  if (a < -16.635532333438687426013570f) x = -1;
+  if (xisnegzerof(a)) x = -0.0f;
+  return x;
+}
+
+EXPORT CONST float xlog10f(float d) {
+  Sleef_float2 x, s;
+  float m, t, x2;
+  int e;
+
+  int o = d < FLT_MIN;
+  if (o) d *= (float)(1LL << 32) * (float)(1LL << 32);
+      
+  e = ilogb2kf(d * (1.0f/0.75f));
+  m = ldexp3kf(d, -e);
+
+  if (o) e -= 64;
+
+  x = dfdiv_f2_f2_f2(dfadd2_f2_f_f(-1, m), dfadd2_f2_f_f(1, m));
+  x2 = x.x * x.x;
+
+  t = +0.1314289868e+0;
+  t = mlaf(t, x2, +0.1735493541e+0);
+  t = mlaf(t, x2, +0.2895309627e+0);
+    
+  s = dfmul_f2_f2_f(df(0.30103001, -1.432098889e-08), (float)e);
+  s = dfadd_f2_f2_f2(s, dfmul_f2_f2_f2(x, df(0.868588984, -2.170757285e-08)));
+  s = dfadd_f2_f2_f(s, x2 * x.x * t);
+
+  float r = s.x + s.y;
+  
+  if (xisinff(d)) r = INFINITYf;
+  if (d < 0 || xisnanf(d)) r = NANf;
+  if (d == 0) r = -INFINITYf;
+
+  return r;
+}
+
+static INLINE CONST float xlog1pf_fast(float d) {
+  Sleef_float2 x, s;
+  float m, t, x2;
+  int e;
+
+  float dp1 = d + 1;
+
+  int o = dp1 < FLT_MIN;
+  if (o) dp1 *= (float)(1LL << 32) * (float)(1LL << 32);
+
+  e = ilogb2kf(dp1 * (1.0f/0.75f));
+
+  t = ldexp3kf(1, -e);
+  m = mlaf(d, t, t-1);
+
+  if (o) e -= 64;
+
+  x = dfdiv_f2_f2_f2(df(m, 0), dfadd_f2_f_f(2, m));
+  x2 = x.x * x.x;
+
+  t = +0.3027294874e+0f;
+  t = mlaf(t, x2, +0.3996108174e+0f);
+  t = mlaf(t, x2, +0.6666694880e+0f);
+
+  s = dfmul_f2_f2_f(df(0.69314718246459960938f, -1.904654323148236017e-09f), (float)e);
+  s = dfadd_f2_f2_f2(s, dfscale_f2_f2_f(x, 2));
+  s = dfadd_f2_f2_f(s, x2 * x.x * t);
+
+  float r = s.x + s.y;
+
+  if (d == INFINITYf) r = INFINITYf;
+  if (d < -1) r = NANf;
+  if (d == -1) r = -INFINITYf;
+  if (xisnegzerof(d)) r = -0.0f;
+  if (xisnanf(d)) r = d;
+
+  return r;
+}
+
+EXPORT CONST float xlog1pf(float a) {
+  if (a > 0x1.0p+125)
+    return xlogf(a);
+  else
+    return xlog1pf_fast(a);
+}
+
+//
+
+EXPORT CONST float xcbrtf(float d) {
+  float x, y, q = 1.0f;
+  int e, r;
+
+  e = ilogbkf(fabsfk(d))+1;
+  d = ldexp2kf(d, -e);
+  r = (e + 6144) % 3;
+  q = (r == 1) ? 1.2599210498948731647672106f : q;
+  q = (r == 2) ? 1.5874010519681994747517056f : q;
+  q = ldexp2kf(q, (e + 6144) / 3 - 2048);
+
+  q = mulsignf(q, d);
+  d = fabsfk(d);
+
+  x = -0.601564466953277587890625f;
+  x = mlaf(x, d, 2.8208892345428466796875f);
+  x = mlaf(x, d, -5.532182216644287109375f);
+  x = mlaf(x, d, 5.898262500762939453125f);
+  x = mlaf(x, d, -3.8095417022705078125f);
+  x = mlaf(x, d, 2.2241256237030029296875f);
+
+  y = d * x * x;
+  y = (y - (2.0f / 3.0f) * y * (y * x - 1.0f)) * q;
+
+  return y;
+}
+
+EXPORT CONST float xcbrtf_u1(float d) {
+  float x, y, z;
+  Sleef_float2 q2 = df(1, 0), u, v;
+  int e, r;
+
+  e = ilogbkf(fabsfk(d))+1;
+  d = ldexp2kf(d, -e);
+  r = (e + 6144) % 3;
+  q2 = (r == 1) ? df(1.2599210739135742188, -2.4018701694217270415e-08) : q2;
+  q2 = (r == 2) ? df(1.5874010324478149414,  1.9520385308169352356e-08) : q2;
+
+  q2.x = mulsignf(q2.x, d); q2.y = mulsignf(q2.y, d);
+  d = fabsfk(d);
+
+  x = -0.601564466953277587890625f;
+  x = mlaf(x, d, 2.8208892345428466796875f);
+  x = mlaf(x, d, -5.532182216644287109375f);
+  x = mlaf(x, d, 5.898262500762939453125f);
+  x = mlaf(x, d, -3.8095417022705078125f);
+  x = mlaf(x, d, 2.2241256237030029296875f);
+
+  y = x * x; y = y * y; x -= (d * y - x) * (1.0 / 3.0f);
+
+  z = x;
+
+  u = dfmul_f2_f_f(x, x);
+  u = dfmul_f2_f2_f2(u, u);
+  u = dfmul_f2_f2_f(u, d);
+  u = dfadd2_f2_f2_f(u, -x);
+  y = u.x + u.y;
+
+  y = -2.0 / 3.0 * y * z;
+  v = dfadd2_f2_f2_f(dfmul_f2_f_f(z, z), y);
+  v = dfmul_f2_f2_f(v, d);
+  v = dfmul_f2_f2_f2(v, q2);
+  z = ldexp2kf(v.x + v.y, (e + 6144) / 3 - 2048);
+
+  if (xisinff(d)) { z = mulsignf(INFINITYf, q2.x); }
+  if (d == 0) { z = mulsignf(0, q2.x); }
+
+  return z;
+}
+
+//
+
+EXPORT CONST float xfabsf(float x) { return fabsfk(x); }
+
+EXPORT CONST float xcopysignf(float x, float y) { return copysignfk(x, y); }
+
+EXPORT CONST float xfmaxf(float x, float y) {
+  return y != y ? x : (x > y ? x : y);
+}
+
+EXPORT CONST float xfminf(float x, float y) {
+  return y != y ? x : (x < y ? x : y);
+}
+
+EXPORT CONST float xfdimf(float x, float y) {
+  float ret = x - y;
+  if (ret < 0 || x == y) ret = 0;
+  return ret;
+}
+
+EXPORT CONST float xtruncf(float x) {
+  float fr = x - (int32_t)x;
+  return (xisinff(x) || fabsfk(x) >= (float)(1LL << 23)) ? x : copysignfk(x - fr, x);
+}
+
+EXPORT CONST float xfloorf(float x) {
+  float fr = x - (int32_t)x;
+  fr = fr < 0 ? fr+1.0f : fr;
+  return (xisinff(x) || fabsfk(x) >= (float)(1LL << 23)) ? x : copysignfk(x - fr, x);
+}
+
+EXPORT CONST float xceilf(float x) {
+  float fr = x - (int32_t)x;
+  fr = fr <= 0 ? fr : fr-1.0f;
+  return (xisinff(x) || fabsfk(x) >= (float)(1LL << 23)) ? x : copysignfk(x - fr, x);
+}
+
+EXPORT CONST float xroundf(float d) {
+  float x = d + 0.5f;
+  float fr = x - (int32_t)x;
+  if (fr == 0 && x <= 0) x--;
+  fr = fr < 0 ? fr+1.0f : fr;
+  x = d == 0.4999999701976776123f ? 0 : x;  // nextafterf(0.5, 0)
+  return (xisinff(d) || fabsfk(d) >= (float)(1LL << 23)) ? d : copysignfk(x - fr, d);
+}
+
+EXPORT CONST float xrintf(float d) {
+  float x = d + 0.5f;
+  int32_t isodd = (1 & (int32_t)x) != 0;
+  float fr = x - (int32_t)x;
+  fr = (fr < 0 || (fr == 0 && isodd)) ? fr+1.0f : fr;
+  x = d == 0.50000005960464477539f ? 0 : x;  // nextafterf(0.5, 1)
+  return (xisinff(d) || fabsfk(d) >= (float)(1LL << 23)) ? d : copysignfk(x - fr, d);
+}
+
+EXPORT CONST Sleef_float2 xmodff(float x) {
+  float fr = x - (int32_t)x;
+  fr = fabsfk(x) > (float)(1LL << 23) ? 0 : fr;
+  Sleef_float2 ret = { copysignfk(fr, x), copysignfk(x - fr, x) };
+  return ret;
+}
+
+EXPORT CONST float xldexpf(float x, int exp) {
+  if (exp >  300) exp =  300;
+  if (exp < -300) exp = -300;
+
+  int e0 = exp >> 2;
+  if (exp < 0) e0++;
+  if (-50 < exp && exp < 50) e0 = 0;
+  int e1 = exp - (e0 << 2);
+
+  float p = pow2if(e0);
+  float ret = x * pow2if(e1) * p * p * p * p;
+
+  return ret;
+}
+
+EXPORT CONST float xnextafterf(float x, float y) {
+  union {
+    float f;
+    int32_t i;
+  } cx;
+
+  cx.f = x == 0 ? mulsignf(0, y) : x;
+  int c = (cx.i < 0) == (y < x);
+  if (c) cx.i = -(cx.i ^ (1 << 31));
+
+  if (x != y) cx.i--;
+
+  if (c) cx.i = -(cx.i ^ (1 << 31));
+
+  if (cx.f == 0 && x != 0) cx.f = mulsignf(0, x);
+  if (x == 0 && y == 0) cx.f = y;
+  if (xisnanf(x) || xisnanf(y)) cx.f = NANf;
+
+  return cx.f;
+}
+
+EXPORT CONST float xfrfrexpf(float x) {
+  union {
+    float f;
+    int32_t u;
+  } cx;
+
+  if (xisnanf(x)) return x;
+
+  if (fabsfk(x) < FLT_MIN) x *= (1 << 30);
+
+  cx.f = x;
+  cx.u &= ~0x7f800000U;
+  cx.u |=  0x3f000000U;
+
+  if (xisinff(x)) cx.f = mulsignf(INFINITYf, x);
+  if (x == 0) cx.f = x;
+
+  return cx.f;
+}
+
+EXPORT CONST int xexpfrexpf(float x) {
+  union {
+    float f;
+    uint32_t u;
+  } cx;
+
+  int ret = 0;
+
+  if (fabsfk(x) < FLT_MIN) { x *= (1 << 30); ret = -30; }
+
+  cx.f = x;
+  ret += (int32_t)(((cx.u >> 23) & 0xff)) - 0x7e;
+
+  if (x == 0 || xisnanf(x) || xisinff(x)) ret = 0;
+
+  return ret;
+}
+
+EXPORT CONST float xhypotf_u05(float x, float y) {
+  x = fabsfk(x);
+  y = fabsfk(y);
+  float min = fminfk(x, y), n = min;
+  float max = fmaxfk(x, y), d = max;
+
+  if (max < FLT_MIN) { n *= 1ULL << 24; d *= 1ULL << 24; }
+  Sleef_float2 t = dfdiv_f2_f2_f2(df(n, 0), df(d, 0));
+  t = dfmul_f2_f2_f(dfsqrt_f2_f2(dfadd2_f2_f2_f(dfsqu_f2_f2(t), 1)), max);
+  float ret = t.x + t.y;
+  if (xisnanf(ret)) ret = INFINITYf;
+  if (min == 0) ret = max;
+  if (xisnanf(x) || xisnanf(y)) ret = NANf;
+  if (x == INFINITYf || y == INFINITYf) ret = INFINITYf;
+  return ret;
+}
+
+EXPORT CONST float xhypotf_u35(float x, float y) {
+  x = fabsfk(x);
+  y = fabsfk(y);
+  float min = fminfk(x, y);
+  float max = fmaxfk(x, y);
+
+  float t = min / max;
+  float ret = max * sqrtf(1 + t*t);
+  if (min == 0) ret = max;
+  if (xisnanf(x) || xisnanf(y)) ret = NANf;
+  if (x == INFINITYf || y == INFINITYf) ret = INFINITYf;
+  return ret;
+}
+
+static INLINE CONST float toward0f(float d) {
+  return d == 0 ? 0 : intBitsToFloat(floatToRawIntBits(d)-1);
+}
+
+static INLINE CONST float ptruncf(float x) {
+  return fabsfk(x) >= (float)(1LL << 23) ? x : (x - (x - (int32_t)x));
+}
+
+EXPORT CONST float xfmodf(float x, float y) {
+  float nu = fabsfk(x), de = fabsfk(y), s = 1, q;
+  if (de < FLT_MIN) { nu *= 1ULL << 25; de *= 1ULL << 25; s = 1.0f / (1ULL << 25); }
+  Sleef_float2 r = df(nu, 0);
+  float rde = toward0f(1.0f / de);
+
+  for(int i=0;i<8;i++) { // ceil(log2(FLT_MAX) / 22)+1
+    q = (de+de > r.x && r.x >= de) ? 1.0f : (toward0f(r.x) * rde);
+    r = dfnormalize_f2_f2(dfadd2_f2_f2_f2(r, dfmul_f2_f_f(ptruncf(q), -de)));
+    if (r.x < de) break;
+  }
+
+  float ret = (r.x + r.y) * s;
+  if (r.x + r.y == de) ret = 0;
+  ret = mulsignf(ret, x);
+  if (nu < de) ret = x;
+  if (de == 0) ret = NANf;
+
+  return ret;
+}
+
+EXPORT CONST float xsqrtf_u05(float d) {
+#if __has_builtin(__builtin_sqrtf)
+  return __builtin_sqrtf(d);
+#else
+#warning Using software SQRT
+  float q = 0.5f;
+
+  d = d < 0 ? NANf : d;
+
+  if (d < 5.2939559203393770e-23f) {
+    d *= 1.8889465931478580e+22f;
+    q = 7.2759576141834260e-12f * 0.5f;
+  }
+
+  if (d > 1.8446744073709552e+19f) {
+    d *= 5.4210108624275220e-20f;
+    q = 4294967296.0f * 0.5f;
+  }
+
+  // http://en.wikipedia.org/wiki/Fast_inverse_square_root
+  float x = intBitsToFloat(0x5f375a86 - (floatToRawIntBits(d + 1e-45f) >> 1));
+
+  x = x * (1.5f - 0.5f * d * x * x);
+  x = x * (1.5f - 0.5f * d * x * x);
+  x = x * (1.5f - 0.5f * d * x * x) * d;
+
+  Sleef_float2 d2 = dfmul_f2_f2_f2(dfadd2_f2_f_f2(d, dfmul_f2_f_f(x, x)), dfrec_f2_f(x));
+
+  float ret = (d2.x + d2.y) * q;
+
+  ret = d == INFINITYf ? INFINITYf : ret;
+  ret = d == 0 ? d : ret;
+
+  return ret;
+#endif
+}
+
+EXPORT CONST float xsqrtf_u35(float d) {
+  float q = 1.0f;
+
+  d = d < 0 ? NANf : d;
+
+  if (d < 5.2939559203393770e-23f) {
+    d *= 1.8889465931478580e+22f;
+    q = 7.2759576141834260e-12f;
+  }
+
+  if (d > 1.8446744073709552e+19f) {
+    d *= 5.4210108624275220e-20f;
+    q = 4294967296.0f;
+  }
+
+  // http://en.wikipedia.org/wiki/Fast_inverse_square_root
+  float x = intBitsToFloat(0x5f375a86 - (floatToRawIntBits(d + 1e-45) >> 1));
+
+  x = x * (1.5f - 0.5f * d * x * x);
+  x = x * (1.5f - 0.5f * d * x * x);
+  x = x * (1.5f - 0.5f * d * x * x);
+  x = x * (1.5f - 0.5f * d * x * x);
+
+  return d == INFINITYf ? INFINITYf : (x * d * q);
+}
+
+EXPORT CONST float xfmaf(float x, float y, float z) {
+#if __has_builtin(__builtin_fmaf)
+  return __builtin_fmaf(x, y, z);
+#else
+#warning Using software FMA
+  float h2 = x * y + z, q = 1;
+  if (fabsfk(h2) < 1e-38f) {
+    const float c0 = 1 << 25, c1 = c0 * c0, c2 = c1 * c1;
+    x *= c1;
+    y *= c1;
+    z *= c2;
+    q = 1.0f / c2;
+  }
+  if (fabsfk(h2) > 1e+38f) {
+    const float c0 = 1 << 25, c1 = c0 * c0, c2 = c1 * c1;
+    x *= 1.0 / c1;
+    y *= 1.0 / c1;
+    z *= 1.0 / c2;
+    q = c2;
+  }
+  Sleef_float2 d = dfmul_f2_f_f(x, y);
+  d = dfadd2_f2_f2_f(d, z);
+  float ret = (x == 0 || y == 0) ? z : (d.x + d.y);
+  if (xisinff(z) && !xisinff(x) && !xisnanf(x) && !xisinff(y) && !xisnanf(y)) h2 = z;
+  return (xisinff(h2) || xisnanf(h2)) ? h2 : ret*q;
+#endif
+}
+
+//
+
+static INLINE CONST Sleef_float2 sinpifk(float d) {
+  float u, s, t;
+  Sleef_float2 x, s2;
+
+  u = d * 4;
+  int q = ceilfk(u) & ~1;
+  int o = (q & 2) != 0;
+
+  s = u - (float)q;
+  t = s;
+  s = s * s;
+  s2 = dfmul_f2_f_f(t, t);
+
+  //
+
+  u = o ? -0.2430611801e-7f : +0.3093842054e-6f;
+  u = mlaf(u, s, o ? +0.3590577080e-5f : -0.3657307388e-4f);
+  u = mlaf(u, s, o ? -0.3259917721e-3f : +0.2490393585e-2f);
+  x = dfadd2_f2_f_f2(u * s, o ? df(0.015854343771934509277, 4.4940051354032242811e-10) :
+         df(-0.080745510756969451904, -1.3373665339076936258e-09));
+  x = dfadd2_f2_f2_f2(dfmul_f2_f2_f2(s2, x), o ? df(-0.30842512845993041992, -9.0728339030733922277e-09) :
+          df(0.78539818525314331055, -2.1857338617566484855e-08));
+
+  x = dfmul_f2_f2_f2(x, o ? s2 : df(t, 0));
+  x = o ? dfadd2_f2_f2_f(x, 1) : x;
+
+  //
+
+  if ((q & 4) != 0) { x.x = -x.x; x.y = -x.y; }
+
+  return x;
+}
+
+EXPORT CONST float xsinpif_u05(float d) {
+  Sleef_float2 x = sinpifk(d);
+  float r = x.x + x.y;
+
+  if (xisnegzerof(d)) r = -0.0;
+  if (fabsfk(d) > TRIGRANGEMAX4f) r = 0;
+  if (xisinff(d)) r = NANf;
+
+  return r;
+}
+
+static INLINE CONST Sleef_float2 cospifk(float d) {
+  float u, s, t;
+  Sleef_float2 x, s2;
+
+  u = d * 4;
+  int q = ceilfk(u) & ~1;
+  int o = (q & 2) == 0;
+
+  s = u - (float)q;
+  t = s;
+  s = s * s;
+  s2 = dfmul_f2_f_f(t, t);
+
+  //
+
+  u = o ? -0.2430611801e-7f : +0.3093842054e-6f;
+  u = mlaf(u, s, o ? +0.3590577080e-5f : -0.3657307388e-4f);
+  u = mlaf(u, s, o ? -0.3259917721e-3f : +0.2490393585e-2f);
+  x = dfadd2_f2_f_f2(u * s, o ? df(0.015854343771934509277, 4.4940051354032242811e-10) :
+         df(-0.080745510756969451904, -1.3373665339076936258e-09));
+  x = dfadd2_f2_f2_f2(dfmul_f2_f2_f2(s2, x), o ? df(-0.30842512845993041992, -9.0728339030733922277e-09) :
+          df(0.78539818525314331055, -2.1857338617566484855e-08));
+
+  x = dfmul_f2_f2_f2(x, o ? s2 : df(t, 0));
+  x = o ? dfadd2_f2_f2_f(x, 1) : x;
+
+  //
+
+  if (((q+2) & 4) != 0) { x.x = -x.x; x.y = -x.y; }
+
+  return x;
+}
+
+EXPORT CONST float xcospif_u05(float d) {
+  Sleef_float2 x = cospifk(d);
+  float r = x.x + x.y;
+
+  if (fabsfk(d) > TRIGRANGEMAX4f) r = 1;
+  if (xisinff(d)) r = NANf;
+
+  return r;
+}
+
+typedef struct {
+  Sleef_float2 a, b;
+} df2;
+
+static CONST df2 gammafk(float a) {
+  Sleef_float2 clc = df(0, 0), clln = df(1, 0), clld = df(1, 0), v = df(1, 0), x, y, z;
+  float t, u;
+
+  int otiny = fabsfk(a) < 1e-30f, oref = a < 0.5f;
+
+  x = otiny ? df(0, 0) : (oref ? dfadd2_f2_f_f(1, -a) : df(a, 0));
+
+  int o0 = (0.5f <= x.x && x.x <= 1.2), o2 = 2.3 < x.x;
+
+  y = dfnormalize_f2_f2(dfmul_f2_f2_f2(dfadd2_f2_f2_f(x, 1), x));
+  y = dfnormalize_f2_f2(dfmul_f2_f2_f2(dfadd2_f2_f2_f(x, 2), y));
+
+  clln = (o2 && x.x <= 7) ? y : clln;
+
+  x = (o2 && x.x <= 7) ? dfadd2_f2_f2_f(x, 3) : x;
+  t = o2 ? (1.0 / x.x) : dfnormalize_f2_f2(dfadd2_f2_f2_f(x, o0 ? -1 : -2)).x;
+
+  u = o2 ? +0.000839498720672087279971000786 : (o0 ? +0.9435157776e+0f : +0.1102489550e-3f);
+  u = mlaf(u, t, o2 ? -5.17179090826059219329394422e-05 : (o0 ? +0.8670063615e+0f : +0.8160019934e-4f));
+  u = mlaf(u, t, o2 ? -0.000592166437353693882857342347 : (o0 ? +0.4826702476e+0f : +0.1528468856e-3f));
+  u = mlaf(u, t, o2 ? +6.97281375836585777403743539e-05 : (o0 ? -0.8855129778e-1f : -0.2355068718e-3f));
+  u = mlaf(u, t, o2 ? +0.000784039221720066627493314301 : (o0 ? +0.1013825238e+0f : +0.4962242092e-3f));
+  u = mlaf(u, t, o2 ? -0.000229472093621399176949318732 : (o0 ? -0.1493408978e+0f : -0.1193488017e-2f));
+  u = mlaf(u, t, o2 ? -0.002681327160493827160473958490 : (o0 ? +0.1697509140e+0f : +0.2891599433e-2f));
+  u = mlaf(u, t, o2 ? +0.003472222222222222222175164840 : (o0 ? -0.2072454542e+0f : -0.7385451812e-2f));
+  u = mlaf(u, t, o2 ? +0.083333333333333333335592087900 : (o0 ? +0.2705872357e+0f : +0.2058077045e-1f));
+
+  y = dfmul_f2_f2_f2(dfadd2_f2_f2_f(x, -0.5), logk2f(x));
+  y = dfadd2_f2_f2_f2(y, dfneg_f2_f2(x));
+  y = dfadd2_f2_f2_f2(y, dfx(0.91893853320467278056)); // 0.5*log(2*M_PI)
+
+  z = dfadd2_f2_f2_f(dfmul_f2_f_f (u, t), o0 ? -0.400686534596170958447352690395e+0f : -0.673523028297382446749257758235e-1f);
+  z = dfadd2_f2_f2_f(dfmul_f2_f2_f(z, t), o0 ? +0.822466960142643054450325495997e+0f : +0.322467033928981157743538726901e+0f);
+  z = dfadd2_f2_f2_f(dfmul_f2_f2_f(z, t), o0 ? -0.577215665946766039837398973297e+0f : +0.422784335087484338986941629852e+0f);
+  z = dfmul_f2_f2_f(z, t);
+
+  clc = o2 ? y : z;
+
+  clld = o2 ? dfadd2_f2_f2_f(dfmul_f2_f_f(u, t), 1) : clld;
+
+  y = clln;
+
+  clc = otiny ? dfx(41.58883083359671856503) : // log(2^60)
+    (oref ? dfadd2_f2_f2_f2(dfx(1.1447298858494001639), dfneg_f2_f2(clc)) : clc); // log(M_PI)
+  clln = otiny ? df(1, 0) : (oref ? clln : clld);
+
+  if (oref) x = dfmul_f2_f2_f2(clld, sinpifk(a - (float)(1LL << 12) * (int32_t)(a * (1.0 / (1LL << 12)))));
+
+  clld = otiny ? df(a*((1LL << 30)*(float)(1LL << 30)), 0) : (oref ? x : y);
+
+  df2 ret = { clc, dfdiv_f2_f2_f2(clln, clld) };
+
+  return ret;
+}
+
+EXPORT CONST float xtgammaf_u1(float a) {
+  df2 d = gammafk(a);
+  Sleef_float2 y = dfmul_f2_f2_f2(expk2f(d.a), d.b);
+  float r = y.x + y.y;
+  r = (a == -INFINITYf || (a < 0 && xisintf(a)) || (xisnumberf(a) && a < 0 && xisnanf(r))) ? NANf : r;
+  r = ((a == INFINITYf || xisnumberf(a)) && a >= -FLT_MIN && (a == 0 || a > 36 || xisnanf(r))) ? mulsignf(INFINITYf, a) : r;
+  return r;
+}
+
+EXPORT CONST float xlgammaf_u1(float a) {
+  df2 d = gammafk(a);
+  Sleef_float2 y = dfadd2_f2_f2_f2(d.a, logk2f(dfabs_f2_f2(d.b)));
+  float r = y.x + y.y;
+  r = (xisinff(a) || (a <= 0 && xisintf(a)) || (xisnumberf(a) && xisnanf(r))) ? INFINITYf : r;
+  return r;
+}
+
+EXPORT CONST Sleef_float2 xlgamma_rf_u1(float a) {
+  df2 d = gammafk(a);
+  Sleef_float2 y = dfadd2_f2_f2_f2(d.a, logk2f(dfabs_f2_f2(d.b)));
+  float r = y.x + y.y;
+  r = (xisinff(a) || (a <= 0 && xisintf(a)) || (xisnumberf(a) && xisnanf(r))) ? INFINITYf : r;
+  Sleef_float2 ret;
+  ret.x = r;
+  ret.y = intBitsToFloat((floatToRawIntBits(d.b.x) & (1 << 31)) | (0x3f800000));
+  return ret;
+}
+
+EXPORT CONST float xerff_u1(float a) {
+  float s = a, t, u;
+  Sleef_float2 d;
+
+  a = fabsfk(a);
+  int o0 = a < 1.1f, o1 = a < 2.4f, o2 = a < 4.0f;
+  u = o0 ? (a*a) : a;
+
+  t = o0 ? +0.7089292194e-4f : o1 ? -0.1792667899e-4f : -0.9495757695e-5f;
+  t = mlaf(t, u, o0 ? -0.7768311189e-3f : o1 ? +0.3937633010e-3f : +0.2481465926e-3f);
+  t = mlaf(t, u, o0 ? +0.5159463733e-2f : o1 ? -0.3949181177e-2f : -0.2918176819e-2f);
+  t = mlaf(t, u, o0 ? -0.2683781274e-1f : o1 ? +0.2445474640e-1f : +0.2059706673e-1f);
+  t = mlaf(t, u, o0 ? +0.1128318012e+0f : o1 ? -0.1070996150e+0f : -0.9901899844e-1f);
+  d = dfmul_f2_f_f(t, u);
+  d = dfadd2_f2_f2_f2(d, o0 ? dfx(-0.376125876000657465175213237214e+0) :
+          o1 ? dfx(-0.634588905908410389971210809210e+0) :
+          dfx(-0.643598050547891613081201721633e+0));
+  d = dfmul_f2_f2_f(d, u);
+  d = dfadd2_f2_f2_f2(d, o0 ? dfx(+0.112837916021059138255978217023e+1) :
+          o1 ? dfx(-0.112879855826694507209862753992e+1) :
+          dfx(-0.112461487742845562801052956293e+1));
+  d = dfmul_f2_f2_f(d, a);
+  d = o0 ? d : dfadd_f2_f_f2(1.0, dfneg_f2_f2(expk2f(d)));
+  u = mulsignf(o2 ? (d.x + d.y) : 1, s);
+  u = xisnanf(a) ? NANf : u;
+  return u;
+}
+
+EXPORT CONST float xerfcf_u15(float a) {
+  float s = a, r = 0, t;
+  Sleef_float2 u, d, x;
+  a = fabsfk(a);
+  int o0 = a < 1.0f, o1 = a < 2.2f, o2 = a < 4.3f, o3 = a < 10.1f;
+  u = o1 ? df(a, 0) : dfdiv_f2_f2_f2(df(1, 0), df(a, 0));
+
+  t = o0 ? -0.8638041618e-4f : o1 ? -0.6236977242e-5f : o2 ? -0.3869504035e+0f : +0.1115344167e+1f;
+  t = mlaf(t, u.x, o0 ? +0.6000166177e-3f : o1 ? +0.5749821503e-4f : o2 ? +0.1288077235e+1f : -0.9454904199e+0f);
+  t = mlaf(t, u.x, o0 ? -0.1665703603e-2f : o1 ? +0.6002851478e-5f : o2 ? -0.1816803217e+1f : -0.3667259514e+0f);
+  t = mlaf(t, u.x, o0 ? +0.1795156277e-3f : o1 ? -0.2851036377e-2f : o2 ? +0.1249150872e+1f : +0.7155663371e+0f);
+  t = mlaf(t, u.x, o0 ? +0.1914106123e-1f : o1 ? +0.2260518074e-1f : o2 ? -0.1328857988e+0f : -0.1262947265e-1f);
+
+  d = dfmul_f2_f2_f(u, t);
+  d = dfadd2_f2_f2_f2(d, o0 ? dfx(-0.102775359343930288081655368891e+0) :
+          o1 ? dfx(-0.105247583459338632253369014063e+0) :
+          o2 ? dfx(-0.482365310333045318680618892669e+0) :
+          dfx(-0.498961546254537647970305302739e+0));
+  d = dfmul_f2_f2_f2(d, u);
+  d = dfadd2_f2_f2_f2(d, o0 ? dfx(-0.636619483208481931303752546439e+0) :
+          o1 ? dfx(-0.635609463574589034216723775292e+0) :
+          o2 ? dfx(-0.134450203224533979217859332703e-2) :
+          dfx(-0.471199543422848492080722832666e-4));
+  d = dfmul_f2_f2_f2(d, u);
+  d = dfadd2_f2_f2_f2(d, o0 ? dfx(-0.112837917790537404939545770596e+1) :
+          o1 ? dfx(-0.112855987376668622084547028949e+1) :
+          o2 ? dfx(-0.572319781150472949561786101080e+0) :
+          dfx(-0.572364030327966044425932623525e+0));
+
+  x = dfmul_f2_f2_f(o1 ? d : df(-a, 0), a);
+  x = o1 ? x : dfadd2_f2_f2_f2(x, d);
+
+  x = expk2f(x);
+  x = o1 ? x : dfmul_f2_f2_f2(x, u);
+
+  r = o3 ? (x.x + x.y) : 0;
+  if (s < 0) r = 2 - r;
+  r = xisnanf(s) ? NANf : r;
+  return r;
+}
+
+//
+
+#ifdef ENABLE_MAIN
+// gcc -w -DENABLE_MAIN -I../common sleefsp.c -lm
+#include <stdlib.h>
+int main(int argc, char **argv) {
+  float d1 = atof(argv[1]);
+  //float d2 = atof(argv[2]);
+  //float d3 = atof(argv[3]);
+  //printf("%.20g, %.20g\n", (double)d1, (double)d2);
+  //float i2 = atoi(argv[2]);
+  //float c = xatan2f_u1(d1, d2);
+  //printf("round %.20g\n", (double)d1);
+  printf("test    = %.20g\n", (double)xsqrtf_u05(d1));
+  //printf("correct = %.20g\n", (double)roundf(d1));
+  //printf("rint %.20g\n", (double)d1);
+  //printf("test    = %.20g\n", (double)xrintf(d1));
+  //printf("correct = %.20g\n", (double)rintf(d1));
+  //Sleef_float2 r = xsincospif_u35(d);
+  //printf("%g, %g\n", (double)r.x, (double)r.y);
+}
+#endif
diff --git a/lib/kernel/sleef/test.c b/lib/kernel/sleef/test.c
new file mode 100644
index 0000000..8e8bc0f
--- /dev/null
+++ b/lib/kernel/sleef/test.c
@@ -0,0 +1,58 @@
+
+/************************/
+#if defined(PURE_C)
+
+  #define CONFIG 1
+
+#elif defined(VEC128)
+
+  #ifdef __ARM_NEON
+    #define CONFIG 1
+
+  #elif defined(__AVX2__)
+    #define CONFIG 1
+
+  #elif defined(__SSE4_1__)
+    #define CONFIG 4
+
+  #elif defined(__SSE3__)
+    #define CONFIG 3
+
+  #elif defined(__SSE2__)
+    #define CONFIG 2
+
+  #else
+    #error 128bit vectors unavailable
+  #endif
+
+#elif defined(VEC256)
+
+  #if defined(__AVX2__)
+    #define CONFIG 1
+
+  #elif defined(__FMA4__)
+    #define CONFIG 4
+
+  #elif defined(__AVX__)
+    #define CONFIG 1
+
+  #else
+    #error 256bit vectors unavailable
+  #endif
+
+#elif defined(VEC512)
+
+  #ifdef __AVX512F__
+    #define CONFIG 1
+  #else
+    #error 512bit vectors unavailable
+  #endif
+
+#else
+#error Please specify valid vector size with -DVECxxx
+#endif
+
+
+int main() {
+  return 0;
+}
diff --git a/lib/kernel/tce/CMakeLists.txt b/lib/kernel/tce/CMakeLists.txt
index 91d2c01..7005888 100644
--- a/lib/kernel/tce/CMakeLists.txt
+++ b/lib/kernel/tce/CMakeLists.txt
@@ -49,15 +49,17 @@ endif()
 separate_arguments(TCE_TARGET_LLC_FLAGS)
 set(LLC_FLAGS ${TCE_TARGET_LLC_FLAGS})
 
+set(KERNEL_CL_FLAGS "-D__OPENCL_C_VERSION__=${TCE_DEVICE_CL_VERSION}" "-Xclang" "-cl-std=CL${TCE_DEVICE_CL_STD}")
+
 if(NOT LLVM_OLDER_THAN_4_0)
-set(KERNEL_CL_FLAGS "-Xclang" "-cl-ext=all,-cl_khr_fp64")
+list(APPEND KERNEL_CL_FLAGS "-Xclang" "-cl-ext=all,-cl_khr_fp64")
 endif()
 
 # TODO LLC_flags is used by kernel.bc target, but ld is unused
 #LD_FLAGS    = @TARGET_LD_FLAGS@
 
 #KERNEL_TARGET = tce (WRONG)
-make_kernel_bc(KERNEL_BC "tce-tut-llvm" "tta" ${SOURCES_WITHOUT_VML})
+make_kernel_bc(KERNEL_BC "tce-tut-llvm" "tta" 0 0 0 ${SOURCES_WITHOUT_VML})
 
 # just debug
 message(STATUS "TCE Kernel BC: ${KERNEL_BC}")
diff --git a/lib/kernel/templates.h b/lib/kernel/templates.h
index 83b5c2a..81cab51 100644
--- a/lib/kernel/templates.h
+++ b/lib/kernel/templates.h
@@ -91,7 +91,7 @@
   }
 #define DEFINE_BUILTIN_V_VV(NAME)                       \
   __IF_FP16(                                            \
-  half __attribute__ ((overloadable))                   \
+  half _CL_OVERLOADABLE _CL_READNONE                   \
   NAME(half a, half b)                                  \
   {                                                     \
     /* use float builtin */                             \
@@ -102,7 +102,7 @@
   IMPLEMENT_BUILTIN_V_VV(NAME, half4   , lo, hi)        \
   IMPLEMENT_BUILTIN_V_VV(NAME, half8   , lo, hi)        \
   IMPLEMENT_BUILTIN_V_VV(NAME, half16  , lo, hi))       \
-  float __attribute__ ((overloadable))                  \
+  float _CL_OVERLOADABLE _CL_READNONE                  \
   NAME(float a, float b)                                \
   {                                                     \
     return __builtin_##NAME##f(a, b);                   \
@@ -113,7 +113,7 @@
   IMPLEMENT_BUILTIN_V_VV(NAME, float8  , lo, hi)        \
   IMPLEMENT_BUILTIN_V_VV(NAME, float16 , lo, hi)        \
   __IF_FP64(                                            \
-  double __attribute__ ((overloadable))                 \
+  double _CL_OVERLOADABLE _CL_READNONE                 \
   NAME(double a, double b)                              \
   {                                                     \
     return __builtin_##NAME(a, b);                      \
@@ -554,7 +554,7 @@
   IMPLEMENT_EXPR_V_VV(NAME, EXPR, double16, double, long16 , long ))
 
 #define IMPLEMENT_EXPR_V_VVV(NAME, EXPR, VTYPE, STYPE, JTYPE, SJTYPE)   \
-  VTYPE __attribute__ ((overloadable))                                  \
+  VTYPE _CL_OVERLOADABLE _CL_READNONE                                  \
   NAME(VTYPE a, VTYPE b, VTYPE c)                                       \
   {                                                                     \
     typedef VTYPE vtype;                                                \
@@ -934,78 +934,120 @@
   IMPLEMENT_EXPR_V_VJ(NAME, EXPR, double8 , double, int8 , int)         \
   IMPLEMENT_EXPR_V_VJ(NAME, EXPR, double16, double, int16, int))
 
-#define IMPLEMENT_EXPR_V_VI(NAME, EXPR, VTYPE, STYPE, ITYPE)    \
-  VTYPE __attribute__ ((overloadable))                          \
-  NAME(VTYPE a, ITYPE b)                                        \
-  {                                                             \
-    typedef VTYPE vtype;                                        \
-    typedef STYPE stype;                                        \
-    typedef ITYPE itype;                                        \
-    return EXPR;                                                \
+#define IMPLEMENT_EXPR_V_VI(NAME, EXPR, VTYPE, STYPE, ITYPE, JTYPE) \
+  VTYPE __attribute__ ((overloadable))                              \
+  NAME(VTYPE a, ITYPE b)                                            \
+  {                                                                 \
+    typedef VTYPE vtype;                                            \
+    typedef STYPE stype;                                            \
+    typedef ITYPE itype;                                            \
+    typedef JTYPE jtype;                                            \
+    return EXPR;                                                    \
   }
 // All V_VS cases are excluded
 #define DEFINE_EXPR_V_VI(NAME, EXPR)                            \
   __IF_FP16(                                                    \
-  IMPLEMENT_EXPR_V_VI(NAME, EXPR, half2   , half  , int)        \
-  IMPLEMENT_EXPR_V_VI(NAME, EXPR, half3   , half  , int)        \
-  IMPLEMENT_EXPR_V_VI(NAME, EXPR, half4   , half  , int)        \
-  IMPLEMENT_EXPR_V_VI(NAME, EXPR, half8   , half  , int)        \
-  IMPLEMENT_EXPR_V_VI(NAME, EXPR, half16  , half  , int))       \
-  IMPLEMENT_EXPR_V_VI(NAME, EXPR, float2  , float , int)        \
-  IMPLEMENT_EXPR_V_VI(NAME, EXPR, float3  , float , int)        \
-  IMPLEMENT_EXPR_V_VI(NAME, EXPR, float4  , float , int)        \
-  IMPLEMENT_EXPR_V_VI(NAME, EXPR, float8  , float , int)        \
-  IMPLEMENT_EXPR_V_VI(NAME, EXPR, float16 , float , int)        \
+  IMPLEMENT_EXPR_V_VI(NAME, EXPR, half2   , half  , int, int2 ) \
+  IMPLEMENT_EXPR_V_VI(NAME, EXPR, half3   , half  , int, int3 ) \
+  IMPLEMENT_EXPR_V_VI(NAME, EXPR, half4   , half  , int, int4 ) \
+  IMPLEMENT_EXPR_V_VI(NAME, EXPR, half8   , half  , int, int8 ) \
+  IMPLEMENT_EXPR_V_VI(NAME, EXPR, half16  , half  , int, int16))\
+  IMPLEMENT_EXPR_V_VI(NAME, EXPR, float2  , float , int, int2 ) \
+  IMPLEMENT_EXPR_V_VI(NAME, EXPR, float3  , float , int, int3 ) \
+  IMPLEMENT_EXPR_V_VI(NAME, EXPR, float4  , float , int, int4 ) \
+  IMPLEMENT_EXPR_V_VI(NAME, EXPR, float8  , float , int, int8 ) \
+  IMPLEMENT_EXPR_V_VI(NAME, EXPR, float16 , float , int, int16) \
+  __IF_FP64(                                             \
+  IMPLEMENT_EXPR_V_VI(NAME, EXPR, double2 , double, int, int2 ) \
+  IMPLEMENT_EXPR_V_VI(NAME, EXPR, double3 , double, int, int3 ) \
+  IMPLEMENT_EXPR_V_VI(NAME, EXPR, double4 , double, int, int4 ) \
+  IMPLEMENT_EXPR_V_VI(NAME, EXPR, double8 , double, int, int8 ) \
+  IMPLEMENT_EXPR_V_VI(NAME, EXPR, double16, double, int, int16))
+
+#define IMPLEMENT_EXPR_V_VPV(NAME, EXPR, VTYPE, STYPE, ITYPE)                 \
+  VTYPE __attribute__ ((overloadable)) NAME (VTYPE a, __global VTYPE *b)      \
+  {                                                                           \
+    typedef VTYPE vtype;                                                      \
+    typedef STYPE stype;                                                      \
+    typedef ITYPE itype;                                                      \
+    return EXPR;                                                              \
+  }                                                                           \
+  VTYPE __attribute__ ((overloadable)) NAME (VTYPE a, __local VTYPE *b)       \
+  {                                                                           \
+    typedef VTYPE vtype;                                                      \
+    typedef STYPE stype;                                                      \
+    typedef ITYPE itype;                                                      \
+    return EXPR;                                                              \
+  }                                                                           \
+  VTYPE __attribute__ ((overloadable)) NAME (VTYPE a, __private VTYPE *b)     \
+  {                                                                           \
+    typedef VTYPE vtype;                                                      \
+    typedef STYPE stype;                                                      \
+    typedef ITYPE itype;                                                      \
+    return EXPR;                                                              \
+  }
+#define DEFINE_EXPR_V_VPV(NAME, EXPR)                           \
+  __IF_FP16(                                                    \
+  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, half    , half  , short)     \
+  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, half2   , half  , short2)    \
+  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, half3   , half  , short3)    \
+  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, half4   , half  , short4)    \
+  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, half8   , half  , short8)    \
+  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, half16  , half  , short16))  \
+  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, float   , float , int)       \
+  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, float2  , float , int2)      \
+  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, float3  , float , int3)      \
+  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, float4  , float , int4)      \
+  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, float8  , float , int8)      \
+  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, float16 , float , int16)     \
   __IF_FP64(                                                    \
-  IMPLEMENT_EXPR_V_VI(NAME, EXPR, double2 , double, int)        \
-  IMPLEMENT_EXPR_V_VI(NAME, EXPR, double3 , double, int)        \
-  IMPLEMENT_EXPR_V_VI(NAME, EXPR, double4 , double, int)        \
-  IMPLEMENT_EXPR_V_VI(NAME, EXPR, double8 , double, int)        \
-  IMPLEMENT_EXPR_V_VI(NAME, EXPR, double16, double, int))
+  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, double  , double, long)      \
+  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, double2 , double, long2)     \
+  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, double3 , double, long3)     \
+  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, double4 , double, long4)     \
+  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, double8 , double, long8)     \
+  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, double16, double, long16))
 
-#define IMPLEMENT_EXPR_V_VPV(NAME, EXPR, VTYPE, STYPE)  \
+#define IMPLEMENT_EXPR_V_VIPV(NAME, EXPR, VTYPE, STYPE, ITYPE)  \
   VTYPE __attribute__ ((overloadable))                  \
-  NAME(VTYPE a, __global VTYPE *b)                      \
+  NAME(VTYPE a, __global ITYPE *b)                      \
   {                                                     \
     typedef VTYPE vtype;                                \
     typedef STYPE stype;                                \
+    typedef ITYPE itype;                                \
     return EXPR;                                        \
   }                                                     \
   VTYPE __attribute__ ((overloadable))                  \
-  NAME(VTYPE a, __local VTYPE *b)                       \
+  NAME(VTYPE a, __local ITYPE *b)                       \
   {                                                     \
     typedef VTYPE vtype;                                \
     typedef STYPE stype;                                \
+    typedef ITYPE itype;                                \
     return EXPR;                                        \
   }                                                     \
   VTYPE __attribute__ ((overloadable))                  \
-  NAME(VTYPE a, __private VTYPE *b)                     \
+  NAME(VTYPE a, __private ITYPE *b)                     \
   {                                                     \
     typedef VTYPE vtype;                                \
     typedef STYPE stype;                                \
+    typedef ITYPE itype;                                \
     return EXPR;                                        \
   }
-#define DEFINE_EXPR_V_VPV(NAME, EXPR)                   \
-  __IF_FP16(                                            \
-  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, half    , half  )    \
-  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, half2   , half  )    \
-  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, half3   , half  )    \
-  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, half4   , half  )    \
-  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, half8   , half  )    \
-  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, half16  , half  ))   \
-  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, float   , float )    \
-  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, float2  , float )    \
-  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, float3  , float )    \
-  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, float4  , float )    \
-  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, float8  , float )    \
-  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, float16 , float )    \
+#define DEFINE_EXPR_V_VIPV(NAME, EXPR)                   \
+  IMPLEMENT_EXPR_V_VIPV(NAME, EXPR, float   , float , int)    \
+  IMPLEMENT_EXPR_V_VIPV(NAME, EXPR, float2  , float , int2)    \
+  IMPLEMENT_EXPR_V_VIPV(NAME, EXPR, float3  , float , int3)    \
+  IMPLEMENT_EXPR_V_VIPV(NAME, EXPR, float4  , float , int4)    \
+  IMPLEMENT_EXPR_V_VIPV(NAME, EXPR, float8  , float , int8)    \
+  IMPLEMENT_EXPR_V_VIPV(NAME, EXPR, float16 , float , int16)    \
   __IF_FP64(                                            \
-  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, double  , double)    \
-  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, double2 , double)    \
-  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, double3 , double)    \
-  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, double4 , double)    \
-  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, double8 , double)    \
-  IMPLEMENT_EXPR_V_VPV(NAME, EXPR, double16, double))
+  IMPLEMENT_EXPR_V_VIPV(NAME, EXPR, double  , double, int)    \
+  IMPLEMENT_EXPR_V_VIPV(NAME, EXPR, double2 , double, int2)    \
+  IMPLEMENT_EXPR_V_VIPV(NAME, EXPR, double3 , double, int3)    \
+  IMPLEMENT_EXPR_V_VIPV(NAME, EXPR, double4 , double, int4)    \
+  IMPLEMENT_EXPR_V_VIPV(NAME, EXPR, double8 , double, int8)    \
+  IMPLEMENT_EXPR_V_VIPV(NAME, EXPR, double16, double, int16))
+
 
 #define IMPLEMENT_EXPR_V_SV(NAME, EXPR, VTYPE, STYPE, JTYPE, SJTYPE)    \
   VTYPE __attribute__ ((overloadable))                                  \
@@ -1046,12 +1088,27 @@
     return EXPR;                                        \
   }
 #define DEFINE_EXPR_F_F(NAME, EXPR)                     \
+  __IF_FP16(                                            \
+  IMPLEMENT_EXPR_F_F(NAME, EXPR, half   , half )        \
+  IMPLEMENT_EXPR_F_F(NAME, EXPR, half2  , half )        \
+  IMPLEMENT_EXPR_F_F(NAME, EXPR, half3  , half )        \
+  IMPLEMENT_EXPR_F_F(NAME, EXPR, half4  , half )        \
+  IMPLEMENT_EXPR_F_F(NAME, EXPR, half8  , half )        \
+  IMPLEMENT_EXPR_F_F(NAME, EXPR, half16 , half ))       \
   IMPLEMENT_EXPR_F_F(NAME, EXPR, float   , float )      \
   IMPLEMENT_EXPR_F_F(NAME, EXPR, float2  , float )      \
   IMPLEMENT_EXPR_F_F(NAME, EXPR, float3  , float )      \
   IMPLEMENT_EXPR_F_F(NAME, EXPR, float4  , float )      \
   IMPLEMENT_EXPR_F_F(NAME, EXPR, float8  , float )      \
-  IMPLEMENT_EXPR_F_F(NAME, EXPR, float16 , float )
+  IMPLEMENT_EXPR_F_F(NAME, EXPR, float16 , float )      \
+  __IF_FP64(                                            \
+  IMPLEMENT_EXPR_F_F(NAME, EXPR, double   , double )    \
+  IMPLEMENT_EXPR_F_F(NAME, EXPR, double2  , double )    \
+  IMPLEMENT_EXPR_F_F(NAME, EXPR, double3  , double )    \
+  IMPLEMENT_EXPR_F_F(NAME, EXPR, double4  , double )    \
+  IMPLEMENT_EXPR_F_F(NAME, EXPR, double8  , double )    \
+  IMPLEMENT_EXPR_F_F(NAME, EXPR, double16 , double ))
+
 
 #define IMPLEMENT_EXPR_F_FF(NAME, EXPR, VTYPE, STYPE, JTYPE)    \
   VTYPE __attribute__ ((overloadable))                          \
@@ -1062,13 +1119,28 @@
     typedef JTYPE jtype;                                        \
     return EXPR;                                                \
   }
+
 #define DEFINE_EXPR_F_FF(NAME, EXPR)                            \
+  __IF_FP16(                                                    \
+  IMPLEMENT_EXPR_F_FF(NAME, EXPR, half   , half , short)        \
+  IMPLEMENT_EXPR_F_FF(NAME, EXPR, half2  , half , short2)       \
+  IMPLEMENT_EXPR_F_FF(NAME, EXPR, half3  , half , short3)       \
+  IMPLEMENT_EXPR_F_FF(NAME, EXPR, half4  , half , short4)       \
+  IMPLEMENT_EXPR_F_FF(NAME, EXPR, half8  , half , short8)       \
+  IMPLEMENT_EXPR_F_FF(NAME, EXPR, half16 , half , short16))     \
   IMPLEMENT_EXPR_F_FF(NAME, EXPR, float   , float , int   )     \
   IMPLEMENT_EXPR_F_FF(NAME, EXPR, float2  , float , int2  )     \
   IMPLEMENT_EXPR_F_FF(NAME, EXPR, float3  , float , int3  )     \
   IMPLEMENT_EXPR_F_FF(NAME, EXPR, float4  , float , int4  )     \
   IMPLEMENT_EXPR_F_FF(NAME, EXPR, float8  , float , int8  )     \
-  IMPLEMENT_EXPR_F_FF(NAME, EXPR, float16 , float , int16 )
+  IMPLEMENT_EXPR_F_FF(NAME, EXPR, float16 , float , int16 )     \
+  __IF_FP64(                                                    \
+  IMPLEMENT_EXPR_F_FF(NAME, EXPR, double   , double , long)     \
+  IMPLEMENT_EXPR_F_FF(NAME, EXPR, double2  , double , long2)    \
+  IMPLEMENT_EXPR_F_FF(NAME, EXPR, double3  , double , long3)    \
+  IMPLEMENT_EXPR_F_FF(NAME, EXPR, double4  , double , long4)    \
+  IMPLEMENT_EXPR_F_FF(NAME, EXPR, double8  , double , long8)    \
+  IMPLEMENT_EXPR_F_FF(NAME, EXPR, double16 , double , long16))
 
 
 
@@ -1436,7 +1508,7 @@
   IMPLEMENT_EXPR_G_GG(NAME, EXPR, ulong16 , ulong , ulong16 , ulong ))
 
 #define IMPLEMENT_EXPR_G_GGG(NAME, EXPR, GTYPE, SGTYPE, UGTYPE, SUGTYPE) \
-  GTYPE __attribute__ ((overloadable))                                  \
+  GTYPE _CL_OVERLOADABLE _CL_READNONE                                  \
   NAME(GTYPE a, GTYPE b, GTYPE c)                                       \
   {                                                                     \
     typedef GTYPE gtype;                                                \
@@ -1550,7 +1622,7 @@
   IMPLEMENT_EXPR_G_GS(NAME, EXPR, ulong16 , ulong , ulong16 , ulong ))
 
 #define IMPLEMENT_EXPR_G_GSS(NAME, EXPR, GTYPE, SGTYPE, UGTYPE, SUGTYPE) \
-  GTYPE __attribute__ ((overloadable))                                  \
+  GTYPE _CL_OVERLOADABLE _CL_READNONE                                  \
   NAME(GTYPE a, SGTYPE b, SGTYPE c)                                     \
   {                                                                     \
     typedef GTYPE gtype;                                                \
@@ -1763,3 +1835,37 @@
     if (get_local_id(0) == 0 &&                 \
         get_local_id(1) == 0 &&                 \
         get_local_id(2) == 0)
+
+#ifndef _CL_DECLARE_FUNC_V_V
+#define _CL_DECLARE_FUNC_V_V(NAME)              \
+  float    _CL_OVERLOADABLE NAME(float   );     \
+  float2   _CL_OVERLOADABLE NAME(float2  );     \
+  float3   _CL_OVERLOADABLE NAME(float3  );     \
+  float4   _CL_OVERLOADABLE NAME(float4  );     \
+  float8   _CL_OVERLOADABLE NAME(float8  );     \
+  float16  _CL_OVERLOADABLE NAME(float16 );     \
+  __IF_FP64(                                    \
+  double   _CL_OVERLOADABLE NAME(double  );     \
+  double2  _CL_OVERLOADABLE NAME(double2 );     \
+  double3  _CL_OVERLOADABLE NAME(double3 );     \
+  double4  _CL_OVERLOADABLE NAME(double4 );     \
+  double8  _CL_OVERLOADABLE NAME(double8 );     \
+  double16 _CL_OVERLOADABLE NAME(double16);)
+#endif
+
+#ifndef _CL_DECLARE_FUNC_K_V
+#define _CL_DECLARE_FUNC_K_V(NAME)              \
+  int   _CL_OVERLOADABLE NAME(float   );        \
+  int2  _CL_OVERLOADABLE NAME(float2  );        \
+  int3  _CL_OVERLOADABLE NAME(float3  );        \
+  int4  _CL_OVERLOADABLE NAME(float4  );        \
+  int8  _CL_OVERLOADABLE NAME(float8  );        \
+  int16 _CL_OVERLOADABLE NAME(float16 );        \
+  __IF_FP64(                                    \
+  long   _CL_OVERLOADABLE NAME(double  );       \
+  long2  _CL_OVERLOADABLE NAME(double2 );       \
+  long3  _CL_OVERLOADABLE NAME(double3 );       \
+  long4  _CL_OVERLOADABLE NAME(double4 );       \
+  long8  _CL_OVERLOADABLE NAME(double8 );       \
+  long16 _CL_OVERLOADABLE NAME(double16);)
+#endif
diff --git a/lib/kernel/vecmathlib-pocl/generate-files.py b/lib/kernel/vecmathlib-pocl/generate-files.py
index e7e5c5f..7ddbe55 100755
--- a/lib/kernel/vecmathlib-pocl/generate-files.py
+++ b/lib/kernel/vecmathlib-pocl/generate-files.py
@@ -748,9 +748,6 @@ def output_directfunc(func):
     else:
         spaces = [""]
     for basetype in ["half", "float", "double"]:
-        if ((name.startswith("half_") or name.startswith("native_")) and
-            basetype!="float"):
-            continue
         if basetype=="half":
             decl("#ifdef cl_khr_fp16")
             out("")
diff --git a/lib/kernel/vecmathlib-pocl/half_cos.cl b/lib/kernel/vecmathlib-pocl/half_cos.cl
index 23d7501..91db59f 100644
--- a/lib/kernel/vecmathlib-pocl/half_cos.cl
+++ b/lib/kernel/vecmathlib-pocl/half_cos.cl
@@ -27,6 +27,190 @@
 
 // half_cos: ['VF'] -> VF
 
+#ifdef cl_khr_fp16
+
+// half_cos: VF=half
+// Implement half_cos directly
+__attribute__((__overloadable__))
+half _cl_half_cos(half x0)
+{
+  typedef short iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef half vector_t;
+#define convert_ivector_t convert_short
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_half
+#define ilogb_ _cl_ilogb_half
+#define ldexp_scalar_ _cl_ldexp_half_short
+#define ldexp_vector_ _cl_ldexp_half_short
+  return cos(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_cos: VF=half2
+// Implement half_cos directly
+__attribute__((__overloadable__))
+half2 _cl_half_cos(half2 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short2 ivector_t;
+  typedef short2 jvector_t;
+  typedef int2 kvector_t;
+  typedef half2 vector_t;
+#define convert_ivector_t convert_short2
+#define convert_jvector_t convert_short2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_half2
+#define ilogb_ _cl_ilogb_half2
+#define ldexp_scalar_ _cl_ldexp_half2_short
+#define ldexp_vector_ _cl_ldexp_half2_short2
+  return cos(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_cos: VF=half3
+// Implement half_cos directly
+__attribute__((__overloadable__))
+half3 _cl_half_cos(half3 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short3 ivector_t;
+  typedef short3 jvector_t;
+  typedef int3 kvector_t;
+  typedef half3 vector_t;
+#define convert_ivector_t convert_short3
+#define convert_jvector_t convert_short3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_half3
+#define ilogb_ _cl_ilogb_half3
+#define ldexp_scalar_ _cl_ldexp_half3_short
+#define ldexp_vector_ _cl_ldexp_half3_short3
+  return cos(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_cos: VF=half4
+// Implement half_cos directly
+__attribute__((__overloadable__))
+half4 _cl_half_cos(half4 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short4 ivector_t;
+  typedef short4 jvector_t;
+  typedef int4 kvector_t;
+  typedef half4 vector_t;
+#define convert_ivector_t convert_short4
+#define convert_jvector_t convert_short4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_half4
+#define ilogb_ _cl_ilogb_half4
+#define ldexp_scalar_ _cl_ldexp_half4_short
+#define ldexp_vector_ _cl_ldexp_half4_short4
+  return cos(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_cos: VF=half8
+// Implement half_cos directly
+__attribute__((__overloadable__))
+half8 _cl_half_cos(half8 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short8 ivector_t;
+  typedef short8 jvector_t;
+  typedef int8 kvector_t;
+  typedef half8 vector_t;
+#define convert_ivector_t convert_short8
+#define convert_jvector_t convert_short8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_half8
+#define ilogb_ _cl_ilogb_half8
+#define ldexp_scalar_ _cl_ldexp_half8_short
+#define ldexp_vector_ _cl_ldexp_half8_short8
+  return cos(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_cos: VF=half16
+// Implement half_cos directly
+__attribute__((__overloadable__))
+half16 _cl_half_cos(half16 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short16 ivector_t;
+  typedef short16 jvector_t;
+  typedef int16 kvector_t;
+  typedef half16 vector_t;
+#define convert_ivector_t convert_short16
+#define convert_jvector_t convert_short16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_half16
+#define ilogb_ _cl_ilogb_half16
+#define ldexp_scalar_ _cl_ldexp_half16_short
+#define ldexp_vector_ _cl_ldexp_half16_short16
+  return cos(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp16
+
 // half_cos: VF=float
 // Implement half_cos directly
 __attribute__((__overloadable__))
@@ -206,3 +390,187 @@ float16 _cl_half_cos(float16 x0)
 #undef ldexp_scalar_
 #undef ldexp_vector_
 }
+
+#ifdef cl_khr_fp64
+
+// half_cos: VF=double
+// Implement half_cos directly
+__attribute__((__overloadable__))
+double _cl_half_cos(double x0)
+{
+  typedef long iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef double vector_t;
+#define convert_ivector_t convert_long
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_double
+#define ilogb_ _cl_ilogb_double
+#define ldexp_scalar_ _cl_ldexp_double_long
+#define ldexp_vector_ _cl_ldexp_double_long
+  return cos(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_cos: VF=double2
+// Implement half_cos directly
+__attribute__((__overloadable__))
+double2 _cl_half_cos(double2 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long2 ivector_t;
+  typedef long2 jvector_t;
+  typedef int2 kvector_t;
+  typedef double2 vector_t;
+#define convert_ivector_t convert_long2
+#define convert_jvector_t convert_long2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_double2
+#define ilogb_ _cl_ilogb_double2
+#define ldexp_scalar_ _cl_ldexp_double2_long
+#define ldexp_vector_ _cl_ldexp_double2_long2
+  return cos(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_cos: VF=double3
+// Implement half_cos directly
+__attribute__((__overloadable__))
+double3 _cl_half_cos(double3 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long3 ivector_t;
+  typedef long3 jvector_t;
+  typedef int3 kvector_t;
+  typedef double3 vector_t;
+#define convert_ivector_t convert_long3
+#define convert_jvector_t convert_long3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_double3
+#define ilogb_ _cl_ilogb_double3
+#define ldexp_scalar_ _cl_ldexp_double3_long
+#define ldexp_vector_ _cl_ldexp_double3_long3
+  return cos(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_cos: VF=double4
+// Implement half_cos directly
+__attribute__((__overloadable__))
+double4 _cl_half_cos(double4 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long4 ivector_t;
+  typedef long4 jvector_t;
+  typedef int4 kvector_t;
+  typedef double4 vector_t;
+#define convert_ivector_t convert_long4
+#define convert_jvector_t convert_long4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_double4
+#define ilogb_ _cl_ilogb_double4
+#define ldexp_scalar_ _cl_ldexp_double4_long
+#define ldexp_vector_ _cl_ldexp_double4_long4
+  return cos(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_cos: VF=double8
+// Implement half_cos directly
+__attribute__((__overloadable__))
+double8 _cl_half_cos(double8 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long8 ivector_t;
+  typedef long8 jvector_t;
+  typedef int8 kvector_t;
+  typedef double8 vector_t;
+#define convert_ivector_t convert_long8
+#define convert_jvector_t convert_long8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_double8
+#define ilogb_ _cl_ilogb_double8
+#define ldexp_scalar_ _cl_ldexp_double8_long
+#define ldexp_vector_ _cl_ldexp_double8_long8
+  return cos(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_cos: VF=double16
+// Implement half_cos directly
+__attribute__((__overloadable__))
+double16 _cl_half_cos(double16 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long16 ivector_t;
+  typedef long16 jvector_t;
+  typedef int16 kvector_t;
+  typedef double16 vector_t;
+#define convert_ivector_t convert_long16
+#define convert_jvector_t convert_long16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_double16
+#define ilogb_ _cl_ilogb_double16
+#define ldexp_scalar_ _cl_ldexp_double16_long
+#define ldexp_vector_ _cl_ldexp_double16_long16
+  return cos(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp64
diff --git a/lib/kernel/vecmathlib-pocl/half_divide.cl b/lib/kernel/vecmathlib-pocl/half_divide.cl
index d547d12..50aeae0 100644
--- a/lib/kernel/vecmathlib-pocl/half_divide.cl
+++ b/lib/kernel/vecmathlib-pocl/half_divide.cl
@@ -27,6 +27,190 @@
 
 // half_divide: ['VF', 'VF'] -> VF
 
+#ifdef cl_khr_fp16
+
+// half_divide: VF=half
+// Implement half_divide directly
+__attribute__((__overloadable__))
+half _cl_half_divide(half x0, half x1)
+{
+  typedef short iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef half vector_t;
+#define convert_ivector_t convert_short
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_half
+#define ilogb_ _cl_ilogb_half
+#define ldexp_scalar_ _cl_ldexp_half_short
+#define ldexp_vector_ _cl_ldexp_half_short
+  return x0/x1;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_divide: VF=half2
+// Implement half_divide directly
+__attribute__((__overloadable__))
+half2 _cl_half_divide(half2 x0, half2 x1)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short2 ivector_t;
+  typedef short2 jvector_t;
+  typedef int2 kvector_t;
+  typedef half2 vector_t;
+#define convert_ivector_t convert_short2
+#define convert_jvector_t convert_short2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_half2
+#define ilogb_ _cl_ilogb_half2
+#define ldexp_scalar_ _cl_ldexp_half2_short
+#define ldexp_vector_ _cl_ldexp_half2_short2
+  return x0/x1;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_divide: VF=half3
+// Implement half_divide directly
+__attribute__((__overloadable__))
+half3 _cl_half_divide(half3 x0, half3 x1)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short3 ivector_t;
+  typedef short3 jvector_t;
+  typedef int3 kvector_t;
+  typedef half3 vector_t;
+#define convert_ivector_t convert_short3
+#define convert_jvector_t convert_short3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_half3
+#define ilogb_ _cl_ilogb_half3
+#define ldexp_scalar_ _cl_ldexp_half3_short
+#define ldexp_vector_ _cl_ldexp_half3_short3
+  return x0/x1;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_divide: VF=half4
+// Implement half_divide directly
+__attribute__((__overloadable__))
+half4 _cl_half_divide(half4 x0, half4 x1)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short4 ivector_t;
+  typedef short4 jvector_t;
+  typedef int4 kvector_t;
+  typedef half4 vector_t;
+#define convert_ivector_t convert_short4
+#define convert_jvector_t convert_short4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_half4
+#define ilogb_ _cl_ilogb_half4
+#define ldexp_scalar_ _cl_ldexp_half4_short
+#define ldexp_vector_ _cl_ldexp_half4_short4
+  return x0/x1;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_divide: VF=half8
+// Implement half_divide directly
+__attribute__((__overloadable__))
+half8 _cl_half_divide(half8 x0, half8 x1)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short8 ivector_t;
+  typedef short8 jvector_t;
+  typedef int8 kvector_t;
+  typedef half8 vector_t;
+#define convert_ivector_t convert_short8
+#define convert_jvector_t convert_short8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_half8
+#define ilogb_ _cl_ilogb_half8
+#define ldexp_scalar_ _cl_ldexp_half8_short
+#define ldexp_vector_ _cl_ldexp_half8_short8
+  return x0/x1;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_divide: VF=half16
+// Implement half_divide directly
+__attribute__((__overloadable__))
+half16 _cl_half_divide(half16 x0, half16 x1)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short16 ivector_t;
+  typedef short16 jvector_t;
+  typedef int16 kvector_t;
+  typedef half16 vector_t;
+#define convert_ivector_t convert_short16
+#define convert_jvector_t convert_short16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_half16
+#define ilogb_ _cl_ilogb_half16
+#define ldexp_scalar_ _cl_ldexp_half16_short
+#define ldexp_vector_ _cl_ldexp_half16_short16
+  return x0/x1;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp16
+
 // half_divide: VF=float
 // Implement half_divide directly
 __attribute__((__overloadable__))
@@ -206,3 +390,187 @@ float16 _cl_half_divide(float16 x0, float16 x1)
 #undef ldexp_scalar_
 #undef ldexp_vector_
 }
+
+#ifdef cl_khr_fp64
+
+// half_divide: VF=double
+// Implement half_divide directly
+__attribute__((__overloadable__))
+double _cl_half_divide(double x0, double x1)
+{
+  typedef long iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef double vector_t;
+#define convert_ivector_t convert_long
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_double
+#define ilogb_ _cl_ilogb_double
+#define ldexp_scalar_ _cl_ldexp_double_long
+#define ldexp_vector_ _cl_ldexp_double_long
+  return x0/x1;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_divide: VF=double2
+// Implement half_divide directly
+__attribute__((__overloadable__))
+double2 _cl_half_divide(double2 x0, double2 x1)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long2 ivector_t;
+  typedef long2 jvector_t;
+  typedef int2 kvector_t;
+  typedef double2 vector_t;
+#define convert_ivector_t convert_long2
+#define convert_jvector_t convert_long2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_double2
+#define ilogb_ _cl_ilogb_double2
+#define ldexp_scalar_ _cl_ldexp_double2_long
+#define ldexp_vector_ _cl_ldexp_double2_long2
+  return x0/x1;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_divide: VF=double3
+// Implement half_divide directly
+__attribute__((__overloadable__))
+double3 _cl_half_divide(double3 x0, double3 x1)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long3 ivector_t;
+  typedef long3 jvector_t;
+  typedef int3 kvector_t;
+  typedef double3 vector_t;
+#define convert_ivector_t convert_long3
+#define convert_jvector_t convert_long3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_double3
+#define ilogb_ _cl_ilogb_double3
+#define ldexp_scalar_ _cl_ldexp_double3_long
+#define ldexp_vector_ _cl_ldexp_double3_long3
+  return x0/x1;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_divide: VF=double4
+// Implement half_divide directly
+__attribute__((__overloadable__))
+double4 _cl_half_divide(double4 x0, double4 x1)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long4 ivector_t;
+  typedef long4 jvector_t;
+  typedef int4 kvector_t;
+  typedef double4 vector_t;
+#define convert_ivector_t convert_long4
+#define convert_jvector_t convert_long4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_double4
+#define ilogb_ _cl_ilogb_double4
+#define ldexp_scalar_ _cl_ldexp_double4_long
+#define ldexp_vector_ _cl_ldexp_double4_long4
+  return x0/x1;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_divide: VF=double8
+// Implement half_divide directly
+__attribute__((__overloadable__))
+double8 _cl_half_divide(double8 x0, double8 x1)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long8 ivector_t;
+  typedef long8 jvector_t;
+  typedef int8 kvector_t;
+  typedef double8 vector_t;
+#define convert_ivector_t convert_long8
+#define convert_jvector_t convert_long8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_double8
+#define ilogb_ _cl_ilogb_double8
+#define ldexp_scalar_ _cl_ldexp_double8_long
+#define ldexp_vector_ _cl_ldexp_double8_long8
+  return x0/x1;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_divide: VF=double16
+// Implement half_divide directly
+__attribute__((__overloadable__))
+double16 _cl_half_divide(double16 x0, double16 x1)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long16 ivector_t;
+  typedef long16 jvector_t;
+  typedef int16 kvector_t;
+  typedef double16 vector_t;
+#define convert_ivector_t convert_long16
+#define convert_jvector_t convert_long16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_double16
+#define ilogb_ _cl_ilogb_double16
+#define ldexp_scalar_ _cl_ldexp_double16_long
+#define ldexp_vector_ _cl_ldexp_double16_long16
+  return x0/x1;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp64
diff --git a/lib/kernel/vecmathlib-pocl/half_exp.cl b/lib/kernel/vecmathlib-pocl/half_exp.cl
index 3e7078a..83e2940 100644
--- a/lib/kernel/vecmathlib-pocl/half_exp.cl
+++ b/lib/kernel/vecmathlib-pocl/half_exp.cl
@@ -27,6 +27,190 @@
 
 // half_exp: ['VF'] -> VF
 
+#ifdef cl_khr_fp16
+
+// half_exp: VF=half
+// Implement half_exp directly
+__attribute__((__overloadable__))
+half _cl_half_exp(half x0)
+{
+  typedef short iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef half vector_t;
+#define convert_ivector_t convert_short
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_half
+#define ilogb_ _cl_ilogb_half
+#define ldexp_scalar_ _cl_ldexp_half_short
+#define ldexp_vector_ _cl_ldexp_half_short
+  return exp(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_exp: VF=half2
+// Implement half_exp directly
+__attribute__((__overloadable__))
+half2 _cl_half_exp(half2 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short2 ivector_t;
+  typedef short2 jvector_t;
+  typedef int2 kvector_t;
+  typedef half2 vector_t;
+#define convert_ivector_t convert_short2
+#define convert_jvector_t convert_short2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_half2
+#define ilogb_ _cl_ilogb_half2
+#define ldexp_scalar_ _cl_ldexp_half2_short
+#define ldexp_vector_ _cl_ldexp_half2_short2
+  return exp(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_exp: VF=half3
+// Implement half_exp directly
+__attribute__((__overloadable__))
+half3 _cl_half_exp(half3 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short3 ivector_t;
+  typedef short3 jvector_t;
+  typedef int3 kvector_t;
+  typedef half3 vector_t;
+#define convert_ivector_t convert_short3
+#define convert_jvector_t convert_short3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_half3
+#define ilogb_ _cl_ilogb_half3
+#define ldexp_scalar_ _cl_ldexp_half3_short
+#define ldexp_vector_ _cl_ldexp_half3_short3
+  return exp(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_exp: VF=half4
+// Implement half_exp directly
+__attribute__((__overloadable__))
+half4 _cl_half_exp(half4 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short4 ivector_t;
+  typedef short4 jvector_t;
+  typedef int4 kvector_t;
+  typedef half4 vector_t;
+#define convert_ivector_t convert_short4
+#define convert_jvector_t convert_short4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_half4
+#define ilogb_ _cl_ilogb_half4
+#define ldexp_scalar_ _cl_ldexp_half4_short
+#define ldexp_vector_ _cl_ldexp_half4_short4
+  return exp(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_exp: VF=half8
+// Implement half_exp directly
+__attribute__((__overloadable__))
+half8 _cl_half_exp(half8 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short8 ivector_t;
+  typedef short8 jvector_t;
+  typedef int8 kvector_t;
+  typedef half8 vector_t;
+#define convert_ivector_t convert_short8
+#define convert_jvector_t convert_short8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_half8
+#define ilogb_ _cl_ilogb_half8
+#define ldexp_scalar_ _cl_ldexp_half8_short
+#define ldexp_vector_ _cl_ldexp_half8_short8
+  return exp(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_exp: VF=half16
+// Implement half_exp directly
+__attribute__((__overloadable__))
+half16 _cl_half_exp(half16 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short16 ivector_t;
+  typedef short16 jvector_t;
+  typedef int16 kvector_t;
+  typedef half16 vector_t;
+#define convert_ivector_t convert_short16
+#define convert_jvector_t convert_short16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_half16
+#define ilogb_ _cl_ilogb_half16
+#define ldexp_scalar_ _cl_ldexp_half16_short
+#define ldexp_vector_ _cl_ldexp_half16_short16
+  return exp(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp16
+
 // half_exp: VF=float
 // Implement half_exp directly
 __attribute__((__overloadable__))
@@ -206,3 +390,187 @@ float16 _cl_half_exp(float16 x0)
 #undef ldexp_scalar_
 #undef ldexp_vector_
 }
+
+#ifdef cl_khr_fp64
+
+// half_exp: VF=double
+// Implement half_exp directly
+__attribute__((__overloadable__))
+double _cl_half_exp(double x0)
+{
+  typedef long iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef double vector_t;
+#define convert_ivector_t convert_long
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_double
+#define ilogb_ _cl_ilogb_double
+#define ldexp_scalar_ _cl_ldexp_double_long
+#define ldexp_vector_ _cl_ldexp_double_long
+  return exp(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_exp: VF=double2
+// Implement half_exp directly
+__attribute__((__overloadable__))
+double2 _cl_half_exp(double2 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long2 ivector_t;
+  typedef long2 jvector_t;
+  typedef int2 kvector_t;
+  typedef double2 vector_t;
+#define convert_ivector_t convert_long2
+#define convert_jvector_t convert_long2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_double2
+#define ilogb_ _cl_ilogb_double2
+#define ldexp_scalar_ _cl_ldexp_double2_long
+#define ldexp_vector_ _cl_ldexp_double2_long2
+  return exp(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_exp: VF=double3
+// Implement half_exp directly
+__attribute__((__overloadable__))
+double3 _cl_half_exp(double3 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long3 ivector_t;
+  typedef long3 jvector_t;
+  typedef int3 kvector_t;
+  typedef double3 vector_t;
+#define convert_ivector_t convert_long3
+#define convert_jvector_t convert_long3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_double3
+#define ilogb_ _cl_ilogb_double3
+#define ldexp_scalar_ _cl_ldexp_double3_long
+#define ldexp_vector_ _cl_ldexp_double3_long3
+  return exp(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_exp: VF=double4
+// Implement half_exp directly
+__attribute__((__overloadable__))
+double4 _cl_half_exp(double4 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long4 ivector_t;
+  typedef long4 jvector_t;
+  typedef int4 kvector_t;
+  typedef double4 vector_t;
+#define convert_ivector_t convert_long4
+#define convert_jvector_t convert_long4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_double4
+#define ilogb_ _cl_ilogb_double4
+#define ldexp_scalar_ _cl_ldexp_double4_long
+#define ldexp_vector_ _cl_ldexp_double4_long4
+  return exp(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_exp: VF=double8
+// Implement half_exp directly
+__attribute__((__overloadable__))
+double8 _cl_half_exp(double8 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long8 ivector_t;
+  typedef long8 jvector_t;
+  typedef int8 kvector_t;
+  typedef double8 vector_t;
+#define convert_ivector_t convert_long8
+#define convert_jvector_t convert_long8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_double8
+#define ilogb_ _cl_ilogb_double8
+#define ldexp_scalar_ _cl_ldexp_double8_long
+#define ldexp_vector_ _cl_ldexp_double8_long8
+  return exp(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_exp: VF=double16
+// Implement half_exp directly
+__attribute__((__overloadable__))
+double16 _cl_half_exp(double16 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long16 ivector_t;
+  typedef long16 jvector_t;
+  typedef int16 kvector_t;
+  typedef double16 vector_t;
+#define convert_ivector_t convert_long16
+#define convert_jvector_t convert_long16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_double16
+#define ilogb_ _cl_ilogb_double16
+#define ldexp_scalar_ _cl_ldexp_double16_long
+#define ldexp_vector_ _cl_ldexp_double16_long16
+  return exp(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp64
diff --git a/lib/kernel/vecmathlib-pocl/half_exp10.cl b/lib/kernel/vecmathlib-pocl/half_exp10.cl
index 050f694..c12be63 100644
--- a/lib/kernel/vecmathlib-pocl/half_exp10.cl
+++ b/lib/kernel/vecmathlib-pocl/half_exp10.cl
@@ -27,6 +27,190 @@
 
 // half_exp10: ['VF'] -> VF
 
+#ifdef cl_khr_fp16
+
+// half_exp10: VF=half
+// Implement half_exp10 directly
+__attribute__((__overloadable__))
+half _cl_half_exp10(half x0)
+{
+  typedef short iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef half vector_t;
+#define convert_ivector_t convert_short
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_half
+#define ilogb_ _cl_ilogb_half
+#define ldexp_scalar_ _cl_ldexp_half_short
+#define ldexp_vector_ _cl_ldexp_half_short
+  return exp10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_exp10: VF=half2
+// Implement half_exp10 directly
+__attribute__((__overloadable__))
+half2 _cl_half_exp10(half2 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short2 ivector_t;
+  typedef short2 jvector_t;
+  typedef int2 kvector_t;
+  typedef half2 vector_t;
+#define convert_ivector_t convert_short2
+#define convert_jvector_t convert_short2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_half2
+#define ilogb_ _cl_ilogb_half2
+#define ldexp_scalar_ _cl_ldexp_half2_short
+#define ldexp_vector_ _cl_ldexp_half2_short2
+  return exp10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_exp10: VF=half3
+// Implement half_exp10 directly
+__attribute__((__overloadable__))
+half3 _cl_half_exp10(half3 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short3 ivector_t;
+  typedef short3 jvector_t;
+  typedef int3 kvector_t;
+  typedef half3 vector_t;
+#define convert_ivector_t convert_short3
+#define convert_jvector_t convert_short3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_half3
+#define ilogb_ _cl_ilogb_half3
+#define ldexp_scalar_ _cl_ldexp_half3_short
+#define ldexp_vector_ _cl_ldexp_half3_short3
+  return exp10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_exp10: VF=half4
+// Implement half_exp10 directly
+__attribute__((__overloadable__))
+half4 _cl_half_exp10(half4 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short4 ivector_t;
+  typedef short4 jvector_t;
+  typedef int4 kvector_t;
+  typedef half4 vector_t;
+#define convert_ivector_t convert_short4
+#define convert_jvector_t convert_short4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_half4
+#define ilogb_ _cl_ilogb_half4
+#define ldexp_scalar_ _cl_ldexp_half4_short
+#define ldexp_vector_ _cl_ldexp_half4_short4
+  return exp10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_exp10: VF=half8
+// Implement half_exp10 directly
+__attribute__((__overloadable__))
+half8 _cl_half_exp10(half8 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short8 ivector_t;
+  typedef short8 jvector_t;
+  typedef int8 kvector_t;
+  typedef half8 vector_t;
+#define convert_ivector_t convert_short8
+#define convert_jvector_t convert_short8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_half8
+#define ilogb_ _cl_ilogb_half8
+#define ldexp_scalar_ _cl_ldexp_half8_short
+#define ldexp_vector_ _cl_ldexp_half8_short8
+  return exp10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_exp10: VF=half16
+// Implement half_exp10 directly
+__attribute__((__overloadable__))
+half16 _cl_half_exp10(half16 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short16 ivector_t;
+  typedef short16 jvector_t;
+  typedef int16 kvector_t;
+  typedef half16 vector_t;
+#define convert_ivector_t convert_short16
+#define convert_jvector_t convert_short16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_half16
+#define ilogb_ _cl_ilogb_half16
+#define ldexp_scalar_ _cl_ldexp_half16_short
+#define ldexp_vector_ _cl_ldexp_half16_short16
+  return exp10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp16
+
 // half_exp10: VF=float
 // Implement half_exp10 directly
 __attribute__((__overloadable__))
@@ -206,3 +390,187 @@ float16 _cl_half_exp10(float16 x0)
 #undef ldexp_scalar_
 #undef ldexp_vector_
 }
+
+#ifdef cl_khr_fp64
+
+// half_exp10: VF=double
+// Implement half_exp10 directly
+__attribute__((__overloadable__))
+double _cl_half_exp10(double x0)
+{
+  typedef long iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef double vector_t;
+#define convert_ivector_t convert_long
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_double
+#define ilogb_ _cl_ilogb_double
+#define ldexp_scalar_ _cl_ldexp_double_long
+#define ldexp_vector_ _cl_ldexp_double_long
+  return exp10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_exp10: VF=double2
+// Implement half_exp10 directly
+__attribute__((__overloadable__))
+double2 _cl_half_exp10(double2 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long2 ivector_t;
+  typedef long2 jvector_t;
+  typedef int2 kvector_t;
+  typedef double2 vector_t;
+#define convert_ivector_t convert_long2
+#define convert_jvector_t convert_long2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_double2
+#define ilogb_ _cl_ilogb_double2
+#define ldexp_scalar_ _cl_ldexp_double2_long
+#define ldexp_vector_ _cl_ldexp_double2_long2
+  return exp10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_exp10: VF=double3
+// Implement half_exp10 directly
+__attribute__((__overloadable__))
+double3 _cl_half_exp10(double3 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long3 ivector_t;
+  typedef long3 jvector_t;
+  typedef int3 kvector_t;
+  typedef double3 vector_t;
+#define convert_ivector_t convert_long3
+#define convert_jvector_t convert_long3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_double3
+#define ilogb_ _cl_ilogb_double3
+#define ldexp_scalar_ _cl_ldexp_double3_long
+#define ldexp_vector_ _cl_ldexp_double3_long3
+  return exp10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_exp10: VF=double4
+// Implement half_exp10 directly
+__attribute__((__overloadable__))
+double4 _cl_half_exp10(double4 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long4 ivector_t;
+  typedef long4 jvector_t;
+  typedef int4 kvector_t;
+  typedef double4 vector_t;
+#define convert_ivector_t convert_long4
+#define convert_jvector_t convert_long4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_double4
+#define ilogb_ _cl_ilogb_double4
+#define ldexp_scalar_ _cl_ldexp_double4_long
+#define ldexp_vector_ _cl_ldexp_double4_long4
+  return exp10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_exp10: VF=double8
+// Implement half_exp10 directly
+__attribute__((__overloadable__))
+double8 _cl_half_exp10(double8 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long8 ivector_t;
+  typedef long8 jvector_t;
+  typedef int8 kvector_t;
+  typedef double8 vector_t;
+#define convert_ivector_t convert_long8
+#define convert_jvector_t convert_long8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_double8
+#define ilogb_ _cl_ilogb_double8
+#define ldexp_scalar_ _cl_ldexp_double8_long
+#define ldexp_vector_ _cl_ldexp_double8_long8
+  return exp10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_exp10: VF=double16
+// Implement half_exp10 directly
+__attribute__((__overloadable__))
+double16 _cl_half_exp10(double16 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long16 ivector_t;
+  typedef long16 jvector_t;
+  typedef int16 kvector_t;
+  typedef double16 vector_t;
+#define convert_ivector_t convert_long16
+#define convert_jvector_t convert_long16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_double16
+#define ilogb_ _cl_ilogb_double16
+#define ldexp_scalar_ _cl_ldexp_double16_long
+#define ldexp_vector_ _cl_ldexp_double16_long16
+  return exp10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp64
diff --git a/lib/kernel/vecmathlib-pocl/half_exp2.cl b/lib/kernel/vecmathlib-pocl/half_exp2.cl
index af5cde6..e5c0e82 100644
--- a/lib/kernel/vecmathlib-pocl/half_exp2.cl
+++ b/lib/kernel/vecmathlib-pocl/half_exp2.cl
@@ -27,6 +27,190 @@
 
 // half_exp2: ['VF'] -> VF
 
+#ifdef cl_khr_fp16
+
+// half_exp2: VF=half
+// Implement half_exp2 directly
+__attribute__((__overloadable__))
+half _cl_half_exp2(half x0)
+{
+  typedef short iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef half vector_t;
+#define convert_ivector_t convert_short
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_half
+#define ilogb_ _cl_ilogb_half
+#define ldexp_scalar_ _cl_ldexp_half_short
+#define ldexp_vector_ _cl_ldexp_half_short
+  return exp2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_exp2: VF=half2
+// Implement half_exp2 directly
+__attribute__((__overloadable__))
+half2 _cl_half_exp2(half2 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short2 ivector_t;
+  typedef short2 jvector_t;
+  typedef int2 kvector_t;
+  typedef half2 vector_t;
+#define convert_ivector_t convert_short2
+#define convert_jvector_t convert_short2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_half2
+#define ilogb_ _cl_ilogb_half2
+#define ldexp_scalar_ _cl_ldexp_half2_short
+#define ldexp_vector_ _cl_ldexp_half2_short2
+  return exp2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_exp2: VF=half3
+// Implement half_exp2 directly
+__attribute__((__overloadable__))
+half3 _cl_half_exp2(half3 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short3 ivector_t;
+  typedef short3 jvector_t;
+  typedef int3 kvector_t;
+  typedef half3 vector_t;
+#define convert_ivector_t convert_short3
+#define convert_jvector_t convert_short3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_half3
+#define ilogb_ _cl_ilogb_half3
+#define ldexp_scalar_ _cl_ldexp_half3_short
+#define ldexp_vector_ _cl_ldexp_half3_short3
+  return exp2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_exp2: VF=half4
+// Implement half_exp2 directly
+__attribute__((__overloadable__))
+half4 _cl_half_exp2(half4 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short4 ivector_t;
+  typedef short4 jvector_t;
+  typedef int4 kvector_t;
+  typedef half4 vector_t;
+#define convert_ivector_t convert_short4
+#define convert_jvector_t convert_short4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_half4
+#define ilogb_ _cl_ilogb_half4
+#define ldexp_scalar_ _cl_ldexp_half4_short
+#define ldexp_vector_ _cl_ldexp_half4_short4
+  return exp2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_exp2: VF=half8
+// Implement half_exp2 directly
+__attribute__((__overloadable__))
+half8 _cl_half_exp2(half8 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short8 ivector_t;
+  typedef short8 jvector_t;
+  typedef int8 kvector_t;
+  typedef half8 vector_t;
+#define convert_ivector_t convert_short8
+#define convert_jvector_t convert_short8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_half8
+#define ilogb_ _cl_ilogb_half8
+#define ldexp_scalar_ _cl_ldexp_half8_short
+#define ldexp_vector_ _cl_ldexp_half8_short8
+  return exp2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_exp2: VF=half16
+// Implement half_exp2 directly
+__attribute__((__overloadable__))
+half16 _cl_half_exp2(half16 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short16 ivector_t;
+  typedef short16 jvector_t;
+  typedef int16 kvector_t;
+  typedef half16 vector_t;
+#define convert_ivector_t convert_short16
+#define convert_jvector_t convert_short16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_half16
+#define ilogb_ _cl_ilogb_half16
+#define ldexp_scalar_ _cl_ldexp_half16_short
+#define ldexp_vector_ _cl_ldexp_half16_short16
+  return exp2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp16
+
 // half_exp2: VF=float
 // Implement half_exp2 directly
 __attribute__((__overloadable__))
@@ -206,3 +390,187 @@ float16 _cl_half_exp2(float16 x0)
 #undef ldexp_scalar_
 #undef ldexp_vector_
 }
+
+#ifdef cl_khr_fp64
+
+// half_exp2: VF=double
+// Implement half_exp2 directly
+__attribute__((__overloadable__))
+double _cl_half_exp2(double x0)
+{
+  typedef long iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef double vector_t;
+#define convert_ivector_t convert_long
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_double
+#define ilogb_ _cl_ilogb_double
+#define ldexp_scalar_ _cl_ldexp_double_long
+#define ldexp_vector_ _cl_ldexp_double_long
+  return exp2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_exp2: VF=double2
+// Implement half_exp2 directly
+__attribute__((__overloadable__))
+double2 _cl_half_exp2(double2 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long2 ivector_t;
+  typedef long2 jvector_t;
+  typedef int2 kvector_t;
+  typedef double2 vector_t;
+#define convert_ivector_t convert_long2
+#define convert_jvector_t convert_long2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_double2
+#define ilogb_ _cl_ilogb_double2
+#define ldexp_scalar_ _cl_ldexp_double2_long
+#define ldexp_vector_ _cl_ldexp_double2_long2
+  return exp2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_exp2: VF=double3
+// Implement half_exp2 directly
+__attribute__((__overloadable__))
+double3 _cl_half_exp2(double3 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long3 ivector_t;
+  typedef long3 jvector_t;
+  typedef int3 kvector_t;
+  typedef double3 vector_t;
+#define convert_ivector_t convert_long3
+#define convert_jvector_t convert_long3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_double3
+#define ilogb_ _cl_ilogb_double3
+#define ldexp_scalar_ _cl_ldexp_double3_long
+#define ldexp_vector_ _cl_ldexp_double3_long3
+  return exp2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_exp2: VF=double4
+// Implement half_exp2 directly
+__attribute__((__overloadable__))
+double4 _cl_half_exp2(double4 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long4 ivector_t;
+  typedef long4 jvector_t;
+  typedef int4 kvector_t;
+  typedef double4 vector_t;
+#define convert_ivector_t convert_long4
+#define convert_jvector_t convert_long4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_double4
+#define ilogb_ _cl_ilogb_double4
+#define ldexp_scalar_ _cl_ldexp_double4_long
+#define ldexp_vector_ _cl_ldexp_double4_long4
+  return exp2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_exp2: VF=double8
+// Implement half_exp2 directly
+__attribute__((__overloadable__))
+double8 _cl_half_exp2(double8 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long8 ivector_t;
+  typedef long8 jvector_t;
+  typedef int8 kvector_t;
+  typedef double8 vector_t;
+#define convert_ivector_t convert_long8
+#define convert_jvector_t convert_long8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_double8
+#define ilogb_ _cl_ilogb_double8
+#define ldexp_scalar_ _cl_ldexp_double8_long
+#define ldexp_vector_ _cl_ldexp_double8_long8
+  return exp2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_exp2: VF=double16
+// Implement half_exp2 directly
+__attribute__((__overloadable__))
+double16 _cl_half_exp2(double16 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long16 ivector_t;
+  typedef long16 jvector_t;
+  typedef int16 kvector_t;
+  typedef double16 vector_t;
+#define convert_ivector_t convert_long16
+#define convert_jvector_t convert_long16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_double16
+#define ilogb_ _cl_ilogb_double16
+#define ldexp_scalar_ _cl_ldexp_double16_long
+#define ldexp_vector_ _cl_ldexp_double16_long16
+  return exp2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp64
diff --git a/lib/kernel/vecmathlib-pocl/half_log.cl b/lib/kernel/vecmathlib-pocl/half_log.cl
index 3a56eb0..95eabbf 100644
--- a/lib/kernel/vecmathlib-pocl/half_log.cl
+++ b/lib/kernel/vecmathlib-pocl/half_log.cl
@@ -27,6 +27,190 @@
 
 // half_log: ['VF'] -> VF
 
+#ifdef cl_khr_fp16
+
+// half_log: VF=half
+// Implement half_log directly
+__attribute__((__overloadable__))
+half _cl_half_log(half x0)
+{
+  typedef short iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef half vector_t;
+#define convert_ivector_t convert_short
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_half
+#define ilogb_ _cl_ilogb_half
+#define ldexp_scalar_ _cl_ldexp_half_short
+#define ldexp_vector_ _cl_ldexp_half_short
+  return log(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_log: VF=half2
+// Implement half_log directly
+__attribute__((__overloadable__))
+half2 _cl_half_log(half2 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short2 ivector_t;
+  typedef short2 jvector_t;
+  typedef int2 kvector_t;
+  typedef half2 vector_t;
+#define convert_ivector_t convert_short2
+#define convert_jvector_t convert_short2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_half2
+#define ilogb_ _cl_ilogb_half2
+#define ldexp_scalar_ _cl_ldexp_half2_short
+#define ldexp_vector_ _cl_ldexp_half2_short2
+  return log(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_log: VF=half3
+// Implement half_log directly
+__attribute__((__overloadable__))
+half3 _cl_half_log(half3 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short3 ivector_t;
+  typedef short3 jvector_t;
+  typedef int3 kvector_t;
+  typedef half3 vector_t;
+#define convert_ivector_t convert_short3
+#define convert_jvector_t convert_short3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_half3
+#define ilogb_ _cl_ilogb_half3
+#define ldexp_scalar_ _cl_ldexp_half3_short
+#define ldexp_vector_ _cl_ldexp_half3_short3
+  return log(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_log: VF=half4
+// Implement half_log directly
+__attribute__((__overloadable__))
+half4 _cl_half_log(half4 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short4 ivector_t;
+  typedef short4 jvector_t;
+  typedef int4 kvector_t;
+  typedef half4 vector_t;
+#define convert_ivector_t convert_short4
+#define convert_jvector_t convert_short4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_half4
+#define ilogb_ _cl_ilogb_half4
+#define ldexp_scalar_ _cl_ldexp_half4_short
+#define ldexp_vector_ _cl_ldexp_half4_short4
+  return log(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_log: VF=half8
+// Implement half_log directly
+__attribute__((__overloadable__))
+half8 _cl_half_log(half8 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short8 ivector_t;
+  typedef short8 jvector_t;
+  typedef int8 kvector_t;
+  typedef half8 vector_t;
+#define convert_ivector_t convert_short8
+#define convert_jvector_t convert_short8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_half8
+#define ilogb_ _cl_ilogb_half8
+#define ldexp_scalar_ _cl_ldexp_half8_short
+#define ldexp_vector_ _cl_ldexp_half8_short8
+  return log(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_log: VF=half16
+// Implement half_log directly
+__attribute__((__overloadable__))
+half16 _cl_half_log(half16 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short16 ivector_t;
+  typedef short16 jvector_t;
+  typedef int16 kvector_t;
+  typedef half16 vector_t;
+#define convert_ivector_t convert_short16
+#define convert_jvector_t convert_short16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_half16
+#define ilogb_ _cl_ilogb_half16
+#define ldexp_scalar_ _cl_ldexp_half16_short
+#define ldexp_vector_ _cl_ldexp_half16_short16
+  return log(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp16
+
 // half_log: VF=float
 // Implement half_log directly
 __attribute__((__overloadable__))
@@ -206,3 +390,187 @@ float16 _cl_half_log(float16 x0)
 #undef ldexp_scalar_
 #undef ldexp_vector_
 }
+
+#ifdef cl_khr_fp64
+
+// half_log: VF=double
+// Implement half_log directly
+__attribute__((__overloadable__))
+double _cl_half_log(double x0)
+{
+  typedef long iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef double vector_t;
+#define convert_ivector_t convert_long
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_double
+#define ilogb_ _cl_ilogb_double
+#define ldexp_scalar_ _cl_ldexp_double_long
+#define ldexp_vector_ _cl_ldexp_double_long
+  return log(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_log: VF=double2
+// Implement half_log directly
+__attribute__((__overloadable__))
+double2 _cl_half_log(double2 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long2 ivector_t;
+  typedef long2 jvector_t;
+  typedef int2 kvector_t;
+  typedef double2 vector_t;
+#define convert_ivector_t convert_long2
+#define convert_jvector_t convert_long2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_double2
+#define ilogb_ _cl_ilogb_double2
+#define ldexp_scalar_ _cl_ldexp_double2_long
+#define ldexp_vector_ _cl_ldexp_double2_long2
+  return log(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_log: VF=double3
+// Implement half_log directly
+__attribute__((__overloadable__))
+double3 _cl_half_log(double3 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long3 ivector_t;
+  typedef long3 jvector_t;
+  typedef int3 kvector_t;
+  typedef double3 vector_t;
+#define convert_ivector_t convert_long3
+#define convert_jvector_t convert_long3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_double3
+#define ilogb_ _cl_ilogb_double3
+#define ldexp_scalar_ _cl_ldexp_double3_long
+#define ldexp_vector_ _cl_ldexp_double3_long3
+  return log(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_log: VF=double4
+// Implement half_log directly
+__attribute__((__overloadable__))
+double4 _cl_half_log(double4 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long4 ivector_t;
+  typedef long4 jvector_t;
+  typedef int4 kvector_t;
+  typedef double4 vector_t;
+#define convert_ivector_t convert_long4
+#define convert_jvector_t convert_long4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_double4
+#define ilogb_ _cl_ilogb_double4
+#define ldexp_scalar_ _cl_ldexp_double4_long
+#define ldexp_vector_ _cl_ldexp_double4_long4
+  return log(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_log: VF=double8
+// Implement half_log directly
+__attribute__((__overloadable__))
+double8 _cl_half_log(double8 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long8 ivector_t;
+  typedef long8 jvector_t;
+  typedef int8 kvector_t;
+  typedef double8 vector_t;
+#define convert_ivector_t convert_long8
+#define convert_jvector_t convert_long8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_double8
+#define ilogb_ _cl_ilogb_double8
+#define ldexp_scalar_ _cl_ldexp_double8_long
+#define ldexp_vector_ _cl_ldexp_double8_long8
+  return log(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_log: VF=double16
+// Implement half_log directly
+__attribute__((__overloadable__))
+double16 _cl_half_log(double16 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long16 ivector_t;
+  typedef long16 jvector_t;
+  typedef int16 kvector_t;
+  typedef double16 vector_t;
+#define convert_ivector_t convert_long16
+#define convert_jvector_t convert_long16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_double16
+#define ilogb_ _cl_ilogb_double16
+#define ldexp_scalar_ _cl_ldexp_double16_long
+#define ldexp_vector_ _cl_ldexp_double16_long16
+  return log(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp64
diff --git a/lib/kernel/vecmathlib-pocl/half_log10.cl b/lib/kernel/vecmathlib-pocl/half_log10.cl
index 08ade4a..1f8a6a6 100644
--- a/lib/kernel/vecmathlib-pocl/half_log10.cl
+++ b/lib/kernel/vecmathlib-pocl/half_log10.cl
@@ -27,6 +27,190 @@
 
 // half_log10: ['VF'] -> VF
 
+#ifdef cl_khr_fp16
+
+// half_log10: VF=half
+// Implement half_log10 directly
+__attribute__((__overloadable__))
+half _cl_half_log10(half x0)
+{
+  typedef short iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef half vector_t;
+#define convert_ivector_t convert_short
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_half
+#define ilogb_ _cl_ilogb_half
+#define ldexp_scalar_ _cl_ldexp_half_short
+#define ldexp_vector_ _cl_ldexp_half_short
+  return log10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_log10: VF=half2
+// Implement half_log10 directly
+__attribute__((__overloadable__))
+half2 _cl_half_log10(half2 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short2 ivector_t;
+  typedef short2 jvector_t;
+  typedef int2 kvector_t;
+  typedef half2 vector_t;
+#define convert_ivector_t convert_short2
+#define convert_jvector_t convert_short2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_half2
+#define ilogb_ _cl_ilogb_half2
+#define ldexp_scalar_ _cl_ldexp_half2_short
+#define ldexp_vector_ _cl_ldexp_half2_short2
+  return log10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_log10: VF=half3
+// Implement half_log10 directly
+__attribute__((__overloadable__))
+half3 _cl_half_log10(half3 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short3 ivector_t;
+  typedef short3 jvector_t;
+  typedef int3 kvector_t;
+  typedef half3 vector_t;
+#define convert_ivector_t convert_short3
+#define convert_jvector_t convert_short3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_half3
+#define ilogb_ _cl_ilogb_half3
+#define ldexp_scalar_ _cl_ldexp_half3_short
+#define ldexp_vector_ _cl_ldexp_half3_short3
+  return log10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_log10: VF=half4
+// Implement half_log10 directly
+__attribute__((__overloadable__))
+half4 _cl_half_log10(half4 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short4 ivector_t;
+  typedef short4 jvector_t;
+  typedef int4 kvector_t;
+  typedef half4 vector_t;
+#define convert_ivector_t convert_short4
+#define convert_jvector_t convert_short4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_half4
+#define ilogb_ _cl_ilogb_half4
+#define ldexp_scalar_ _cl_ldexp_half4_short
+#define ldexp_vector_ _cl_ldexp_half4_short4
+  return log10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_log10: VF=half8
+// Implement half_log10 directly
+__attribute__((__overloadable__))
+half8 _cl_half_log10(half8 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short8 ivector_t;
+  typedef short8 jvector_t;
+  typedef int8 kvector_t;
+  typedef half8 vector_t;
+#define convert_ivector_t convert_short8
+#define convert_jvector_t convert_short8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_half8
+#define ilogb_ _cl_ilogb_half8
+#define ldexp_scalar_ _cl_ldexp_half8_short
+#define ldexp_vector_ _cl_ldexp_half8_short8
+  return log10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_log10: VF=half16
+// Implement half_log10 directly
+__attribute__((__overloadable__))
+half16 _cl_half_log10(half16 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short16 ivector_t;
+  typedef short16 jvector_t;
+  typedef int16 kvector_t;
+  typedef half16 vector_t;
+#define convert_ivector_t convert_short16
+#define convert_jvector_t convert_short16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_half16
+#define ilogb_ _cl_ilogb_half16
+#define ldexp_scalar_ _cl_ldexp_half16_short
+#define ldexp_vector_ _cl_ldexp_half16_short16
+  return log10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp16
+
 // half_log10: VF=float
 // Implement half_log10 directly
 __attribute__((__overloadable__))
@@ -206,3 +390,187 @@ float16 _cl_half_log10(float16 x0)
 #undef ldexp_scalar_
 #undef ldexp_vector_
 }
+
+#ifdef cl_khr_fp64
+
+// half_log10: VF=double
+// Implement half_log10 directly
+__attribute__((__overloadable__))
+double _cl_half_log10(double x0)
+{
+  typedef long iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef double vector_t;
+#define convert_ivector_t convert_long
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_double
+#define ilogb_ _cl_ilogb_double
+#define ldexp_scalar_ _cl_ldexp_double_long
+#define ldexp_vector_ _cl_ldexp_double_long
+  return log10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_log10: VF=double2
+// Implement half_log10 directly
+__attribute__((__overloadable__))
+double2 _cl_half_log10(double2 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long2 ivector_t;
+  typedef long2 jvector_t;
+  typedef int2 kvector_t;
+  typedef double2 vector_t;
+#define convert_ivector_t convert_long2
+#define convert_jvector_t convert_long2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_double2
+#define ilogb_ _cl_ilogb_double2
+#define ldexp_scalar_ _cl_ldexp_double2_long
+#define ldexp_vector_ _cl_ldexp_double2_long2
+  return log10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_log10: VF=double3
+// Implement half_log10 directly
+__attribute__((__overloadable__))
+double3 _cl_half_log10(double3 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long3 ivector_t;
+  typedef long3 jvector_t;
+  typedef int3 kvector_t;
+  typedef double3 vector_t;
+#define convert_ivector_t convert_long3
+#define convert_jvector_t convert_long3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_double3
+#define ilogb_ _cl_ilogb_double3
+#define ldexp_scalar_ _cl_ldexp_double3_long
+#define ldexp_vector_ _cl_ldexp_double3_long3
+  return log10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_log10: VF=double4
+// Implement half_log10 directly
+__attribute__((__overloadable__))
+double4 _cl_half_log10(double4 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long4 ivector_t;
+  typedef long4 jvector_t;
+  typedef int4 kvector_t;
+  typedef double4 vector_t;
+#define convert_ivector_t convert_long4
+#define convert_jvector_t convert_long4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_double4
+#define ilogb_ _cl_ilogb_double4
+#define ldexp_scalar_ _cl_ldexp_double4_long
+#define ldexp_vector_ _cl_ldexp_double4_long4
+  return log10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_log10: VF=double8
+// Implement half_log10 directly
+__attribute__((__overloadable__))
+double8 _cl_half_log10(double8 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long8 ivector_t;
+  typedef long8 jvector_t;
+  typedef int8 kvector_t;
+  typedef double8 vector_t;
+#define convert_ivector_t convert_long8
+#define convert_jvector_t convert_long8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_double8
+#define ilogb_ _cl_ilogb_double8
+#define ldexp_scalar_ _cl_ldexp_double8_long
+#define ldexp_vector_ _cl_ldexp_double8_long8
+  return log10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_log10: VF=double16
+// Implement half_log10 directly
+__attribute__((__overloadable__))
+double16 _cl_half_log10(double16 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long16 ivector_t;
+  typedef long16 jvector_t;
+  typedef int16 kvector_t;
+  typedef double16 vector_t;
+#define convert_ivector_t convert_long16
+#define convert_jvector_t convert_long16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_double16
+#define ilogb_ _cl_ilogb_double16
+#define ldexp_scalar_ _cl_ldexp_double16_long
+#define ldexp_vector_ _cl_ldexp_double16_long16
+  return log10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp64
diff --git a/lib/kernel/vecmathlib-pocl/half_log2.cl b/lib/kernel/vecmathlib-pocl/half_log2.cl
index 7169a76..448f864 100644
--- a/lib/kernel/vecmathlib-pocl/half_log2.cl
+++ b/lib/kernel/vecmathlib-pocl/half_log2.cl
@@ -27,6 +27,190 @@
 
 // half_log2: ['VF'] -> VF
 
+#ifdef cl_khr_fp16
+
+// half_log2: VF=half
+// Implement half_log2 directly
+__attribute__((__overloadable__))
+half _cl_half_log2(half x0)
+{
+  typedef short iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef half vector_t;
+#define convert_ivector_t convert_short
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_half
+#define ilogb_ _cl_ilogb_half
+#define ldexp_scalar_ _cl_ldexp_half_short
+#define ldexp_vector_ _cl_ldexp_half_short
+  return log2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_log2: VF=half2
+// Implement half_log2 directly
+__attribute__((__overloadable__))
+half2 _cl_half_log2(half2 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short2 ivector_t;
+  typedef short2 jvector_t;
+  typedef int2 kvector_t;
+  typedef half2 vector_t;
+#define convert_ivector_t convert_short2
+#define convert_jvector_t convert_short2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_half2
+#define ilogb_ _cl_ilogb_half2
+#define ldexp_scalar_ _cl_ldexp_half2_short
+#define ldexp_vector_ _cl_ldexp_half2_short2
+  return log2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_log2: VF=half3
+// Implement half_log2 directly
+__attribute__((__overloadable__))
+half3 _cl_half_log2(half3 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short3 ivector_t;
+  typedef short3 jvector_t;
+  typedef int3 kvector_t;
+  typedef half3 vector_t;
+#define convert_ivector_t convert_short3
+#define convert_jvector_t convert_short3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_half3
+#define ilogb_ _cl_ilogb_half3
+#define ldexp_scalar_ _cl_ldexp_half3_short
+#define ldexp_vector_ _cl_ldexp_half3_short3
+  return log2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_log2: VF=half4
+// Implement half_log2 directly
+__attribute__((__overloadable__))
+half4 _cl_half_log2(half4 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short4 ivector_t;
+  typedef short4 jvector_t;
+  typedef int4 kvector_t;
+  typedef half4 vector_t;
+#define convert_ivector_t convert_short4
+#define convert_jvector_t convert_short4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_half4
+#define ilogb_ _cl_ilogb_half4
+#define ldexp_scalar_ _cl_ldexp_half4_short
+#define ldexp_vector_ _cl_ldexp_half4_short4
+  return log2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_log2: VF=half8
+// Implement half_log2 directly
+__attribute__((__overloadable__))
+half8 _cl_half_log2(half8 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short8 ivector_t;
+  typedef short8 jvector_t;
+  typedef int8 kvector_t;
+  typedef half8 vector_t;
+#define convert_ivector_t convert_short8
+#define convert_jvector_t convert_short8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_half8
+#define ilogb_ _cl_ilogb_half8
+#define ldexp_scalar_ _cl_ldexp_half8_short
+#define ldexp_vector_ _cl_ldexp_half8_short8
+  return log2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_log2: VF=half16
+// Implement half_log2 directly
+__attribute__((__overloadable__))
+half16 _cl_half_log2(half16 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short16 ivector_t;
+  typedef short16 jvector_t;
+  typedef int16 kvector_t;
+  typedef half16 vector_t;
+#define convert_ivector_t convert_short16
+#define convert_jvector_t convert_short16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_half16
+#define ilogb_ _cl_ilogb_half16
+#define ldexp_scalar_ _cl_ldexp_half16_short
+#define ldexp_vector_ _cl_ldexp_half16_short16
+  return log2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp16
+
 // half_log2: VF=float
 // Implement half_log2 directly
 __attribute__((__overloadable__))
@@ -206,3 +390,187 @@ float16 _cl_half_log2(float16 x0)
 #undef ldexp_scalar_
 #undef ldexp_vector_
 }
+
+#ifdef cl_khr_fp64
+
+// half_log2: VF=double
+// Implement half_log2 directly
+__attribute__((__overloadable__))
+double _cl_half_log2(double x0)
+{
+  typedef long iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef double vector_t;
+#define convert_ivector_t convert_long
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_double
+#define ilogb_ _cl_ilogb_double
+#define ldexp_scalar_ _cl_ldexp_double_long
+#define ldexp_vector_ _cl_ldexp_double_long
+  return log2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_log2: VF=double2
+// Implement half_log2 directly
+__attribute__((__overloadable__))
+double2 _cl_half_log2(double2 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long2 ivector_t;
+  typedef long2 jvector_t;
+  typedef int2 kvector_t;
+  typedef double2 vector_t;
+#define convert_ivector_t convert_long2
+#define convert_jvector_t convert_long2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_double2
+#define ilogb_ _cl_ilogb_double2
+#define ldexp_scalar_ _cl_ldexp_double2_long
+#define ldexp_vector_ _cl_ldexp_double2_long2
+  return log2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_log2: VF=double3
+// Implement half_log2 directly
+__attribute__((__overloadable__))
+double3 _cl_half_log2(double3 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long3 ivector_t;
+  typedef long3 jvector_t;
+  typedef int3 kvector_t;
+  typedef double3 vector_t;
+#define convert_ivector_t convert_long3
+#define convert_jvector_t convert_long3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_double3
+#define ilogb_ _cl_ilogb_double3
+#define ldexp_scalar_ _cl_ldexp_double3_long
+#define ldexp_vector_ _cl_ldexp_double3_long3
+  return log2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_log2: VF=double4
+// Implement half_log2 directly
+__attribute__((__overloadable__))
+double4 _cl_half_log2(double4 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long4 ivector_t;
+  typedef long4 jvector_t;
+  typedef int4 kvector_t;
+  typedef double4 vector_t;
+#define convert_ivector_t convert_long4
+#define convert_jvector_t convert_long4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_double4
+#define ilogb_ _cl_ilogb_double4
+#define ldexp_scalar_ _cl_ldexp_double4_long
+#define ldexp_vector_ _cl_ldexp_double4_long4
+  return log2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_log2: VF=double8
+// Implement half_log2 directly
+__attribute__((__overloadable__))
+double8 _cl_half_log2(double8 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long8 ivector_t;
+  typedef long8 jvector_t;
+  typedef int8 kvector_t;
+  typedef double8 vector_t;
+#define convert_ivector_t convert_long8
+#define convert_jvector_t convert_long8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_double8
+#define ilogb_ _cl_ilogb_double8
+#define ldexp_scalar_ _cl_ldexp_double8_long
+#define ldexp_vector_ _cl_ldexp_double8_long8
+  return log2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_log2: VF=double16
+// Implement half_log2 directly
+__attribute__((__overloadable__))
+double16 _cl_half_log2(double16 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long16 ivector_t;
+  typedef long16 jvector_t;
+  typedef int16 kvector_t;
+  typedef double16 vector_t;
+#define convert_ivector_t convert_long16
+#define convert_jvector_t convert_long16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_double16
+#define ilogb_ _cl_ilogb_double16
+#define ldexp_scalar_ _cl_ldexp_double16_long
+#define ldexp_vector_ _cl_ldexp_double16_long16
+  return log2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp64
diff --git a/lib/kernel/vecmathlib-pocl/half_powr.cl b/lib/kernel/vecmathlib-pocl/half_powr.cl
index 84599b5..b2989ca 100644
--- a/lib/kernel/vecmathlib-pocl/half_powr.cl
+++ b/lib/kernel/vecmathlib-pocl/half_powr.cl
@@ -27,6 +27,190 @@
 
 // half_powr: ['VF', 'VF'] -> VF
 
+#ifdef cl_khr_fp16
+
+// half_powr: VF=half
+// Implement half_powr directly
+__attribute__((__overloadable__))
+half _cl_half_powr(half x0, half x1)
+{
+  typedef short iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef half vector_t;
+#define convert_ivector_t convert_short
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_half
+#define ilogb_ _cl_ilogb_half
+#define ldexp_scalar_ _cl_ldexp_half_short
+#define ldexp_vector_ _cl_ldexp_half_short
+  return powr(x0,x1);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_powr: VF=half2
+// Implement half_powr directly
+__attribute__((__overloadable__))
+half2 _cl_half_powr(half2 x0, half2 x1)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short2 ivector_t;
+  typedef short2 jvector_t;
+  typedef int2 kvector_t;
+  typedef half2 vector_t;
+#define convert_ivector_t convert_short2
+#define convert_jvector_t convert_short2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_half2
+#define ilogb_ _cl_ilogb_half2
+#define ldexp_scalar_ _cl_ldexp_half2_short
+#define ldexp_vector_ _cl_ldexp_half2_short2
+  return powr(x0,x1);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_powr: VF=half3
+// Implement half_powr directly
+__attribute__((__overloadable__))
+half3 _cl_half_powr(half3 x0, half3 x1)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short3 ivector_t;
+  typedef short3 jvector_t;
+  typedef int3 kvector_t;
+  typedef half3 vector_t;
+#define convert_ivector_t convert_short3
+#define convert_jvector_t convert_short3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_half3
+#define ilogb_ _cl_ilogb_half3
+#define ldexp_scalar_ _cl_ldexp_half3_short
+#define ldexp_vector_ _cl_ldexp_half3_short3
+  return powr(x0,x1);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_powr: VF=half4
+// Implement half_powr directly
+__attribute__((__overloadable__))
+half4 _cl_half_powr(half4 x0, half4 x1)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short4 ivector_t;
+  typedef short4 jvector_t;
+  typedef int4 kvector_t;
+  typedef half4 vector_t;
+#define convert_ivector_t convert_short4
+#define convert_jvector_t convert_short4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_half4
+#define ilogb_ _cl_ilogb_half4
+#define ldexp_scalar_ _cl_ldexp_half4_short
+#define ldexp_vector_ _cl_ldexp_half4_short4
+  return powr(x0,x1);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_powr: VF=half8
+// Implement half_powr directly
+__attribute__((__overloadable__))
+half8 _cl_half_powr(half8 x0, half8 x1)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short8 ivector_t;
+  typedef short8 jvector_t;
+  typedef int8 kvector_t;
+  typedef half8 vector_t;
+#define convert_ivector_t convert_short8
+#define convert_jvector_t convert_short8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_half8
+#define ilogb_ _cl_ilogb_half8
+#define ldexp_scalar_ _cl_ldexp_half8_short
+#define ldexp_vector_ _cl_ldexp_half8_short8
+  return powr(x0,x1);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_powr: VF=half16
+// Implement half_powr directly
+__attribute__((__overloadable__))
+half16 _cl_half_powr(half16 x0, half16 x1)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short16 ivector_t;
+  typedef short16 jvector_t;
+  typedef int16 kvector_t;
+  typedef half16 vector_t;
+#define convert_ivector_t convert_short16
+#define convert_jvector_t convert_short16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_half16
+#define ilogb_ _cl_ilogb_half16
+#define ldexp_scalar_ _cl_ldexp_half16_short
+#define ldexp_vector_ _cl_ldexp_half16_short16
+  return powr(x0,x1);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp16
+
 // half_powr: VF=float
 // Implement half_powr directly
 __attribute__((__overloadable__))
@@ -206,3 +390,187 @@ float16 _cl_half_powr(float16 x0, float16 x1)
 #undef ldexp_scalar_
 #undef ldexp_vector_
 }
+
+#ifdef cl_khr_fp64
+
+// half_powr: VF=double
+// Implement half_powr directly
+__attribute__((__overloadable__))
+double _cl_half_powr(double x0, double x1)
+{
+  typedef long iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef double vector_t;
+#define convert_ivector_t convert_long
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_double
+#define ilogb_ _cl_ilogb_double
+#define ldexp_scalar_ _cl_ldexp_double_long
+#define ldexp_vector_ _cl_ldexp_double_long
+  return powr(x0,x1);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_powr: VF=double2
+// Implement half_powr directly
+__attribute__((__overloadable__))
+double2 _cl_half_powr(double2 x0, double2 x1)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long2 ivector_t;
+  typedef long2 jvector_t;
+  typedef int2 kvector_t;
+  typedef double2 vector_t;
+#define convert_ivector_t convert_long2
+#define convert_jvector_t convert_long2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_double2
+#define ilogb_ _cl_ilogb_double2
+#define ldexp_scalar_ _cl_ldexp_double2_long
+#define ldexp_vector_ _cl_ldexp_double2_long2
+  return powr(x0,x1);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_powr: VF=double3
+// Implement half_powr directly
+__attribute__((__overloadable__))
+double3 _cl_half_powr(double3 x0, double3 x1)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long3 ivector_t;
+  typedef long3 jvector_t;
+  typedef int3 kvector_t;
+  typedef double3 vector_t;
+#define convert_ivector_t convert_long3
+#define convert_jvector_t convert_long3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_double3
+#define ilogb_ _cl_ilogb_double3
+#define ldexp_scalar_ _cl_ldexp_double3_long
+#define ldexp_vector_ _cl_ldexp_double3_long3
+  return powr(x0,x1);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_powr: VF=double4
+// Implement half_powr directly
+__attribute__((__overloadable__))
+double4 _cl_half_powr(double4 x0, double4 x1)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long4 ivector_t;
+  typedef long4 jvector_t;
+  typedef int4 kvector_t;
+  typedef double4 vector_t;
+#define convert_ivector_t convert_long4
+#define convert_jvector_t convert_long4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_double4
+#define ilogb_ _cl_ilogb_double4
+#define ldexp_scalar_ _cl_ldexp_double4_long
+#define ldexp_vector_ _cl_ldexp_double4_long4
+  return powr(x0,x1);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_powr: VF=double8
+// Implement half_powr directly
+__attribute__((__overloadable__))
+double8 _cl_half_powr(double8 x0, double8 x1)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long8 ivector_t;
+  typedef long8 jvector_t;
+  typedef int8 kvector_t;
+  typedef double8 vector_t;
+#define convert_ivector_t convert_long8
+#define convert_jvector_t convert_long8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_double8
+#define ilogb_ _cl_ilogb_double8
+#define ldexp_scalar_ _cl_ldexp_double8_long
+#define ldexp_vector_ _cl_ldexp_double8_long8
+  return powr(x0,x1);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_powr: VF=double16
+// Implement half_powr directly
+__attribute__((__overloadable__))
+double16 _cl_half_powr(double16 x0, double16 x1)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long16 ivector_t;
+  typedef long16 jvector_t;
+  typedef int16 kvector_t;
+  typedef double16 vector_t;
+#define convert_ivector_t convert_long16
+#define convert_jvector_t convert_long16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_double16
+#define ilogb_ _cl_ilogb_double16
+#define ldexp_scalar_ _cl_ldexp_double16_long
+#define ldexp_vector_ _cl_ldexp_double16_long16
+  return powr(x0,x1);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp64
diff --git a/lib/kernel/vecmathlib-pocl/half_recip.cl b/lib/kernel/vecmathlib-pocl/half_recip.cl
index 5f84a50..5af7db1 100644
--- a/lib/kernel/vecmathlib-pocl/half_recip.cl
+++ b/lib/kernel/vecmathlib-pocl/half_recip.cl
@@ -27,6 +27,190 @@
 
 // half_recip: ['VF'] -> VF
 
+#ifdef cl_khr_fp16
+
+// half_recip: VF=half
+// Implement half_recip directly
+__attribute__((__overloadable__))
+half _cl_half_recip(half x0)
+{
+  typedef short iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef half vector_t;
+#define convert_ivector_t convert_short
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_half
+#define ilogb_ _cl_ilogb_half
+#define ldexp_scalar_ _cl_ldexp_half_short
+#define ldexp_vector_ _cl_ldexp_half_short
+  return (scalar_t)1/x0;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_recip: VF=half2
+// Implement half_recip directly
+__attribute__((__overloadable__))
+half2 _cl_half_recip(half2 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short2 ivector_t;
+  typedef short2 jvector_t;
+  typedef int2 kvector_t;
+  typedef half2 vector_t;
+#define convert_ivector_t convert_short2
+#define convert_jvector_t convert_short2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_half2
+#define ilogb_ _cl_ilogb_half2
+#define ldexp_scalar_ _cl_ldexp_half2_short
+#define ldexp_vector_ _cl_ldexp_half2_short2
+  return (scalar_t)1/x0;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_recip: VF=half3
+// Implement half_recip directly
+__attribute__((__overloadable__))
+half3 _cl_half_recip(half3 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short3 ivector_t;
+  typedef short3 jvector_t;
+  typedef int3 kvector_t;
+  typedef half3 vector_t;
+#define convert_ivector_t convert_short3
+#define convert_jvector_t convert_short3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_half3
+#define ilogb_ _cl_ilogb_half3
+#define ldexp_scalar_ _cl_ldexp_half3_short
+#define ldexp_vector_ _cl_ldexp_half3_short3
+  return (scalar_t)1/x0;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_recip: VF=half4
+// Implement half_recip directly
+__attribute__((__overloadable__))
+half4 _cl_half_recip(half4 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short4 ivector_t;
+  typedef short4 jvector_t;
+  typedef int4 kvector_t;
+  typedef half4 vector_t;
+#define convert_ivector_t convert_short4
+#define convert_jvector_t convert_short4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_half4
+#define ilogb_ _cl_ilogb_half4
+#define ldexp_scalar_ _cl_ldexp_half4_short
+#define ldexp_vector_ _cl_ldexp_half4_short4
+  return (scalar_t)1/x0;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_recip: VF=half8
+// Implement half_recip directly
+__attribute__((__overloadable__))
+half8 _cl_half_recip(half8 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short8 ivector_t;
+  typedef short8 jvector_t;
+  typedef int8 kvector_t;
+  typedef half8 vector_t;
+#define convert_ivector_t convert_short8
+#define convert_jvector_t convert_short8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_half8
+#define ilogb_ _cl_ilogb_half8
+#define ldexp_scalar_ _cl_ldexp_half8_short
+#define ldexp_vector_ _cl_ldexp_half8_short8
+  return (scalar_t)1/x0;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_recip: VF=half16
+// Implement half_recip directly
+__attribute__((__overloadable__))
+half16 _cl_half_recip(half16 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short16 ivector_t;
+  typedef short16 jvector_t;
+  typedef int16 kvector_t;
+  typedef half16 vector_t;
+#define convert_ivector_t convert_short16
+#define convert_jvector_t convert_short16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_half16
+#define ilogb_ _cl_ilogb_half16
+#define ldexp_scalar_ _cl_ldexp_half16_short
+#define ldexp_vector_ _cl_ldexp_half16_short16
+  return (scalar_t)1/x0;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp16
+
 // half_recip: VF=float
 // Implement half_recip directly
 __attribute__((__overloadable__))
@@ -206,3 +390,187 @@ float16 _cl_half_recip(float16 x0)
 #undef ldexp_scalar_
 #undef ldexp_vector_
 }
+
+#ifdef cl_khr_fp64
+
+// half_recip: VF=double
+// Implement half_recip directly
+__attribute__((__overloadable__))
+double _cl_half_recip(double x0)
+{
+  typedef long iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef double vector_t;
+#define convert_ivector_t convert_long
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_double
+#define ilogb_ _cl_ilogb_double
+#define ldexp_scalar_ _cl_ldexp_double_long
+#define ldexp_vector_ _cl_ldexp_double_long
+  return (scalar_t)1/x0;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_recip: VF=double2
+// Implement half_recip directly
+__attribute__((__overloadable__))
+double2 _cl_half_recip(double2 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long2 ivector_t;
+  typedef long2 jvector_t;
+  typedef int2 kvector_t;
+  typedef double2 vector_t;
+#define convert_ivector_t convert_long2
+#define convert_jvector_t convert_long2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_double2
+#define ilogb_ _cl_ilogb_double2
+#define ldexp_scalar_ _cl_ldexp_double2_long
+#define ldexp_vector_ _cl_ldexp_double2_long2
+  return (scalar_t)1/x0;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_recip: VF=double3
+// Implement half_recip directly
+__attribute__((__overloadable__))
+double3 _cl_half_recip(double3 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long3 ivector_t;
+  typedef long3 jvector_t;
+  typedef int3 kvector_t;
+  typedef double3 vector_t;
+#define convert_ivector_t convert_long3
+#define convert_jvector_t convert_long3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_double3
+#define ilogb_ _cl_ilogb_double3
+#define ldexp_scalar_ _cl_ldexp_double3_long
+#define ldexp_vector_ _cl_ldexp_double3_long3
+  return (scalar_t)1/x0;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_recip: VF=double4
+// Implement half_recip directly
+__attribute__((__overloadable__))
+double4 _cl_half_recip(double4 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long4 ivector_t;
+  typedef long4 jvector_t;
+  typedef int4 kvector_t;
+  typedef double4 vector_t;
+#define convert_ivector_t convert_long4
+#define convert_jvector_t convert_long4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_double4
+#define ilogb_ _cl_ilogb_double4
+#define ldexp_scalar_ _cl_ldexp_double4_long
+#define ldexp_vector_ _cl_ldexp_double4_long4
+  return (scalar_t)1/x0;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_recip: VF=double8
+// Implement half_recip directly
+__attribute__((__overloadable__))
+double8 _cl_half_recip(double8 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long8 ivector_t;
+  typedef long8 jvector_t;
+  typedef int8 kvector_t;
+  typedef double8 vector_t;
+#define convert_ivector_t convert_long8
+#define convert_jvector_t convert_long8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_double8
+#define ilogb_ _cl_ilogb_double8
+#define ldexp_scalar_ _cl_ldexp_double8_long
+#define ldexp_vector_ _cl_ldexp_double8_long8
+  return (scalar_t)1/x0;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_recip: VF=double16
+// Implement half_recip directly
+__attribute__((__overloadable__))
+double16 _cl_half_recip(double16 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long16 ivector_t;
+  typedef long16 jvector_t;
+  typedef int16 kvector_t;
+  typedef double16 vector_t;
+#define convert_ivector_t convert_long16
+#define convert_jvector_t convert_long16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_double16
+#define ilogb_ _cl_ilogb_double16
+#define ldexp_scalar_ _cl_ldexp_double16_long
+#define ldexp_vector_ _cl_ldexp_double16_long16
+  return (scalar_t)1/x0;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp64
diff --git a/lib/kernel/vecmathlib-pocl/half_rsqrt.cl b/lib/kernel/vecmathlib-pocl/half_rsqrt.cl
index 52cf525..e297b60 100644
--- a/lib/kernel/vecmathlib-pocl/half_rsqrt.cl
+++ b/lib/kernel/vecmathlib-pocl/half_rsqrt.cl
@@ -27,6 +27,190 @@
 
 // half_rsqrt: ['VF'] -> VF
 
+#ifdef cl_khr_fp16
+
+// half_rsqrt: VF=half
+// Implement half_rsqrt directly
+__attribute__((__overloadable__))
+half _cl_half_rsqrt(half x0)
+{
+  typedef short iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef half vector_t;
+#define convert_ivector_t convert_short
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_half
+#define ilogb_ _cl_ilogb_half
+#define ldexp_scalar_ _cl_ldexp_half_short
+#define ldexp_vector_ _cl_ldexp_half_short
+  return rsqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_rsqrt: VF=half2
+// Implement half_rsqrt directly
+__attribute__((__overloadable__))
+half2 _cl_half_rsqrt(half2 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short2 ivector_t;
+  typedef short2 jvector_t;
+  typedef int2 kvector_t;
+  typedef half2 vector_t;
+#define convert_ivector_t convert_short2
+#define convert_jvector_t convert_short2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_half2
+#define ilogb_ _cl_ilogb_half2
+#define ldexp_scalar_ _cl_ldexp_half2_short
+#define ldexp_vector_ _cl_ldexp_half2_short2
+  return rsqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_rsqrt: VF=half3
+// Implement half_rsqrt directly
+__attribute__((__overloadable__))
+half3 _cl_half_rsqrt(half3 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short3 ivector_t;
+  typedef short3 jvector_t;
+  typedef int3 kvector_t;
+  typedef half3 vector_t;
+#define convert_ivector_t convert_short3
+#define convert_jvector_t convert_short3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_half3
+#define ilogb_ _cl_ilogb_half3
+#define ldexp_scalar_ _cl_ldexp_half3_short
+#define ldexp_vector_ _cl_ldexp_half3_short3
+  return rsqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_rsqrt: VF=half4
+// Implement half_rsqrt directly
+__attribute__((__overloadable__))
+half4 _cl_half_rsqrt(half4 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short4 ivector_t;
+  typedef short4 jvector_t;
+  typedef int4 kvector_t;
+  typedef half4 vector_t;
+#define convert_ivector_t convert_short4
+#define convert_jvector_t convert_short4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_half4
+#define ilogb_ _cl_ilogb_half4
+#define ldexp_scalar_ _cl_ldexp_half4_short
+#define ldexp_vector_ _cl_ldexp_half4_short4
+  return rsqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_rsqrt: VF=half8
+// Implement half_rsqrt directly
+__attribute__((__overloadable__))
+half8 _cl_half_rsqrt(half8 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short8 ivector_t;
+  typedef short8 jvector_t;
+  typedef int8 kvector_t;
+  typedef half8 vector_t;
+#define convert_ivector_t convert_short8
+#define convert_jvector_t convert_short8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_half8
+#define ilogb_ _cl_ilogb_half8
+#define ldexp_scalar_ _cl_ldexp_half8_short
+#define ldexp_vector_ _cl_ldexp_half8_short8
+  return rsqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_rsqrt: VF=half16
+// Implement half_rsqrt directly
+__attribute__((__overloadable__))
+half16 _cl_half_rsqrt(half16 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short16 ivector_t;
+  typedef short16 jvector_t;
+  typedef int16 kvector_t;
+  typedef half16 vector_t;
+#define convert_ivector_t convert_short16
+#define convert_jvector_t convert_short16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_half16
+#define ilogb_ _cl_ilogb_half16
+#define ldexp_scalar_ _cl_ldexp_half16_short
+#define ldexp_vector_ _cl_ldexp_half16_short16
+  return rsqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp16
+
 // half_rsqrt: VF=float
 // Implement half_rsqrt directly
 __attribute__((__overloadable__))
@@ -206,3 +390,187 @@ float16 _cl_half_rsqrt(float16 x0)
 #undef ldexp_scalar_
 #undef ldexp_vector_
 }
+
+#ifdef cl_khr_fp64
+
+// half_rsqrt: VF=double
+// Implement half_rsqrt directly
+__attribute__((__overloadable__))
+double _cl_half_rsqrt(double x0)
+{
+  typedef long iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef double vector_t;
+#define convert_ivector_t convert_long
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_double
+#define ilogb_ _cl_ilogb_double
+#define ldexp_scalar_ _cl_ldexp_double_long
+#define ldexp_vector_ _cl_ldexp_double_long
+  return rsqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_rsqrt: VF=double2
+// Implement half_rsqrt directly
+__attribute__((__overloadable__))
+double2 _cl_half_rsqrt(double2 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long2 ivector_t;
+  typedef long2 jvector_t;
+  typedef int2 kvector_t;
+  typedef double2 vector_t;
+#define convert_ivector_t convert_long2
+#define convert_jvector_t convert_long2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_double2
+#define ilogb_ _cl_ilogb_double2
+#define ldexp_scalar_ _cl_ldexp_double2_long
+#define ldexp_vector_ _cl_ldexp_double2_long2
+  return rsqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_rsqrt: VF=double3
+// Implement half_rsqrt directly
+__attribute__((__overloadable__))
+double3 _cl_half_rsqrt(double3 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long3 ivector_t;
+  typedef long3 jvector_t;
+  typedef int3 kvector_t;
+  typedef double3 vector_t;
+#define convert_ivector_t convert_long3
+#define convert_jvector_t convert_long3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_double3
+#define ilogb_ _cl_ilogb_double3
+#define ldexp_scalar_ _cl_ldexp_double3_long
+#define ldexp_vector_ _cl_ldexp_double3_long3
+  return rsqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_rsqrt: VF=double4
+// Implement half_rsqrt directly
+__attribute__((__overloadable__))
+double4 _cl_half_rsqrt(double4 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long4 ivector_t;
+  typedef long4 jvector_t;
+  typedef int4 kvector_t;
+  typedef double4 vector_t;
+#define convert_ivector_t convert_long4
+#define convert_jvector_t convert_long4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_double4
+#define ilogb_ _cl_ilogb_double4
+#define ldexp_scalar_ _cl_ldexp_double4_long
+#define ldexp_vector_ _cl_ldexp_double4_long4
+  return rsqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_rsqrt: VF=double8
+// Implement half_rsqrt directly
+__attribute__((__overloadable__))
+double8 _cl_half_rsqrt(double8 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long8 ivector_t;
+  typedef long8 jvector_t;
+  typedef int8 kvector_t;
+  typedef double8 vector_t;
+#define convert_ivector_t convert_long8
+#define convert_jvector_t convert_long8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_double8
+#define ilogb_ _cl_ilogb_double8
+#define ldexp_scalar_ _cl_ldexp_double8_long
+#define ldexp_vector_ _cl_ldexp_double8_long8
+  return rsqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_rsqrt: VF=double16
+// Implement half_rsqrt directly
+__attribute__((__overloadable__))
+double16 _cl_half_rsqrt(double16 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long16 ivector_t;
+  typedef long16 jvector_t;
+  typedef int16 kvector_t;
+  typedef double16 vector_t;
+#define convert_ivector_t convert_long16
+#define convert_jvector_t convert_long16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_double16
+#define ilogb_ _cl_ilogb_double16
+#define ldexp_scalar_ _cl_ldexp_double16_long
+#define ldexp_vector_ _cl_ldexp_double16_long16
+  return rsqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp64
diff --git a/lib/kernel/vecmathlib-pocl/half_sin.cl b/lib/kernel/vecmathlib-pocl/half_sin.cl
index b7000a2..5ef01af 100644
--- a/lib/kernel/vecmathlib-pocl/half_sin.cl
+++ b/lib/kernel/vecmathlib-pocl/half_sin.cl
@@ -27,6 +27,190 @@
 
 // half_sin: ['VF'] -> VF
 
+#ifdef cl_khr_fp16
+
+// half_sin: VF=half
+// Implement half_sin directly
+__attribute__((__overloadable__))
+half _cl_half_sin(half x0)
+{
+  typedef short iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef half vector_t;
+#define convert_ivector_t convert_short
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_half
+#define ilogb_ _cl_ilogb_half
+#define ldexp_scalar_ _cl_ldexp_half_short
+#define ldexp_vector_ _cl_ldexp_half_short
+  return sin(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_sin: VF=half2
+// Implement half_sin directly
+__attribute__((__overloadable__))
+half2 _cl_half_sin(half2 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short2 ivector_t;
+  typedef short2 jvector_t;
+  typedef int2 kvector_t;
+  typedef half2 vector_t;
+#define convert_ivector_t convert_short2
+#define convert_jvector_t convert_short2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_half2
+#define ilogb_ _cl_ilogb_half2
+#define ldexp_scalar_ _cl_ldexp_half2_short
+#define ldexp_vector_ _cl_ldexp_half2_short2
+  return sin(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_sin: VF=half3
+// Implement half_sin directly
+__attribute__((__overloadable__))
+half3 _cl_half_sin(half3 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short3 ivector_t;
+  typedef short3 jvector_t;
+  typedef int3 kvector_t;
+  typedef half3 vector_t;
+#define convert_ivector_t convert_short3
+#define convert_jvector_t convert_short3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_half3
+#define ilogb_ _cl_ilogb_half3
+#define ldexp_scalar_ _cl_ldexp_half3_short
+#define ldexp_vector_ _cl_ldexp_half3_short3
+  return sin(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_sin: VF=half4
+// Implement half_sin directly
+__attribute__((__overloadable__))
+half4 _cl_half_sin(half4 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short4 ivector_t;
+  typedef short4 jvector_t;
+  typedef int4 kvector_t;
+  typedef half4 vector_t;
+#define convert_ivector_t convert_short4
+#define convert_jvector_t convert_short4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_half4
+#define ilogb_ _cl_ilogb_half4
+#define ldexp_scalar_ _cl_ldexp_half4_short
+#define ldexp_vector_ _cl_ldexp_half4_short4
+  return sin(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_sin: VF=half8
+// Implement half_sin directly
+__attribute__((__overloadable__))
+half8 _cl_half_sin(half8 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short8 ivector_t;
+  typedef short8 jvector_t;
+  typedef int8 kvector_t;
+  typedef half8 vector_t;
+#define convert_ivector_t convert_short8
+#define convert_jvector_t convert_short8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_half8
+#define ilogb_ _cl_ilogb_half8
+#define ldexp_scalar_ _cl_ldexp_half8_short
+#define ldexp_vector_ _cl_ldexp_half8_short8
+  return sin(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_sin: VF=half16
+// Implement half_sin directly
+__attribute__((__overloadable__))
+half16 _cl_half_sin(half16 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short16 ivector_t;
+  typedef short16 jvector_t;
+  typedef int16 kvector_t;
+  typedef half16 vector_t;
+#define convert_ivector_t convert_short16
+#define convert_jvector_t convert_short16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_half16
+#define ilogb_ _cl_ilogb_half16
+#define ldexp_scalar_ _cl_ldexp_half16_short
+#define ldexp_vector_ _cl_ldexp_half16_short16
+  return sin(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp16
+
 // half_sin: VF=float
 // Implement half_sin directly
 __attribute__((__overloadable__))
@@ -206,3 +390,187 @@ float16 _cl_half_sin(float16 x0)
 #undef ldexp_scalar_
 #undef ldexp_vector_
 }
+
+#ifdef cl_khr_fp64
+
+// half_sin: VF=double
+// Implement half_sin directly
+__attribute__((__overloadable__))
+double _cl_half_sin(double x0)
+{
+  typedef long iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef double vector_t;
+#define convert_ivector_t convert_long
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_double
+#define ilogb_ _cl_ilogb_double
+#define ldexp_scalar_ _cl_ldexp_double_long
+#define ldexp_vector_ _cl_ldexp_double_long
+  return sin(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_sin: VF=double2
+// Implement half_sin directly
+__attribute__((__overloadable__))
+double2 _cl_half_sin(double2 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long2 ivector_t;
+  typedef long2 jvector_t;
+  typedef int2 kvector_t;
+  typedef double2 vector_t;
+#define convert_ivector_t convert_long2
+#define convert_jvector_t convert_long2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_double2
+#define ilogb_ _cl_ilogb_double2
+#define ldexp_scalar_ _cl_ldexp_double2_long
+#define ldexp_vector_ _cl_ldexp_double2_long2
+  return sin(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_sin: VF=double3
+// Implement half_sin directly
+__attribute__((__overloadable__))
+double3 _cl_half_sin(double3 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long3 ivector_t;
+  typedef long3 jvector_t;
+  typedef int3 kvector_t;
+  typedef double3 vector_t;
+#define convert_ivector_t convert_long3
+#define convert_jvector_t convert_long3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_double3
+#define ilogb_ _cl_ilogb_double3
+#define ldexp_scalar_ _cl_ldexp_double3_long
+#define ldexp_vector_ _cl_ldexp_double3_long3
+  return sin(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_sin: VF=double4
+// Implement half_sin directly
+__attribute__((__overloadable__))
+double4 _cl_half_sin(double4 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long4 ivector_t;
+  typedef long4 jvector_t;
+  typedef int4 kvector_t;
+  typedef double4 vector_t;
+#define convert_ivector_t convert_long4
+#define convert_jvector_t convert_long4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_double4
+#define ilogb_ _cl_ilogb_double4
+#define ldexp_scalar_ _cl_ldexp_double4_long
+#define ldexp_vector_ _cl_ldexp_double4_long4
+  return sin(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_sin: VF=double8
+// Implement half_sin directly
+__attribute__((__overloadable__))
+double8 _cl_half_sin(double8 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long8 ivector_t;
+  typedef long8 jvector_t;
+  typedef int8 kvector_t;
+  typedef double8 vector_t;
+#define convert_ivector_t convert_long8
+#define convert_jvector_t convert_long8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_double8
+#define ilogb_ _cl_ilogb_double8
+#define ldexp_scalar_ _cl_ldexp_double8_long
+#define ldexp_vector_ _cl_ldexp_double8_long8
+  return sin(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_sin: VF=double16
+// Implement half_sin directly
+__attribute__((__overloadable__))
+double16 _cl_half_sin(double16 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long16 ivector_t;
+  typedef long16 jvector_t;
+  typedef int16 kvector_t;
+  typedef double16 vector_t;
+#define convert_ivector_t convert_long16
+#define convert_jvector_t convert_long16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_double16
+#define ilogb_ _cl_ilogb_double16
+#define ldexp_scalar_ _cl_ldexp_double16_long
+#define ldexp_vector_ _cl_ldexp_double16_long16
+  return sin(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp64
diff --git a/lib/kernel/vecmathlib-pocl/half_sqrt.cl b/lib/kernel/vecmathlib-pocl/half_sqrt.cl
index 362bfb2..a9fef6a 100644
--- a/lib/kernel/vecmathlib-pocl/half_sqrt.cl
+++ b/lib/kernel/vecmathlib-pocl/half_sqrt.cl
@@ -27,6 +27,190 @@
 
 // half_sqrt: ['VF'] -> VF
 
+#ifdef cl_khr_fp16
+
+// half_sqrt: VF=half
+// Implement half_sqrt directly
+__attribute__((__overloadable__))
+half _cl_half_sqrt(half x0)
+{
+  typedef short iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef half vector_t;
+#define convert_ivector_t convert_short
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_half
+#define ilogb_ _cl_ilogb_half
+#define ldexp_scalar_ _cl_ldexp_half_short
+#define ldexp_vector_ _cl_ldexp_half_short
+  return sqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_sqrt: VF=half2
+// Implement half_sqrt directly
+__attribute__((__overloadable__))
+half2 _cl_half_sqrt(half2 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short2 ivector_t;
+  typedef short2 jvector_t;
+  typedef int2 kvector_t;
+  typedef half2 vector_t;
+#define convert_ivector_t convert_short2
+#define convert_jvector_t convert_short2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_half2
+#define ilogb_ _cl_ilogb_half2
+#define ldexp_scalar_ _cl_ldexp_half2_short
+#define ldexp_vector_ _cl_ldexp_half2_short2
+  return sqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_sqrt: VF=half3
+// Implement half_sqrt directly
+__attribute__((__overloadable__))
+half3 _cl_half_sqrt(half3 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short3 ivector_t;
+  typedef short3 jvector_t;
+  typedef int3 kvector_t;
+  typedef half3 vector_t;
+#define convert_ivector_t convert_short3
+#define convert_jvector_t convert_short3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_half3
+#define ilogb_ _cl_ilogb_half3
+#define ldexp_scalar_ _cl_ldexp_half3_short
+#define ldexp_vector_ _cl_ldexp_half3_short3
+  return sqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_sqrt: VF=half4
+// Implement half_sqrt directly
+__attribute__((__overloadable__))
+half4 _cl_half_sqrt(half4 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short4 ivector_t;
+  typedef short4 jvector_t;
+  typedef int4 kvector_t;
+  typedef half4 vector_t;
+#define convert_ivector_t convert_short4
+#define convert_jvector_t convert_short4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_half4
+#define ilogb_ _cl_ilogb_half4
+#define ldexp_scalar_ _cl_ldexp_half4_short
+#define ldexp_vector_ _cl_ldexp_half4_short4
+  return sqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_sqrt: VF=half8
+// Implement half_sqrt directly
+__attribute__((__overloadable__))
+half8 _cl_half_sqrt(half8 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short8 ivector_t;
+  typedef short8 jvector_t;
+  typedef int8 kvector_t;
+  typedef half8 vector_t;
+#define convert_ivector_t convert_short8
+#define convert_jvector_t convert_short8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_half8
+#define ilogb_ _cl_ilogb_half8
+#define ldexp_scalar_ _cl_ldexp_half8_short
+#define ldexp_vector_ _cl_ldexp_half8_short8
+  return sqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_sqrt: VF=half16
+// Implement half_sqrt directly
+__attribute__((__overloadable__))
+half16 _cl_half_sqrt(half16 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short16 ivector_t;
+  typedef short16 jvector_t;
+  typedef int16 kvector_t;
+  typedef half16 vector_t;
+#define convert_ivector_t convert_short16
+#define convert_jvector_t convert_short16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_half16
+#define ilogb_ _cl_ilogb_half16
+#define ldexp_scalar_ _cl_ldexp_half16_short
+#define ldexp_vector_ _cl_ldexp_half16_short16
+  return sqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp16
+
 // half_sqrt: VF=float
 // Implement half_sqrt directly
 __attribute__((__overloadable__))
@@ -206,3 +390,187 @@ float16 _cl_half_sqrt(float16 x0)
 #undef ldexp_scalar_
 #undef ldexp_vector_
 }
+
+#ifdef cl_khr_fp64
+
+// half_sqrt: VF=double
+// Implement half_sqrt directly
+__attribute__((__overloadable__))
+double _cl_half_sqrt(double x0)
+{
+  typedef long iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef double vector_t;
+#define convert_ivector_t convert_long
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_double
+#define ilogb_ _cl_ilogb_double
+#define ldexp_scalar_ _cl_ldexp_double_long
+#define ldexp_vector_ _cl_ldexp_double_long
+  return sqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_sqrt: VF=double2
+// Implement half_sqrt directly
+__attribute__((__overloadable__))
+double2 _cl_half_sqrt(double2 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long2 ivector_t;
+  typedef long2 jvector_t;
+  typedef int2 kvector_t;
+  typedef double2 vector_t;
+#define convert_ivector_t convert_long2
+#define convert_jvector_t convert_long2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_double2
+#define ilogb_ _cl_ilogb_double2
+#define ldexp_scalar_ _cl_ldexp_double2_long
+#define ldexp_vector_ _cl_ldexp_double2_long2
+  return sqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_sqrt: VF=double3
+// Implement half_sqrt directly
+__attribute__((__overloadable__))
+double3 _cl_half_sqrt(double3 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long3 ivector_t;
+  typedef long3 jvector_t;
+  typedef int3 kvector_t;
+  typedef double3 vector_t;
+#define convert_ivector_t convert_long3
+#define convert_jvector_t convert_long3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_double3
+#define ilogb_ _cl_ilogb_double3
+#define ldexp_scalar_ _cl_ldexp_double3_long
+#define ldexp_vector_ _cl_ldexp_double3_long3
+  return sqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_sqrt: VF=double4
+// Implement half_sqrt directly
+__attribute__((__overloadable__))
+double4 _cl_half_sqrt(double4 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long4 ivector_t;
+  typedef long4 jvector_t;
+  typedef int4 kvector_t;
+  typedef double4 vector_t;
+#define convert_ivector_t convert_long4
+#define convert_jvector_t convert_long4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_double4
+#define ilogb_ _cl_ilogb_double4
+#define ldexp_scalar_ _cl_ldexp_double4_long
+#define ldexp_vector_ _cl_ldexp_double4_long4
+  return sqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_sqrt: VF=double8
+// Implement half_sqrt directly
+__attribute__((__overloadable__))
+double8 _cl_half_sqrt(double8 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long8 ivector_t;
+  typedef long8 jvector_t;
+  typedef int8 kvector_t;
+  typedef double8 vector_t;
+#define convert_ivector_t convert_long8
+#define convert_jvector_t convert_long8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_double8
+#define ilogb_ _cl_ilogb_double8
+#define ldexp_scalar_ _cl_ldexp_double8_long
+#define ldexp_vector_ _cl_ldexp_double8_long8
+  return sqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_sqrt: VF=double16
+// Implement half_sqrt directly
+__attribute__((__overloadable__))
+double16 _cl_half_sqrt(double16 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long16 ivector_t;
+  typedef long16 jvector_t;
+  typedef int16 kvector_t;
+  typedef double16 vector_t;
+#define convert_ivector_t convert_long16
+#define convert_jvector_t convert_long16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_double16
+#define ilogb_ _cl_ilogb_double16
+#define ldexp_scalar_ _cl_ldexp_double16_long
+#define ldexp_vector_ _cl_ldexp_double16_long16
+  return sqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp64
diff --git a/lib/kernel/vecmathlib-pocl/half_tan.cl b/lib/kernel/vecmathlib-pocl/half_tan.cl
index 3213410..82d4e76 100644
--- a/lib/kernel/vecmathlib-pocl/half_tan.cl
+++ b/lib/kernel/vecmathlib-pocl/half_tan.cl
@@ -27,6 +27,190 @@
 
 // half_tan: ['VF'] -> VF
 
+#ifdef cl_khr_fp16
+
+// half_tan: VF=half
+// Implement half_tan directly
+__attribute__((__overloadable__))
+half _cl_half_tan(half x0)
+{
+  typedef short iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef half vector_t;
+#define convert_ivector_t convert_short
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_half
+#define ilogb_ _cl_ilogb_half
+#define ldexp_scalar_ _cl_ldexp_half_short
+#define ldexp_vector_ _cl_ldexp_half_short
+  return tan(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_tan: VF=half2
+// Implement half_tan directly
+__attribute__((__overloadable__))
+half2 _cl_half_tan(half2 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short2 ivector_t;
+  typedef short2 jvector_t;
+  typedef int2 kvector_t;
+  typedef half2 vector_t;
+#define convert_ivector_t convert_short2
+#define convert_jvector_t convert_short2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_half2
+#define ilogb_ _cl_ilogb_half2
+#define ldexp_scalar_ _cl_ldexp_half2_short
+#define ldexp_vector_ _cl_ldexp_half2_short2
+  return tan(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_tan: VF=half3
+// Implement half_tan directly
+__attribute__((__overloadable__))
+half3 _cl_half_tan(half3 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short3 ivector_t;
+  typedef short3 jvector_t;
+  typedef int3 kvector_t;
+  typedef half3 vector_t;
+#define convert_ivector_t convert_short3
+#define convert_jvector_t convert_short3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_half3
+#define ilogb_ _cl_ilogb_half3
+#define ldexp_scalar_ _cl_ldexp_half3_short
+#define ldexp_vector_ _cl_ldexp_half3_short3
+  return tan(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_tan: VF=half4
+// Implement half_tan directly
+__attribute__((__overloadable__))
+half4 _cl_half_tan(half4 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short4 ivector_t;
+  typedef short4 jvector_t;
+  typedef int4 kvector_t;
+  typedef half4 vector_t;
+#define convert_ivector_t convert_short4
+#define convert_jvector_t convert_short4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_half4
+#define ilogb_ _cl_ilogb_half4
+#define ldexp_scalar_ _cl_ldexp_half4_short
+#define ldexp_vector_ _cl_ldexp_half4_short4
+  return tan(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_tan: VF=half8
+// Implement half_tan directly
+__attribute__((__overloadable__))
+half8 _cl_half_tan(half8 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short8 ivector_t;
+  typedef short8 jvector_t;
+  typedef int8 kvector_t;
+  typedef half8 vector_t;
+#define convert_ivector_t convert_short8
+#define convert_jvector_t convert_short8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_half8
+#define ilogb_ _cl_ilogb_half8
+#define ldexp_scalar_ _cl_ldexp_half8_short
+#define ldexp_vector_ _cl_ldexp_half8_short8
+  return tan(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_tan: VF=half16
+// Implement half_tan directly
+__attribute__((__overloadable__))
+half16 _cl_half_tan(half16 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short16 ivector_t;
+  typedef short16 jvector_t;
+  typedef int16 kvector_t;
+  typedef half16 vector_t;
+#define convert_ivector_t convert_short16
+#define convert_jvector_t convert_short16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_half16
+#define ilogb_ _cl_ilogb_half16
+#define ldexp_scalar_ _cl_ldexp_half16_short
+#define ldexp_vector_ _cl_ldexp_half16_short16
+  return tan(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp16
+
 // half_tan: VF=float
 // Implement half_tan directly
 __attribute__((__overloadable__))
@@ -206,3 +390,187 @@ float16 _cl_half_tan(float16 x0)
 #undef ldexp_scalar_
 #undef ldexp_vector_
 }
+
+#ifdef cl_khr_fp64
+
+// half_tan: VF=double
+// Implement half_tan directly
+__attribute__((__overloadable__))
+double _cl_half_tan(double x0)
+{
+  typedef long iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef double vector_t;
+#define convert_ivector_t convert_long
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_double
+#define ilogb_ _cl_ilogb_double
+#define ldexp_scalar_ _cl_ldexp_double_long
+#define ldexp_vector_ _cl_ldexp_double_long
+  return tan(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_tan: VF=double2
+// Implement half_tan directly
+__attribute__((__overloadable__))
+double2 _cl_half_tan(double2 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long2 ivector_t;
+  typedef long2 jvector_t;
+  typedef int2 kvector_t;
+  typedef double2 vector_t;
+#define convert_ivector_t convert_long2
+#define convert_jvector_t convert_long2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_double2
+#define ilogb_ _cl_ilogb_double2
+#define ldexp_scalar_ _cl_ldexp_double2_long
+#define ldexp_vector_ _cl_ldexp_double2_long2
+  return tan(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_tan: VF=double3
+// Implement half_tan directly
+__attribute__((__overloadable__))
+double3 _cl_half_tan(double3 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long3 ivector_t;
+  typedef long3 jvector_t;
+  typedef int3 kvector_t;
+  typedef double3 vector_t;
+#define convert_ivector_t convert_long3
+#define convert_jvector_t convert_long3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_double3
+#define ilogb_ _cl_ilogb_double3
+#define ldexp_scalar_ _cl_ldexp_double3_long
+#define ldexp_vector_ _cl_ldexp_double3_long3
+  return tan(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_tan: VF=double4
+// Implement half_tan directly
+__attribute__((__overloadable__))
+double4 _cl_half_tan(double4 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long4 ivector_t;
+  typedef long4 jvector_t;
+  typedef int4 kvector_t;
+  typedef double4 vector_t;
+#define convert_ivector_t convert_long4
+#define convert_jvector_t convert_long4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_double4
+#define ilogb_ _cl_ilogb_double4
+#define ldexp_scalar_ _cl_ldexp_double4_long
+#define ldexp_vector_ _cl_ldexp_double4_long4
+  return tan(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_tan: VF=double8
+// Implement half_tan directly
+__attribute__((__overloadable__))
+double8 _cl_half_tan(double8 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long8 ivector_t;
+  typedef long8 jvector_t;
+  typedef int8 kvector_t;
+  typedef double8 vector_t;
+#define convert_ivector_t convert_long8
+#define convert_jvector_t convert_long8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_double8
+#define ilogb_ _cl_ilogb_double8
+#define ldexp_scalar_ _cl_ldexp_double8_long
+#define ldexp_vector_ _cl_ldexp_double8_long8
+  return tan(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// half_tan: VF=double16
+// Implement half_tan directly
+__attribute__((__overloadable__))
+double16 _cl_half_tan(double16 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long16 ivector_t;
+  typedef long16 jvector_t;
+  typedef int16 kvector_t;
+  typedef double16 vector_t;
+#define convert_ivector_t convert_long16
+#define convert_jvector_t convert_long16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_double16
+#define ilogb_ _cl_ilogb_double16
+#define ldexp_scalar_ _cl_ldexp_double16_long
+#define ldexp_vector_ _cl_ldexp_double16_long16
+  return tan(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp64
diff --git a/lib/kernel/vecmathlib-pocl/kernel-vecmathlib.h b/lib/kernel/vecmathlib-pocl/kernel-vecmathlib.h
index b37c4da..c7aa072 100644
--- a/lib/kernel/vecmathlib-pocl/kernel-vecmathlib.h
+++ b/lib/kernel/vecmathlib-pocl/kernel-vecmathlib.h
@@ -3557,282 +3557,730 @@ __attribute__((__overloadable__)) double16 _cl_tanpi(double16 x0);
 // half_cos: ['VF'] -> VF
 #undef half_cos
 #define half_cos _cl_half_cos
+#ifdef cl_khr_fp16
+__attribute__((__overloadable__)) half _cl_half_cos(half x0);
+__attribute__((__overloadable__)) half2 _cl_half_cos(half2 x0);
+__attribute__((__overloadable__)) half3 _cl_half_cos(half3 x0);
+__attribute__((__overloadable__)) half4 _cl_half_cos(half4 x0);
+__attribute__((__overloadable__)) half8 _cl_half_cos(half8 x0);
+__attribute__((__overloadable__)) half16 _cl_half_cos(half16 x0);
+#endif // #ifdef cl_khr_fp16
 __attribute__((__overloadable__)) float _cl_half_cos(float x0);
 __attribute__((__overloadable__)) float2 _cl_half_cos(float2 x0);
 __attribute__((__overloadable__)) float3 _cl_half_cos(float3 x0);
 __attribute__((__overloadable__)) float4 _cl_half_cos(float4 x0);
 __attribute__((__overloadable__)) float8 _cl_half_cos(float8 x0);
 __attribute__((__overloadable__)) float16 _cl_half_cos(float16 x0);
+#ifdef cl_khr_fp64
+__attribute__((__overloadable__)) double _cl_half_cos(double x0);
+__attribute__((__overloadable__)) double2 _cl_half_cos(double2 x0);
+__attribute__((__overloadable__)) double3 _cl_half_cos(double3 x0);
+__attribute__((__overloadable__)) double4 _cl_half_cos(double4 x0);
+__attribute__((__overloadable__)) double8 _cl_half_cos(double8 x0);
+__attribute__((__overloadable__)) double16 _cl_half_cos(double16 x0);
+#endif // #ifdef cl_khr_fp64
 
 // half_divide: ['VF', 'VF'] -> VF
 #undef half_divide
 #define half_divide _cl_half_divide
+#ifdef cl_khr_fp16
+__attribute__((__overloadable__)) half _cl_half_divide(half x0, half x1);
+__attribute__((__overloadable__)) half2 _cl_half_divide(half2 x0, half2 x1);
+__attribute__((__overloadable__)) half3 _cl_half_divide(half3 x0, half3 x1);
+__attribute__((__overloadable__)) half4 _cl_half_divide(half4 x0, half4 x1);
+__attribute__((__overloadable__)) half8 _cl_half_divide(half8 x0, half8 x1);
+__attribute__((__overloadable__)) half16 _cl_half_divide(half16 x0, half16 x1);
+#endif // #ifdef cl_khr_fp16
 __attribute__((__overloadable__)) float _cl_half_divide(float x0, float x1);
 __attribute__((__overloadable__)) float2 _cl_half_divide(float2 x0, float2 x1);
 __attribute__((__overloadable__)) float3 _cl_half_divide(float3 x0, float3 x1);
 __attribute__((__overloadable__)) float4 _cl_half_divide(float4 x0, float4 x1);
 __attribute__((__overloadable__)) float8 _cl_half_divide(float8 x0, float8 x1);
 __attribute__((__overloadable__)) float16 _cl_half_divide(float16 x0, float16 x1);
+#ifdef cl_khr_fp64
+__attribute__((__overloadable__)) double _cl_half_divide(double x0, double x1);
+__attribute__((__overloadable__)) double2 _cl_half_divide(double2 x0, double2 x1);
+__attribute__((__overloadable__)) double3 _cl_half_divide(double3 x0, double3 x1);
+__attribute__((__overloadable__)) double4 _cl_half_divide(double4 x0, double4 x1);
+__attribute__((__overloadable__)) double8 _cl_half_divide(double8 x0, double8 x1);
+__attribute__((__overloadable__)) double16 _cl_half_divide(double16 x0, double16 x1);
+#endif // #ifdef cl_khr_fp64
 
 // half_exp: ['VF'] -> VF
 #undef half_exp
 #define half_exp _cl_half_exp
+#ifdef cl_khr_fp16
+__attribute__((__overloadable__)) half _cl_half_exp(half x0);
+__attribute__((__overloadable__)) half2 _cl_half_exp(half2 x0);
+__attribute__((__overloadable__)) half3 _cl_half_exp(half3 x0);
+__attribute__((__overloadable__)) half4 _cl_half_exp(half4 x0);
+__attribute__((__overloadable__)) half8 _cl_half_exp(half8 x0);
+__attribute__((__overloadable__)) half16 _cl_half_exp(half16 x0);
+#endif // #ifdef cl_khr_fp16
 __attribute__((__overloadable__)) float _cl_half_exp(float x0);
 __attribute__((__overloadable__)) float2 _cl_half_exp(float2 x0);
 __attribute__((__overloadable__)) float3 _cl_half_exp(float3 x0);
 __attribute__((__overloadable__)) float4 _cl_half_exp(float4 x0);
 __attribute__((__overloadable__)) float8 _cl_half_exp(float8 x0);
 __attribute__((__overloadable__)) float16 _cl_half_exp(float16 x0);
+#ifdef cl_khr_fp64
+__attribute__((__overloadable__)) double _cl_half_exp(double x0);
+__attribute__((__overloadable__)) double2 _cl_half_exp(double2 x0);
+__attribute__((__overloadable__)) double3 _cl_half_exp(double3 x0);
+__attribute__((__overloadable__)) double4 _cl_half_exp(double4 x0);
+__attribute__((__overloadable__)) double8 _cl_half_exp(double8 x0);
+__attribute__((__overloadable__)) double16 _cl_half_exp(double16 x0);
+#endif // #ifdef cl_khr_fp64
 
 // half_exp2: ['VF'] -> VF
 #undef half_exp2
 #define half_exp2 _cl_half_exp2
+#ifdef cl_khr_fp16
+__attribute__((__overloadable__)) half _cl_half_exp2(half x0);
+__attribute__((__overloadable__)) half2 _cl_half_exp2(half2 x0);
+__attribute__((__overloadable__)) half3 _cl_half_exp2(half3 x0);
+__attribute__((__overloadable__)) half4 _cl_half_exp2(half4 x0);
+__attribute__((__overloadable__)) half8 _cl_half_exp2(half8 x0);
+__attribute__((__overloadable__)) half16 _cl_half_exp2(half16 x0);
+#endif // #ifdef cl_khr_fp16
 __attribute__((__overloadable__)) float _cl_half_exp2(float x0);
 __attribute__((__overloadable__)) float2 _cl_half_exp2(float2 x0);
 __attribute__((__overloadable__)) float3 _cl_half_exp2(float3 x0);
 __attribute__((__overloadable__)) float4 _cl_half_exp2(float4 x0);
 __attribute__((__overloadable__)) float8 _cl_half_exp2(float8 x0);
 __attribute__((__overloadable__)) float16 _cl_half_exp2(float16 x0);
+#ifdef cl_khr_fp64
+__attribute__((__overloadable__)) double _cl_half_exp2(double x0);
+__attribute__((__overloadable__)) double2 _cl_half_exp2(double2 x0);
+__attribute__((__overloadable__)) double3 _cl_half_exp2(double3 x0);
+__attribute__((__overloadable__)) double4 _cl_half_exp2(double4 x0);
+__attribute__((__overloadable__)) double8 _cl_half_exp2(double8 x0);
+__attribute__((__overloadable__)) double16 _cl_half_exp2(double16 x0);
+#endif // #ifdef cl_khr_fp64
 
 // half_exp10: ['VF'] -> VF
 #undef half_exp10
 #define half_exp10 _cl_half_exp10
+#ifdef cl_khr_fp16
+__attribute__((__overloadable__)) half _cl_half_exp10(half x0);
+__attribute__((__overloadable__)) half2 _cl_half_exp10(half2 x0);
+__attribute__((__overloadable__)) half3 _cl_half_exp10(half3 x0);
+__attribute__((__overloadable__)) half4 _cl_half_exp10(half4 x0);
+__attribute__((__overloadable__)) half8 _cl_half_exp10(half8 x0);
+__attribute__((__overloadable__)) half16 _cl_half_exp10(half16 x0);
+#endif // #ifdef cl_khr_fp16
 __attribute__((__overloadable__)) float _cl_half_exp10(float x0);
 __attribute__((__overloadable__)) float2 _cl_half_exp10(float2 x0);
 __attribute__((__overloadable__)) float3 _cl_half_exp10(float3 x0);
 __attribute__((__overloadable__)) float4 _cl_half_exp10(float4 x0);
 __attribute__((__overloadable__)) float8 _cl_half_exp10(float8 x0);
 __attribute__((__overloadable__)) float16 _cl_half_exp10(float16 x0);
+#ifdef cl_khr_fp64
+__attribute__((__overloadable__)) double _cl_half_exp10(double x0);
+__attribute__((__overloadable__)) double2 _cl_half_exp10(double2 x0);
+__attribute__((__overloadable__)) double3 _cl_half_exp10(double3 x0);
+__attribute__((__overloadable__)) double4 _cl_half_exp10(double4 x0);
+__attribute__((__overloadable__)) double8 _cl_half_exp10(double8 x0);
+__attribute__((__overloadable__)) double16 _cl_half_exp10(double16 x0);
+#endif // #ifdef cl_khr_fp64
 
 // half_log: ['VF'] -> VF
 #undef half_log
 #define half_log _cl_half_log
+#ifdef cl_khr_fp16
+__attribute__((__overloadable__)) half _cl_half_log(half x0);
+__attribute__((__overloadable__)) half2 _cl_half_log(half2 x0);
+__attribute__((__overloadable__)) half3 _cl_half_log(half3 x0);
+__attribute__((__overloadable__)) half4 _cl_half_log(half4 x0);
+__attribute__((__overloadable__)) half8 _cl_half_log(half8 x0);
+__attribute__((__overloadable__)) half16 _cl_half_log(half16 x0);
+#endif // #ifdef cl_khr_fp16
 __attribute__((__overloadable__)) float _cl_half_log(float x0);
 __attribute__((__overloadable__)) float2 _cl_half_log(float2 x0);
 __attribute__((__overloadable__)) float3 _cl_half_log(float3 x0);
 __attribute__((__overloadable__)) float4 _cl_half_log(float4 x0);
 __attribute__((__overloadable__)) float8 _cl_half_log(float8 x0);
 __attribute__((__overloadable__)) float16 _cl_half_log(float16 x0);
+#ifdef cl_khr_fp64
+__attribute__((__overloadable__)) double _cl_half_log(double x0);
+__attribute__((__overloadable__)) double2 _cl_half_log(double2 x0);
+__attribute__((__overloadable__)) double3 _cl_half_log(double3 x0);
+__attribute__((__overloadable__)) double4 _cl_half_log(double4 x0);
+__attribute__((__overloadable__)) double8 _cl_half_log(double8 x0);
+__attribute__((__overloadable__)) double16 _cl_half_log(double16 x0);
+#endif // #ifdef cl_khr_fp64
 
 // half_log2: ['VF'] -> VF
 #undef half_log2
 #define half_log2 _cl_half_log2
+#ifdef cl_khr_fp16
+__attribute__((__overloadable__)) half _cl_half_log2(half x0);
+__attribute__((__overloadable__)) half2 _cl_half_log2(half2 x0);
+__attribute__((__overloadable__)) half3 _cl_half_log2(half3 x0);
+__attribute__((__overloadable__)) half4 _cl_half_log2(half4 x0);
+__attribute__((__overloadable__)) half8 _cl_half_log2(half8 x0);
+__attribute__((__overloadable__)) half16 _cl_half_log2(half16 x0);
+#endif // #ifdef cl_khr_fp16
 __attribute__((__overloadable__)) float _cl_half_log2(float x0);
 __attribute__((__overloadable__)) float2 _cl_half_log2(float2 x0);
 __attribute__((__overloadable__)) float3 _cl_half_log2(float3 x0);
 __attribute__((__overloadable__)) float4 _cl_half_log2(float4 x0);
 __attribute__((__overloadable__)) float8 _cl_half_log2(float8 x0);
 __attribute__((__overloadable__)) float16 _cl_half_log2(float16 x0);
+#ifdef cl_khr_fp64
+__attribute__((__overloadable__)) double _cl_half_log2(double x0);
+__attribute__((__overloadable__)) double2 _cl_half_log2(double2 x0);
+__attribute__((__overloadable__)) double3 _cl_half_log2(double3 x0);
+__attribute__((__overloadable__)) double4 _cl_half_log2(double4 x0);
+__attribute__((__overloadable__)) double8 _cl_half_log2(double8 x0);
+__attribute__((__overloadable__)) double16 _cl_half_log2(double16 x0);
+#endif // #ifdef cl_khr_fp64
 
 // half_log10: ['VF'] -> VF
 #undef half_log10
 #define half_log10 _cl_half_log10
+#ifdef cl_khr_fp16
+__attribute__((__overloadable__)) half _cl_half_log10(half x0);
+__attribute__((__overloadable__)) half2 _cl_half_log10(half2 x0);
+__attribute__((__overloadable__)) half3 _cl_half_log10(half3 x0);
+__attribute__((__overloadable__)) half4 _cl_half_log10(half4 x0);
+__attribute__((__overloadable__)) half8 _cl_half_log10(half8 x0);
+__attribute__((__overloadable__)) half16 _cl_half_log10(half16 x0);
+#endif // #ifdef cl_khr_fp16
 __attribute__((__overloadable__)) float _cl_half_log10(float x0);
 __attribute__((__overloadable__)) float2 _cl_half_log10(float2 x0);
 __attribute__((__overloadable__)) float3 _cl_half_log10(float3 x0);
 __attribute__((__overloadable__)) float4 _cl_half_log10(float4 x0);
 __attribute__((__overloadable__)) float8 _cl_half_log10(float8 x0);
 __attribute__((__overloadable__)) float16 _cl_half_log10(float16 x0);
+#ifdef cl_khr_fp64
+__attribute__((__overloadable__)) double _cl_half_log10(double x0);
+__attribute__((__overloadable__)) double2 _cl_half_log10(double2 x0);
+__attribute__((__overloadable__)) double3 _cl_half_log10(double3 x0);
+__attribute__((__overloadable__)) double4 _cl_half_log10(double4 x0);
+__attribute__((__overloadable__)) double8 _cl_half_log10(double8 x0);
+__attribute__((__overloadable__)) double16 _cl_half_log10(double16 x0);
+#endif // #ifdef cl_khr_fp64
 
 // half_powr: ['VF', 'VF'] -> VF
 #undef half_powr
 #define half_powr _cl_half_powr
+#ifdef cl_khr_fp16
+__attribute__((__overloadable__)) half _cl_half_powr(half x0, half x1);
+__attribute__((__overloadable__)) half2 _cl_half_powr(half2 x0, half2 x1);
+__attribute__((__overloadable__)) half3 _cl_half_powr(half3 x0, half3 x1);
+__attribute__((__overloadable__)) half4 _cl_half_powr(half4 x0, half4 x1);
+__attribute__((__overloadable__)) half8 _cl_half_powr(half8 x0, half8 x1);
+__attribute__((__overloadable__)) half16 _cl_half_powr(half16 x0, half16 x1);
+#endif // #ifdef cl_khr_fp16
 __attribute__((__overloadable__)) float _cl_half_powr(float x0, float x1);
 __attribute__((__overloadable__)) float2 _cl_half_powr(float2 x0, float2 x1);
 __attribute__((__overloadable__)) float3 _cl_half_powr(float3 x0, float3 x1);
 __attribute__((__overloadable__)) float4 _cl_half_powr(float4 x0, float4 x1);
 __attribute__((__overloadable__)) float8 _cl_half_powr(float8 x0, float8 x1);
 __attribute__((__overloadable__)) float16 _cl_half_powr(float16 x0, float16 x1);
+#ifdef cl_khr_fp64
+__attribute__((__overloadable__)) double _cl_half_powr(double x0, double x1);
+__attribute__((__overloadable__)) double2 _cl_half_powr(double2 x0, double2 x1);
+__attribute__((__overloadable__)) double3 _cl_half_powr(double3 x0, double3 x1);
+__attribute__((__overloadable__)) double4 _cl_half_powr(double4 x0, double4 x1);
+__attribute__((__overloadable__)) double8 _cl_half_powr(double8 x0, double8 x1);
+__attribute__((__overloadable__)) double16 _cl_half_powr(double16 x0, double16 x1);
+#endif // #ifdef cl_khr_fp64
 
 // half_recip: ['VF'] -> VF
 #undef half_recip
 #define half_recip _cl_half_recip
+#ifdef cl_khr_fp16
+__attribute__((__overloadable__)) half _cl_half_recip(half x0);
+__attribute__((__overloadable__)) half2 _cl_half_recip(half2 x0);
+__attribute__((__overloadable__)) half3 _cl_half_recip(half3 x0);
+__attribute__((__overloadable__)) half4 _cl_half_recip(half4 x0);
+__attribute__((__overloadable__)) half8 _cl_half_recip(half8 x0);
+__attribute__((__overloadable__)) half16 _cl_half_recip(half16 x0);
+#endif // #ifdef cl_khr_fp16
 __attribute__((__overloadable__)) float _cl_half_recip(float x0);
 __attribute__((__overloadable__)) float2 _cl_half_recip(float2 x0);
 __attribute__((__overloadable__)) float3 _cl_half_recip(float3 x0);
 __attribute__((__overloadable__)) float4 _cl_half_recip(float4 x0);
 __attribute__((__overloadable__)) float8 _cl_half_recip(float8 x0);
 __attribute__((__overloadable__)) float16 _cl_half_recip(float16 x0);
+#ifdef cl_khr_fp64
+__attribute__((__overloadable__)) double _cl_half_recip(double x0);
+__attribute__((__overloadable__)) double2 _cl_half_recip(double2 x0);
+__attribute__((__overloadable__)) double3 _cl_half_recip(double3 x0);
+__attribute__((__overloadable__)) double4 _cl_half_recip(double4 x0);
+__attribute__((__overloadable__)) double8 _cl_half_recip(double8 x0);
+__attribute__((__overloadable__)) double16 _cl_half_recip(double16 x0);
+#endif // #ifdef cl_khr_fp64
 
 // half_rsqrt: ['VF'] -> VF
 #undef half_rsqrt
 #define half_rsqrt _cl_half_rsqrt
+#ifdef cl_khr_fp16
+__attribute__((__overloadable__)) half _cl_half_rsqrt(half x0);
+__attribute__((__overloadable__)) half2 _cl_half_rsqrt(half2 x0);
+__attribute__((__overloadable__)) half3 _cl_half_rsqrt(half3 x0);
+__attribute__((__overloadable__)) half4 _cl_half_rsqrt(half4 x0);
+__attribute__((__overloadable__)) half8 _cl_half_rsqrt(half8 x0);
+__attribute__((__overloadable__)) half16 _cl_half_rsqrt(half16 x0);
+#endif // #ifdef cl_khr_fp16
 __attribute__((__overloadable__)) float _cl_half_rsqrt(float x0);
 __attribute__((__overloadable__)) float2 _cl_half_rsqrt(float2 x0);
 __attribute__((__overloadable__)) float3 _cl_half_rsqrt(float3 x0);
 __attribute__((__overloadable__)) float4 _cl_half_rsqrt(float4 x0);
 __attribute__((__overloadable__)) float8 _cl_half_rsqrt(float8 x0);
 __attribute__((__overloadable__)) float16 _cl_half_rsqrt(float16 x0);
+#ifdef cl_khr_fp64
+__attribute__((__overloadable__)) double _cl_half_rsqrt(double x0);
+__attribute__((__overloadable__)) double2 _cl_half_rsqrt(double2 x0);
+__attribute__((__overloadable__)) double3 _cl_half_rsqrt(double3 x0);
+__attribute__((__overloadable__)) double4 _cl_half_rsqrt(double4 x0);
+__attribute__((__overloadable__)) double8 _cl_half_rsqrt(double8 x0);
+__attribute__((__overloadable__)) double16 _cl_half_rsqrt(double16 x0);
+#endif // #ifdef cl_khr_fp64
 
 // half_sin: ['VF'] -> VF
 #undef half_sin
 #define half_sin _cl_half_sin
+#ifdef cl_khr_fp16
+__attribute__((__overloadable__)) half _cl_half_sin(half x0);
+__attribute__((__overloadable__)) half2 _cl_half_sin(half2 x0);
+__attribute__((__overloadable__)) half3 _cl_half_sin(half3 x0);
+__attribute__((__overloadable__)) half4 _cl_half_sin(half4 x0);
+__attribute__((__overloadable__)) half8 _cl_half_sin(half8 x0);
+__attribute__((__overloadable__)) half16 _cl_half_sin(half16 x0);
+#endif // #ifdef cl_khr_fp16
 __attribute__((__overloadable__)) float _cl_half_sin(float x0);
 __attribute__((__overloadable__)) float2 _cl_half_sin(float2 x0);
 __attribute__((__overloadable__)) float3 _cl_half_sin(float3 x0);
 __attribute__((__overloadable__)) float4 _cl_half_sin(float4 x0);
 __attribute__((__overloadable__)) float8 _cl_half_sin(float8 x0);
 __attribute__((__overloadable__)) float16 _cl_half_sin(float16 x0);
+#ifdef cl_khr_fp64
+__attribute__((__overloadable__)) double _cl_half_sin(double x0);
+__attribute__((__overloadable__)) double2 _cl_half_sin(double2 x0);
+__attribute__((__overloadable__)) double3 _cl_half_sin(double3 x0);
+__attribute__((__overloadable__)) double4 _cl_half_sin(double4 x0);
+__attribute__((__overloadable__)) double8 _cl_half_sin(double8 x0);
+__attribute__((__overloadable__)) double16 _cl_half_sin(double16 x0);
+#endif // #ifdef cl_khr_fp64
 
 // half_sqrt: ['VF'] -> VF
 #undef half_sqrt
 #define half_sqrt _cl_half_sqrt
+#ifdef cl_khr_fp16
+__attribute__((__overloadable__)) half _cl_half_sqrt(half x0);
+__attribute__((__overloadable__)) half2 _cl_half_sqrt(half2 x0);
+__attribute__((__overloadable__)) half3 _cl_half_sqrt(half3 x0);
+__attribute__((__overloadable__)) half4 _cl_half_sqrt(half4 x0);
+__attribute__((__overloadable__)) half8 _cl_half_sqrt(half8 x0);
+__attribute__((__overloadable__)) half16 _cl_half_sqrt(half16 x0);
+#endif // #ifdef cl_khr_fp16
 __attribute__((__overloadable__)) float _cl_half_sqrt(float x0);
 __attribute__((__overloadable__)) float2 _cl_half_sqrt(float2 x0);
 __attribute__((__overloadable__)) float3 _cl_half_sqrt(float3 x0);
 __attribute__((__overloadable__)) float4 _cl_half_sqrt(float4 x0);
 __attribute__((__overloadable__)) float8 _cl_half_sqrt(float8 x0);
 __attribute__((__overloadable__)) float16 _cl_half_sqrt(float16 x0);
+#ifdef cl_khr_fp64
+__attribute__((__overloadable__)) double _cl_half_sqrt(double x0);
+__attribute__((__overloadable__)) double2 _cl_half_sqrt(double2 x0);
+__attribute__((__overloadable__)) double3 _cl_half_sqrt(double3 x0);
+__attribute__((__overloadable__)) double4 _cl_half_sqrt(double4 x0);
+__attribute__((__overloadable__)) double8 _cl_half_sqrt(double8 x0);
+__attribute__((__overloadable__)) double16 _cl_half_sqrt(double16 x0);
+#endif // #ifdef cl_khr_fp64
 
 // half_tan: ['VF'] -> VF
 #undef half_tan
 #define half_tan _cl_half_tan
+#ifdef cl_khr_fp16
+__attribute__((__overloadable__)) half _cl_half_tan(half x0);
+__attribute__((__overloadable__)) half2 _cl_half_tan(half2 x0);
+__attribute__((__overloadable__)) half3 _cl_half_tan(half3 x0);
+__attribute__((__overloadable__)) half4 _cl_half_tan(half4 x0);
+__attribute__((__overloadable__)) half8 _cl_half_tan(half8 x0);
+__attribute__((__overloadable__)) half16 _cl_half_tan(half16 x0);
+#endif // #ifdef cl_khr_fp16
 __attribute__((__overloadable__)) float _cl_half_tan(float x0);
 __attribute__((__overloadable__)) float2 _cl_half_tan(float2 x0);
 __attribute__((__overloadable__)) float3 _cl_half_tan(float3 x0);
 __attribute__((__overloadable__)) float4 _cl_half_tan(float4 x0);
 __attribute__((__overloadable__)) float8 _cl_half_tan(float8 x0);
 __attribute__((__overloadable__)) float16 _cl_half_tan(float16 x0);
+#ifdef cl_khr_fp64
+__attribute__((__overloadable__)) double _cl_half_tan(double x0);
+__attribute__((__overloadable__)) double2 _cl_half_tan(double2 x0);
+__attribute__((__overloadable__)) double3 _cl_half_tan(double3 x0);
+__attribute__((__overloadable__)) double4 _cl_half_tan(double4 x0);
+__attribute__((__overloadable__)) double8 _cl_half_tan(double8 x0);
+__attribute__((__overloadable__)) double16 _cl_half_tan(double16 x0);
+#endif // #ifdef cl_khr_fp64
 
 // native_cos: ['VF'] -> VF
 #undef native_cos
 #define native_cos _cl_native_cos
+#ifdef cl_khr_fp16
+__attribute__((__overloadable__)) half _cl_native_cos(half x0);
+__attribute__((__overloadable__)) half2 _cl_native_cos(half2 x0);
+__attribute__((__overloadable__)) half3 _cl_native_cos(half3 x0);
+__attribute__((__overloadable__)) half4 _cl_native_cos(half4 x0);
+__attribute__((__overloadable__)) half8 _cl_native_cos(half8 x0);
+__attribute__((__overloadable__)) half16 _cl_native_cos(half16 x0);
+#endif // #ifdef cl_khr_fp16
 __attribute__((__overloadable__)) float _cl_native_cos(float x0);
 __attribute__((__overloadable__)) float2 _cl_native_cos(float2 x0);
 __attribute__((__overloadable__)) float3 _cl_native_cos(float3 x0);
 __attribute__((__overloadable__)) float4 _cl_native_cos(float4 x0);
 __attribute__((__overloadable__)) float8 _cl_native_cos(float8 x0);
 __attribute__((__overloadable__)) float16 _cl_native_cos(float16 x0);
+#ifdef cl_khr_fp64
+__attribute__((__overloadable__)) double _cl_native_cos(double x0);
+__attribute__((__overloadable__)) double2 _cl_native_cos(double2 x0);
+__attribute__((__overloadable__)) double3 _cl_native_cos(double3 x0);
+__attribute__((__overloadable__)) double4 _cl_native_cos(double4 x0);
+__attribute__((__overloadable__)) double8 _cl_native_cos(double8 x0);
+__attribute__((__overloadable__)) double16 _cl_native_cos(double16 x0);
+#endif // #ifdef cl_khr_fp64
 
 // native_divide: ['VF', 'VF'] -> VF
 #undef native_divide
 #define native_divide _cl_native_divide
+#ifdef cl_khr_fp16
+__attribute__((__overloadable__)) half _cl_native_divide(half x0, half x1);
+__attribute__((__overloadable__)) half2 _cl_native_divide(half2 x0, half2 x1);
+__attribute__((__overloadable__)) half3 _cl_native_divide(half3 x0, half3 x1);
+__attribute__((__overloadable__)) half4 _cl_native_divide(half4 x0, half4 x1);
+__attribute__((__overloadable__)) half8 _cl_native_divide(half8 x0, half8 x1);
+__attribute__((__overloadable__)) half16 _cl_native_divide(half16 x0, half16 x1);
+#endif // #ifdef cl_khr_fp16
 __attribute__((__overloadable__)) float _cl_native_divide(float x0, float x1);
 __attribute__((__overloadable__)) float2 _cl_native_divide(float2 x0, float2 x1);
 __attribute__((__overloadable__)) float3 _cl_native_divide(float3 x0, float3 x1);
 __attribute__((__overloadable__)) float4 _cl_native_divide(float4 x0, float4 x1);
 __attribute__((__overloadable__)) float8 _cl_native_divide(float8 x0, float8 x1);
 __attribute__((__overloadable__)) float16 _cl_native_divide(float16 x0, float16 x1);
+#ifdef cl_khr_fp64
+__attribute__((__overloadable__)) double _cl_native_divide(double x0, double x1);
+__attribute__((__overloadable__)) double2 _cl_native_divide(double2 x0, double2 x1);
+__attribute__((__overloadable__)) double3 _cl_native_divide(double3 x0, double3 x1);
+__attribute__((__overloadable__)) double4 _cl_native_divide(double4 x0, double4 x1);
+__attribute__((__overloadable__)) double8 _cl_native_divide(double8 x0, double8 x1);
+__attribute__((__overloadable__)) double16 _cl_native_divide(double16 x0, double16 x1);
+#endif // #ifdef cl_khr_fp64
 
 // native_exp: ['VF'] -> VF
 #undef native_exp
 #define native_exp _cl_native_exp
+#ifdef cl_khr_fp16
+__attribute__((__overloadable__)) half _cl_native_exp(half x0);
+__attribute__((__overloadable__)) half2 _cl_native_exp(half2 x0);
+__attribute__((__overloadable__)) half3 _cl_native_exp(half3 x0);
+__attribute__((__overloadable__)) half4 _cl_native_exp(half4 x0);
+__attribute__((__overloadable__)) half8 _cl_native_exp(half8 x0);
+__attribute__((__overloadable__)) half16 _cl_native_exp(half16 x0);
+#endif // #ifdef cl_khr_fp16
 __attribute__((__overloadable__)) float _cl_native_exp(float x0);
 __attribute__((__overloadable__)) float2 _cl_native_exp(float2 x0);
 __attribute__((__overloadable__)) float3 _cl_native_exp(float3 x0);
 __attribute__((__overloadable__)) float4 _cl_native_exp(float4 x0);
 __attribute__((__overloadable__)) float8 _cl_native_exp(float8 x0);
 __attribute__((__overloadable__)) float16 _cl_native_exp(float16 x0);
+#ifdef cl_khr_fp64
+__attribute__((__overloadable__)) double _cl_native_exp(double x0);
+__attribute__((__overloadable__)) double2 _cl_native_exp(double2 x0);
+__attribute__((__overloadable__)) double3 _cl_native_exp(double3 x0);
+__attribute__((__overloadable__)) double4 _cl_native_exp(double4 x0);
+__attribute__((__overloadable__)) double8 _cl_native_exp(double8 x0);
+__attribute__((__overloadable__)) double16 _cl_native_exp(double16 x0);
+#endif // #ifdef cl_khr_fp64
 
 // native_exp2: ['VF'] -> VF
 #undef native_exp2
 #define native_exp2 _cl_native_exp2
+#ifdef cl_khr_fp16
+__attribute__((__overloadable__)) half _cl_native_exp2(half x0);
+__attribute__((__overloadable__)) half2 _cl_native_exp2(half2 x0);
+__attribute__((__overloadable__)) half3 _cl_native_exp2(half3 x0);
+__attribute__((__overloadable__)) half4 _cl_native_exp2(half4 x0);
+__attribute__((__overloadable__)) half8 _cl_native_exp2(half8 x0);
+__attribute__((__overloadable__)) half16 _cl_native_exp2(half16 x0);
+#endif // #ifdef cl_khr_fp16
 __attribute__((__overloadable__)) float _cl_native_exp2(float x0);
 __attribute__((__overloadable__)) float2 _cl_native_exp2(float2 x0);
 __attribute__((__overloadable__)) float3 _cl_native_exp2(float3 x0);
 __attribute__((__overloadable__)) float4 _cl_native_exp2(float4 x0);
 __attribute__((__overloadable__)) float8 _cl_native_exp2(float8 x0);
 __attribute__((__overloadable__)) float16 _cl_native_exp2(float16 x0);
+#ifdef cl_khr_fp64
+__attribute__((__overloadable__)) double _cl_native_exp2(double x0);
+__attribute__((__overloadable__)) double2 _cl_native_exp2(double2 x0);
+__attribute__((__overloadable__)) double3 _cl_native_exp2(double3 x0);
+__attribute__((__overloadable__)) double4 _cl_native_exp2(double4 x0);
+__attribute__((__overloadable__)) double8 _cl_native_exp2(double8 x0);
+__attribute__((__overloadable__)) double16 _cl_native_exp2(double16 x0);
+#endif // #ifdef cl_khr_fp64
 
 // native_exp10: ['VF'] -> VF
 #undef native_exp10
 #define native_exp10 _cl_native_exp10
+#ifdef cl_khr_fp16
+__attribute__((__overloadable__)) half _cl_native_exp10(half x0);
+__attribute__((__overloadable__)) half2 _cl_native_exp10(half2 x0);
+__attribute__((__overloadable__)) half3 _cl_native_exp10(half3 x0);
+__attribute__((__overloadable__)) half4 _cl_native_exp10(half4 x0);
+__attribute__((__overloadable__)) half8 _cl_native_exp10(half8 x0);
+__attribute__((__overloadable__)) half16 _cl_native_exp10(half16 x0);
+#endif // #ifdef cl_khr_fp16
 __attribute__((__overloadable__)) float _cl_native_exp10(float x0);
 __attribute__((__overloadable__)) float2 _cl_native_exp10(float2 x0);
 __attribute__((__overloadable__)) float3 _cl_native_exp10(float3 x0);
 __attribute__((__overloadable__)) float4 _cl_native_exp10(float4 x0);
 __attribute__((__overloadable__)) float8 _cl_native_exp10(float8 x0);
 __attribute__((__overloadable__)) float16 _cl_native_exp10(float16 x0);
+#ifdef cl_khr_fp64
+__attribute__((__overloadable__)) double _cl_native_exp10(double x0);
+__attribute__((__overloadable__)) double2 _cl_native_exp10(double2 x0);
+__attribute__((__overloadable__)) double3 _cl_native_exp10(double3 x0);
+__attribute__((__overloadable__)) double4 _cl_native_exp10(double4 x0);
+__attribute__((__overloadable__)) double8 _cl_native_exp10(double8 x0);
+__attribute__((__overloadable__)) double16 _cl_native_exp10(double16 x0);
+#endif // #ifdef cl_khr_fp64
 
 // native_log: ['VF'] -> VF
 #undef native_log
 #define native_log _cl_native_log
+#ifdef cl_khr_fp16
+__attribute__((__overloadable__)) half _cl_native_log(half x0);
+__attribute__((__overloadable__)) half2 _cl_native_log(half2 x0);
+__attribute__((__overloadable__)) half3 _cl_native_log(half3 x0);
+__attribute__((__overloadable__)) half4 _cl_native_log(half4 x0);
+__attribute__((__overloadable__)) half8 _cl_native_log(half8 x0);
+__attribute__((__overloadable__)) half16 _cl_native_log(half16 x0);
+#endif // #ifdef cl_khr_fp16
 __attribute__((__overloadable__)) float _cl_native_log(float x0);
 __attribute__((__overloadable__)) float2 _cl_native_log(float2 x0);
 __attribute__((__overloadable__)) float3 _cl_native_log(float3 x0);
 __attribute__((__overloadable__)) float4 _cl_native_log(float4 x0);
 __attribute__((__overloadable__)) float8 _cl_native_log(float8 x0);
 __attribute__((__overloadable__)) float16 _cl_native_log(float16 x0);
+#ifdef cl_khr_fp64
+__attribute__((__overloadable__)) double _cl_native_log(double x0);
+__attribute__((__overloadable__)) double2 _cl_native_log(double2 x0);
+__attribute__((__overloadable__)) double3 _cl_native_log(double3 x0);
+__attribute__((__overloadable__)) double4 _cl_native_log(double4 x0);
+__attribute__((__overloadable__)) double8 _cl_native_log(double8 x0);
+__attribute__((__overloadable__)) double16 _cl_native_log(double16 x0);
+#endif // #ifdef cl_khr_fp64
 
 // native_log2: ['VF'] -> VF
 #undef native_log2
 #define native_log2 _cl_native_log2
+#ifdef cl_khr_fp16
+__attribute__((__overloadable__)) half _cl_native_log2(half x0);
+__attribute__((__overloadable__)) half2 _cl_native_log2(half2 x0);
+__attribute__((__overloadable__)) half3 _cl_native_log2(half3 x0);
+__attribute__((__overloadable__)) half4 _cl_native_log2(half4 x0);
+__attribute__((__overloadable__)) half8 _cl_native_log2(half8 x0);
+__attribute__((__overloadable__)) half16 _cl_native_log2(half16 x0);
+#endif // #ifdef cl_khr_fp16
 __attribute__((__overloadable__)) float _cl_native_log2(float x0);
 __attribute__((__overloadable__)) float2 _cl_native_log2(float2 x0);
 __attribute__((__overloadable__)) float3 _cl_native_log2(float3 x0);
 __attribute__((__overloadable__)) float4 _cl_native_log2(float4 x0);
 __attribute__((__overloadable__)) float8 _cl_native_log2(float8 x0);
 __attribute__((__overloadable__)) float16 _cl_native_log2(float16 x0);
+#ifdef cl_khr_fp64
+__attribute__((__overloadable__)) double _cl_native_log2(double x0);
+__attribute__((__overloadable__)) double2 _cl_native_log2(double2 x0);
+__attribute__((__overloadable__)) double3 _cl_native_log2(double3 x0);
+__attribute__((__overloadable__)) double4 _cl_native_log2(double4 x0);
+__attribute__((__overloadable__)) double8 _cl_native_log2(double8 x0);
+__attribute__((__overloadable__)) double16 _cl_native_log2(double16 x0);
+#endif // #ifdef cl_khr_fp64
 
 // native_log10: ['VF'] -> VF
 #undef native_log10
 #define native_log10 _cl_native_log10
+#ifdef cl_khr_fp16
+__attribute__((__overloadable__)) half _cl_native_log10(half x0);
+__attribute__((__overloadable__)) half2 _cl_native_log10(half2 x0);
+__attribute__((__overloadable__)) half3 _cl_native_log10(half3 x0);
+__attribute__((__overloadable__)) half4 _cl_native_log10(half4 x0);
+__attribute__((__overloadable__)) half8 _cl_native_log10(half8 x0);
+__attribute__((__overloadable__)) half16 _cl_native_log10(half16 x0);
+#endif // #ifdef cl_khr_fp16
 __attribute__((__overloadable__)) float _cl_native_log10(float x0);
 __attribute__((__overloadable__)) float2 _cl_native_log10(float2 x0);
 __attribute__((__overloadable__)) float3 _cl_native_log10(float3 x0);
 __attribute__((__overloadable__)) float4 _cl_native_log10(float4 x0);
 __attribute__((__overloadable__)) float8 _cl_native_log10(float8 x0);
 __attribute__((__overloadable__)) float16 _cl_native_log10(float16 x0);
+#ifdef cl_khr_fp64
+__attribute__((__overloadable__)) double _cl_native_log10(double x0);
+__attribute__((__overloadable__)) double2 _cl_native_log10(double2 x0);
+__attribute__((__overloadable__)) double3 _cl_native_log10(double3 x0);
+__attribute__((__overloadable__)) double4 _cl_native_log10(double4 x0);
+__attribute__((__overloadable__)) double8 _cl_native_log10(double8 x0);
+__attribute__((__overloadable__)) double16 _cl_native_log10(double16 x0);
+#endif // #ifdef cl_khr_fp64
 
 // native_powr: ['VF', 'VF'] -> VF
 #undef native_powr
 #define native_powr _cl_native_powr
+#ifdef cl_khr_fp16
+__attribute__((__overloadable__)) half _cl_native_powr(half x0, half x1);
+__attribute__((__overloadable__)) half2 _cl_native_powr(half2 x0, half2 x1);
+__attribute__((__overloadable__)) half3 _cl_native_powr(half3 x0, half3 x1);
+__attribute__((__overloadable__)) half4 _cl_native_powr(half4 x0, half4 x1);
+__attribute__((__overloadable__)) half8 _cl_native_powr(half8 x0, half8 x1);
+__attribute__((__overloadable__)) half16 _cl_native_powr(half16 x0, half16 x1);
+#endif // #ifdef cl_khr_fp16
 __attribute__((__overloadable__)) float _cl_native_powr(float x0, float x1);
 __attribute__((__overloadable__)) float2 _cl_native_powr(float2 x0, float2 x1);
 __attribute__((__overloadable__)) float3 _cl_native_powr(float3 x0, float3 x1);
 __attribute__((__overloadable__)) float4 _cl_native_powr(float4 x0, float4 x1);
 __attribute__((__overloadable__)) float8 _cl_native_powr(float8 x0, float8 x1);
 __attribute__((__overloadable__)) float16 _cl_native_powr(float16 x0, float16 x1);
+#ifdef cl_khr_fp64
+__attribute__((__overloadable__)) double _cl_native_powr(double x0, double x1);
+__attribute__((__overloadable__)) double2 _cl_native_powr(double2 x0, double2 x1);
+__attribute__((__overloadable__)) double3 _cl_native_powr(double3 x0, double3 x1);
+__attribute__((__overloadable__)) double4 _cl_native_powr(double4 x0, double4 x1);
+__attribute__((__overloadable__)) double8 _cl_native_powr(double8 x0, double8 x1);
+__attribute__((__overloadable__)) double16 _cl_native_powr(double16 x0, double16 x1);
+#endif // #ifdef cl_khr_fp64
 
 // native_recip: ['VF'] -> VF
 #undef native_recip
 #define native_recip _cl_native_recip
+#ifdef cl_khr_fp16
+__attribute__((__overloadable__)) half _cl_native_recip(half x0);
+__attribute__((__overloadable__)) half2 _cl_native_recip(half2 x0);
+__attribute__((__overloadable__)) half3 _cl_native_recip(half3 x0);
+__attribute__((__overloadable__)) half4 _cl_native_recip(half4 x0);
+__attribute__((__overloadable__)) half8 _cl_native_recip(half8 x0);
+__attribute__((__overloadable__)) half16 _cl_native_recip(half16 x0);
+#endif // #ifdef cl_khr_fp16
 __attribute__((__overloadable__)) float _cl_native_recip(float x0);
 __attribute__((__overloadable__)) float2 _cl_native_recip(float2 x0);
 __attribute__((__overloadable__)) float3 _cl_native_recip(float3 x0);
 __attribute__((__overloadable__)) float4 _cl_native_recip(float4 x0);
 __attribute__((__overloadable__)) float8 _cl_native_recip(float8 x0);
 __attribute__((__overloadable__)) float16 _cl_native_recip(float16 x0);
+#ifdef cl_khr_fp64
+__attribute__((__overloadable__)) double _cl_native_recip(double x0);
+__attribute__((__overloadable__)) double2 _cl_native_recip(double2 x0);
+__attribute__((__overloadable__)) double3 _cl_native_recip(double3 x0);
+__attribute__((__overloadable__)) double4 _cl_native_recip(double4 x0);
+__attribute__((__overloadable__)) double8 _cl_native_recip(double8 x0);
+__attribute__((__overloadable__)) double16 _cl_native_recip(double16 x0);
+#endif // #ifdef cl_khr_fp64
 
 // native_rsqrt: ['VF'] -> VF
 #undef native_rsqrt
 #define native_rsqrt _cl_native_rsqrt
+#ifdef cl_khr_fp16
+__attribute__((__overloadable__)) half _cl_native_rsqrt(half x0);
+__attribute__((__overloadable__)) half2 _cl_native_rsqrt(half2 x0);
+__attribute__((__overloadable__)) half3 _cl_native_rsqrt(half3 x0);
+__attribute__((__overloadable__)) half4 _cl_native_rsqrt(half4 x0);
+__attribute__((__overloadable__)) half8 _cl_native_rsqrt(half8 x0);
+__attribute__((__overloadable__)) half16 _cl_native_rsqrt(half16 x0);
+#endif // #ifdef cl_khr_fp16
 __attribute__((__overloadable__)) float _cl_native_rsqrt(float x0);
 __attribute__((__overloadable__)) float2 _cl_native_rsqrt(float2 x0);
 __attribute__((__overloadable__)) float3 _cl_native_rsqrt(float3 x0);
 __attribute__((__overloadable__)) float4 _cl_native_rsqrt(float4 x0);
 __attribute__((__overloadable__)) float8 _cl_native_rsqrt(float8 x0);
 __attribute__((__overloadable__)) float16 _cl_native_rsqrt(float16 x0);
+#ifdef cl_khr_fp64
+__attribute__((__overloadable__)) double _cl_native_rsqrt(double x0);
+__attribute__((__overloadable__)) double2 _cl_native_rsqrt(double2 x0);
+__attribute__((__overloadable__)) double3 _cl_native_rsqrt(double3 x0);
+__attribute__((__overloadable__)) double4 _cl_native_rsqrt(double4 x0);
+__attribute__((__overloadable__)) double8 _cl_native_rsqrt(double8 x0);
+__attribute__((__overloadable__)) double16 _cl_native_rsqrt(double16 x0);
+#endif // #ifdef cl_khr_fp64
 
 // native_sin: ['VF'] -> VF
 #undef native_sin
 #define native_sin _cl_native_sin
+#ifdef cl_khr_fp16
+__attribute__((__overloadable__)) half _cl_native_sin(half x0);
+__attribute__((__overloadable__)) half2 _cl_native_sin(half2 x0);
+__attribute__((__overloadable__)) half3 _cl_native_sin(half3 x0);
+__attribute__((__overloadable__)) half4 _cl_native_sin(half4 x0);
+__attribute__((__overloadable__)) half8 _cl_native_sin(half8 x0);
+__attribute__((__overloadable__)) half16 _cl_native_sin(half16 x0);
+#endif // #ifdef cl_khr_fp16
 __attribute__((__overloadable__)) float _cl_native_sin(float x0);
 __attribute__((__overloadable__)) float2 _cl_native_sin(float2 x0);
 __attribute__((__overloadable__)) float3 _cl_native_sin(float3 x0);
 __attribute__((__overloadable__)) float4 _cl_native_sin(float4 x0);
 __attribute__((__overloadable__)) float8 _cl_native_sin(float8 x0);
 __attribute__((__overloadable__)) float16 _cl_native_sin(float16 x0);
+#ifdef cl_khr_fp64
+__attribute__((__overloadable__)) double _cl_native_sin(double x0);
+__attribute__((__overloadable__)) double2 _cl_native_sin(double2 x0);
+__attribute__((__overloadable__)) double3 _cl_native_sin(double3 x0);
+__attribute__((__overloadable__)) double4 _cl_native_sin(double4 x0);
+__attribute__((__overloadable__)) double8 _cl_native_sin(double8 x0);
+__attribute__((__overloadable__)) double16 _cl_native_sin(double16 x0);
+#endif // #ifdef cl_khr_fp64
 
 // native_sqrt: ['VF'] -> VF
 #undef native_sqrt
 #define native_sqrt _cl_native_sqrt
+#ifdef cl_khr_fp16
+__attribute__((__overloadable__)) half _cl_native_sqrt(half x0);
+__attribute__((__overloadable__)) half2 _cl_native_sqrt(half2 x0);
+__attribute__((__overloadable__)) half3 _cl_native_sqrt(half3 x0);
+__attribute__((__overloadable__)) half4 _cl_native_sqrt(half4 x0);
+__attribute__((__overloadable__)) half8 _cl_native_sqrt(half8 x0);
+__attribute__((__overloadable__)) half16 _cl_native_sqrt(half16 x0);
+#endif // #ifdef cl_khr_fp16
 __attribute__((__overloadable__)) float _cl_native_sqrt(float x0);
 __attribute__((__overloadable__)) float2 _cl_native_sqrt(float2 x0);
 __attribute__((__overloadable__)) float3 _cl_native_sqrt(float3 x0);
 __attribute__((__overloadable__)) float4 _cl_native_sqrt(float4 x0);
 __attribute__((__overloadable__)) float8 _cl_native_sqrt(float8 x0);
 __attribute__((__overloadable__)) float16 _cl_native_sqrt(float16 x0);
+#ifdef cl_khr_fp64
+__attribute__((__overloadable__)) double _cl_native_sqrt(double x0);
+__attribute__((__overloadable__)) double2 _cl_native_sqrt(double2 x0);
+__attribute__((__overloadable__)) double3 _cl_native_sqrt(double3 x0);
+__attribute__((__overloadable__)) double4 _cl_native_sqrt(double4 x0);
+__attribute__((__overloadable__)) double8 _cl_native_sqrt(double8 x0);
+__attribute__((__overloadable__)) double16 _cl_native_sqrt(double16 x0);
+#endif // #ifdef cl_khr_fp64
 
 // native_tan: ['VF'] -> VF
 #undef native_tan
 #define native_tan _cl_native_tan
+#ifdef cl_khr_fp16
+__attribute__((__overloadable__)) half _cl_native_tan(half x0);
+__attribute__((__overloadable__)) half2 _cl_native_tan(half2 x0);
+__attribute__((__overloadable__)) half3 _cl_native_tan(half3 x0);
+__attribute__((__overloadable__)) half4 _cl_native_tan(half4 x0);
+__attribute__((__overloadable__)) half8 _cl_native_tan(half8 x0);
+__attribute__((__overloadable__)) half16 _cl_native_tan(half16 x0);
+#endif // #ifdef cl_khr_fp16
 __attribute__((__overloadable__)) float _cl_native_tan(float x0);
 __attribute__((__overloadable__)) float2 _cl_native_tan(float2 x0);
 __attribute__((__overloadable__)) float3 _cl_native_tan(float3 x0);
 __attribute__((__overloadable__)) float4 _cl_native_tan(float4 x0);
 __attribute__((__overloadable__)) float8 _cl_native_tan(float8 x0);
 __attribute__((__overloadable__)) float16 _cl_native_tan(float16 x0);
+#ifdef cl_khr_fp64
+__attribute__((__overloadable__)) double _cl_native_tan(double x0);
+__attribute__((__overloadable__)) double2 _cl_native_tan(double2 x0);
+__attribute__((__overloadable__)) double3 _cl_native_tan(double3 x0);
+__attribute__((__overloadable__)) double4 _cl_native_tan(double4 x0);
+__attribute__((__overloadable__)) double8 _cl_native_tan(double8 x0);
+__attribute__((__overloadable__)) double16 _cl_native_tan(double16 x0);
+#endif // #ifdef cl_khr_fp64
 
 // clamp: ['VF', 'VF', 'VF'] -> VF
 #undef clamp
diff --git a/lib/kernel/vecmathlib-pocl/native_cos.cl b/lib/kernel/vecmathlib-pocl/native_cos.cl
index 77b155a..fd2cfa6 100644
--- a/lib/kernel/vecmathlib-pocl/native_cos.cl
+++ b/lib/kernel/vecmathlib-pocl/native_cos.cl
@@ -27,6 +27,190 @@
 
 // native_cos: ['VF'] -> VF
 
+#ifdef cl_khr_fp16
+
+// native_cos: VF=half
+// Implement native_cos directly
+__attribute__((__overloadable__))
+half _cl_native_cos(half x0)
+{
+  typedef short iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef half vector_t;
+#define convert_ivector_t convert_short
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_half
+#define ilogb_ _cl_ilogb_half
+#define ldexp_scalar_ _cl_ldexp_half_short
+#define ldexp_vector_ _cl_ldexp_half_short
+  return cos(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_cos: VF=half2
+// Implement native_cos directly
+__attribute__((__overloadable__))
+half2 _cl_native_cos(half2 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short2 ivector_t;
+  typedef short2 jvector_t;
+  typedef int2 kvector_t;
+  typedef half2 vector_t;
+#define convert_ivector_t convert_short2
+#define convert_jvector_t convert_short2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_half2
+#define ilogb_ _cl_ilogb_half2
+#define ldexp_scalar_ _cl_ldexp_half2_short
+#define ldexp_vector_ _cl_ldexp_half2_short2
+  return cos(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_cos: VF=half3
+// Implement native_cos directly
+__attribute__((__overloadable__))
+half3 _cl_native_cos(half3 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short3 ivector_t;
+  typedef short3 jvector_t;
+  typedef int3 kvector_t;
+  typedef half3 vector_t;
+#define convert_ivector_t convert_short3
+#define convert_jvector_t convert_short3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_half3
+#define ilogb_ _cl_ilogb_half3
+#define ldexp_scalar_ _cl_ldexp_half3_short
+#define ldexp_vector_ _cl_ldexp_half3_short3
+  return cos(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_cos: VF=half4
+// Implement native_cos directly
+__attribute__((__overloadable__))
+half4 _cl_native_cos(half4 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short4 ivector_t;
+  typedef short4 jvector_t;
+  typedef int4 kvector_t;
+  typedef half4 vector_t;
+#define convert_ivector_t convert_short4
+#define convert_jvector_t convert_short4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_half4
+#define ilogb_ _cl_ilogb_half4
+#define ldexp_scalar_ _cl_ldexp_half4_short
+#define ldexp_vector_ _cl_ldexp_half4_short4
+  return cos(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_cos: VF=half8
+// Implement native_cos directly
+__attribute__((__overloadable__))
+half8 _cl_native_cos(half8 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short8 ivector_t;
+  typedef short8 jvector_t;
+  typedef int8 kvector_t;
+  typedef half8 vector_t;
+#define convert_ivector_t convert_short8
+#define convert_jvector_t convert_short8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_half8
+#define ilogb_ _cl_ilogb_half8
+#define ldexp_scalar_ _cl_ldexp_half8_short
+#define ldexp_vector_ _cl_ldexp_half8_short8
+  return cos(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_cos: VF=half16
+// Implement native_cos directly
+__attribute__((__overloadable__))
+half16 _cl_native_cos(half16 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short16 ivector_t;
+  typedef short16 jvector_t;
+  typedef int16 kvector_t;
+  typedef half16 vector_t;
+#define convert_ivector_t convert_short16
+#define convert_jvector_t convert_short16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_half16
+#define ilogb_ _cl_ilogb_half16
+#define ldexp_scalar_ _cl_ldexp_half16_short
+#define ldexp_vector_ _cl_ldexp_half16_short16
+  return cos(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp16
+
 // native_cos: VF=float
 // Implement native_cos directly
 __attribute__((__overloadable__))
@@ -206,3 +390,187 @@ float16 _cl_native_cos(float16 x0)
 #undef ldexp_scalar_
 #undef ldexp_vector_
 }
+
+#ifdef cl_khr_fp64
+
+// native_cos: VF=double
+// Implement native_cos directly
+__attribute__((__overloadable__))
+double _cl_native_cos(double x0)
+{
+  typedef long iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef double vector_t;
+#define convert_ivector_t convert_long
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_double
+#define ilogb_ _cl_ilogb_double
+#define ldexp_scalar_ _cl_ldexp_double_long
+#define ldexp_vector_ _cl_ldexp_double_long
+  return cos(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_cos: VF=double2
+// Implement native_cos directly
+__attribute__((__overloadable__))
+double2 _cl_native_cos(double2 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long2 ivector_t;
+  typedef long2 jvector_t;
+  typedef int2 kvector_t;
+  typedef double2 vector_t;
+#define convert_ivector_t convert_long2
+#define convert_jvector_t convert_long2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_double2
+#define ilogb_ _cl_ilogb_double2
+#define ldexp_scalar_ _cl_ldexp_double2_long
+#define ldexp_vector_ _cl_ldexp_double2_long2
+  return cos(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_cos: VF=double3
+// Implement native_cos directly
+__attribute__((__overloadable__))
+double3 _cl_native_cos(double3 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long3 ivector_t;
+  typedef long3 jvector_t;
+  typedef int3 kvector_t;
+  typedef double3 vector_t;
+#define convert_ivector_t convert_long3
+#define convert_jvector_t convert_long3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_double3
+#define ilogb_ _cl_ilogb_double3
+#define ldexp_scalar_ _cl_ldexp_double3_long
+#define ldexp_vector_ _cl_ldexp_double3_long3
+  return cos(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_cos: VF=double4
+// Implement native_cos directly
+__attribute__((__overloadable__))
+double4 _cl_native_cos(double4 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long4 ivector_t;
+  typedef long4 jvector_t;
+  typedef int4 kvector_t;
+  typedef double4 vector_t;
+#define convert_ivector_t convert_long4
+#define convert_jvector_t convert_long4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_double4
+#define ilogb_ _cl_ilogb_double4
+#define ldexp_scalar_ _cl_ldexp_double4_long
+#define ldexp_vector_ _cl_ldexp_double4_long4
+  return cos(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_cos: VF=double8
+// Implement native_cos directly
+__attribute__((__overloadable__))
+double8 _cl_native_cos(double8 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long8 ivector_t;
+  typedef long8 jvector_t;
+  typedef int8 kvector_t;
+  typedef double8 vector_t;
+#define convert_ivector_t convert_long8
+#define convert_jvector_t convert_long8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_double8
+#define ilogb_ _cl_ilogb_double8
+#define ldexp_scalar_ _cl_ldexp_double8_long
+#define ldexp_vector_ _cl_ldexp_double8_long8
+  return cos(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_cos: VF=double16
+// Implement native_cos directly
+__attribute__((__overloadable__))
+double16 _cl_native_cos(double16 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long16 ivector_t;
+  typedef long16 jvector_t;
+  typedef int16 kvector_t;
+  typedef double16 vector_t;
+#define convert_ivector_t convert_long16
+#define convert_jvector_t convert_long16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_double16
+#define ilogb_ _cl_ilogb_double16
+#define ldexp_scalar_ _cl_ldexp_double16_long
+#define ldexp_vector_ _cl_ldexp_double16_long16
+  return cos(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp64
diff --git a/lib/kernel/vecmathlib-pocl/native_divide.cl b/lib/kernel/vecmathlib-pocl/native_divide.cl
index 7acaab8..3a3fd22 100644
--- a/lib/kernel/vecmathlib-pocl/native_divide.cl
+++ b/lib/kernel/vecmathlib-pocl/native_divide.cl
@@ -27,6 +27,190 @@
 
 // native_divide: ['VF', 'VF'] -> VF
 
+#ifdef cl_khr_fp16
+
+// native_divide: VF=half
+// Implement native_divide directly
+__attribute__((__overloadable__))
+half _cl_native_divide(half x0, half x1)
+{
+  typedef short iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef half vector_t;
+#define convert_ivector_t convert_short
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_half
+#define ilogb_ _cl_ilogb_half
+#define ldexp_scalar_ _cl_ldexp_half_short
+#define ldexp_vector_ _cl_ldexp_half_short
+  return x0/x1;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_divide: VF=half2
+// Implement native_divide directly
+__attribute__((__overloadable__))
+half2 _cl_native_divide(half2 x0, half2 x1)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short2 ivector_t;
+  typedef short2 jvector_t;
+  typedef int2 kvector_t;
+  typedef half2 vector_t;
+#define convert_ivector_t convert_short2
+#define convert_jvector_t convert_short2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_half2
+#define ilogb_ _cl_ilogb_half2
+#define ldexp_scalar_ _cl_ldexp_half2_short
+#define ldexp_vector_ _cl_ldexp_half2_short2
+  return x0/x1;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_divide: VF=half3
+// Implement native_divide directly
+__attribute__((__overloadable__))
+half3 _cl_native_divide(half3 x0, half3 x1)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short3 ivector_t;
+  typedef short3 jvector_t;
+  typedef int3 kvector_t;
+  typedef half3 vector_t;
+#define convert_ivector_t convert_short3
+#define convert_jvector_t convert_short3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_half3
+#define ilogb_ _cl_ilogb_half3
+#define ldexp_scalar_ _cl_ldexp_half3_short
+#define ldexp_vector_ _cl_ldexp_half3_short3
+  return x0/x1;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_divide: VF=half4
+// Implement native_divide directly
+__attribute__((__overloadable__))
+half4 _cl_native_divide(half4 x0, half4 x1)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short4 ivector_t;
+  typedef short4 jvector_t;
+  typedef int4 kvector_t;
+  typedef half4 vector_t;
+#define convert_ivector_t convert_short4
+#define convert_jvector_t convert_short4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_half4
+#define ilogb_ _cl_ilogb_half4
+#define ldexp_scalar_ _cl_ldexp_half4_short
+#define ldexp_vector_ _cl_ldexp_half4_short4
+  return x0/x1;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_divide: VF=half8
+// Implement native_divide directly
+__attribute__((__overloadable__))
+half8 _cl_native_divide(half8 x0, half8 x1)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short8 ivector_t;
+  typedef short8 jvector_t;
+  typedef int8 kvector_t;
+  typedef half8 vector_t;
+#define convert_ivector_t convert_short8
+#define convert_jvector_t convert_short8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_half8
+#define ilogb_ _cl_ilogb_half8
+#define ldexp_scalar_ _cl_ldexp_half8_short
+#define ldexp_vector_ _cl_ldexp_half8_short8
+  return x0/x1;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_divide: VF=half16
+// Implement native_divide directly
+__attribute__((__overloadable__))
+half16 _cl_native_divide(half16 x0, half16 x1)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short16 ivector_t;
+  typedef short16 jvector_t;
+  typedef int16 kvector_t;
+  typedef half16 vector_t;
+#define convert_ivector_t convert_short16
+#define convert_jvector_t convert_short16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_half16
+#define ilogb_ _cl_ilogb_half16
+#define ldexp_scalar_ _cl_ldexp_half16_short
+#define ldexp_vector_ _cl_ldexp_half16_short16
+  return x0/x1;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp16
+
 // native_divide: VF=float
 // Implement native_divide directly
 __attribute__((__overloadable__))
@@ -206,3 +390,187 @@ float16 _cl_native_divide(float16 x0, float16 x1)
 #undef ldexp_scalar_
 #undef ldexp_vector_
 }
+
+#ifdef cl_khr_fp64
+
+// native_divide: VF=double
+// Implement native_divide directly
+__attribute__((__overloadable__))
+double _cl_native_divide(double x0, double x1)
+{
+  typedef long iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef double vector_t;
+#define convert_ivector_t convert_long
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_double
+#define ilogb_ _cl_ilogb_double
+#define ldexp_scalar_ _cl_ldexp_double_long
+#define ldexp_vector_ _cl_ldexp_double_long
+  return x0/x1;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_divide: VF=double2
+// Implement native_divide directly
+__attribute__((__overloadable__))
+double2 _cl_native_divide(double2 x0, double2 x1)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long2 ivector_t;
+  typedef long2 jvector_t;
+  typedef int2 kvector_t;
+  typedef double2 vector_t;
+#define convert_ivector_t convert_long2
+#define convert_jvector_t convert_long2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_double2
+#define ilogb_ _cl_ilogb_double2
+#define ldexp_scalar_ _cl_ldexp_double2_long
+#define ldexp_vector_ _cl_ldexp_double2_long2
+  return x0/x1;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_divide: VF=double3
+// Implement native_divide directly
+__attribute__((__overloadable__))
+double3 _cl_native_divide(double3 x0, double3 x1)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long3 ivector_t;
+  typedef long3 jvector_t;
+  typedef int3 kvector_t;
+  typedef double3 vector_t;
+#define convert_ivector_t convert_long3
+#define convert_jvector_t convert_long3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_double3
+#define ilogb_ _cl_ilogb_double3
+#define ldexp_scalar_ _cl_ldexp_double3_long
+#define ldexp_vector_ _cl_ldexp_double3_long3
+  return x0/x1;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_divide: VF=double4
+// Implement native_divide directly
+__attribute__((__overloadable__))
+double4 _cl_native_divide(double4 x0, double4 x1)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long4 ivector_t;
+  typedef long4 jvector_t;
+  typedef int4 kvector_t;
+  typedef double4 vector_t;
+#define convert_ivector_t convert_long4
+#define convert_jvector_t convert_long4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_double4
+#define ilogb_ _cl_ilogb_double4
+#define ldexp_scalar_ _cl_ldexp_double4_long
+#define ldexp_vector_ _cl_ldexp_double4_long4
+  return x0/x1;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_divide: VF=double8
+// Implement native_divide directly
+__attribute__((__overloadable__))
+double8 _cl_native_divide(double8 x0, double8 x1)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long8 ivector_t;
+  typedef long8 jvector_t;
+  typedef int8 kvector_t;
+  typedef double8 vector_t;
+#define convert_ivector_t convert_long8
+#define convert_jvector_t convert_long8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_double8
+#define ilogb_ _cl_ilogb_double8
+#define ldexp_scalar_ _cl_ldexp_double8_long
+#define ldexp_vector_ _cl_ldexp_double8_long8
+  return x0/x1;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_divide: VF=double16
+// Implement native_divide directly
+__attribute__((__overloadable__))
+double16 _cl_native_divide(double16 x0, double16 x1)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long16 ivector_t;
+  typedef long16 jvector_t;
+  typedef int16 kvector_t;
+  typedef double16 vector_t;
+#define convert_ivector_t convert_long16
+#define convert_jvector_t convert_long16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_double16
+#define ilogb_ _cl_ilogb_double16
+#define ldexp_scalar_ _cl_ldexp_double16_long
+#define ldexp_vector_ _cl_ldexp_double16_long16
+  return x0/x1;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp64
diff --git a/lib/kernel/vecmathlib-pocl/native_exp.cl b/lib/kernel/vecmathlib-pocl/native_exp.cl
index 7c42a71..96117c1 100644
--- a/lib/kernel/vecmathlib-pocl/native_exp.cl
+++ b/lib/kernel/vecmathlib-pocl/native_exp.cl
@@ -27,6 +27,190 @@
 
 // native_exp: ['VF'] -> VF
 
+#ifdef cl_khr_fp16
+
+// native_exp: VF=half
+// Implement native_exp directly
+__attribute__((__overloadable__))
+half _cl_native_exp(half x0)
+{
+  typedef short iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef half vector_t;
+#define convert_ivector_t convert_short
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_half
+#define ilogb_ _cl_ilogb_half
+#define ldexp_scalar_ _cl_ldexp_half_short
+#define ldexp_vector_ _cl_ldexp_half_short
+  return exp(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_exp: VF=half2
+// Implement native_exp directly
+__attribute__((__overloadable__))
+half2 _cl_native_exp(half2 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short2 ivector_t;
+  typedef short2 jvector_t;
+  typedef int2 kvector_t;
+  typedef half2 vector_t;
+#define convert_ivector_t convert_short2
+#define convert_jvector_t convert_short2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_half2
+#define ilogb_ _cl_ilogb_half2
+#define ldexp_scalar_ _cl_ldexp_half2_short
+#define ldexp_vector_ _cl_ldexp_half2_short2
+  return exp(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_exp: VF=half3
+// Implement native_exp directly
+__attribute__((__overloadable__))
+half3 _cl_native_exp(half3 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short3 ivector_t;
+  typedef short3 jvector_t;
+  typedef int3 kvector_t;
+  typedef half3 vector_t;
+#define convert_ivector_t convert_short3
+#define convert_jvector_t convert_short3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_half3
+#define ilogb_ _cl_ilogb_half3
+#define ldexp_scalar_ _cl_ldexp_half3_short
+#define ldexp_vector_ _cl_ldexp_half3_short3
+  return exp(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_exp: VF=half4
+// Implement native_exp directly
+__attribute__((__overloadable__))
+half4 _cl_native_exp(half4 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short4 ivector_t;
+  typedef short4 jvector_t;
+  typedef int4 kvector_t;
+  typedef half4 vector_t;
+#define convert_ivector_t convert_short4
+#define convert_jvector_t convert_short4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_half4
+#define ilogb_ _cl_ilogb_half4
+#define ldexp_scalar_ _cl_ldexp_half4_short
+#define ldexp_vector_ _cl_ldexp_half4_short4
+  return exp(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_exp: VF=half8
+// Implement native_exp directly
+__attribute__((__overloadable__))
+half8 _cl_native_exp(half8 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short8 ivector_t;
+  typedef short8 jvector_t;
+  typedef int8 kvector_t;
+  typedef half8 vector_t;
+#define convert_ivector_t convert_short8
+#define convert_jvector_t convert_short8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_half8
+#define ilogb_ _cl_ilogb_half8
+#define ldexp_scalar_ _cl_ldexp_half8_short
+#define ldexp_vector_ _cl_ldexp_half8_short8
+  return exp(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_exp: VF=half16
+// Implement native_exp directly
+__attribute__((__overloadable__))
+half16 _cl_native_exp(half16 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short16 ivector_t;
+  typedef short16 jvector_t;
+  typedef int16 kvector_t;
+  typedef half16 vector_t;
+#define convert_ivector_t convert_short16
+#define convert_jvector_t convert_short16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_half16
+#define ilogb_ _cl_ilogb_half16
+#define ldexp_scalar_ _cl_ldexp_half16_short
+#define ldexp_vector_ _cl_ldexp_half16_short16
+  return exp(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp16
+
 // native_exp: VF=float
 // Implement native_exp directly
 __attribute__((__overloadable__))
@@ -206,3 +390,187 @@ float16 _cl_native_exp(float16 x0)
 #undef ldexp_scalar_
 #undef ldexp_vector_
 }
+
+#ifdef cl_khr_fp64
+
+// native_exp: VF=double
+// Implement native_exp directly
+__attribute__((__overloadable__))
+double _cl_native_exp(double x0)
+{
+  typedef long iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef double vector_t;
+#define convert_ivector_t convert_long
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_double
+#define ilogb_ _cl_ilogb_double
+#define ldexp_scalar_ _cl_ldexp_double_long
+#define ldexp_vector_ _cl_ldexp_double_long
+  return exp(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_exp: VF=double2
+// Implement native_exp directly
+__attribute__((__overloadable__))
+double2 _cl_native_exp(double2 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long2 ivector_t;
+  typedef long2 jvector_t;
+  typedef int2 kvector_t;
+  typedef double2 vector_t;
+#define convert_ivector_t convert_long2
+#define convert_jvector_t convert_long2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_double2
+#define ilogb_ _cl_ilogb_double2
+#define ldexp_scalar_ _cl_ldexp_double2_long
+#define ldexp_vector_ _cl_ldexp_double2_long2
+  return exp(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_exp: VF=double3
+// Implement native_exp directly
+__attribute__((__overloadable__))
+double3 _cl_native_exp(double3 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long3 ivector_t;
+  typedef long3 jvector_t;
+  typedef int3 kvector_t;
+  typedef double3 vector_t;
+#define convert_ivector_t convert_long3
+#define convert_jvector_t convert_long3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_double3
+#define ilogb_ _cl_ilogb_double3
+#define ldexp_scalar_ _cl_ldexp_double3_long
+#define ldexp_vector_ _cl_ldexp_double3_long3
+  return exp(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_exp: VF=double4
+// Implement native_exp directly
+__attribute__((__overloadable__))
+double4 _cl_native_exp(double4 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long4 ivector_t;
+  typedef long4 jvector_t;
+  typedef int4 kvector_t;
+  typedef double4 vector_t;
+#define convert_ivector_t convert_long4
+#define convert_jvector_t convert_long4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_double4
+#define ilogb_ _cl_ilogb_double4
+#define ldexp_scalar_ _cl_ldexp_double4_long
+#define ldexp_vector_ _cl_ldexp_double4_long4
+  return exp(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_exp: VF=double8
+// Implement native_exp directly
+__attribute__((__overloadable__))
+double8 _cl_native_exp(double8 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long8 ivector_t;
+  typedef long8 jvector_t;
+  typedef int8 kvector_t;
+  typedef double8 vector_t;
+#define convert_ivector_t convert_long8
+#define convert_jvector_t convert_long8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_double8
+#define ilogb_ _cl_ilogb_double8
+#define ldexp_scalar_ _cl_ldexp_double8_long
+#define ldexp_vector_ _cl_ldexp_double8_long8
+  return exp(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_exp: VF=double16
+// Implement native_exp directly
+__attribute__((__overloadable__))
+double16 _cl_native_exp(double16 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long16 ivector_t;
+  typedef long16 jvector_t;
+  typedef int16 kvector_t;
+  typedef double16 vector_t;
+#define convert_ivector_t convert_long16
+#define convert_jvector_t convert_long16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_double16
+#define ilogb_ _cl_ilogb_double16
+#define ldexp_scalar_ _cl_ldexp_double16_long
+#define ldexp_vector_ _cl_ldexp_double16_long16
+  return exp(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp64
diff --git a/lib/kernel/vecmathlib-pocl/native_exp10.cl b/lib/kernel/vecmathlib-pocl/native_exp10.cl
index ac38f0f..ddc9e2e 100644
--- a/lib/kernel/vecmathlib-pocl/native_exp10.cl
+++ b/lib/kernel/vecmathlib-pocl/native_exp10.cl
@@ -27,6 +27,190 @@
 
 // native_exp10: ['VF'] -> VF
 
+#ifdef cl_khr_fp16
+
+// native_exp10: VF=half
+// Implement native_exp10 directly
+__attribute__((__overloadable__))
+half _cl_native_exp10(half x0)
+{
+  typedef short iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef half vector_t;
+#define convert_ivector_t convert_short
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_half
+#define ilogb_ _cl_ilogb_half
+#define ldexp_scalar_ _cl_ldexp_half_short
+#define ldexp_vector_ _cl_ldexp_half_short
+  return exp10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_exp10: VF=half2
+// Implement native_exp10 directly
+__attribute__((__overloadable__))
+half2 _cl_native_exp10(half2 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short2 ivector_t;
+  typedef short2 jvector_t;
+  typedef int2 kvector_t;
+  typedef half2 vector_t;
+#define convert_ivector_t convert_short2
+#define convert_jvector_t convert_short2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_half2
+#define ilogb_ _cl_ilogb_half2
+#define ldexp_scalar_ _cl_ldexp_half2_short
+#define ldexp_vector_ _cl_ldexp_half2_short2
+  return exp10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_exp10: VF=half3
+// Implement native_exp10 directly
+__attribute__((__overloadable__))
+half3 _cl_native_exp10(half3 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short3 ivector_t;
+  typedef short3 jvector_t;
+  typedef int3 kvector_t;
+  typedef half3 vector_t;
+#define convert_ivector_t convert_short3
+#define convert_jvector_t convert_short3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_half3
+#define ilogb_ _cl_ilogb_half3
+#define ldexp_scalar_ _cl_ldexp_half3_short
+#define ldexp_vector_ _cl_ldexp_half3_short3
+  return exp10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_exp10: VF=half4
+// Implement native_exp10 directly
+__attribute__((__overloadable__))
+half4 _cl_native_exp10(half4 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short4 ivector_t;
+  typedef short4 jvector_t;
+  typedef int4 kvector_t;
+  typedef half4 vector_t;
+#define convert_ivector_t convert_short4
+#define convert_jvector_t convert_short4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_half4
+#define ilogb_ _cl_ilogb_half4
+#define ldexp_scalar_ _cl_ldexp_half4_short
+#define ldexp_vector_ _cl_ldexp_half4_short4
+  return exp10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_exp10: VF=half8
+// Implement native_exp10 directly
+__attribute__((__overloadable__))
+half8 _cl_native_exp10(half8 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short8 ivector_t;
+  typedef short8 jvector_t;
+  typedef int8 kvector_t;
+  typedef half8 vector_t;
+#define convert_ivector_t convert_short8
+#define convert_jvector_t convert_short8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_half8
+#define ilogb_ _cl_ilogb_half8
+#define ldexp_scalar_ _cl_ldexp_half8_short
+#define ldexp_vector_ _cl_ldexp_half8_short8
+  return exp10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_exp10: VF=half16
+// Implement native_exp10 directly
+__attribute__((__overloadable__))
+half16 _cl_native_exp10(half16 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short16 ivector_t;
+  typedef short16 jvector_t;
+  typedef int16 kvector_t;
+  typedef half16 vector_t;
+#define convert_ivector_t convert_short16
+#define convert_jvector_t convert_short16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_half16
+#define ilogb_ _cl_ilogb_half16
+#define ldexp_scalar_ _cl_ldexp_half16_short
+#define ldexp_vector_ _cl_ldexp_half16_short16
+  return exp10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp16
+
 // native_exp10: VF=float
 // Implement native_exp10 directly
 __attribute__((__overloadable__))
@@ -206,3 +390,187 @@ float16 _cl_native_exp10(float16 x0)
 #undef ldexp_scalar_
 #undef ldexp_vector_
 }
+
+#ifdef cl_khr_fp64
+
+// native_exp10: VF=double
+// Implement native_exp10 directly
+__attribute__((__overloadable__))
+double _cl_native_exp10(double x0)
+{
+  typedef long iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef double vector_t;
+#define convert_ivector_t convert_long
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_double
+#define ilogb_ _cl_ilogb_double
+#define ldexp_scalar_ _cl_ldexp_double_long
+#define ldexp_vector_ _cl_ldexp_double_long
+  return exp10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_exp10: VF=double2
+// Implement native_exp10 directly
+__attribute__((__overloadable__))
+double2 _cl_native_exp10(double2 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long2 ivector_t;
+  typedef long2 jvector_t;
+  typedef int2 kvector_t;
+  typedef double2 vector_t;
+#define convert_ivector_t convert_long2
+#define convert_jvector_t convert_long2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_double2
+#define ilogb_ _cl_ilogb_double2
+#define ldexp_scalar_ _cl_ldexp_double2_long
+#define ldexp_vector_ _cl_ldexp_double2_long2
+  return exp10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_exp10: VF=double3
+// Implement native_exp10 directly
+__attribute__((__overloadable__))
+double3 _cl_native_exp10(double3 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long3 ivector_t;
+  typedef long3 jvector_t;
+  typedef int3 kvector_t;
+  typedef double3 vector_t;
+#define convert_ivector_t convert_long3
+#define convert_jvector_t convert_long3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_double3
+#define ilogb_ _cl_ilogb_double3
+#define ldexp_scalar_ _cl_ldexp_double3_long
+#define ldexp_vector_ _cl_ldexp_double3_long3
+  return exp10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_exp10: VF=double4
+// Implement native_exp10 directly
+__attribute__((__overloadable__))
+double4 _cl_native_exp10(double4 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long4 ivector_t;
+  typedef long4 jvector_t;
+  typedef int4 kvector_t;
+  typedef double4 vector_t;
+#define convert_ivector_t convert_long4
+#define convert_jvector_t convert_long4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_double4
+#define ilogb_ _cl_ilogb_double4
+#define ldexp_scalar_ _cl_ldexp_double4_long
+#define ldexp_vector_ _cl_ldexp_double4_long4
+  return exp10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_exp10: VF=double8
+// Implement native_exp10 directly
+__attribute__((__overloadable__))
+double8 _cl_native_exp10(double8 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long8 ivector_t;
+  typedef long8 jvector_t;
+  typedef int8 kvector_t;
+  typedef double8 vector_t;
+#define convert_ivector_t convert_long8
+#define convert_jvector_t convert_long8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_double8
+#define ilogb_ _cl_ilogb_double8
+#define ldexp_scalar_ _cl_ldexp_double8_long
+#define ldexp_vector_ _cl_ldexp_double8_long8
+  return exp10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_exp10: VF=double16
+// Implement native_exp10 directly
+__attribute__((__overloadable__))
+double16 _cl_native_exp10(double16 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long16 ivector_t;
+  typedef long16 jvector_t;
+  typedef int16 kvector_t;
+  typedef double16 vector_t;
+#define convert_ivector_t convert_long16
+#define convert_jvector_t convert_long16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_double16
+#define ilogb_ _cl_ilogb_double16
+#define ldexp_scalar_ _cl_ldexp_double16_long
+#define ldexp_vector_ _cl_ldexp_double16_long16
+  return exp10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp64
diff --git a/lib/kernel/vecmathlib-pocl/native_exp2.cl b/lib/kernel/vecmathlib-pocl/native_exp2.cl
index a7e743a..47daf10 100644
--- a/lib/kernel/vecmathlib-pocl/native_exp2.cl
+++ b/lib/kernel/vecmathlib-pocl/native_exp2.cl
@@ -27,6 +27,190 @@
 
 // native_exp2: ['VF'] -> VF
 
+#ifdef cl_khr_fp16
+
+// native_exp2: VF=half
+// Implement native_exp2 directly
+__attribute__((__overloadable__))
+half _cl_native_exp2(half x0)
+{
+  typedef short iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef half vector_t;
+#define convert_ivector_t convert_short
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_half
+#define ilogb_ _cl_ilogb_half
+#define ldexp_scalar_ _cl_ldexp_half_short
+#define ldexp_vector_ _cl_ldexp_half_short
+  return exp2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_exp2: VF=half2
+// Implement native_exp2 directly
+__attribute__((__overloadable__))
+half2 _cl_native_exp2(half2 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short2 ivector_t;
+  typedef short2 jvector_t;
+  typedef int2 kvector_t;
+  typedef half2 vector_t;
+#define convert_ivector_t convert_short2
+#define convert_jvector_t convert_short2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_half2
+#define ilogb_ _cl_ilogb_half2
+#define ldexp_scalar_ _cl_ldexp_half2_short
+#define ldexp_vector_ _cl_ldexp_half2_short2
+  return exp2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_exp2: VF=half3
+// Implement native_exp2 directly
+__attribute__((__overloadable__))
+half3 _cl_native_exp2(half3 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short3 ivector_t;
+  typedef short3 jvector_t;
+  typedef int3 kvector_t;
+  typedef half3 vector_t;
+#define convert_ivector_t convert_short3
+#define convert_jvector_t convert_short3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_half3
+#define ilogb_ _cl_ilogb_half3
+#define ldexp_scalar_ _cl_ldexp_half3_short
+#define ldexp_vector_ _cl_ldexp_half3_short3
+  return exp2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_exp2: VF=half4
+// Implement native_exp2 directly
+__attribute__((__overloadable__))
+half4 _cl_native_exp2(half4 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short4 ivector_t;
+  typedef short4 jvector_t;
+  typedef int4 kvector_t;
+  typedef half4 vector_t;
+#define convert_ivector_t convert_short4
+#define convert_jvector_t convert_short4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_half4
+#define ilogb_ _cl_ilogb_half4
+#define ldexp_scalar_ _cl_ldexp_half4_short
+#define ldexp_vector_ _cl_ldexp_half4_short4
+  return exp2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_exp2: VF=half8
+// Implement native_exp2 directly
+__attribute__((__overloadable__))
+half8 _cl_native_exp2(half8 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short8 ivector_t;
+  typedef short8 jvector_t;
+  typedef int8 kvector_t;
+  typedef half8 vector_t;
+#define convert_ivector_t convert_short8
+#define convert_jvector_t convert_short8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_half8
+#define ilogb_ _cl_ilogb_half8
+#define ldexp_scalar_ _cl_ldexp_half8_short
+#define ldexp_vector_ _cl_ldexp_half8_short8
+  return exp2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_exp2: VF=half16
+// Implement native_exp2 directly
+__attribute__((__overloadable__))
+half16 _cl_native_exp2(half16 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short16 ivector_t;
+  typedef short16 jvector_t;
+  typedef int16 kvector_t;
+  typedef half16 vector_t;
+#define convert_ivector_t convert_short16
+#define convert_jvector_t convert_short16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_half16
+#define ilogb_ _cl_ilogb_half16
+#define ldexp_scalar_ _cl_ldexp_half16_short
+#define ldexp_vector_ _cl_ldexp_half16_short16
+  return exp2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp16
+
 // native_exp2: VF=float
 // Implement native_exp2 directly
 __attribute__((__overloadable__))
@@ -206,3 +390,187 @@ float16 _cl_native_exp2(float16 x0)
 #undef ldexp_scalar_
 #undef ldexp_vector_
 }
+
+#ifdef cl_khr_fp64
+
+// native_exp2: VF=double
+// Implement native_exp2 directly
+__attribute__((__overloadable__))
+double _cl_native_exp2(double x0)
+{
+  typedef long iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef double vector_t;
+#define convert_ivector_t convert_long
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_double
+#define ilogb_ _cl_ilogb_double
+#define ldexp_scalar_ _cl_ldexp_double_long
+#define ldexp_vector_ _cl_ldexp_double_long
+  return exp2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_exp2: VF=double2
+// Implement native_exp2 directly
+__attribute__((__overloadable__))
+double2 _cl_native_exp2(double2 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long2 ivector_t;
+  typedef long2 jvector_t;
+  typedef int2 kvector_t;
+  typedef double2 vector_t;
+#define convert_ivector_t convert_long2
+#define convert_jvector_t convert_long2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_double2
+#define ilogb_ _cl_ilogb_double2
+#define ldexp_scalar_ _cl_ldexp_double2_long
+#define ldexp_vector_ _cl_ldexp_double2_long2
+  return exp2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_exp2: VF=double3
+// Implement native_exp2 directly
+__attribute__((__overloadable__))
+double3 _cl_native_exp2(double3 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long3 ivector_t;
+  typedef long3 jvector_t;
+  typedef int3 kvector_t;
+  typedef double3 vector_t;
+#define convert_ivector_t convert_long3
+#define convert_jvector_t convert_long3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_double3
+#define ilogb_ _cl_ilogb_double3
+#define ldexp_scalar_ _cl_ldexp_double3_long
+#define ldexp_vector_ _cl_ldexp_double3_long3
+  return exp2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_exp2: VF=double4
+// Implement native_exp2 directly
+__attribute__((__overloadable__))
+double4 _cl_native_exp2(double4 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long4 ivector_t;
+  typedef long4 jvector_t;
+  typedef int4 kvector_t;
+  typedef double4 vector_t;
+#define convert_ivector_t convert_long4
+#define convert_jvector_t convert_long4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_double4
+#define ilogb_ _cl_ilogb_double4
+#define ldexp_scalar_ _cl_ldexp_double4_long
+#define ldexp_vector_ _cl_ldexp_double4_long4
+  return exp2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_exp2: VF=double8
+// Implement native_exp2 directly
+__attribute__((__overloadable__))
+double8 _cl_native_exp2(double8 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long8 ivector_t;
+  typedef long8 jvector_t;
+  typedef int8 kvector_t;
+  typedef double8 vector_t;
+#define convert_ivector_t convert_long8
+#define convert_jvector_t convert_long8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_double8
+#define ilogb_ _cl_ilogb_double8
+#define ldexp_scalar_ _cl_ldexp_double8_long
+#define ldexp_vector_ _cl_ldexp_double8_long8
+  return exp2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_exp2: VF=double16
+// Implement native_exp2 directly
+__attribute__((__overloadable__))
+double16 _cl_native_exp2(double16 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long16 ivector_t;
+  typedef long16 jvector_t;
+  typedef int16 kvector_t;
+  typedef double16 vector_t;
+#define convert_ivector_t convert_long16
+#define convert_jvector_t convert_long16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_double16
+#define ilogb_ _cl_ilogb_double16
+#define ldexp_scalar_ _cl_ldexp_double16_long
+#define ldexp_vector_ _cl_ldexp_double16_long16
+  return exp2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp64
diff --git a/lib/kernel/vecmathlib-pocl/native_log.cl b/lib/kernel/vecmathlib-pocl/native_log.cl
index e764214..93a010d 100644
--- a/lib/kernel/vecmathlib-pocl/native_log.cl
+++ b/lib/kernel/vecmathlib-pocl/native_log.cl
@@ -27,6 +27,190 @@
 
 // native_log: ['VF'] -> VF
 
+#ifdef cl_khr_fp16
+
+// native_log: VF=half
+// Implement native_log directly
+__attribute__((__overloadable__))
+half _cl_native_log(half x0)
+{
+  typedef short iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef half vector_t;
+#define convert_ivector_t convert_short
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_half
+#define ilogb_ _cl_ilogb_half
+#define ldexp_scalar_ _cl_ldexp_half_short
+#define ldexp_vector_ _cl_ldexp_half_short
+  return log(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_log: VF=half2
+// Implement native_log directly
+__attribute__((__overloadable__))
+half2 _cl_native_log(half2 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short2 ivector_t;
+  typedef short2 jvector_t;
+  typedef int2 kvector_t;
+  typedef half2 vector_t;
+#define convert_ivector_t convert_short2
+#define convert_jvector_t convert_short2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_half2
+#define ilogb_ _cl_ilogb_half2
+#define ldexp_scalar_ _cl_ldexp_half2_short
+#define ldexp_vector_ _cl_ldexp_half2_short2
+  return log(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_log: VF=half3
+// Implement native_log directly
+__attribute__((__overloadable__))
+half3 _cl_native_log(half3 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short3 ivector_t;
+  typedef short3 jvector_t;
+  typedef int3 kvector_t;
+  typedef half3 vector_t;
+#define convert_ivector_t convert_short3
+#define convert_jvector_t convert_short3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_half3
+#define ilogb_ _cl_ilogb_half3
+#define ldexp_scalar_ _cl_ldexp_half3_short
+#define ldexp_vector_ _cl_ldexp_half3_short3
+  return log(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_log: VF=half4
+// Implement native_log directly
+__attribute__((__overloadable__))
+half4 _cl_native_log(half4 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short4 ivector_t;
+  typedef short4 jvector_t;
+  typedef int4 kvector_t;
+  typedef half4 vector_t;
+#define convert_ivector_t convert_short4
+#define convert_jvector_t convert_short4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_half4
+#define ilogb_ _cl_ilogb_half4
+#define ldexp_scalar_ _cl_ldexp_half4_short
+#define ldexp_vector_ _cl_ldexp_half4_short4
+  return log(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_log: VF=half8
+// Implement native_log directly
+__attribute__((__overloadable__))
+half8 _cl_native_log(half8 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short8 ivector_t;
+  typedef short8 jvector_t;
+  typedef int8 kvector_t;
+  typedef half8 vector_t;
+#define convert_ivector_t convert_short8
+#define convert_jvector_t convert_short8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_half8
+#define ilogb_ _cl_ilogb_half8
+#define ldexp_scalar_ _cl_ldexp_half8_short
+#define ldexp_vector_ _cl_ldexp_half8_short8
+  return log(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_log: VF=half16
+// Implement native_log directly
+__attribute__((__overloadable__))
+half16 _cl_native_log(half16 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short16 ivector_t;
+  typedef short16 jvector_t;
+  typedef int16 kvector_t;
+  typedef half16 vector_t;
+#define convert_ivector_t convert_short16
+#define convert_jvector_t convert_short16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_half16
+#define ilogb_ _cl_ilogb_half16
+#define ldexp_scalar_ _cl_ldexp_half16_short
+#define ldexp_vector_ _cl_ldexp_half16_short16
+  return log(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp16
+
 // native_log: VF=float
 // Implement native_log directly
 __attribute__((__overloadable__))
@@ -206,3 +390,187 @@ float16 _cl_native_log(float16 x0)
 #undef ldexp_scalar_
 #undef ldexp_vector_
 }
+
+#ifdef cl_khr_fp64
+
+// native_log: VF=double
+// Implement native_log directly
+__attribute__((__overloadable__))
+double _cl_native_log(double x0)
+{
+  typedef long iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef double vector_t;
+#define convert_ivector_t convert_long
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_double
+#define ilogb_ _cl_ilogb_double
+#define ldexp_scalar_ _cl_ldexp_double_long
+#define ldexp_vector_ _cl_ldexp_double_long
+  return log(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_log: VF=double2
+// Implement native_log directly
+__attribute__((__overloadable__))
+double2 _cl_native_log(double2 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long2 ivector_t;
+  typedef long2 jvector_t;
+  typedef int2 kvector_t;
+  typedef double2 vector_t;
+#define convert_ivector_t convert_long2
+#define convert_jvector_t convert_long2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_double2
+#define ilogb_ _cl_ilogb_double2
+#define ldexp_scalar_ _cl_ldexp_double2_long
+#define ldexp_vector_ _cl_ldexp_double2_long2
+  return log(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_log: VF=double3
+// Implement native_log directly
+__attribute__((__overloadable__))
+double3 _cl_native_log(double3 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long3 ivector_t;
+  typedef long3 jvector_t;
+  typedef int3 kvector_t;
+  typedef double3 vector_t;
+#define convert_ivector_t convert_long3
+#define convert_jvector_t convert_long3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_double3
+#define ilogb_ _cl_ilogb_double3
+#define ldexp_scalar_ _cl_ldexp_double3_long
+#define ldexp_vector_ _cl_ldexp_double3_long3
+  return log(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_log: VF=double4
+// Implement native_log directly
+__attribute__((__overloadable__))
+double4 _cl_native_log(double4 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long4 ivector_t;
+  typedef long4 jvector_t;
+  typedef int4 kvector_t;
+  typedef double4 vector_t;
+#define convert_ivector_t convert_long4
+#define convert_jvector_t convert_long4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_double4
+#define ilogb_ _cl_ilogb_double4
+#define ldexp_scalar_ _cl_ldexp_double4_long
+#define ldexp_vector_ _cl_ldexp_double4_long4
+  return log(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_log: VF=double8
+// Implement native_log directly
+__attribute__((__overloadable__))
+double8 _cl_native_log(double8 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long8 ivector_t;
+  typedef long8 jvector_t;
+  typedef int8 kvector_t;
+  typedef double8 vector_t;
+#define convert_ivector_t convert_long8
+#define convert_jvector_t convert_long8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_double8
+#define ilogb_ _cl_ilogb_double8
+#define ldexp_scalar_ _cl_ldexp_double8_long
+#define ldexp_vector_ _cl_ldexp_double8_long8
+  return log(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_log: VF=double16
+// Implement native_log directly
+__attribute__((__overloadable__))
+double16 _cl_native_log(double16 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long16 ivector_t;
+  typedef long16 jvector_t;
+  typedef int16 kvector_t;
+  typedef double16 vector_t;
+#define convert_ivector_t convert_long16
+#define convert_jvector_t convert_long16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_double16
+#define ilogb_ _cl_ilogb_double16
+#define ldexp_scalar_ _cl_ldexp_double16_long
+#define ldexp_vector_ _cl_ldexp_double16_long16
+  return log(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp64
diff --git a/lib/kernel/vecmathlib-pocl/native_log10.cl b/lib/kernel/vecmathlib-pocl/native_log10.cl
index 3d3766c..85cc3dc 100644
--- a/lib/kernel/vecmathlib-pocl/native_log10.cl
+++ b/lib/kernel/vecmathlib-pocl/native_log10.cl
@@ -27,6 +27,190 @@
 
 // native_log10: ['VF'] -> VF
 
+#ifdef cl_khr_fp16
+
+// native_log10: VF=half
+// Implement native_log10 directly
+__attribute__((__overloadable__))
+half _cl_native_log10(half x0)
+{
+  typedef short iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef half vector_t;
+#define convert_ivector_t convert_short
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_half
+#define ilogb_ _cl_ilogb_half
+#define ldexp_scalar_ _cl_ldexp_half_short
+#define ldexp_vector_ _cl_ldexp_half_short
+  return log10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_log10: VF=half2
+// Implement native_log10 directly
+__attribute__((__overloadable__))
+half2 _cl_native_log10(half2 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short2 ivector_t;
+  typedef short2 jvector_t;
+  typedef int2 kvector_t;
+  typedef half2 vector_t;
+#define convert_ivector_t convert_short2
+#define convert_jvector_t convert_short2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_half2
+#define ilogb_ _cl_ilogb_half2
+#define ldexp_scalar_ _cl_ldexp_half2_short
+#define ldexp_vector_ _cl_ldexp_half2_short2
+  return log10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_log10: VF=half3
+// Implement native_log10 directly
+__attribute__((__overloadable__))
+half3 _cl_native_log10(half3 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short3 ivector_t;
+  typedef short3 jvector_t;
+  typedef int3 kvector_t;
+  typedef half3 vector_t;
+#define convert_ivector_t convert_short3
+#define convert_jvector_t convert_short3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_half3
+#define ilogb_ _cl_ilogb_half3
+#define ldexp_scalar_ _cl_ldexp_half3_short
+#define ldexp_vector_ _cl_ldexp_half3_short3
+  return log10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_log10: VF=half4
+// Implement native_log10 directly
+__attribute__((__overloadable__))
+half4 _cl_native_log10(half4 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short4 ivector_t;
+  typedef short4 jvector_t;
+  typedef int4 kvector_t;
+  typedef half4 vector_t;
+#define convert_ivector_t convert_short4
+#define convert_jvector_t convert_short4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_half4
+#define ilogb_ _cl_ilogb_half4
+#define ldexp_scalar_ _cl_ldexp_half4_short
+#define ldexp_vector_ _cl_ldexp_half4_short4
+  return log10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_log10: VF=half8
+// Implement native_log10 directly
+__attribute__((__overloadable__))
+half8 _cl_native_log10(half8 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short8 ivector_t;
+  typedef short8 jvector_t;
+  typedef int8 kvector_t;
+  typedef half8 vector_t;
+#define convert_ivector_t convert_short8
+#define convert_jvector_t convert_short8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_half8
+#define ilogb_ _cl_ilogb_half8
+#define ldexp_scalar_ _cl_ldexp_half8_short
+#define ldexp_vector_ _cl_ldexp_half8_short8
+  return log10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_log10: VF=half16
+// Implement native_log10 directly
+__attribute__((__overloadable__))
+half16 _cl_native_log10(half16 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short16 ivector_t;
+  typedef short16 jvector_t;
+  typedef int16 kvector_t;
+  typedef half16 vector_t;
+#define convert_ivector_t convert_short16
+#define convert_jvector_t convert_short16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_half16
+#define ilogb_ _cl_ilogb_half16
+#define ldexp_scalar_ _cl_ldexp_half16_short
+#define ldexp_vector_ _cl_ldexp_half16_short16
+  return log10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp16
+
 // native_log10: VF=float
 // Implement native_log10 directly
 __attribute__((__overloadable__))
@@ -206,3 +390,187 @@ float16 _cl_native_log10(float16 x0)
 #undef ldexp_scalar_
 #undef ldexp_vector_
 }
+
+#ifdef cl_khr_fp64
+
+// native_log10: VF=double
+// Implement native_log10 directly
+__attribute__((__overloadable__))
+double _cl_native_log10(double x0)
+{
+  typedef long iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef double vector_t;
+#define convert_ivector_t convert_long
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_double
+#define ilogb_ _cl_ilogb_double
+#define ldexp_scalar_ _cl_ldexp_double_long
+#define ldexp_vector_ _cl_ldexp_double_long
+  return log10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_log10: VF=double2
+// Implement native_log10 directly
+__attribute__((__overloadable__))
+double2 _cl_native_log10(double2 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long2 ivector_t;
+  typedef long2 jvector_t;
+  typedef int2 kvector_t;
+  typedef double2 vector_t;
+#define convert_ivector_t convert_long2
+#define convert_jvector_t convert_long2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_double2
+#define ilogb_ _cl_ilogb_double2
+#define ldexp_scalar_ _cl_ldexp_double2_long
+#define ldexp_vector_ _cl_ldexp_double2_long2
+  return log10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_log10: VF=double3
+// Implement native_log10 directly
+__attribute__((__overloadable__))
+double3 _cl_native_log10(double3 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long3 ivector_t;
+  typedef long3 jvector_t;
+  typedef int3 kvector_t;
+  typedef double3 vector_t;
+#define convert_ivector_t convert_long3
+#define convert_jvector_t convert_long3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_double3
+#define ilogb_ _cl_ilogb_double3
+#define ldexp_scalar_ _cl_ldexp_double3_long
+#define ldexp_vector_ _cl_ldexp_double3_long3
+  return log10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_log10: VF=double4
+// Implement native_log10 directly
+__attribute__((__overloadable__))
+double4 _cl_native_log10(double4 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long4 ivector_t;
+  typedef long4 jvector_t;
+  typedef int4 kvector_t;
+  typedef double4 vector_t;
+#define convert_ivector_t convert_long4
+#define convert_jvector_t convert_long4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_double4
+#define ilogb_ _cl_ilogb_double4
+#define ldexp_scalar_ _cl_ldexp_double4_long
+#define ldexp_vector_ _cl_ldexp_double4_long4
+  return log10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_log10: VF=double8
+// Implement native_log10 directly
+__attribute__((__overloadable__))
+double8 _cl_native_log10(double8 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long8 ivector_t;
+  typedef long8 jvector_t;
+  typedef int8 kvector_t;
+  typedef double8 vector_t;
+#define convert_ivector_t convert_long8
+#define convert_jvector_t convert_long8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_double8
+#define ilogb_ _cl_ilogb_double8
+#define ldexp_scalar_ _cl_ldexp_double8_long
+#define ldexp_vector_ _cl_ldexp_double8_long8
+  return log10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_log10: VF=double16
+// Implement native_log10 directly
+__attribute__((__overloadable__))
+double16 _cl_native_log10(double16 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long16 ivector_t;
+  typedef long16 jvector_t;
+  typedef int16 kvector_t;
+  typedef double16 vector_t;
+#define convert_ivector_t convert_long16
+#define convert_jvector_t convert_long16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_double16
+#define ilogb_ _cl_ilogb_double16
+#define ldexp_scalar_ _cl_ldexp_double16_long
+#define ldexp_vector_ _cl_ldexp_double16_long16
+  return log10(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp64
diff --git a/lib/kernel/vecmathlib-pocl/native_log2.cl b/lib/kernel/vecmathlib-pocl/native_log2.cl
index 9b690f1..1f5239e 100644
--- a/lib/kernel/vecmathlib-pocl/native_log2.cl
+++ b/lib/kernel/vecmathlib-pocl/native_log2.cl
@@ -27,6 +27,190 @@
 
 // native_log2: ['VF'] -> VF
 
+#ifdef cl_khr_fp16
+
+// native_log2: VF=half
+// Implement native_log2 directly
+__attribute__((__overloadable__))
+half _cl_native_log2(half x0)
+{
+  typedef short iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef half vector_t;
+#define convert_ivector_t convert_short
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_half
+#define ilogb_ _cl_ilogb_half
+#define ldexp_scalar_ _cl_ldexp_half_short
+#define ldexp_vector_ _cl_ldexp_half_short
+  return log2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_log2: VF=half2
+// Implement native_log2 directly
+__attribute__((__overloadable__))
+half2 _cl_native_log2(half2 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short2 ivector_t;
+  typedef short2 jvector_t;
+  typedef int2 kvector_t;
+  typedef half2 vector_t;
+#define convert_ivector_t convert_short2
+#define convert_jvector_t convert_short2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_half2
+#define ilogb_ _cl_ilogb_half2
+#define ldexp_scalar_ _cl_ldexp_half2_short
+#define ldexp_vector_ _cl_ldexp_half2_short2
+  return log2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_log2: VF=half3
+// Implement native_log2 directly
+__attribute__((__overloadable__))
+half3 _cl_native_log2(half3 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short3 ivector_t;
+  typedef short3 jvector_t;
+  typedef int3 kvector_t;
+  typedef half3 vector_t;
+#define convert_ivector_t convert_short3
+#define convert_jvector_t convert_short3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_half3
+#define ilogb_ _cl_ilogb_half3
+#define ldexp_scalar_ _cl_ldexp_half3_short
+#define ldexp_vector_ _cl_ldexp_half3_short3
+  return log2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_log2: VF=half4
+// Implement native_log2 directly
+__attribute__((__overloadable__))
+half4 _cl_native_log2(half4 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short4 ivector_t;
+  typedef short4 jvector_t;
+  typedef int4 kvector_t;
+  typedef half4 vector_t;
+#define convert_ivector_t convert_short4
+#define convert_jvector_t convert_short4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_half4
+#define ilogb_ _cl_ilogb_half4
+#define ldexp_scalar_ _cl_ldexp_half4_short
+#define ldexp_vector_ _cl_ldexp_half4_short4
+  return log2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_log2: VF=half8
+// Implement native_log2 directly
+__attribute__((__overloadable__))
+half8 _cl_native_log2(half8 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short8 ivector_t;
+  typedef short8 jvector_t;
+  typedef int8 kvector_t;
+  typedef half8 vector_t;
+#define convert_ivector_t convert_short8
+#define convert_jvector_t convert_short8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_half8
+#define ilogb_ _cl_ilogb_half8
+#define ldexp_scalar_ _cl_ldexp_half8_short
+#define ldexp_vector_ _cl_ldexp_half8_short8
+  return log2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_log2: VF=half16
+// Implement native_log2 directly
+__attribute__((__overloadable__))
+half16 _cl_native_log2(half16 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short16 ivector_t;
+  typedef short16 jvector_t;
+  typedef int16 kvector_t;
+  typedef half16 vector_t;
+#define convert_ivector_t convert_short16
+#define convert_jvector_t convert_short16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_half16
+#define ilogb_ _cl_ilogb_half16
+#define ldexp_scalar_ _cl_ldexp_half16_short
+#define ldexp_vector_ _cl_ldexp_half16_short16
+  return log2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp16
+
 // native_log2: VF=float
 // Implement native_log2 directly
 __attribute__((__overloadable__))
@@ -206,3 +390,187 @@ float16 _cl_native_log2(float16 x0)
 #undef ldexp_scalar_
 #undef ldexp_vector_
 }
+
+#ifdef cl_khr_fp64
+
+// native_log2: VF=double
+// Implement native_log2 directly
+__attribute__((__overloadable__))
+double _cl_native_log2(double x0)
+{
+  typedef long iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef double vector_t;
+#define convert_ivector_t convert_long
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_double
+#define ilogb_ _cl_ilogb_double
+#define ldexp_scalar_ _cl_ldexp_double_long
+#define ldexp_vector_ _cl_ldexp_double_long
+  return log2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_log2: VF=double2
+// Implement native_log2 directly
+__attribute__((__overloadable__))
+double2 _cl_native_log2(double2 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long2 ivector_t;
+  typedef long2 jvector_t;
+  typedef int2 kvector_t;
+  typedef double2 vector_t;
+#define convert_ivector_t convert_long2
+#define convert_jvector_t convert_long2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_double2
+#define ilogb_ _cl_ilogb_double2
+#define ldexp_scalar_ _cl_ldexp_double2_long
+#define ldexp_vector_ _cl_ldexp_double2_long2
+  return log2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_log2: VF=double3
+// Implement native_log2 directly
+__attribute__((__overloadable__))
+double3 _cl_native_log2(double3 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long3 ivector_t;
+  typedef long3 jvector_t;
+  typedef int3 kvector_t;
+  typedef double3 vector_t;
+#define convert_ivector_t convert_long3
+#define convert_jvector_t convert_long3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_double3
+#define ilogb_ _cl_ilogb_double3
+#define ldexp_scalar_ _cl_ldexp_double3_long
+#define ldexp_vector_ _cl_ldexp_double3_long3
+  return log2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_log2: VF=double4
+// Implement native_log2 directly
+__attribute__((__overloadable__))
+double4 _cl_native_log2(double4 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long4 ivector_t;
+  typedef long4 jvector_t;
+  typedef int4 kvector_t;
+  typedef double4 vector_t;
+#define convert_ivector_t convert_long4
+#define convert_jvector_t convert_long4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_double4
+#define ilogb_ _cl_ilogb_double4
+#define ldexp_scalar_ _cl_ldexp_double4_long
+#define ldexp_vector_ _cl_ldexp_double4_long4
+  return log2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_log2: VF=double8
+// Implement native_log2 directly
+__attribute__((__overloadable__))
+double8 _cl_native_log2(double8 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long8 ivector_t;
+  typedef long8 jvector_t;
+  typedef int8 kvector_t;
+  typedef double8 vector_t;
+#define convert_ivector_t convert_long8
+#define convert_jvector_t convert_long8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_double8
+#define ilogb_ _cl_ilogb_double8
+#define ldexp_scalar_ _cl_ldexp_double8_long
+#define ldexp_vector_ _cl_ldexp_double8_long8
+  return log2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_log2: VF=double16
+// Implement native_log2 directly
+__attribute__((__overloadable__))
+double16 _cl_native_log2(double16 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long16 ivector_t;
+  typedef long16 jvector_t;
+  typedef int16 kvector_t;
+  typedef double16 vector_t;
+#define convert_ivector_t convert_long16
+#define convert_jvector_t convert_long16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_double16
+#define ilogb_ _cl_ilogb_double16
+#define ldexp_scalar_ _cl_ldexp_double16_long
+#define ldexp_vector_ _cl_ldexp_double16_long16
+  return log2(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp64
diff --git a/lib/kernel/vecmathlib-pocl/native_powr.cl b/lib/kernel/vecmathlib-pocl/native_powr.cl
index b13f852..1893adb 100644
--- a/lib/kernel/vecmathlib-pocl/native_powr.cl
+++ b/lib/kernel/vecmathlib-pocl/native_powr.cl
@@ -27,6 +27,190 @@
 
 // native_powr: ['VF', 'VF'] -> VF
 
+#ifdef cl_khr_fp16
+
+// native_powr: VF=half
+// Implement native_powr directly
+__attribute__((__overloadable__))
+half _cl_native_powr(half x0, half x1)
+{
+  typedef short iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef half vector_t;
+#define convert_ivector_t convert_short
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_half
+#define ilogb_ _cl_ilogb_half
+#define ldexp_scalar_ _cl_ldexp_half_short
+#define ldexp_vector_ _cl_ldexp_half_short
+  return powr(x0,x1);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_powr: VF=half2
+// Implement native_powr directly
+__attribute__((__overloadable__))
+half2 _cl_native_powr(half2 x0, half2 x1)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short2 ivector_t;
+  typedef short2 jvector_t;
+  typedef int2 kvector_t;
+  typedef half2 vector_t;
+#define convert_ivector_t convert_short2
+#define convert_jvector_t convert_short2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_half2
+#define ilogb_ _cl_ilogb_half2
+#define ldexp_scalar_ _cl_ldexp_half2_short
+#define ldexp_vector_ _cl_ldexp_half2_short2
+  return powr(x0,x1);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_powr: VF=half3
+// Implement native_powr directly
+__attribute__((__overloadable__))
+half3 _cl_native_powr(half3 x0, half3 x1)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short3 ivector_t;
+  typedef short3 jvector_t;
+  typedef int3 kvector_t;
+  typedef half3 vector_t;
+#define convert_ivector_t convert_short3
+#define convert_jvector_t convert_short3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_half3
+#define ilogb_ _cl_ilogb_half3
+#define ldexp_scalar_ _cl_ldexp_half3_short
+#define ldexp_vector_ _cl_ldexp_half3_short3
+  return powr(x0,x1);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_powr: VF=half4
+// Implement native_powr directly
+__attribute__((__overloadable__))
+half4 _cl_native_powr(half4 x0, half4 x1)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short4 ivector_t;
+  typedef short4 jvector_t;
+  typedef int4 kvector_t;
+  typedef half4 vector_t;
+#define convert_ivector_t convert_short4
+#define convert_jvector_t convert_short4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_half4
+#define ilogb_ _cl_ilogb_half4
+#define ldexp_scalar_ _cl_ldexp_half4_short
+#define ldexp_vector_ _cl_ldexp_half4_short4
+  return powr(x0,x1);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_powr: VF=half8
+// Implement native_powr directly
+__attribute__((__overloadable__))
+half8 _cl_native_powr(half8 x0, half8 x1)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short8 ivector_t;
+  typedef short8 jvector_t;
+  typedef int8 kvector_t;
+  typedef half8 vector_t;
+#define convert_ivector_t convert_short8
+#define convert_jvector_t convert_short8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_half8
+#define ilogb_ _cl_ilogb_half8
+#define ldexp_scalar_ _cl_ldexp_half8_short
+#define ldexp_vector_ _cl_ldexp_half8_short8
+  return powr(x0,x1);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_powr: VF=half16
+// Implement native_powr directly
+__attribute__((__overloadable__))
+half16 _cl_native_powr(half16 x0, half16 x1)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short16 ivector_t;
+  typedef short16 jvector_t;
+  typedef int16 kvector_t;
+  typedef half16 vector_t;
+#define convert_ivector_t convert_short16
+#define convert_jvector_t convert_short16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_half16
+#define ilogb_ _cl_ilogb_half16
+#define ldexp_scalar_ _cl_ldexp_half16_short
+#define ldexp_vector_ _cl_ldexp_half16_short16
+  return powr(x0,x1);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp16
+
 // native_powr: VF=float
 // Implement native_powr directly
 __attribute__((__overloadable__))
@@ -206,3 +390,187 @@ float16 _cl_native_powr(float16 x0, float16 x1)
 #undef ldexp_scalar_
 #undef ldexp_vector_
 }
+
+#ifdef cl_khr_fp64
+
+// native_powr: VF=double
+// Implement native_powr directly
+__attribute__((__overloadable__))
+double _cl_native_powr(double x0, double x1)
+{
+  typedef long iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef double vector_t;
+#define convert_ivector_t convert_long
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_double
+#define ilogb_ _cl_ilogb_double
+#define ldexp_scalar_ _cl_ldexp_double_long
+#define ldexp_vector_ _cl_ldexp_double_long
+  return powr(x0,x1);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_powr: VF=double2
+// Implement native_powr directly
+__attribute__((__overloadable__))
+double2 _cl_native_powr(double2 x0, double2 x1)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long2 ivector_t;
+  typedef long2 jvector_t;
+  typedef int2 kvector_t;
+  typedef double2 vector_t;
+#define convert_ivector_t convert_long2
+#define convert_jvector_t convert_long2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_double2
+#define ilogb_ _cl_ilogb_double2
+#define ldexp_scalar_ _cl_ldexp_double2_long
+#define ldexp_vector_ _cl_ldexp_double2_long2
+  return powr(x0,x1);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_powr: VF=double3
+// Implement native_powr directly
+__attribute__((__overloadable__))
+double3 _cl_native_powr(double3 x0, double3 x1)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long3 ivector_t;
+  typedef long3 jvector_t;
+  typedef int3 kvector_t;
+  typedef double3 vector_t;
+#define convert_ivector_t convert_long3
+#define convert_jvector_t convert_long3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_double3
+#define ilogb_ _cl_ilogb_double3
+#define ldexp_scalar_ _cl_ldexp_double3_long
+#define ldexp_vector_ _cl_ldexp_double3_long3
+  return powr(x0,x1);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_powr: VF=double4
+// Implement native_powr directly
+__attribute__((__overloadable__))
+double4 _cl_native_powr(double4 x0, double4 x1)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long4 ivector_t;
+  typedef long4 jvector_t;
+  typedef int4 kvector_t;
+  typedef double4 vector_t;
+#define convert_ivector_t convert_long4
+#define convert_jvector_t convert_long4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_double4
+#define ilogb_ _cl_ilogb_double4
+#define ldexp_scalar_ _cl_ldexp_double4_long
+#define ldexp_vector_ _cl_ldexp_double4_long4
+  return powr(x0,x1);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_powr: VF=double8
+// Implement native_powr directly
+__attribute__((__overloadable__))
+double8 _cl_native_powr(double8 x0, double8 x1)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long8 ivector_t;
+  typedef long8 jvector_t;
+  typedef int8 kvector_t;
+  typedef double8 vector_t;
+#define convert_ivector_t convert_long8
+#define convert_jvector_t convert_long8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_double8
+#define ilogb_ _cl_ilogb_double8
+#define ldexp_scalar_ _cl_ldexp_double8_long
+#define ldexp_vector_ _cl_ldexp_double8_long8
+  return powr(x0,x1);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_powr: VF=double16
+// Implement native_powr directly
+__attribute__((__overloadable__))
+double16 _cl_native_powr(double16 x0, double16 x1)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long16 ivector_t;
+  typedef long16 jvector_t;
+  typedef int16 kvector_t;
+  typedef double16 vector_t;
+#define convert_ivector_t convert_long16
+#define convert_jvector_t convert_long16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_double16
+#define ilogb_ _cl_ilogb_double16
+#define ldexp_scalar_ _cl_ldexp_double16_long
+#define ldexp_vector_ _cl_ldexp_double16_long16
+  return powr(x0,x1);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp64
diff --git a/lib/kernel/vecmathlib-pocl/native_recip.cl b/lib/kernel/vecmathlib-pocl/native_recip.cl
index 4056940..2a502eb 100644
--- a/lib/kernel/vecmathlib-pocl/native_recip.cl
+++ b/lib/kernel/vecmathlib-pocl/native_recip.cl
@@ -27,6 +27,190 @@
 
 // native_recip: ['VF'] -> VF
 
+#ifdef cl_khr_fp16
+
+// native_recip: VF=half
+// Implement native_recip directly
+__attribute__((__overloadable__))
+half _cl_native_recip(half x0)
+{
+  typedef short iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef half vector_t;
+#define convert_ivector_t convert_short
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_half
+#define ilogb_ _cl_ilogb_half
+#define ldexp_scalar_ _cl_ldexp_half_short
+#define ldexp_vector_ _cl_ldexp_half_short
+  return (scalar_t)1/x0;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_recip: VF=half2
+// Implement native_recip directly
+__attribute__((__overloadable__))
+half2 _cl_native_recip(half2 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short2 ivector_t;
+  typedef short2 jvector_t;
+  typedef int2 kvector_t;
+  typedef half2 vector_t;
+#define convert_ivector_t convert_short2
+#define convert_jvector_t convert_short2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_half2
+#define ilogb_ _cl_ilogb_half2
+#define ldexp_scalar_ _cl_ldexp_half2_short
+#define ldexp_vector_ _cl_ldexp_half2_short2
+  return (scalar_t)1/x0;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_recip: VF=half3
+// Implement native_recip directly
+__attribute__((__overloadable__))
+half3 _cl_native_recip(half3 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short3 ivector_t;
+  typedef short3 jvector_t;
+  typedef int3 kvector_t;
+  typedef half3 vector_t;
+#define convert_ivector_t convert_short3
+#define convert_jvector_t convert_short3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_half3
+#define ilogb_ _cl_ilogb_half3
+#define ldexp_scalar_ _cl_ldexp_half3_short
+#define ldexp_vector_ _cl_ldexp_half3_short3
+  return (scalar_t)1/x0;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_recip: VF=half4
+// Implement native_recip directly
+__attribute__((__overloadable__))
+half4 _cl_native_recip(half4 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short4 ivector_t;
+  typedef short4 jvector_t;
+  typedef int4 kvector_t;
+  typedef half4 vector_t;
+#define convert_ivector_t convert_short4
+#define convert_jvector_t convert_short4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_half4
+#define ilogb_ _cl_ilogb_half4
+#define ldexp_scalar_ _cl_ldexp_half4_short
+#define ldexp_vector_ _cl_ldexp_half4_short4
+  return (scalar_t)1/x0;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_recip: VF=half8
+// Implement native_recip directly
+__attribute__((__overloadable__))
+half8 _cl_native_recip(half8 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short8 ivector_t;
+  typedef short8 jvector_t;
+  typedef int8 kvector_t;
+  typedef half8 vector_t;
+#define convert_ivector_t convert_short8
+#define convert_jvector_t convert_short8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_half8
+#define ilogb_ _cl_ilogb_half8
+#define ldexp_scalar_ _cl_ldexp_half8_short
+#define ldexp_vector_ _cl_ldexp_half8_short8
+  return (scalar_t)1/x0;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_recip: VF=half16
+// Implement native_recip directly
+__attribute__((__overloadable__))
+half16 _cl_native_recip(half16 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short16 ivector_t;
+  typedef short16 jvector_t;
+  typedef int16 kvector_t;
+  typedef half16 vector_t;
+#define convert_ivector_t convert_short16
+#define convert_jvector_t convert_short16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_half16
+#define ilogb_ _cl_ilogb_half16
+#define ldexp_scalar_ _cl_ldexp_half16_short
+#define ldexp_vector_ _cl_ldexp_half16_short16
+  return (scalar_t)1/x0;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp16
+
 // native_recip: VF=float
 // Implement native_recip directly
 __attribute__((__overloadable__))
@@ -206,3 +390,187 @@ float16 _cl_native_recip(float16 x0)
 #undef ldexp_scalar_
 #undef ldexp_vector_
 }
+
+#ifdef cl_khr_fp64
+
+// native_recip: VF=double
+// Implement native_recip directly
+__attribute__((__overloadable__))
+double _cl_native_recip(double x0)
+{
+  typedef long iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef double vector_t;
+#define convert_ivector_t convert_long
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_double
+#define ilogb_ _cl_ilogb_double
+#define ldexp_scalar_ _cl_ldexp_double_long
+#define ldexp_vector_ _cl_ldexp_double_long
+  return (scalar_t)1/x0;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_recip: VF=double2
+// Implement native_recip directly
+__attribute__((__overloadable__))
+double2 _cl_native_recip(double2 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long2 ivector_t;
+  typedef long2 jvector_t;
+  typedef int2 kvector_t;
+  typedef double2 vector_t;
+#define convert_ivector_t convert_long2
+#define convert_jvector_t convert_long2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_double2
+#define ilogb_ _cl_ilogb_double2
+#define ldexp_scalar_ _cl_ldexp_double2_long
+#define ldexp_vector_ _cl_ldexp_double2_long2
+  return (scalar_t)1/x0;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_recip: VF=double3
+// Implement native_recip directly
+__attribute__((__overloadable__))
+double3 _cl_native_recip(double3 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long3 ivector_t;
+  typedef long3 jvector_t;
+  typedef int3 kvector_t;
+  typedef double3 vector_t;
+#define convert_ivector_t convert_long3
+#define convert_jvector_t convert_long3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_double3
+#define ilogb_ _cl_ilogb_double3
+#define ldexp_scalar_ _cl_ldexp_double3_long
+#define ldexp_vector_ _cl_ldexp_double3_long3
+  return (scalar_t)1/x0;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_recip: VF=double4
+// Implement native_recip directly
+__attribute__((__overloadable__))
+double4 _cl_native_recip(double4 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long4 ivector_t;
+  typedef long4 jvector_t;
+  typedef int4 kvector_t;
+  typedef double4 vector_t;
+#define convert_ivector_t convert_long4
+#define convert_jvector_t convert_long4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_double4
+#define ilogb_ _cl_ilogb_double4
+#define ldexp_scalar_ _cl_ldexp_double4_long
+#define ldexp_vector_ _cl_ldexp_double4_long4
+  return (scalar_t)1/x0;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_recip: VF=double8
+// Implement native_recip directly
+__attribute__((__overloadable__))
+double8 _cl_native_recip(double8 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long8 ivector_t;
+  typedef long8 jvector_t;
+  typedef int8 kvector_t;
+  typedef double8 vector_t;
+#define convert_ivector_t convert_long8
+#define convert_jvector_t convert_long8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_double8
+#define ilogb_ _cl_ilogb_double8
+#define ldexp_scalar_ _cl_ldexp_double8_long
+#define ldexp_vector_ _cl_ldexp_double8_long8
+  return (scalar_t)1/x0;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_recip: VF=double16
+// Implement native_recip directly
+__attribute__((__overloadable__))
+double16 _cl_native_recip(double16 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long16 ivector_t;
+  typedef long16 jvector_t;
+  typedef int16 kvector_t;
+  typedef double16 vector_t;
+#define convert_ivector_t convert_long16
+#define convert_jvector_t convert_long16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_double16
+#define ilogb_ _cl_ilogb_double16
+#define ldexp_scalar_ _cl_ldexp_double16_long
+#define ldexp_vector_ _cl_ldexp_double16_long16
+  return (scalar_t)1/x0;
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp64
diff --git a/lib/kernel/vecmathlib-pocl/native_rsqrt.cl b/lib/kernel/vecmathlib-pocl/native_rsqrt.cl
index a0859a1..732d959 100644
--- a/lib/kernel/vecmathlib-pocl/native_rsqrt.cl
+++ b/lib/kernel/vecmathlib-pocl/native_rsqrt.cl
@@ -27,6 +27,190 @@
 
 // native_rsqrt: ['VF'] -> VF
 
+#ifdef cl_khr_fp16
+
+// native_rsqrt: VF=half
+// Implement native_rsqrt directly
+__attribute__((__overloadable__))
+half _cl_native_rsqrt(half x0)
+{
+  typedef short iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef half vector_t;
+#define convert_ivector_t convert_short
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_half
+#define ilogb_ _cl_ilogb_half
+#define ldexp_scalar_ _cl_ldexp_half_short
+#define ldexp_vector_ _cl_ldexp_half_short
+  return rsqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_rsqrt: VF=half2
+// Implement native_rsqrt directly
+__attribute__((__overloadable__))
+half2 _cl_native_rsqrt(half2 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short2 ivector_t;
+  typedef short2 jvector_t;
+  typedef int2 kvector_t;
+  typedef half2 vector_t;
+#define convert_ivector_t convert_short2
+#define convert_jvector_t convert_short2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_half2
+#define ilogb_ _cl_ilogb_half2
+#define ldexp_scalar_ _cl_ldexp_half2_short
+#define ldexp_vector_ _cl_ldexp_half2_short2
+  return rsqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_rsqrt: VF=half3
+// Implement native_rsqrt directly
+__attribute__((__overloadable__))
+half3 _cl_native_rsqrt(half3 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short3 ivector_t;
+  typedef short3 jvector_t;
+  typedef int3 kvector_t;
+  typedef half3 vector_t;
+#define convert_ivector_t convert_short3
+#define convert_jvector_t convert_short3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_half3
+#define ilogb_ _cl_ilogb_half3
+#define ldexp_scalar_ _cl_ldexp_half3_short
+#define ldexp_vector_ _cl_ldexp_half3_short3
+  return rsqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_rsqrt: VF=half4
+// Implement native_rsqrt directly
+__attribute__((__overloadable__))
+half4 _cl_native_rsqrt(half4 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short4 ivector_t;
+  typedef short4 jvector_t;
+  typedef int4 kvector_t;
+  typedef half4 vector_t;
+#define convert_ivector_t convert_short4
+#define convert_jvector_t convert_short4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_half4
+#define ilogb_ _cl_ilogb_half4
+#define ldexp_scalar_ _cl_ldexp_half4_short
+#define ldexp_vector_ _cl_ldexp_half4_short4
+  return rsqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_rsqrt: VF=half8
+// Implement native_rsqrt directly
+__attribute__((__overloadable__))
+half8 _cl_native_rsqrt(half8 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short8 ivector_t;
+  typedef short8 jvector_t;
+  typedef int8 kvector_t;
+  typedef half8 vector_t;
+#define convert_ivector_t convert_short8
+#define convert_jvector_t convert_short8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_half8
+#define ilogb_ _cl_ilogb_half8
+#define ldexp_scalar_ _cl_ldexp_half8_short
+#define ldexp_vector_ _cl_ldexp_half8_short8
+  return rsqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_rsqrt: VF=half16
+// Implement native_rsqrt directly
+__attribute__((__overloadable__))
+half16 _cl_native_rsqrt(half16 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short16 ivector_t;
+  typedef short16 jvector_t;
+  typedef int16 kvector_t;
+  typedef half16 vector_t;
+#define convert_ivector_t convert_short16
+#define convert_jvector_t convert_short16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_half16
+#define ilogb_ _cl_ilogb_half16
+#define ldexp_scalar_ _cl_ldexp_half16_short
+#define ldexp_vector_ _cl_ldexp_half16_short16
+  return rsqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp16
+
 // native_rsqrt: VF=float
 // Implement native_rsqrt directly
 __attribute__((__overloadable__))
@@ -206,3 +390,187 @@ float16 _cl_native_rsqrt(float16 x0)
 #undef ldexp_scalar_
 #undef ldexp_vector_
 }
+
+#ifdef cl_khr_fp64
+
+// native_rsqrt: VF=double
+// Implement native_rsqrt directly
+__attribute__((__overloadable__))
+double _cl_native_rsqrt(double x0)
+{
+  typedef long iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef double vector_t;
+#define convert_ivector_t convert_long
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_double
+#define ilogb_ _cl_ilogb_double
+#define ldexp_scalar_ _cl_ldexp_double_long
+#define ldexp_vector_ _cl_ldexp_double_long
+  return rsqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_rsqrt: VF=double2
+// Implement native_rsqrt directly
+__attribute__((__overloadable__))
+double2 _cl_native_rsqrt(double2 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long2 ivector_t;
+  typedef long2 jvector_t;
+  typedef int2 kvector_t;
+  typedef double2 vector_t;
+#define convert_ivector_t convert_long2
+#define convert_jvector_t convert_long2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_double2
+#define ilogb_ _cl_ilogb_double2
+#define ldexp_scalar_ _cl_ldexp_double2_long
+#define ldexp_vector_ _cl_ldexp_double2_long2
+  return rsqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_rsqrt: VF=double3
+// Implement native_rsqrt directly
+__attribute__((__overloadable__))
+double3 _cl_native_rsqrt(double3 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long3 ivector_t;
+  typedef long3 jvector_t;
+  typedef int3 kvector_t;
+  typedef double3 vector_t;
+#define convert_ivector_t convert_long3
+#define convert_jvector_t convert_long3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_double3
+#define ilogb_ _cl_ilogb_double3
+#define ldexp_scalar_ _cl_ldexp_double3_long
+#define ldexp_vector_ _cl_ldexp_double3_long3
+  return rsqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_rsqrt: VF=double4
+// Implement native_rsqrt directly
+__attribute__((__overloadable__))
+double4 _cl_native_rsqrt(double4 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long4 ivector_t;
+  typedef long4 jvector_t;
+  typedef int4 kvector_t;
+  typedef double4 vector_t;
+#define convert_ivector_t convert_long4
+#define convert_jvector_t convert_long4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_double4
+#define ilogb_ _cl_ilogb_double4
+#define ldexp_scalar_ _cl_ldexp_double4_long
+#define ldexp_vector_ _cl_ldexp_double4_long4
+  return rsqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_rsqrt: VF=double8
+// Implement native_rsqrt directly
+__attribute__((__overloadable__))
+double8 _cl_native_rsqrt(double8 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long8 ivector_t;
+  typedef long8 jvector_t;
+  typedef int8 kvector_t;
+  typedef double8 vector_t;
+#define convert_ivector_t convert_long8
+#define convert_jvector_t convert_long8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_double8
+#define ilogb_ _cl_ilogb_double8
+#define ldexp_scalar_ _cl_ldexp_double8_long
+#define ldexp_vector_ _cl_ldexp_double8_long8
+  return rsqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_rsqrt: VF=double16
+// Implement native_rsqrt directly
+__attribute__((__overloadable__))
+double16 _cl_native_rsqrt(double16 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long16 ivector_t;
+  typedef long16 jvector_t;
+  typedef int16 kvector_t;
+  typedef double16 vector_t;
+#define convert_ivector_t convert_long16
+#define convert_jvector_t convert_long16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_double16
+#define ilogb_ _cl_ilogb_double16
+#define ldexp_scalar_ _cl_ldexp_double16_long
+#define ldexp_vector_ _cl_ldexp_double16_long16
+  return rsqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp64
diff --git a/lib/kernel/vecmathlib-pocl/native_sin.cl b/lib/kernel/vecmathlib-pocl/native_sin.cl
index 12dc999..b00f89c 100644
--- a/lib/kernel/vecmathlib-pocl/native_sin.cl
+++ b/lib/kernel/vecmathlib-pocl/native_sin.cl
@@ -27,6 +27,190 @@
 
 // native_sin: ['VF'] -> VF
 
+#ifdef cl_khr_fp16
+
+// native_sin: VF=half
+// Implement native_sin directly
+__attribute__((__overloadable__))
+half _cl_native_sin(half x0)
+{
+  typedef short iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef half vector_t;
+#define convert_ivector_t convert_short
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_half
+#define ilogb_ _cl_ilogb_half
+#define ldexp_scalar_ _cl_ldexp_half_short
+#define ldexp_vector_ _cl_ldexp_half_short
+  return sin(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_sin: VF=half2
+// Implement native_sin directly
+__attribute__((__overloadable__))
+half2 _cl_native_sin(half2 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short2 ivector_t;
+  typedef short2 jvector_t;
+  typedef int2 kvector_t;
+  typedef half2 vector_t;
+#define convert_ivector_t convert_short2
+#define convert_jvector_t convert_short2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_half2
+#define ilogb_ _cl_ilogb_half2
+#define ldexp_scalar_ _cl_ldexp_half2_short
+#define ldexp_vector_ _cl_ldexp_half2_short2
+  return sin(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_sin: VF=half3
+// Implement native_sin directly
+__attribute__((__overloadable__))
+half3 _cl_native_sin(half3 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short3 ivector_t;
+  typedef short3 jvector_t;
+  typedef int3 kvector_t;
+  typedef half3 vector_t;
+#define convert_ivector_t convert_short3
+#define convert_jvector_t convert_short3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_half3
+#define ilogb_ _cl_ilogb_half3
+#define ldexp_scalar_ _cl_ldexp_half3_short
+#define ldexp_vector_ _cl_ldexp_half3_short3
+  return sin(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_sin: VF=half4
+// Implement native_sin directly
+__attribute__((__overloadable__))
+half4 _cl_native_sin(half4 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short4 ivector_t;
+  typedef short4 jvector_t;
+  typedef int4 kvector_t;
+  typedef half4 vector_t;
+#define convert_ivector_t convert_short4
+#define convert_jvector_t convert_short4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_half4
+#define ilogb_ _cl_ilogb_half4
+#define ldexp_scalar_ _cl_ldexp_half4_short
+#define ldexp_vector_ _cl_ldexp_half4_short4
+  return sin(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_sin: VF=half8
+// Implement native_sin directly
+__attribute__((__overloadable__))
+half8 _cl_native_sin(half8 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short8 ivector_t;
+  typedef short8 jvector_t;
+  typedef int8 kvector_t;
+  typedef half8 vector_t;
+#define convert_ivector_t convert_short8
+#define convert_jvector_t convert_short8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_half8
+#define ilogb_ _cl_ilogb_half8
+#define ldexp_scalar_ _cl_ldexp_half8_short
+#define ldexp_vector_ _cl_ldexp_half8_short8
+  return sin(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_sin: VF=half16
+// Implement native_sin directly
+__attribute__((__overloadable__))
+half16 _cl_native_sin(half16 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short16 ivector_t;
+  typedef short16 jvector_t;
+  typedef int16 kvector_t;
+  typedef half16 vector_t;
+#define convert_ivector_t convert_short16
+#define convert_jvector_t convert_short16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_half16
+#define ilogb_ _cl_ilogb_half16
+#define ldexp_scalar_ _cl_ldexp_half16_short
+#define ldexp_vector_ _cl_ldexp_half16_short16
+  return sin(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp16
+
 // native_sin: VF=float
 // Implement native_sin directly
 __attribute__((__overloadable__))
@@ -206,3 +390,187 @@ float16 _cl_native_sin(float16 x0)
 #undef ldexp_scalar_
 #undef ldexp_vector_
 }
+
+#ifdef cl_khr_fp64
+
+// native_sin: VF=double
+// Implement native_sin directly
+__attribute__((__overloadable__))
+double _cl_native_sin(double x0)
+{
+  typedef long iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef double vector_t;
+#define convert_ivector_t convert_long
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_double
+#define ilogb_ _cl_ilogb_double
+#define ldexp_scalar_ _cl_ldexp_double_long
+#define ldexp_vector_ _cl_ldexp_double_long
+  return sin(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_sin: VF=double2
+// Implement native_sin directly
+__attribute__((__overloadable__))
+double2 _cl_native_sin(double2 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long2 ivector_t;
+  typedef long2 jvector_t;
+  typedef int2 kvector_t;
+  typedef double2 vector_t;
+#define convert_ivector_t convert_long2
+#define convert_jvector_t convert_long2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_double2
+#define ilogb_ _cl_ilogb_double2
+#define ldexp_scalar_ _cl_ldexp_double2_long
+#define ldexp_vector_ _cl_ldexp_double2_long2
+  return sin(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_sin: VF=double3
+// Implement native_sin directly
+__attribute__((__overloadable__))
+double3 _cl_native_sin(double3 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long3 ivector_t;
+  typedef long3 jvector_t;
+  typedef int3 kvector_t;
+  typedef double3 vector_t;
+#define convert_ivector_t convert_long3
+#define convert_jvector_t convert_long3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_double3
+#define ilogb_ _cl_ilogb_double3
+#define ldexp_scalar_ _cl_ldexp_double3_long
+#define ldexp_vector_ _cl_ldexp_double3_long3
+  return sin(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_sin: VF=double4
+// Implement native_sin directly
+__attribute__((__overloadable__))
+double4 _cl_native_sin(double4 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long4 ivector_t;
+  typedef long4 jvector_t;
+  typedef int4 kvector_t;
+  typedef double4 vector_t;
+#define convert_ivector_t convert_long4
+#define convert_jvector_t convert_long4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_double4
+#define ilogb_ _cl_ilogb_double4
+#define ldexp_scalar_ _cl_ldexp_double4_long
+#define ldexp_vector_ _cl_ldexp_double4_long4
+  return sin(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_sin: VF=double8
+// Implement native_sin directly
+__attribute__((__overloadable__))
+double8 _cl_native_sin(double8 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long8 ivector_t;
+  typedef long8 jvector_t;
+  typedef int8 kvector_t;
+  typedef double8 vector_t;
+#define convert_ivector_t convert_long8
+#define convert_jvector_t convert_long8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_double8
+#define ilogb_ _cl_ilogb_double8
+#define ldexp_scalar_ _cl_ldexp_double8_long
+#define ldexp_vector_ _cl_ldexp_double8_long8
+  return sin(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_sin: VF=double16
+// Implement native_sin directly
+__attribute__((__overloadable__))
+double16 _cl_native_sin(double16 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long16 ivector_t;
+  typedef long16 jvector_t;
+  typedef int16 kvector_t;
+  typedef double16 vector_t;
+#define convert_ivector_t convert_long16
+#define convert_jvector_t convert_long16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_double16
+#define ilogb_ _cl_ilogb_double16
+#define ldexp_scalar_ _cl_ldexp_double16_long
+#define ldexp_vector_ _cl_ldexp_double16_long16
+  return sin(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp64
diff --git a/lib/kernel/vecmathlib-pocl/native_sqrt.cl b/lib/kernel/vecmathlib-pocl/native_sqrt.cl
index cdcbb48..27fe859 100644
--- a/lib/kernel/vecmathlib-pocl/native_sqrt.cl
+++ b/lib/kernel/vecmathlib-pocl/native_sqrt.cl
@@ -27,6 +27,190 @@
 
 // native_sqrt: ['VF'] -> VF
 
+#ifdef cl_khr_fp16
+
+// native_sqrt: VF=half
+// Implement native_sqrt directly
+__attribute__((__overloadable__))
+half _cl_native_sqrt(half x0)
+{
+  typedef short iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef half vector_t;
+#define convert_ivector_t convert_short
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_half
+#define ilogb_ _cl_ilogb_half
+#define ldexp_scalar_ _cl_ldexp_half_short
+#define ldexp_vector_ _cl_ldexp_half_short
+  return sqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_sqrt: VF=half2
+// Implement native_sqrt directly
+__attribute__((__overloadable__))
+half2 _cl_native_sqrt(half2 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short2 ivector_t;
+  typedef short2 jvector_t;
+  typedef int2 kvector_t;
+  typedef half2 vector_t;
+#define convert_ivector_t convert_short2
+#define convert_jvector_t convert_short2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_half2
+#define ilogb_ _cl_ilogb_half2
+#define ldexp_scalar_ _cl_ldexp_half2_short
+#define ldexp_vector_ _cl_ldexp_half2_short2
+  return sqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_sqrt: VF=half3
+// Implement native_sqrt directly
+__attribute__((__overloadable__))
+half3 _cl_native_sqrt(half3 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short3 ivector_t;
+  typedef short3 jvector_t;
+  typedef int3 kvector_t;
+  typedef half3 vector_t;
+#define convert_ivector_t convert_short3
+#define convert_jvector_t convert_short3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_half3
+#define ilogb_ _cl_ilogb_half3
+#define ldexp_scalar_ _cl_ldexp_half3_short
+#define ldexp_vector_ _cl_ldexp_half3_short3
+  return sqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_sqrt: VF=half4
+// Implement native_sqrt directly
+__attribute__((__overloadable__))
+half4 _cl_native_sqrt(half4 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short4 ivector_t;
+  typedef short4 jvector_t;
+  typedef int4 kvector_t;
+  typedef half4 vector_t;
+#define convert_ivector_t convert_short4
+#define convert_jvector_t convert_short4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_half4
+#define ilogb_ _cl_ilogb_half4
+#define ldexp_scalar_ _cl_ldexp_half4_short
+#define ldexp_vector_ _cl_ldexp_half4_short4
+  return sqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_sqrt: VF=half8
+// Implement native_sqrt directly
+__attribute__((__overloadable__))
+half8 _cl_native_sqrt(half8 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short8 ivector_t;
+  typedef short8 jvector_t;
+  typedef int8 kvector_t;
+  typedef half8 vector_t;
+#define convert_ivector_t convert_short8
+#define convert_jvector_t convert_short8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_half8
+#define ilogb_ _cl_ilogb_half8
+#define ldexp_scalar_ _cl_ldexp_half8_short
+#define ldexp_vector_ _cl_ldexp_half8_short8
+  return sqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_sqrt: VF=half16
+// Implement native_sqrt directly
+__attribute__((__overloadable__))
+half16 _cl_native_sqrt(half16 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short16 ivector_t;
+  typedef short16 jvector_t;
+  typedef int16 kvector_t;
+  typedef half16 vector_t;
+#define convert_ivector_t convert_short16
+#define convert_jvector_t convert_short16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_half16
+#define ilogb_ _cl_ilogb_half16
+#define ldexp_scalar_ _cl_ldexp_half16_short
+#define ldexp_vector_ _cl_ldexp_half16_short16
+  return sqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp16
+
 // native_sqrt: VF=float
 // Implement native_sqrt directly
 __attribute__((__overloadable__))
@@ -206,3 +390,187 @@ float16 _cl_native_sqrt(float16 x0)
 #undef ldexp_scalar_
 #undef ldexp_vector_
 }
+
+#ifdef cl_khr_fp64
+
+// native_sqrt: VF=double
+// Implement native_sqrt directly
+__attribute__((__overloadable__))
+double _cl_native_sqrt(double x0)
+{
+  typedef long iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef double vector_t;
+#define convert_ivector_t convert_long
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_double
+#define ilogb_ _cl_ilogb_double
+#define ldexp_scalar_ _cl_ldexp_double_long
+#define ldexp_vector_ _cl_ldexp_double_long
+  return sqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_sqrt: VF=double2
+// Implement native_sqrt directly
+__attribute__((__overloadable__))
+double2 _cl_native_sqrt(double2 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long2 ivector_t;
+  typedef long2 jvector_t;
+  typedef int2 kvector_t;
+  typedef double2 vector_t;
+#define convert_ivector_t convert_long2
+#define convert_jvector_t convert_long2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_double2
+#define ilogb_ _cl_ilogb_double2
+#define ldexp_scalar_ _cl_ldexp_double2_long
+#define ldexp_vector_ _cl_ldexp_double2_long2
+  return sqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_sqrt: VF=double3
+// Implement native_sqrt directly
+__attribute__((__overloadable__))
+double3 _cl_native_sqrt(double3 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long3 ivector_t;
+  typedef long3 jvector_t;
+  typedef int3 kvector_t;
+  typedef double3 vector_t;
+#define convert_ivector_t convert_long3
+#define convert_jvector_t convert_long3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_double3
+#define ilogb_ _cl_ilogb_double3
+#define ldexp_scalar_ _cl_ldexp_double3_long
+#define ldexp_vector_ _cl_ldexp_double3_long3
+  return sqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_sqrt: VF=double4
+// Implement native_sqrt directly
+__attribute__((__overloadable__))
+double4 _cl_native_sqrt(double4 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long4 ivector_t;
+  typedef long4 jvector_t;
+  typedef int4 kvector_t;
+  typedef double4 vector_t;
+#define convert_ivector_t convert_long4
+#define convert_jvector_t convert_long4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_double4
+#define ilogb_ _cl_ilogb_double4
+#define ldexp_scalar_ _cl_ldexp_double4_long
+#define ldexp_vector_ _cl_ldexp_double4_long4
+  return sqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_sqrt: VF=double8
+// Implement native_sqrt directly
+__attribute__((__overloadable__))
+double8 _cl_native_sqrt(double8 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long8 ivector_t;
+  typedef long8 jvector_t;
+  typedef int8 kvector_t;
+  typedef double8 vector_t;
+#define convert_ivector_t convert_long8
+#define convert_jvector_t convert_long8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_double8
+#define ilogb_ _cl_ilogb_double8
+#define ldexp_scalar_ _cl_ldexp_double8_long
+#define ldexp_vector_ _cl_ldexp_double8_long8
+  return sqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_sqrt: VF=double16
+// Implement native_sqrt directly
+__attribute__((__overloadable__))
+double16 _cl_native_sqrt(double16 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long16 ivector_t;
+  typedef long16 jvector_t;
+  typedef int16 kvector_t;
+  typedef double16 vector_t;
+#define convert_ivector_t convert_long16
+#define convert_jvector_t convert_long16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_double16
+#define ilogb_ _cl_ilogb_double16
+#define ldexp_scalar_ _cl_ldexp_double16_long
+#define ldexp_vector_ _cl_ldexp_double16_long16
+  return sqrt(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp64
diff --git a/lib/kernel/vecmathlib-pocl/native_tan.cl b/lib/kernel/vecmathlib-pocl/native_tan.cl
index 5601138..3266ee8 100644
--- a/lib/kernel/vecmathlib-pocl/native_tan.cl
+++ b/lib/kernel/vecmathlib-pocl/native_tan.cl
@@ -27,6 +27,190 @@
 
 // native_tan: ['VF'] -> VF
 
+#ifdef cl_khr_fp16
+
+// native_tan: VF=half
+// Implement native_tan directly
+__attribute__((__overloadable__))
+half _cl_native_tan(half x0)
+{
+  typedef short iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef half vector_t;
+#define convert_ivector_t convert_short
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_half
+#define ilogb_ _cl_ilogb_half
+#define ldexp_scalar_ _cl_ldexp_half_short
+#define ldexp_vector_ _cl_ldexp_half_short
+  return tan(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_tan: VF=half2
+// Implement native_tan directly
+__attribute__((__overloadable__))
+half2 _cl_native_tan(half2 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short2 ivector_t;
+  typedef short2 jvector_t;
+  typedef int2 kvector_t;
+  typedef half2 vector_t;
+#define convert_ivector_t convert_short2
+#define convert_jvector_t convert_short2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_half2
+#define ilogb_ _cl_ilogb_half2
+#define ldexp_scalar_ _cl_ldexp_half2_short
+#define ldexp_vector_ _cl_ldexp_half2_short2
+  return tan(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_tan: VF=half3
+// Implement native_tan directly
+__attribute__((__overloadable__))
+half3 _cl_native_tan(half3 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short3 ivector_t;
+  typedef short3 jvector_t;
+  typedef int3 kvector_t;
+  typedef half3 vector_t;
+#define convert_ivector_t convert_short3
+#define convert_jvector_t convert_short3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_half3
+#define ilogb_ _cl_ilogb_half3
+#define ldexp_scalar_ _cl_ldexp_half3_short
+#define ldexp_vector_ _cl_ldexp_half3_short3
+  return tan(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_tan: VF=half4
+// Implement native_tan directly
+__attribute__((__overloadable__))
+half4 _cl_native_tan(half4 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short4 ivector_t;
+  typedef short4 jvector_t;
+  typedef int4 kvector_t;
+  typedef half4 vector_t;
+#define convert_ivector_t convert_short4
+#define convert_jvector_t convert_short4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_half4
+#define ilogb_ _cl_ilogb_half4
+#define ldexp_scalar_ _cl_ldexp_half4_short
+#define ldexp_vector_ _cl_ldexp_half4_short4
+  return tan(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_tan: VF=half8
+// Implement native_tan directly
+__attribute__((__overloadable__))
+half8 _cl_native_tan(half8 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short8 ivector_t;
+  typedef short8 jvector_t;
+  typedef int8 kvector_t;
+  typedef half8 vector_t;
+#define convert_ivector_t convert_short8
+#define convert_jvector_t convert_short8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_half8
+#define ilogb_ _cl_ilogb_half8
+#define ldexp_scalar_ _cl_ldexp_half8_short
+#define ldexp_vector_ _cl_ldexp_half8_short8
+  return tan(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_tan: VF=half16
+// Implement native_tan directly
+__attribute__((__overloadable__))
+half16 _cl_native_tan(half16 x0)
+{
+  typedef short iscalar_t;
+  typedef short jscalar_t;
+  typedef int kscalar_t;
+  typedef half scalar_t;
+  typedef short16 ivector_t;
+  typedef short16 jvector_t;
+  typedef int16 kvector_t;
+  typedef half16 vector_t;
+#define convert_ivector_t convert_short16
+#define convert_jvector_t convert_short16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_half16
+#define ilogb_ _cl_ilogb_half16
+#define ldexp_scalar_ _cl_ldexp_half16_short
+#define ldexp_vector_ _cl_ldexp_half16_short16
+  return tan(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp16
+
 // native_tan: VF=float
 // Implement native_tan directly
 __attribute__((__overloadable__))
@@ -206,3 +390,187 @@ float16 _cl_native_tan(float16 x0)
 #undef ldexp_scalar_
 #undef ldexp_vector_
 }
+
+#ifdef cl_khr_fp64
+
+// native_tan: VF=double
+// Implement native_tan directly
+__attribute__((__overloadable__))
+double _cl_native_tan(double x0)
+{
+  typedef long iscalar_t;
+  typedef int jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long ivector_t;
+  typedef int jvector_t;
+  typedef int kvector_t;
+  typedef double vector_t;
+#define convert_ivector_t convert_long
+#define convert_jvector_t convert_int
+#define convert_kvector_t convert_int
+#define convert_vector_t convert_double
+#define ilogb_ _cl_ilogb_double
+#define ldexp_scalar_ _cl_ldexp_double_long
+#define ldexp_vector_ _cl_ldexp_double_long
+  return tan(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_tan: VF=double2
+// Implement native_tan directly
+__attribute__((__overloadable__))
+double2 _cl_native_tan(double2 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long2 ivector_t;
+  typedef long2 jvector_t;
+  typedef int2 kvector_t;
+  typedef double2 vector_t;
+#define convert_ivector_t convert_long2
+#define convert_jvector_t convert_long2
+#define convert_kvector_t convert_int2
+#define convert_vector_t convert_double2
+#define ilogb_ _cl_ilogb_double2
+#define ldexp_scalar_ _cl_ldexp_double2_long
+#define ldexp_vector_ _cl_ldexp_double2_long2
+  return tan(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_tan: VF=double3
+// Implement native_tan directly
+__attribute__((__overloadable__))
+double3 _cl_native_tan(double3 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long3 ivector_t;
+  typedef long3 jvector_t;
+  typedef int3 kvector_t;
+  typedef double3 vector_t;
+#define convert_ivector_t convert_long3
+#define convert_jvector_t convert_long3
+#define convert_kvector_t convert_int3
+#define convert_vector_t convert_double3
+#define ilogb_ _cl_ilogb_double3
+#define ldexp_scalar_ _cl_ldexp_double3_long
+#define ldexp_vector_ _cl_ldexp_double3_long3
+  return tan(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_tan: VF=double4
+// Implement native_tan directly
+__attribute__((__overloadable__))
+double4 _cl_native_tan(double4 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long4 ivector_t;
+  typedef long4 jvector_t;
+  typedef int4 kvector_t;
+  typedef double4 vector_t;
+#define convert_ivector_t convert_long4
+#define convert_jvector_t convert_long4
+#define convert_kvector_t convert_int4
+#define convert_vector_t convert_double4
+#define ilogb_ _cl_ilogb_double4
+#define ldexp_scalar_ _cl_ldexp_double4_long
+#define ldexp_vector_ _cl_ldexp_double4_long4
+  return tan(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_tan: VF=double8
+// Implement native_tan directly
+__attribute__((__overloadable__))
+double8 _cl_native_tan(double8 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long8 ivector_t;
+  typedef long8 jvector_t;
+  typedef int8 kvector_t;
+  typedef double8 vector_t;
+#define convert_ivector_t convert_long8
+#define convert_jvector_t convert_long8
+#define convert_kvector_t convert_int8
+#define convert_vector_t convert_double8
+#define ilogb_ _cl_ilogb_double8
+#define ldexp_scalar_ _cl_ldexp_double8_long
+#define ldexp_vector_ _cl_ldexp_double8_long8
+  return tan(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+// native_tan: VF=double16
+// Implement native_tan directly
+__attribute__((__overloadable__))
+double16 _cl_native_tan(double16 x0)
+{
+  typedef long iscalar_t;
+  typedef long jscalar_t;
+  typedef int kscalar_t;
+  typedef double scalar_t;
+  typedef long16 ivector_t;
+  typedef long16 jvector_t;
+  typedef int16 kvector_t;
+  typedef double16 vector_t;
+#define convert_ivector_t convert_long16
+#define convert_jvector_t convert_long16
+#define convert_kvector_t convert_int16
+#define convert_vector_t convert_double16
+#define ilogb_ _cl_ilogb_double16
+#define ldexp_scalar_ _cl_ldexp_double16_long
+#define ldexp_vector_ _cl_ldexp_double16_long16
+  return tan(x0);
+#undef convert_ivector_t
+#undef convert_jvector_t
+#undef convert_kvector_t
+#undef convert_vector_t
+#undef ilogb_
+#undef ldexp_scalar_
+#undef ldexp_vector_
+}
+
+#endif // #ifdef cl_khr_fp64
diff --git a/lib/kernel/vecmathlib/mathfuncs_asin.h b/lib/kernel/vecmathlib/mathfuncs_asin.h
index cd174a2..16d4c5b 100644
--- a/lib/kernel/vecmathlib/mathfuncs_asin.h
+++ b/lib/kernel/vecmathlib/mathfuncs_asin.h
@@ -97,6 +97,8 @@ realvec_t mathfuncs<realvec_t>::vml_asin(realvec_t d) {
 
 template <typename realvec_t>
 realvec_t mathfuncs<realvec_t>::vml_acos(realvec_t d) {
+  // negative zero has the same (positive) result as positive zero
+  d = ifthen(d == RV(-0.0), RV(0.0), d);
   // Algorithm taken from SLEEF 2.80
   return (mulsign(atan2k(sqrt((RV(1.0) + d) * (RV(1.0) - d)), fabs(d)), d) +
           ifthen(d < RV(0.0), RV(M_PI), RV(0.0)));
diff --git a/lib/kernel/vecmathlib/mathfuncs_fabs.h b/lib/kernel/vecmathlib/mathfuncs_fabs.h
index c3f7356..fd83292 100644
--- a/lib/kernel/vecmathlib/mathfuncs_fabs.h
+++ b/lib/kernel/vecmathlib/mathfuncs_fabs.h
@@ -24,7 +24,12 @@ realvec_t mathfuncs<realvec_t>::vml_fabs(realvec_t x) {
 template <typename realvec_t>
 realvec_t mathfuncs<realvec_t>::vml_fdim(realvec_t x, realvec_t y) {
   // return ifthen(x > y, x - y, RV(0.0));
-  return fmax(x - y, RV(0.0));
+  realvec_t res = fmax(x - y, RV(0.0));
+#if defined VML_HAVE_NAN
+  res = ifthen(isnan(x), RV(NAN), res);
+  res = ifthen(isnan(y), RV(NAN), res);
+#endif
+  return res;
 }
 
 template <typename realvec_t>
@@ -34,12 +39,26 @@ realvec_t mathfuncs<realvec_t>::vml_fma(realvec_t x, realvec_t y, realvec_t z) {
 
 template <typename realvec_t>
 realvec_t mathfuncs<realvec_t>::vml_fmax(realvec_t x, realvec_t y) {
-  return ifthen(x < y, y, x);
+#if defined VML_HAVE_NAN
+  realvec_t notnan = ifthen(isnan(x), y, x);
+  notnan = ifthen(isnan(y), x, notnan);
+  realvec_t res = ifthen(x < y, y, x);
+  return ifthen(isnormal(res), res, notnan);
+#else
+  return ifthen(x > y, x, y);
+#endif
 }
 
 template <typename realvec_t>
 realvec_t mathfuncs<realvec_t>::vml_fmin(realvec_t x, realvec_t y) {
-  return ifthen(y < x, y, x);
+#if defined VML_HAVE_NAN
+  realvec_t notnan = ifthen(isnan(x), y, x);
+  notnan = ifthen(isnan(y), x, notnan);
+  realvec_t res = ifthen(x < y, x, y);
+  return ifthen(isnormal(res), res, notnan);
+#else
+  return ifthen(x < y, x, y);
+#endif
 }
 
 template <typename realvec_t>
diff --git a/lib/kernel/vecmathlib/vec_avx_double4.h b/lib/kernel/vecmathlib/vec_avx_double4.h
index f01e74c..388dfd3 100644
--- a/lib/kernel/vecmathlib/vec_avx_double4.h
+++ b/lib/kernel/vecmathlib/vec_avx_double4.h
@@ -10,7 +10,7 @@
 #include <cmath>
 
 // AVX intrinsics
-#include <immintrin.h>
+#include <x86intrin.h>
 
 namespace vecmathlib {
 
@@ -583,10 +583,37 @@ template <> struct realvec<double, 4> : floatprops<double> {
   realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
   realvec_t floor() const { return _mm256_floor_pd(v); }
   realvec_t fma(realvec_t y, realvec_t z) const {
+#if defined(__FMA4__)
+    realvec_t x = *this;
+    return _mm256_macc_pd(x, y, z);
+#elif defined(__FMA__)
+    realvec_t x = *this;
+    return _mm256_fmadd_pd(x, y, z);
+#else
     return MF::vml_fma(*this, y, z);
+#endif
   }
-  realvec_t fmax(realvec_t y) const { return _mm256_max_pd(v, y.v); }
-  realvec_t fmin(realvec_t y) const { return _mm256_min_pd(v, y.v); }
+  /* OpenCL spec: if one argument is NaN, return the second
+   * instructions: if any argument is NaN, return the second
+   * ... so we must take care of (x, NaN) arguments case
+   */
+  realvec_t fmax(realvec_t y) const {
+    realvec_t res = _mm256_max_pd(v, y.v);
+#if defined VML_HAVE_NAN
+    return y.isnan().ifthen(v, res);
+#else
+    return res;
+#endif
+  }
+  realvec_t fmin(realvec_t y) const {
+    realvec_t res = _mm256_min_pd(v, y.v);
+#if defined VML_HAVE_NAN
+    return y.isnan().ifthen(v, res);
+#else
+    return res;
+#endif
+  }
+
   realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); }
   realvec_t frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
   realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
diff --git a/lib/kernel/vecmathlib/vec_avx_float8.h b/lib/kernel/vecmathlib/vec_avx_float8.h
index f119aee..0da935d 100644
--- a/lib/kernel/vecmathlib/vec_avx_float8.h
+++ b/lib/kernel/vecmathlib/vec_avx_float8.h
@@ -10,7 +10,7 @@
 #include <cmath>
 
 // AVX intrinsics
-#include <immintrin.h>
+#include <x86intrin.h>
 
 namespace vecmathlib {
 
@@ -575,11 +575,35 @@ template <> struct realvec<float, 8> : floatprops<float> {
   realvec_t fabs() const { return MF::vml_fabs(*this); }
   realvec_t fdim(realvec_t y) const { return MF::vml_fdim(*this, y); }
   realvec_t floor() const { return _mm256_floor_ps(v); }
+
   realvec_t fma(realvec_t y, realvec_t z) const {
+#if defined(__FMA4__)
+    realvec_t x = *this;
+    return _mm256_macc_ps(x, y, z);
+#elif defined(__FMA__)
+    realvec_t x = *this;
+    return _mm256_fmadd_ps(x, y, z);
+#else
     return MF::vml_fma(*this, y, z);
+#endif
+  }
+
+  realvec_t fmax(realvec_t y) const {
+    realvec_t res = _mm256_max_ps(v, y.v);
+#if defined VML_HAVE_NAN
+    return y.isnan().ifthen(v, res);
+#else
+    return res;
+#endif
+  }
+  realvec_t fmin(realvec_t y) const {
+    realvec_t res = _mm256_min_ps(v, y.v);
+#if defined VML_HAVE_NAN
+    return y.isnan().ifthen(v, res);
+#else
+    return res;
+#endif
   }
-  realvec_t fmax(realvec_t y) const { return _mm256_max_ps(v, y.v); }
-  realvec_t fmin(realvec_t y) const { return _mm256_min_ps(v, y.v); }
   realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); }
   realvec_t frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
   realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
diff --git a/lib/kernel/vecmathlib/vec_avx_fp16_16.h b/lib/kernel/vecmathlib/vec_avx_fp16_16.h
index 8dadf64..7f92933 100644
--- a/lib/kernel/vecmathlib/vec_avx_fp16_16.h
+++ b/lib/kernel/vecmathlib/vec_avx_fp16_16.h
@@ -300,7 +300,7 @@ template <> struct intvec<fp16, 16> : floatprops<fp16> {
   }
   intvec operator>>(intvec n) const {
 #ifdef __AVX2__
-    intvec_t offset = U(1) << (bits - 1);
+    intvec_t offset = (intvec_t)1 << (bits - 1);
     return (*this + offset).lsr(n) - offset.lsr(n);
 #else
     intvec r;
diff --git a/lib/kernel/vecmathlib/vec_avx_fp8_32.h b/lib/kernel/vecmathlib/vec_avx_fp8_32.h
index 0ae79e7..6fa3077 100644
--- a/lib/kernel/vecmathlib/vec_avx_fp8_32.h
+++ b/lib/kernel/vecmathlib/vec_avx_fp8_32.h
@@ -272,7 +272,7 @@ template <> struct intvec<fp8, 32> : floatprops<fp8> {
 #ifdef __AVX2__
     // There is no _mm256_srai_epi8. To emulate it, add 0x80 before
     // shifting, and subtract the shifted 0x80 after shifting
-    intvec_t offset = U(1) << (bits - 1);
+    intvec_t offset = (intvec_t)1 << (bits - 1);
     return (*this + offset).lsr(n) - offset.lsr(n);
 #else
     __m128i vlo = _mm256_castsi256_si128(v);
diff --git a/lib/kernel/vecmathlib/vec_sse_double1.h b/lib/kernel/vecmathlib/vec_sse_double1.h
index d727de8..6e8d4dd 100644
--- a/lib/kernel/vecmathlib/vec_sse_double1.h
+++ b/lib/kernel/vecmathlib/vec_sse_double1.h
@@ -10,20 +10,7 @@
 #include <cmath>
 #include <climits>
 
-// SSE2 intrinsics
-#include <emmintrin.h>
-#ifdef __SSE3__ // Intel's SSE 3
-#include <pmmintrin.h>
-#endif
-#ifdef __SSE4_1__ // Intel's SSE 4.1
-#include <smmintrin.h>
-#endif
-#ifdef __SSE4A__ // AMD's SSE 4a
-#include <ammintrin.h>
-#endif
-#if defined __AVX__ // Intel's AVX
-#include <immintrin.h>
-#endif
+#include <x86intrin.h>
 
 namespace vecmathlib {
 
@@ -360,13 +347,31 @@ public:
 #endif
   }
   realvec_t fma(realvec_t y, realvec_t z) const {
+#if defined(__FMA4__)
+    return to_double(
+        _mm_macc_sd(from_double(v), from_double(y.v), from_double(z.v)));
+#elif defined(__FMA__)
+    return to_double(
+        _mm_fmadd_sd(from_double(v), from_double(y.v), from_double(z.v)));
+#else
     return MF::vml_fma(*this, y, z);
+#endif
   }
   realvec_t fmax(realvec_t y) const {
-    return to_double(_mm_max_sd(from_double(v), from_double(y.v)));
+    realvec_t res = to_double(_mm_max_sd(from_double(v), from_double(y.v)));
+#if defined VML_HAVE_NAN
+    return y.isnan().ifthen(v, res);
+#else
+    return res;
+#endif
   }
   realvec_t fmin(realvec_t y) const {
-    return to_double(_mm_min_sd(from_double(v), from_double(y.v)));
+    realvec_t res = to_double(_mm_min_sd(from_double(v), from_double(y.v)));
+#if defined VML_HAVE_NAN
+    return y.isnan().ifthen(v, res);
+#else
+    return res;
+#endif
   }
   realvec_t fmod(realvec_t y) const { return vml_std::fmod(v, y.v); }
   realvec_t frexp(intvec_t *irp) const {
diff --git a/lib/kernel/vecmathlib/vec_sse_double2.h b/lib/kernel/vecmathlib/vec_sse_double2.h
index 095f458..11e8ee7 100644
--- a/lib/kernel/vecmathlib/vec_sse_double2.h
+++ b/lib/kernel/vecmathlib/vec_sse_double2.h
@@ -10,19 +10,7 @@
 #include <cmath>
 
 // SSE2 intrinsics
-#include <emmintrin.h>
-#ifdef __SSE3__ // Intel's SSE 3
-#include <pmmintrin.h>
-#endif
-#ifdef __SSE4_1__ // Intel's SSE 4.1
-#include <smmintrin.h>
-#endif
-#ifdef __SSE4A__ // AMD's SSE 4a
-#include <ammintrin.h>
-#endif
-#if defined __AVX__ // Intel's AVX
-#include <immintrin.h>
-#endif
+#include <x86intrin.h>
 
 namespace vecmathlib {
 
@@ -473,10 +461,36 @@ template <> struct realvec<double, 2> : floatprops<double> {
 #endif
   }
   realvec_t fma(realvec_t y, realvec_t z) const {
+#if defined(__FMA4__)
+    realvec_t x = *this;
+    return _mm_macc_pd(x, y, z);
+#elif defined(__FMA__)
+    realvec_t x = *this;
+    return _mm_fmadd_pd(x, y, z);
+#else
     return MF::vml_fma(*this, y, z);
+#endif
+  }
+  /* OpenCL spec: if one argument is NaN, return the second
+   * instructions: if any argument is NaN, return the second
+   * ... so we must take care of (x, NaN) arguments case
+   */
+  realvec_t fmax(realvec_t y) const {
+    realvec_t res = _mm_max_pd(v, y.v);
+#if defined VML_HAVE_NAN
+    return y.isnan().ifthen(v, res);
+#else
+    return res;
+#endif
+  }
+  realvec_t fmin(realvec_t y) const {
+    realvec_t res = _mm_min_pd(v, y.v);
+#if defined VML_HAVE_NAN
+    return y.isnan().ifthen(v, res);
+#else
+    return res;
+#endif
   }
-  realvec_t fmax(realvec_t y) const { return _mm_max_pd(v, y.v); }
-  realvec_t fmin(realvec_t y) const { return _mm_min_pd(v, y.v); }
   realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); }
   realvec_t frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
   realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
diff --git a/lib/kernel/vecmathlib/vec_sse_float1.h b/lib/kernel/vecmathlib/vec_sse_float1.h
index a84a046..19c3ad0 100644
--- a/lib/kernel/vecmathlib/vec_sse_float1.h
+++ b/lib/kernel/vecmathlib/vec_sse_float1.h
@@ -11,19 +11,7 @@
 #include <climits>
 
 // SSE2 intrinsics
-#include <emmintrin.h>
-#ifdef __SSE3__ // Intel's SSE 3
-#include <pmmintrin.h>
-#endif
-#ifdef __SSE4_1__ // Intel's SSE 4.1
-#include <smmintrin.h>
-#endif
-#ifdef __SSE4A__ // AMD's SSE 4a
-#include <ammintrin.h>
-#endif
-#if defined __AVX__ // Intel's AVX
-#include <immintrin.h>
-#endif
+#include <x86intrin.h>
 
 namespace vecmathlib {
 
@@ -357,13 +345,31 @@ public:
 #endif
   }
   realvec_t fma(realvec_t y, realvec_t z) const {
+#if defined(__FMA4__)
+    return to_float(
+        _mm_macc_ss(from_float(v), from_float(y.v), from_float(z.v)));
+#elif defined(__FMA__)
+    return to_float(
+        _mm_fmadd_ss(from_float(v), from_float(y.v), from_float(z.v)));
+#else
     return MF::vml_fma(*this, y, z);
+#endif
   }
   realvec_t fmax(realvec_t y) const {
-    return to_float(_mm_max_ss(from_float(v), from_float(y.v)));
+    realvec_t res = to_float(_mm_max_ss(from_float(v), from_float(y.v)));
+#if defined VML_HAVE_NAN
+    return y.isnan().ifthen(v, res);
+#else
+    return res;
+#endif
   }
   realvec_t fmin(realvec_t y) const {
-    return to_float(_mm_min_ss(from_float(v), from_float(y.v)));
+    realvec_t res = to_float(_mm_min_ss(from_float(v), from_float(y.v)));
+#if defined VML_HAVE_NAN
+    return y.isnan().ifthen(v, res);
+#else
+    return res;
+#endif
   }
   realvec_t fmod(realvec_t y) const { return vml_std::fmod(v, y.v); }
   realvec_t frexp(intvec_t *irp) const {
diff --git a/lib/kernel/vecmathlib/vec_sse_float4.h b/lib/kernel/vecmathlib/vec_sse_float4.h
index f8e8e80..5813b5c 100644
--- a/lib/kernel/vecmathlib/vec_sse_float4.h
+++ b/lib/kernel/vecmathlib/vec_sse_float4.h
@@ -10,22 +10,7 @@
 #include <cmath>
 
 // SSE2 intrinsics
-#include <xmmintrin.h>
-#ifdef __SSE3__ // Intel's SSE 3
-#include <pmmintrin.h>
-#endif
-#ifdef __SSSE3__ // Intel's SSSE 3
-#include <tmmintrin.h>
-#endif
-#if defined __SSE4_1__ // Intel's SSE 4.1
-#include <smmintrin.h>
-#endif
-#if defined __SSE4A__ // AMD's SSE 4a
-#include <ammintrin.h>
-#endif
-#if defined __AVX__ // Intel's AVX
-#include <immintrin.h>
-#endif
+#include <x86intrin.h>
 
 namespace vecmathlib {
 
@@ -497,10 +482,37 @@ template <> struct realvec<float, 4> : floatprops<float> {
 #endif
   }
   realvec_t fma(realvec_t y, realvec_t z) const {
+#if defined(__FMA4__)
+    realvec_t x = *this;
+    return _mm_macc_ps(x, y, z);
+#elif defined(__FMA__)
+    realvec_t x = *this;
+    return _mm_fmadd_ps(x, y, z);
+#else
     return MF::vml_fma(*this, y, z);
+#endif
   }
-  realvec_t fmax(realvec_t y) const { return _mm_max_ps(v, y.v); }
-  realvec_t fmin(realvec_t y) const { return _mm_min_ps(v, y.v); }
+  /* OpenCL spec: if one argument is NaN, return the second
+   * instructions: if any argument is NaN, return the second
+   * ... so we must take care of (x, NaN) arguments case
+   */
+  realvec_t fmax(realvec_t y) const {
+    realvec_t res = _mm_max_ps(v, y.v);
+#if defined VML_HAVE_NAN
+    return y.isnan().ifthen(v, res);
+#else
+    return res;
+#endif
+  }
+  realvec_t fmin(realvec_t y) const {
+    realvec_t res = _mm_min_ps(v, y.v);
+#if defined VML_HAVE_NAN
+    return y.isnan().ifthen(v, res);
+#else
+    return res;
+#endif
+  }
+
   realvec_t fmod(realvec_t y) const { return MF::vml_fmod(*this, y); }
   realvec_t frexp(intvec_t *r) const { return MF::vml_frexp(*this, r); }
   realvec_t hypot(realvec_t y) const { return MF::vml_hypot(*this, y); }
diff --git a/lib/kernel/vload_half.cl b/lib/kernel/vload_half.cl
index a3705ab..1e79868 100644
--- a/lib/kernel/vload_half.cl
+++ b/lib/kernel/vload_half.cl
@@ -1,6 +1,7 @@
 /* OpenCL built-in library: vload_half()
 
    Copyright (c) 2011 Universidad Rey Juan Carlos
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
    
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
@@ -21,134 +22,189 @@
    THE SOFTWARE.
 */
 
+/* The following function is rewritten to OpenCL from a
+ * FP16 header-only library using The MIT License (MIT)
+ *
+ * https://github.com/Maratyszcza/FP16
+ */
 
-
-#ifdef cl_khr_fp16
-
-
-
-/*
-  half:        1 sign bit,  5 exponent bits,  10 mantissa bits, exponent offset 15
-  float:       1 sign bit,  8 exponent bits,  23 mantissa bits, exponent offset 127
-  double:      1 sign bit, 10 exponent bits,  53 mantissa bits, exponent offset 1023
-  long double: 1 sign bit, 15 exponent bits, 112 mantissa bits, exponent offset 16383
-*/
-
-// Clang supports "half" only on ARM
-// TODO: Create autoconf test for this
-#ifdef __ARM_ARCH
-
-float _cl_half2float(ushort hval)
+float
+_cl_half2float (ushort h)
 {
-  return *(const half*)&hval;
+  const uint w = convert_uint(h) << 16;
+  const uint sign = w & (uint)(0x80000000);
+  const uint two_w = w + w;
+  const uint exp_offset = (uint)(0xE0) << 23;
+  const float exp_scale = 0x1.0p-112f;
+  const float normalized_value = as_float((two_w >> 4) + exp_offset) * exp_scale;
+  const uint magic_mask = (uint)(126) << 23;
+  const float magic_bias = 0.5f;
+  const float denormalized_value = as_float((two_w >> 17) | magic_mask) - magic_bias;
+  const uint denormalized_cutoff = (uint)(1) << 27;
+  const uint result = sign |
+             (two_w < denormalized_cutoff ? as_uint(denormalized_value) : as_uint(normalized_value));
+  return as_float(result);
 }
 
+#ifdef __F16C__
+
+float4 _cl_half2float4 (const ushort4 data);
+float8 _cl_half2float8 (const ushort8 data);
+
+#define IMPLEMENT_VLOAD_HALF(MOD)                                             \
+                                                                              \
+  float _CL_OVERLOADABLE vload_half (size_t offset, const MOD half *p)        \
+  {                                                                           \
+    return _cl_half2float (((const MOD ushort *)p)[offset]);                  \
+  }                                                                           \
+                                                                              \
+  float2 _CL_OVERLOADABLE vload_half2 (size_t offset, const MOD half *p)      \
+  {                                                                           \
+    return (float2) (vload_half (offset * 2, p),                              \
+                     vload_half (offset * 2 + 1, p));                         \
+  }                                                                           \
+                                                                              \
+  float3 _CL_OVERLOADABLE vload_half3 (size_t offset, const MOD half *p)      \
+  {                                                                           \
+    return (float3) (vload_half (offset * 3, p),                              \
+                     vload_half (offset * 3 + 1, p),                          \
+                     vload_half (offset * 3 + 2, p));                         \
+  }                                                                           \
+                                                                              \
+  float4 _CL_OVERLOADABLE vload_half4 (size_t offset, const MOD half *p)      \
+  {                                                                           \
+    return _cl_half2float4 (((const MOD ushort4 *)p)[offset]);                \
+  }                                                                           \
+                                                                              \
+  float8 _CL_OVERLOADABLE vload_half8 (size_t offset, const MOD half *p)      \
+  {                                                                           \
+    return _cl_half2float8 (((const MOD ushort8 *)p)[offset]);                \
+  }                                                                           \
+                                                                              \
+  float16 _CL_OVERLOADABLE vload_half16 (size_t offset, const MOD half *p)    \
+  {                                                                           \
+    float8 hi = _cl_half2float8 (((const MOD ushort8 *)p)[offset * 2]);       \
+    float8 lo = _cl_half2float8 (((const MOD ushort8 *)p)[offset * 2 + 1]);   \
+    return (float16) (hi, lo);                                                \
+  }                                                                           \
+                                                                              \
+  float _CL_OVERLOADABLE vloada_half (size_t offset, const MOD half *p)       \
+  {                                                                           \
+    return _cl_half2float (((const MOD ushort *)p)[offset]);                  \
+  }                                                                           \
+                                                                              \
+  float2 _CL_OVERLOADABLE vloada_half2 (size_t offset, const MOD half *p)     \
+  {                                                                           \
+    return (float2) (vloada_half (offset * 2, p),                             \
+                     vloada_half (offset * 2, p + 1));                        \
+  }                                                                           \
+                                                                              \
+  float3 _CL_OVERLOADABLE vloada_half3 (size_t offset, const MOD half *p)     \
+  {                                                                           \
+    float4 tmp = vloada_half4 (offset, p);                                    \
+    return (float3) (tmp.xyz);                                                \
+  }                                                                           \
+                                                                              \
+  float4 _CL_OVERLOADABLE vloada_half4 (size_t offset, const MOD half *p)     \
+  {                                                                           \
+    return _cl_half2float4 (((const MOD ushort4 *)p)[offset]);                \
+  }                                                                           \
+                                                                              \
+  float8 _CL_OVERLOADABLE vloada_half8 (size_t offset, const MOD half *p)     \
+  {                                                                           \
+    return _cl_half2float8 (((const MOD ushort8 *)p)[offset]);                \
+  }                                                                           \
+                                                                              \
+  float16 _CL_OVERLOADABLE vloada_half16 (size_t offset, const MOD half *p)   \
+  {                                                                           \
+    float8 hi = _cl_half2float8 (((const MOD ushort8 *)p)[offset * 2]);       \
+    float8 lo = _cl_half2float8 (((const MOD ushort8 *)p)[offset * 2 + 1]);   \
+    return (float16) (hi, lo);                                                \
+  }                                                                           \
+                                                                              \
+// __F16C__
 #else
 
-float _cl_half2float(ushort hval)
-{
-  ushort hsign = (hval & (ushort)0x8000) >> (ushort)15;
-  ushort hexp = (hval & (ushort)0x7c00) >> (ushort)10;
-  ushort hmant = hval & (ushort)0x03ff;
-  bool isdenorm = hexp == (ushort)0;
-  bool isinfnan = hexp == (ushort)31;
-  hexp -= (ushort)15;
-  uint fsign = (uint)hsign << 31U;
-  uint fexp = (__builtin_expect(isdenorm, false) ? 0U :
-               __builtin_expect(isinfnan, false) ? 255U : (uint)hexp + 127U);
-  fexp <<= 23U;
-  uint fmant = (uint)hmant << 13U;
-  uint fval = fsign | fexp | fmant;
-  return as_float(fval);
-}
+#define IMPLEMENT_VLOAD_HALF(MOD)                                             \
+                                                                              \
+  float _CL_OVERLOADABLE vload_half (size_t offset, const MOD half *p)        \
+  {                                                                           \
+    ushort h = ((const MOD ushort *)p)[offset];                               \
+    return _cl_half2float (h);                                                \
+  }                                                                           \
+                                                                              \
+  float2 _CL_OVERLOADABLE vload_half2 (size_t offset, const MOD half *p)      \
+  {                                                                           \
+    return (float2) (vload_half (offset * 2, p),                              \
+                     vload_half (offset * 2 + 1, p));                         \
+  }                                                                           \
+                                                                              \
+  float3 _CL_OVERLOADABLE vload_half3 (size_t offset, const MOD half *p)      \
+  {                                                                           \
+    return (float3) (vload_half (offset * 3, p),                              \
+                     vload_half (offset * 3 + 1, p),                          \
+                     vload_half (offset * 3 + 2, p));                         \
+  }                                                                           \
+                                                                              \
+  float4 _CL_OVERLOADABLE vload_half4 (size_t offset, const MOD half *p)      \
+  {                                                                           \
+    return (float4) (vload_half2 (offset * 2, p),                             \
+                     vload_half2 (offset * 2 + 1, p));                        \
+  }                                                                           \
+                                                                              \
+  float8 _CL_OVERLOADABLE vload_half8 (size_t offset, const MOD half *p)      \
+  {                                                                           \
+    return (float8) (vload_half4 (offset * 2, p),                             \
+                     vload_half4 (offset * 2 + 1, p));                        \
+  }                                                                           \
+                                                                              \
+  float16 _CL_OVERLOADABLE vload_half16 (size_t offset, const MOD half *p)    \
+  {                                                                           \
+    return (float16) (vload_half8 (offset * 2, p),                            \
+                      vload_half8 (offset * 2 + 1, p));                       \
+  }                                                                           \
+                                                                              \
+  float _CL_OVERLOADABLE vloada_half (size_t offset, const MOD half *p)       \
+  {                                                                           \
+    return _cl_half2float (((const MOD ushort *)p)[offset]);                  \
+  }                                                                           \
+                                                                              \
+  float2 _CL_OVERLOADABLE vloada_half2 (size_t offset, const MOD half *p)     \
+  {                                                                           \
+    return (float2) (vloada_half (offset * 2, p),                             \
+                     vloada_half (offset * 2 + 1, p));                        \
+  }                                                                           \
+                                                                              \
+  float3 _CL_OVERLOADABLE vloada_half3 (size_t offset, const MOD half *p)     \
+  {                                                                           \
+    float4 tmp = vloada_half4 (offset, p);                                    \
+    return (float3) (tmp.xyz);                                                \
+  }                                                                           \
+                                                                              \
+  float4 _CL_OVERLOADABLE vloada_half4 (size_t offset, const MOD half *p)     \
+  {                                                                           \
+    return (float4) (vloada_half2 (offset * 2, p),                            \
+                     vloada_half2 (offset * 2 + 1, p));                       \
+  }                                                                           \
+                                                                              \
+  float8 _CL_OVERLOADABLE vloada_half8 (size_t offset, const MOD half *p)     \
+  {                                                                           \
+    return (float8) (vloada_half4 (offset * 2, p),                            \
+                     vloada_half4 (offset * 2 + 1, p));                       \
+  }                                                                           \
+                                                                              \
+  float16 _CL_OVERLOADABLE vloada_half16 (size_t offset, const MOD half *p)   \
+  {                                                                           \
+    return (float16) (vloada_half8 (offset * 2, p),                           \
+                      vloada_half8 (offset * 2 + 1, p));                      \
+  }
 
 #endif
 
 
 
-#define IMPLEMENT_VLOAD_HALF(MOD)                               \
-                                                                \
-  float _CL_OVERLOADABLE                                        \
-  vload_half(size_t offset, const MOD half *p)                  \
-  {                                                             \
-    return _cl_half2float(((const MOD ushort*)p)[offset]);      \
-  }                                                             \
-                                                                \
-  float2 _CL_OVERLOADABLE                                       \
-  vload_half2(size_t offset, const MOD half *p)                 \
-  {                                                             \
-    return (float2)(vload_half(0, &p[offset*2]),                \
-                    vload_half(0, &p[offset*2+1]));             \
-  }                                                             \
-                                                                \
-  float3 _CL_OVERLOADABLE                                       \
-  vload_half3(size_t offset, const MOD half *p)                 \
-  {                                                             \
-    return (float3)(vload_half2(0, &p[offset*3]),               \
-                    vload_half(0, &p[offset*3+2]));             \
-  }                                                             \
-                                                                \
-  float4 _CL_OVERLOADABLE                                       \
-  vload_half4(size_t offset, const MOD half *p)                 \
-  {                                                             \
-    return (float4)(vload_half2(0, &p[offset*4]),               \
-                    vload_half2(0, &p[offset*4+2]));            \
-  }                                                             \
-                                                                \
-  float8 _CL_OVERLOADABLE                                       \
-  vload_half8(size_t offset, const MOD half *p)                 \
-  {                                                             \
-    return (float8)(vload_half4(0, &p[offset*8]),               \
-                    vload_half4(0, &p[offset*8+4]));            \
-  }                                                             \
-                                                                \
-  float16 _CL_OVERLOADABLE                                      \
-  vload_half16(size_t offset, const MOD half *p)                \
-  {                                                             \
-    return (float16)(vload_half8(0, &p[offset*16]),             \
-                     vload_half8(0, &p[offset*16+8]));          \
-  }                                                             \
-                                                                \
-  float2 _CL_OVERLOADABLE                                       \
-  vloada_half2(size_t offset, const MOD half *p)                \
-  {                                                             \
-    return (float2)(vload_half(0, &p[offset*2]),                \
-                    vload_half(0, &p[offset*2+1]));             \
-  }                                                             \
-                                                                \
-  float3 _CL_OVERLOADABLE                                       \
-  vloada_half3(size_t offset, const MOD half *p)                \
-  {                                                             \
-    return (float3)(vloada_half2(0, &p[offset*4]),              \
-                    vload_half(0, &p[offset*4+2]));             \
-  }                                                             \
-                                                                \
-  float4 _CL_OVERLOADABLE                                       \
-  vloada_half4(size_t offset, const MOD half *p)                \
-  {                                                             \
-    return (float4)(vloada_half2(0, &p[offset*4]),              \
-                    vloada_half2(0, &p[offset*4+2]));           \
-  }                                                             \
-                                                                \
-  float8 _CL_OVERLOADABLE                                       \
-  vloada_half8(size_t offset, const MOD half *p)                \
-  {                                                             \
-    return (float8)(vloada_half4(0, &p[offset*8]),              \
-                    vloada_half4(0, &p[offset*8+4]));           \
-  }                                                             \
-                                                                \
-  float16 _CL_OVERLOADABLE                                      \
-  vloada_half16(size_t offset, const MOD half *p)               \
-  {                                                             \
-    return (float16)(vloada_half8(0, &p[offset*16]),            \
-                     vloada_half8(0, &p[offset*16+8]));         \
-  }
-
-
 
 IMPLEMENT_VLOAD_HALF(__global)
 IMPLEMENT_VLOAD_HALF(__local)
 IMPLEMENT_VLOAD_HALF(__constant)
 IMPLEMENT_VLOAD_HALF(__private)
-
-#endif
diff --git a/lib/kernel/vload_store_half_f16c.c b/lib/kernel/vload_store_half_f16c.c
new file mode 100644
index 0000000..a494380
--- /dev/null
+++ b/lib/kernel/vload_store_half_f16c.c
@@ -0,0 +1,218 @@
+/* OpenCL built-in library: vload_store_half_f16c()
+
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+/* Accelerated float-half conversions on x86-64 using
+ * these builtins from Clang:
+ *    BUILTIN(__builtin_ia32_vcvtps2ph, "V8sV4fIi", "")
+ *    BUILTIN(__builtin_ia32_vcvtps2ph256, "V8sV8fIi", "")
+ *    BUILTIN(__builtin_ia32_vcvtph2ps, "V4fV8s", "")
+ *    BUILTIN(__builtin_ia32_vcvtph2ps256, "V8fV8s", "")
+ *    _mm_cvtps_ph(a, int);
+ *    _mm256_cvtps_ph(a, int);
+ *    _mm_cvtph_ps(a);
+ *    _mm256_cvtph_ps(a);
+ */
+
+/* TODO
+ * If case of a denormal operand,
+ * the correct normal result is returned.
+ * MXCSR.DAZ is ignored and is treated as if it 0.
+ * No denormal exception is reported on MXCSR.
+ */
+
+/* Clang defines the __F16C__ macro for x86 cpus which support F16C extension */
+
+#ifdef __F16C__
+
+
+
+
+/** FLOAT -> HALF vec4 ************************************************/
+
+typedef union
+{
+  float4 i;
+  float4 low, hi;
+} f2h4_i;
+
+typedef union
+{
+  short8 o;
+  ushort4 low, hi;
+} f2h4_o;
+
+ushort4
+_cl_float2half4_rte (const float4 data)
+{
+  f2h4_i ui;
+  f2h4_o uo;
+  ui.low = data;
+  uo.o = __builtin_ia32_vcvtps2ph (ui.i, 0);
+  return uo.low;
+}
+
+ushort4
+_cl_float2half4_rtn (const float4 data)
+{
+  f2h4_i ui;
+  f2h4_o uo;
+  ui.low = data;
+  uo.o = __builtin_ia32_vcvtps2ph (ui.i, 1);
+  return uo.low;
+}
+
+ushort4
+_cl_float2half4_rtp (const float4 data)
+{
+  f2h4_i ui;
+  f2h4_o uo;
+  ui.low = data;
+  uo.o = __builtin_ia32_vcvtps2ph (ui.i, 2);
+  return uo.low;
+}
+
+ushort4
+_cl_float2half4_rtz (const float4 data)
+{
+  f2h4_i ui;
+  f2h4_o uo;
+  ui.low = data;
+  uo.o = __builtin_ia32_vcvtps2ph (ui.i, 3);
+  return uo.low;
+}
+
+ushort4
+_cl_float2half4 (const float4 data)
+{
+  return _cl_float2half4_rte (data);
+}
+
+/** FLOAT -> HALF vec8 ************************************************/
+
+typedef union
+{
+  float8 i;
+  float8 f;
+} f2h8_i;
+
+typedef union
+{
+  ushort8 o;
+  ushort8 f;
+} f2h8_o;
+
+ushort8
+_cl_float2half8_rte (const float8 data)
+{
+  f2h8_i ui;
+  f2h8_o uo;
+  ui.f = data;
+  uo.o = __builtin_ia32_vcvtps2ph256 (ui.i, 0);
+  return uo.f;
+}
+
+ushort8
+_cl_float2half8_rtn (const float8 data)
+{
+  f2h8_i ui;
+  f2h8_o uo;
+  ui.f = data;
+  uo.o = __builtin_ia32_vcvtps2ph256 (ui.i, 1);
+  return uo.f;
+}
+
+ushort8
+_cl_float2half8_rtp (const float8 data)
+{
+  f2h8_i ui;
+  f2h8_o uo;
+  ui.f = data;
+  uo.o = __builtin_ia32_vcvtps2ph256 (ui.i, 2);
+  return uo.f;
+}
+
+ushort8
+_cl_float2half8_rtz (const float8 data)
+{
+  f2h8_i ui;
+  f2h8_o uo;
+  ui.f = data;
+  uo.o = __builtin_ia32_vcvtps2ph256 (ui.i, 3);
+  return uo.f;
+}
+
+ushort8
+_cl_float2half8 (const float8 data)
+{
+  return _cl_float2half8_rte (data);
+}
+
+/** HALF -> FLOAT vec4 ************************************************/
+
+typedef union
+{
+  short8 i;
+  ushort4 low, hi;
+} h2f4_i;
+
+typedef union
+{
+  float4 o;
+  float4 f;
+} h2f4_o;
+
+float4
+_cl_half2float4 (const ushort4 data)
+{
+  h2f4_i ui;
+  h2f4_o uo;
+  ui.low = data;
+  uo.o = __builtin_ia32_vcvtph2ps (ui.i);
+  return uo.f;
+}
+
+/** HALF -> FLOAT vec8 ************************************************/
+
+typedef union
+{
+  short8 i;
+  ushort8 u;
+} h2f8_i;
+
+typedef union
+{
+  float8 o;
+  float8 f;
+} h2f8_o;
+
+float8
+_cl_half2float8 (const ushort8 data)
+{
+  h2f8_i ui;
+  h2f8_o uo;
+  ui.u = data;
+  uo.o = __builtin_ia32_vcvtph2ps256 (ui.i);
+  return uo.f;
+}
+
+#endif
diff --git a/lib/kernel/vstore_half.cl b/lib/kernel/vstore_half.cl
index ea6a112..3412a98 100644
--- a/lib/kernel/vstore_half.cl
+++ b/lib/kernel/vstore_half.cl
@@ -1,17 +1,18 @@
 /* OpenCL built-in library: vstore_half()
 
    Copyright (c) 2011 Universidad Rey Juan Carlos
-   
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
+
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
    in the Software without restriction, including without limitation the rights
    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    copies of the Software, and to permit persons to whom the Software is
    furnished to do so, subject to the following conditions:
-   
+
    The above copyright notice and this permission notice shall be included in
    all copies or substantial portions of the Software.
-   
+
    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -21,163 +22,602 @@
    THE SOFTWARE.
 */
 
+/* The following code is taken & adapted from half library @
+ * http://half.sourceforge.net
+ *
+ * half - IEEE 754-based half-precision floating point library.
+ *
+ * Copyright (c) 2012-2017 Christian Rau <rauy at users.sourceforge.net>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation
+ * files (the "Software"), to deal in the Software without restriction,
+ * including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies of the
+ * Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+ * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
 
+#define ROUND_TOWARD_INFINITY 1
+#define ROUND_TOWARD_NEG_INFINITY 2
+#define ROUND_TOWARD_ZERO 3
+#define ROUND_TO_NEAREST 4
 
-#ifdef cl_khr_fp16
+static ushort
+_cl_float2half_round (float f, int round_mode)
+{
+  uint bits = as_uint (f);
+  ushort hbits = (bits >> 16) & 0x8000;
+  bits &= 0x7FFFFFFF;
+  int exp = bits >> 23;
+  if (exp == 255)
+    {
+      ushort temp = (((bits & 0x7FFFFF) != 0) ? 0x03FF : 0x0);
+      return (hbits | 0x7C00 | temp);
+    }
+  if (exp > 142)
+    {
+      if (round_mode == ROUND_TOWARD_INFINITY)
+        return hbits | 0x7C00 - (hbits >> 15);
+      if (round_mode == ROUND_TOWARD_NEG_INFINITY)
+        return hbits | 0x7BFF + (hbits >> 15);
+      return hbits | 0x7BFF + (round_mode != ROUND_TOWARD_ZERO);
+    }
+  int g, s;
+  if (exp > 112)
+    {
+      g = (bits >> 12) & 1;
+      s = (bits & 0xFFF) != 0;
+      hbits |= ((exp - 112) << 10) | ((bits >> 13) & 0x3FF);
+    }
+  else if (exp > 101)
+    {
+      int i = 125 - exp;
+      bits = (bits & 0x7FFFFF) | 0x800000;
+      g = (bits >> i) & 1;
+      s = (bits & ((1L << i) - 1)) != 0;
+      hbits |= bits >> (i + 1);
+    }
+  else
+    {
+      g = 0;
+      s = bits != 0;
+    }
+  if (round_mode == ROUND_TO_NEAREST)
+    hbits += g & (s | hbits);
+  else if (round_mode == ROUND_TOWARD_INFINITY)
+    hbits += ~(hbits >> 15) & (s | g);
+  else if (round_mode == ROUND_TOWARD_NEG_INFINITY)
+    hbits += (hbits >> 15) & (g | s);
+  return hbits;
+}
 
+static ushort
+_cl_float2half (float d)
+{
+  return _cl_float2half_round (d, ROUND_TO_NEAREST);
+}
 
+static ushort
+_cl_float2half_rte (float d)
+{
+  return _cl_float2half_round (d, ROUND_TO_NEAREST);
+}
 
-/*
-  half:        1 sign bit,  5 exponent bits,  10 mantissa bits, exponent offset 15
-  float:       1 sign bit,  8 exponent bits,  23 mantissa bits, exponent offset 127
-  double:      1 sign bit, 10 exponent bits,  53 mantissa bits, exponent offset 1023
-  long double: 1 sign bit, 15 exponent bits, 112 mantissa bits, exponent offset 16383
-*/
+static ushort
+_cl_float2half_rtz (float d)
+{
+  return _cl_float2half_round (d, ROUND_TOWARD_ZERO);
+}
 
-// Clang supports "half" only on ARM
-// TODO: Create autoconf test for this
-#ifdef __ARM_ARCH
+static ushort
+_cl_float2half_rtn (float d)
+{
+  return _cl_float2half_round (d, ROUND_TOWARD_NEG_INFINITY);
+}
 
-ushort _cl_float2half(float data)
+static ushort
+_cl_float2half_rtp (float d)
 {
-  half hdata = data;
-  return *(const ushort*)&hdata;
+  return _cl_float2half_round (d, ROUND_TOWARD_INFINITY);
 }
 
-#else
+#ifdef cl_khr_fp64
 
-#define HALF_MAXPLUS 0x1.ffdp15f /* "one more" than HALF_MAX */
-#undef HALF_MIN
-#define HALF_MIN     0x1.0p-14f
-#define HALF_ZERO    ((short)0x0000) /* zero */
-#define HALF_INF     ((short)0x4000) /* infinity */
-#define HALF_SIGN    ((short)0x8000) /* sign bit */
+static ushort
+_cl_double2half_round (double value, int round_mode)
+{
+  ulong bits = as_ulong (value);
+  uint hi = (bits >> 32);
+  uint lo = (bits & 0xFFFFFFFF);
+  ushort hbits = (hi >> 16) & 0x8000;
+  hi &= 0x7FFFFFFF;
+  int exp = hi >> 20;
+  if (exp == 2047)
+    {
+      ushort temp = ((bits & 0xFFFFFFFFFFFFF) != 0 ? 0x03FF : 0x0);
+      return (hbits | 0x7C00 | temp);
+    }
+  if (exp > 1038)
+    {
+      if (round_mode == ROUND_TOWARD_INFINITY)
+        return (hbits | 0x7C00 - (hbits >> 15));
+      if (round_mode == ROUND_TOWARD_NEG_INFINITY)
+        return (hbits | 0x7BFF + (hbits >> 15));
+      return (hbits | 0x7BFF + (round_mode != ROUND_TOWARD_ZERO));
+    }
+  int g;
+  int s = (lo != 0);
+  if (exp > 1008)
+    {
+      g = (hi >> 9) & 1;
+      s |= (hi & 0x1FF) != 0;
+      hbits |= ((exp - 1008) << 10) | ((hi >> 10) & 0x3FF);
+    }
+  else if (exp > 997)
+    {
+      int i = 1018 - exp;
+      hi = (hi & 0xFFFFF) | 0x100000;
+      g = (hi >> i) & 1;
+      s |= (hi & ((1L << i) - 1)) != 0;
+      hbits |= hi >> (i + 1);
+    }
+  else
+    {
+      g = 0;
+      s |= hi != 0;
+    }
+  if (round_mode == ROUND_TO_NEAREST)
+    hbits += g & (s | hbits);
+  else if (round_mode == ROUND_TOWARD_INFINITY)
+    hbits += ~(hbits >> 15) & (s | g);
+  else if (round_mode == ROUND_TOWARD_NEG_INFINITY)
+    hbits += (hbits >> 15) & (g | s);
+  return hbits;
+}
 
-ushort _cl_float2half(float data)
+static ushort
+_cl_double2half (double d)
 {
-  /* IDEA: modify data (e.g. add "1/2") to round correctly */
-  uint fval = as_uint(data);
-  uint fsign = (fval & 0x80000000U) >> 31U;
-  uint fexp = (fval & 0x7f800000U) >> 23U;
-  uint fmant = fval & 0x007fffffU;
-  bool isdenorm = fexp == 0U;
-  bool isinfnan = fexp == 255U;
-  fexp -= 127U;
-  ushort hsign = (ushort)fsign << (ushort)15;
-  ushort hexp = (__builtin_expect(isdenorm, false) ? (ushort)0 :
-                 __builtin_expect(isinfnan, false) ? (ushort)31 :
-                 (ushort)fexp + (ushort)15);
-  /* TODO: this always truncates */
-  ushort hmant = (ushort)(fmant >> 13);
-  ushort hval;
-  if (__builtin_expect(fabs(data) >= HALF_MAXPLUS, false)) {
-    hval = signbit(data)==0 ? HALF_INF : HALF_INF | HALF_SIGN;
-  } else if (__builtin_expect(fabs(data) < HALF_MIN, false)) {
-    hval = signbit(data)==0 ? HALF_ZERO : HALF_ZERO | HALF_SIGN;
-  } else {
-    hval = hsign | hexp | hmant;
-  }
-  return hval;
+  return _cl_double2half_round (d, ROUND_TO_NEAREST);
 }
 
+static ushort
+_cl_double2half_rte (double d)
+{
+  return _cl_double2half_round (d, ROUND_TO_NEAREST);
+}
+
+static ushort
+_cl_double2half_rtz (double d)
+{
+  return _cl_double2half_round (d, ROUND_TOWARD_ZERO);
+}
+
+static ushort
+_cl_double2half_rtn (double d)
+{
+  return _cl_double2half_round (d, ROUND_TOWARD_NEG_INFINITY);
+}
+
+static ushort
+_cl_double2half_rtp (double d)
+{
+  return _cl_double2half_round (d, ROUND_TOWARD_INFINITY);
+}
+
+#endif
+
+#ifdef __F16C__
+
+ushort4 _cl_float2half4 (const float4 data);
+ushort8 _cl_float2half8 (const float8 data);
+ushort4 _cl_float2half4_rte (const float4 data);
+ushort8 _cl_float2half8_rte (const float8 data);
+ushort4 _cl_float2half4_rtn (const float4 data);
+ushort8 _cl_float2half8_rtn (const float8 data);
+ushort4 _cl_float2half4_rtp (const float4 data);
+ushort8 _cl_float2half8_rtp (const float8 data);
+ushort4 _cl_float2half4_rtz (const float4 data);
+ushort8 _cl_float2half8_rtz (const float8 data);
+
+#define IMPLEMENT_VSTORE_HALF(MOD, SUFFIX)                                    \
+                                                                              \
+  void _CL_OVERLOADABLE vstore_half##SUFFIX (float data, size_t offset,       \
+                                             MOD half *p)                     \
+  {                                                                           \
+    ((MOD ushort *)p)[offset] = _cl_float2half##SUFFIX (data);                \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstore_half2##SUFFIX (float2 data, size_t offset,     \
+                                              MOD half *p)                    \
+  {                                                                           \
+    vstore_half##SUFFIX (data.lo, offset * 2, p);                             \
+    vstore_half##SUFFIX (data.hi, offset * 2 + 1, p);                         \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstore_half3##SUFFIX (float3 data, size_t offset,     \
+                                              MOD half *p)                    \
+  {                                                                           \
+    vstore_half##SUFFIX (data.x, offset * 3, p);                              \
+    vstore_half##SUFFIX (data.y, offset * 3 + 1, p);                          \
+    vstore_half##SUFFIX (data.z, offset * 3 + 2, p);                          \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstore_half4##SUFFIX (float4 data, size_t offset,     \
+                                              MOD half *p)                    \
+  {                                                                           \
+    ((MOD ushort4 *)p)[offset] = _cl_float2half4##SUFFIX (data);              \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstore_half8##SUFFIX (float8 data, size_t offset,     \
+                                              MOD half *p)                    \
+  {                                                                           \
+    ((MOD ushort8 *)p)[offset] = _cl_float2half8##SUFFIX (data);              \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstore_half16##SUFFIX (float16 data, size_t offset,   \
+                                               MOD half *p)                   \
+  {                                                                           \
+    ((MOD ushort8 *)p)[offset * 2] = _cl_float2half8##SUFFIX (data.lo);       \
+    ((MOD ushort8 *)p)[offset * 2 + 1] = _cl_float2half8##SUFFIX (data.hi);   \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstorea_half##SUFFIX (float data, size_t offset,      \
+                                              MOD half *p)                    \
+  {                                                                           \
+    ((MOD ushort *)p)[offset] = _cl_float2half##SUFFIX (data);                \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstorea_half2##SUFFIX (float2 data, size_t offset,    \
+                                               MOD half *p)                   \
+  {                                                                           \
+    vstorea_half##SUFFIX (data.lo, offset * 2, p);                            \
+    vstorea_half##SUFFIX (data.hi, offset * 2 + 1, p);                        \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstorea_half3##SUFFIX (float3 data, size_t offset,    \
+                                               MOD half *p)                   \
+  {                                                                           \
+    vstorea_half2##SUFFIX (data.lo, offset * 2, p);                           \
+    vstorea_half2##SUFFIX ((float2) (data.z, 0.0f), offset * 2 + 1, p);       \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstorea_half4##SUFFIX (float4 data, size_t offset,    \
+                                               MOD half *p)                   \
+  {                                                                           \
+    ((MOD ushort4 *)p)[offset] = _cl_float2half4##SUFFIX (data);              \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstorea_half8##SUFFIX (float8 data, size_t offset,    \
+                                               MOD half *p)                   \
+  {                                                                           \
+    ((MOD ushort8 *)p)[offset] = _cl_float2half8##SUFFIX (data);              \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstorea_half16##SUFFIX (float16 data, size_t offset,  \
+                                                MOD half *p)                  \
+  {                                                                           \
+    ((MOD ushort8 *)p)[offset * 2] = _cl_float2half8##SUFFIX (data.lo);       \
+    ((MOD ushort8 *)p)[offset * 2 + 1] = _cl_float2half8##SUFFIX (data.hi);   \
+  }
+
+// __F16C__
+#else
+
+#define IMPLEMENT_VSTORE_HALF(MOD, SUFFIX)                                    \
+                                                                              \
+  void _CL_OVERLOADABLE vstore_half##SUFFIX (float data, size_t offset,       \
+                                             MOD half *p)                     \
+  {                                                                           \
+    ((MOD ushort *)p)[offset] = _cl_float2half##SUFFIX (data);                \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstore_half2##SUFFIX (float2 data, size_t offset,     \
+                                              MOD half *p)                    \
+  {                                                                           \
+    vstore_half##SUFFIX (data.lo, offset * 2, p);                             \
+    vstore_half##SUFFIX (data.hi, offset * 2 + 1, p);                         \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstore_half3##SUFFIX (float3 data, size_t offset,     \
+                                              MOD half *p)                    \
+  {                                                                           \
+    vstore_half##SUFFIX (data.x, offset * 3, p);                              \
+    vstore_half##SUFFIX (data.y, offset * 3 + 1, p);                          \
+    vstore_half##SUFFIX (data.z, offset * 3 + 2, p);                          \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstore_half4##SUFFIX (float4 data, size_t offset,     \
+                                              MOD half *p)                    \
+  {                                                                           \
+    vstore_half2##SUFFIX (data.lo, offset * 2, p);                            \
+    vstore_half2##SUFFIX (data.hi, offset * 2 + 1, p);                        \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstore_half8##SUFFIX (float8 data, size_t offset,     \
+                                              MOD half *p)                    \
+  {                                                                           \
+    vstore_half4##SUFFIX (data.lo, offset * 2, p);                            \
+    vstore_half4##SUFFIX (data.hi, offset * 2 + 1, p);                        \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstore_half16##SUFFIX (float16 data, size_t offset,   \
+                                               MOD half *p)                   \
+  {                                                                           \
+    vstore_half8##SUFFIX (data.lo, offset * 2, p);                            \
+    vstore_half8##SUFFIX (data.hi, offset * 2 + 1, p);                        \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstorea_half##SUFFIX (float data, size_t offset,      \
+                                              MOD half *p)                    \
+  {                                                                           \
+    ((MOD ushort *)p)[offset] = _cl_float2half##SUFFIX (data);                \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstorea_half2##SUFFIX (float2 data, size_t offset,    \
+                                               MOD half *p)                   \
+  {                                                                           \
+    vstorea_half##SUFFIX (data.lo, offset * 2, p);                            \
+    vstorea_half##SUFFIX (data.hi, offset * 2 + 1, p);                        \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstorea_half3##SUFFIX (float3 data, size_t offset,    \
+                                               MOD half *p)                   \
+  {                                                                           \
+    vstorea_half2##SUFFIX (data.lo, offset * 2, p);                           \
+    vstorea_half2##SUFFIX ((float2) (data.z, 0.0f), offset * 2 + 1, p);       \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstorea_half4##SUFFIX (float4 data, size_t offset,    \
+                                               MOD half *p)                   \
+  {                                                                           \
+    vstorea_half2##SUFFIX (data.lo, offset * 2, p);                           \
+    vstorea_half2##SUFFIX (data.hi, offset * 2 + 1, p);                       \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstorea_half8##SUFFIX (float8 data, size_t offset,    \
+                                               MOD half *p)                   \
+  {                                                                           \
+    vstorea_half4##SUFFIX (data.lo, offset * 2, p);                           \
+    vstorea_half4##SUFFIX (data.hi, offset * 2 + 1, p);                       \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstorea_half16##SUFFIX (float16 data, size_t offset,  \
+                                                MOD half *p)                  \
+  {                                                                           \
+    vstorea_half8##SUFFIX (data.lo, offset * 2, p);                           \
+    vstorea_half8##SUFFIX (data.hi, offset * 2 + 1, p);                       \
+  }
+
 #endif
 
+IMPLEMENT_VSTORE_HALF (__global, )
+IMPLEMENT_VSTORE_HALF (__global, _rte)
+IMPLEMENT_VSTORE_HALF (__global, _rtz)
+IMPLEMENT_VSTORE_HALF (__global, _rtp)
+IMPLEMENT_VSTORE_HALF (__global, _rtn)
+IMPLEMENT_VSTORE_HALF (__local, )
+IMPLEMENT_VSTORE_HALF (__local, _rte)
+IMPLEMENT_VSTORE_HALF (__local, _rtz)
+IMPLEMENT_VSTORE_HALF (__local, _rtp)
+IMPLEMENT_VSTORE_HALF (__local, _rtn)
+IMPLEMENT_VSTORE_HALF (__private, )
+IMPLEMENT_VSTORE_HALF (__private, _rte)
+IMPLEMENT_VSTORE_HALF (__private, _rtz)
+IMPLEMENT_VSTORE_HALF (__private, _rtp)
+IMPLEMENT_VSTORE_HALF (__private, _rtn)
 
+#ifdef cl_khr_fp64
 
-#define IMPLEMENT_VSTORE_HALF(MOD, SUFFIX)                              \
-                                                                        \
-  void _CL_OVERLOADABLE                                                 \
-  vstore_half##SUFFIX(float data, size_t offset, MOD half *p)           \
-  {                                                                     \
-    ((MOD ushort*)p)[offset] = _cl_float2half(data);                    \
-  }                                                                     \
-                                                                        \
-  void _CL_OVERLOADABLE                                                 \
-  vstore_half2##SUFFIX(float2 data, size_t offset, MOD half *p)         \
-  {                                                                     \
-    vstore_half##SUFFIX(data.lo, 0, &p[offset*2]);                      \
-    vstore_half##SUFFIX(data.hi, 0, &p[offset*2+1]);                    \
-  }                                                                     \
-                                                                        \
-  void _CL_OVERLOADABLE                                                 \
-  vstore_half3##SUFFIX(float3 data, size_t offset, MOD half *p)         \
-  {                                                                     \
-    vstore_half2##SUFFIX(data.lo, 0, &p[offset*3]);                     \
-    vstore_half##SUFFIX(data.s2, 0, &p[offset*3+2]);                    \
-  }                                                                     \
-                                                                        \
-  void _CL_OVERLOADABLE                                                 \
-  vstore_half4##SUFFIX(float4 data, size_t offset, MOD half *p)         \
-  {                                                                     \
-    vstore_half2##SUFFIX(data.lo, 0, &p[offset*4]);                     \
-    vstore_half2##SUFFIX(data.hi, 0, &p[offset*4+2]);                   \
-  }                                                                     \
-                                                                        \
-  void _CL_OVERLOADABLE                                                 \
-  vstore_half8##SUFFIX(float8 data, size_t offset, MOD half *p)         \
-  {                                                                     \
-    vstore_half4##SUFFIX(data.lo, 0, &p[offset*8]);                     \
-    vstore_half4##SUFFIX(data.hi, 0, &p[offset*8+4]);                   \
-  }                                                                     \
-                                                                        \
-  void _CL_OVERLOADABLE                                                 \
-  vstore_half16##SUFFIX(float16 data, size_t offset, MOD half *p)       \
-  {                                                                     \
-    vstore_half8##SUFFIX(data.lo, 0, &p[offset*16]);                    \
-    vstore_half8##SUFFIX(data.hi, 0, &p[offset*16+8]);                  \
-  }                                                                     \
-                                                                        \
-  void _CL_OVERLOADABLE                                                 \
-  vstorea_half2##SUFFIX(float2 data, size_t offset, MOD half *p)        \
-  {                                                                     \
-    vstore_half##SUFFIX(data.lo, 0, &p[offset*2]);                      \
-    vstore_half##SUFFIX(data.hi, 0, &p[offset*2+1]);                    \
-  }                                                                     \
-                                                                        \
-  void _CL_OVERLOADABLE                                                 \
-  vstorea_half3##SUFFIX(float3 data, size_t offset, MOD half *p)        \
-  {                                                                     \
-    vstorea_half2##SUFFIX(data.lo, 0, &p[offset*3]);                    \
-    vstore_half##SUFFIX(data.s2, 0, &p[offset*3+2]);                    \
-  }                                                                     \
-                                                                        \
-  void _CL_OVERLOADABLE                                                 \
-  vstorea_half4##SUFFIX(float4 data, size_t offset, MOD half *p)        \
-  {                                                                     \
-    vstorea_half2##SUFFIX(data.lo, 0, &p[offset*4]);                    \
-    vstorea_half2##SUFFIX(data.hi, 0, &p[offset*4+2]);                  \
-  }                                                                     \
-                                                                        \
-  void _CL_OVERLOADABLE                                                 \
-  vstorea_half8##SUFFIX(float8 data, size_t offset, MOD half *p)        \
-  {                                                                     \
-    vstorea_half4##SUFFIX(data.lo, 0, &p[offset*8]);                    \
-    vstorea_half4##SUFFIX(data.hi, 0, &p[offset*8+4]);                  \
-  }                                                                     \
-                                                                        \
-  void _CL_OVERLOADABLE                                                 \
-  vstorea_half16##SUFFIX(float16 data, size_t offset, MOD half *p)      \
-  {                                                                     \
-    vstorea_half8##SUFFIX(data.lo, 0, &p[offset*16]);                   \
-    vstorea_half8##SUFFIX(data.hi, 0, &p[offset*16+8]);                 \
+///#ifdef __F16C__
+#if 0
+
+#define IMPLEMENT_VSTORE_HALF_DBL(MOD, SUFFIX)                                \
+                                                                              \
+  void _CL_OVERLOADABLE vstore_half##SUFFIX (double data, size_t offset,      \
+                                             MOD half *p)                     \
+  {                                                                           \
+    ((MOD ushort *)p)[offset] = _cl_double2half##SUFFIX (data);               \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstore_half2##SUFFIX (double2 data, size_t offset,    \
+                                              MOD half *p)                    \
+  {                                                                           \
+    vstore_half##SUFFIX (data.lo, offset * 2, p);                             \
+    vstore_half##SUFFIX (data.hi, offset * 2 + 1, p);                         \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstore_half3##SUFFIX (double3 data, size_t offset,    \
+                                              MOD half *p)                    \
+  {                                                                           \
+    vstore_half##SUFFIX (data.x, offset * 3, p);                              \
+    vstore_half##SUFFIX (data.y, offset * 3 + 1, p);                          \
+    vstore_half##SUFFIX (data.z, offset * 3 + 2, p);                          \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstore_half4##SUFFIX (double4 data, size_t offset,    \
+                                              MOD half *p)                    \
+  {                                                                           \
+    ((MOD ushort4 *)p)[offset]                                                \
+        = _cl_float2half4##SUFFIX (convert_float4##SUFFIX (data));            \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstore_half8##SUFFIX (double8 data, size_t offset,    \
+                                              MOD half *p)                    \
+  {                                                                           \
+    ((MOD ushort8 *)p)[offset]                                                \
+        = _cl_float2half8##SUFFIX (convert_float8##SUFFIX (data));            \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstore_half16##SUFFIX (double16 data, size_t offset,  \
+                                               MOD half *p)                   \
+  {                                                                           \
+    ((MOD ushort8 *)p)[offset * 2]                                            \
+        = _cl_float2half8##SUFFIX (convert_float8##SUFFIX (data.lo));         \
+    ((MOD ushort8 *)p)[offset * 2 + 1]                                        \
+        = _cl_float2half8##SUFFIX (convert_float8##SUFFIX (data.hi));         \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstorea_half##SUFFIX (double data, size_t offset,     \
+                                              MOD half *p)                    \
+  {                                                                           \
+    ((MOD ushort *)p)[offset] = _cl_double2half##SUFFIX (data);               \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstorea_half2##SUFFIX (double2 data, size_t offset,   \
+                                               MOD half *p)                   \
+  {                                                                           \
+    vstorea_half##SUFFIX (data.lo, offset * 2, p);                            \
+    vstorea_half##SUFFIX (data.hi, offset * 2 + 1, p);                        \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstorea_half3##SUFFIX (double3 data, size_t offset,   \
+                                               MOD half *p)                   \
+  {                                                                           \
+    vstorea_half2##SUFFIX (data.lo, offset * 2, p);                           \
+    vstorea_half2##SUFFIX ((float2) (data.z, 0.0f), offset * 2 + 1, p);       \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstorea_half4##SUFFIX (double4 data, size_t offset,   \
+                                               MOD half *p)                   \
+  {                                                                           \
+    ((MOD ushort4 *)p)[offset]                                                \
+        = _cl_float2half4##SUFFIX (convert_float4##SUFFIX (data));            \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstorea_half8##SUFFIX (double8 data, size_t offset,   \
+                                               MOD half *p)                   \
+  {                                                                           \
+    ((MOD ushort8 *)p)[offset]                                                \
+        = _cl_float2half8##SUFFIX (convert_float8##SUFFIX (data));            \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstorea_half16##SUFFIX (double16 data, size_t offset, \
+                                                MOD half *p)                  \
+  {                                                                           \
+    ((MOD ushort8 *)p)[offset * 2]                                            \
+        = _cl_float2half8##SUFFIX (convert_float8##SUFFIX (data.lo));         \
+    ((MOD ushort8 *)p)[offset * 2 + 1]                                        \
+        = _cl_float2half8##SUFFIX (convert_float8##SUFFIX (data.hi));         \
   }
 
+// __F16C__
+#else
 
+#define IMPLEMENT_VSTORE_HALF_DBL(MOD, SUFFIX)                                \
+                                                                              \
+  void _CL_OVERLOADABLE vstore_half##SUFFIX (double data, size_t offset,      \
+                                             MOD half *p)                     \
+  {                                                                           \
+    ((MOD ushort *)p)[offset] = _cl_double2half##SUFFIX (data);               \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstore_half2##SUFFIX (double2 data, size_t offset,    \
+                                              MOD half *p)                    \
+  {                                                                           \
+    vstore_half##SUFFIX (data.lo, offset * 2, p);                             \
+    vstore_half##SUFFIX (data.hi, offset * 2 + 1, p);                         \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstore_half3##SUFFIX (double3 data, size_t offset,    \
+                                              MOD half *p)                    \
+  {                                                                           \
+    vstore_half##SUFFIX (data.x, offset * 3, p);                              \
+    vstore_half##SUFFIX (data.y, offset * 3 + 1, p);                          \
+    vstore_half##SUFFIX (data.z, offset * 3 + 2, p);                          \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstore_half4##SUFFIX (double4 data, size_t offset,    \
+                                              MOD half *p)                    \
+  {                                                                           \
+    vstore_half2##SUFFIX (data.lo, offset * 2, p);                            \
+    vstore_half2##SUFFIX (data.hi, offset * 2 + 1, p);                        \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstore_half8##SUFFIX (double8 data, size_t offset,    \
+                                              MOD half *p)                    \
+  {                                                                           \
+    vstore_half4##SUFFIX (data.lo, offset * 2, p);                            \
+    vstore_half4##SUFFIX (data.hi, offset * 2 + 1, p);                        \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstore_half16##SUFFIX (double16 data, size_t offset,  \
+                                               MOD half *p)                   \
+  {                                                                           \
+    vstore_half8##SUFFIX (data.lo, offset * 2, p);                            \
+    vstore_half8##SUFFIX (data.hi, offset * 2 + 1, p);                        \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstorea_half##SUFFIX (double data, size_t offset,     \
+                                              MOD half *p)                    \
+  {                                                                           \
+    ((MOD ushort *)p)[offset] = _cl_double2half##SUFFIX (data);               \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstorea_half2##SUFFIX (double2 data, size_t offset,   \
+                                               MOD half *p)                   \
+  {                                                                           \
+    vstorea_half##SUFFIX (data.lo, offset * 2, p);                            \
+    vstorea_half##SUFFIX (data.hi, offset * 2 + 1, p);                        \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstorea_half3##SUFFIX (double3 data, size_t offset,   \
+                                               MOD half *p)                   \
+  {                                                                           \
+    vstorea_half2##SUFFIX (data.lo, offset * 2, p);                           \
+    vstorea_half2##SUFFIX ((double2) (data.z, 0.0), offset * 2 + 1, p);       \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstorea_half4##SUFFIX (double4 data, size_t offset,   \
+                                               MOD half *p)                   \
+  {                                                                           \
+    vstorea_half2##SUFFIX (data.lo, offset * 2, p);                           \
+    vstorea_half2##SUFFIX (data.hi, offset * 2 + 1, p);                       \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstorea_half8##SUFFIX (double8 data, size_t offset,   \
+                                               MOD half *p)                   \
+  {                                                                           \
+    vstorea_half4##SUFFIX (data.lo, offset * 2, p);                           \
+    vstorea_half4##SUFFIX (data.hi, offset * 2 + 1, p);                       \
+  }                                                                           \
+                                                                              \
+  void _CL_OVERLOADABLE vstorea_half16##SUFFIX (double16 data, size_t offset, \
+                                                MOD half *p)                  \
+  {                                                                           \
+    vstorea_half8##SUFFIX (data.lo, offset * 2, p);                           \
+    vstorea_half8##SUFFIX (data.hi, offset * 2 + 1, p);                       \
+  }
+
+#endif
 
-IMPLEMENT_VSTORE_HALF(__global  ,     )
-IMPLEMENT_VSTORE_HALF(__global  , _rte)
-IMPLEMENT_VSTORE_HALF(__global  , _rtz)
-IMPLEMENT_VSTORE_HALF(__global  , _rtp)
-IMPLEMENT_VSTORE_HALF(__global  , _rtn)
-IMPLEMENT_VSTORE_HALF(__local   ,     )
-IMPLEMENT_VSTORE_HALF(__local   , _rte)
-IMPLEMENT_VSTORE_HALF(__local   , _rtz)
-IMPLEMENT_VSTORE_HALF(__local   , _rtp)
-IMPLEMENT_VSTORE_HALF(__local   , _rtn)
-IMPLEMENT_VSTORE_HALF(__private ,     )
-IMPLEMENT_VSTORE_HALF(__private , _rte)
-IMPLEMENT_VSTORE_HALF(__private , _rtz)
-IMPLEMENT_VSTORE_HALF(__private , _rtp)
-IMPLEMENT_VSTORE_HALF(__private , _rtn)
+IMPLEMENT_VSTORE_HALF_DBL (__global, )
+IMPLEMENT_VSTORE_HALF_DBL (__global, _rte)
+IMPLEMENT_VSTORE_HALF_DBL (__global, _rtz)
+IMPLEMENT_VSTORE_HALF_DBL (__global, _rtp)
+IMPLEMENT_VSTORE_HALF_DBL (__global, _rtn)
+IMPLEMENT_VSTORE_HALF_DBL (__local, )
+IMPLEMENT_VSTORE_HALF_DBL (__local, _rte)
+IMPLEMENT_VSTORE_HALF_DBL (__local, _rtz)
+IMPLEMENT_VSTORE_HALF_DBL (__local, _rtp)
+IMPLEMENT_VSTORE_HALF_DBL (__local, _rtn)
+IMPLEMENT_VSTORE_HALF_DBL (__private, )
+IMPLEMENT_VSTORE_HALF_DBL (__private, _rte)
+IMPLEMENT_VSTORE_HALF_DBL (__private, _rtz)
+IMPLEMENT_VSTORE_HALF_DBL (__private, _rtp)
+IMPLEMENT_VSTORE_HALF_DBL (__private, _rtn)
 
 #endif
diff --git a/lib/kernel/wait_group_events.cl b/lib/kernel/wait_group_events.cl
index 6b7fabd..75f3e39 100644
--- a/lib/kernel/wait_group_events.cl
+++ b/lib/kernel/wait_group_events.cl
@@ -25,7 +25,7 @@
    blocking one which doesn't actually need events for 
    anything. The event waiting is therefore a dummy function. */
 
-void wait_group_events (int num_events,
-                        event_t *event_list) {
-    barrier(CLK_GLOBAL_MEM_FENCE);
+void _CL_OVERLOADABLE wait_group_events (int num_events,
+                                         event_t *event_list)
+{
 }
diff --git a/lib/kernel/write_image.cl b/lib/kernel/write_image.cl
index 9fded12..33c2f45 100644
--- a/lib/kernel/write_image.cl
+++ b/lib/kernel/write_image.cl
@@ -1,6 +1,7 @@
 /* OpenCL built-in library: write_image()
 
    Copyright (c) 2013 Ville Korhonen 
+   Copyright (c) 2017 Michal Babej / Tampere University of Technology
    
    Permission is hereby granted, free of charge, to any person obtaining a copy
    of this software and associated documentation files (the "Software"), to deal
@@ -21,91 +22,386 @@
    THE SOFTWARE.
 */
 
+/* NOTE: this file is NOT a generic implementation; it works with vectors
+   in a lot of places and requires that either device supports unaligned
+   vector operations, or that memory backing the images is properly aligned.
+   The maximum required alignment is 16bytes (4channels * 32bit color)
+
+   Not all CPUs support unaligned vector operations, but the pthread / basic
+   drivers allocate properly aligned memory for backing buffers; therefore
+   this should work for everything supported by pthread / basic.
+*/
+
 #include "templates.h"
 #include "pocl_image_rw_utils.h"
 
-#if (__clang_major__ > 3) || ((__clang_major__ == 3) && (__clang_minor__ >= 5))
-// Clang 3.5 crashes in case trying to cast to the private pointer,
-// adding the global qualifier fixes it. Clang 3.4 crashes if it's
-// there. The issue is in SROA.
-#define ADDRESS_SPACE global
-#else
-#define ADDRESS_SPACE
-#endif
+static uint4
+map_channels (uint4 color, int order)
+{
+  switch (order)
+    {
+    case CLK_ARGB:
+      return color.wxyz;
+    case CLK_BGRA:
+      return color.zyxw;
+    case CLK_RGBA:
+    default:
+      return color;
+    }
+}
 
-// 3.9 needs access qualifier
-// TODO: rw images
-#ifdef CLANG_OLDER_THAN_3_9
-#define IMG_WRITE_AQ
+/* only for CLK_FLOAT, CLK_SNORM_INT8, CLK_UNORM_INT8,
+ * CLK_SNORM_INT16, CLK_UNORM_INT16 channel types */
+static void
+write_float4_pixel (float4 color, void *data, size_t base_index, int type)
+{
+  if (type == CLK_FLOAT)
+    {
+      ((float4 *)data)[base_index] = color;
+      return;
+    }
+  if (type == CLK_HALF_FLOAT)
+    {
+#if !defined(LLVM_OLDER_THAN_3_8)
+      vstorea_half4(color, base_index, data);
 #else
-#define IMG_WRITE_AQ __write_only
+      __builtin_trap ();
 #endif
+    }
+  const float4 f127 = ((float4) (SCHAR_MAX));
+  const float4 f32767 = ((float4) (SHRT_MAX));
+  const float4 f255 = ((float4) (UCHAR_MAX));
+  const float4 f65535 = ((float4) (USHRT_MAX));
+  if (type == CLK_SNORM_INT8)
+    {
+      /*  <-1.0, 1.0> to <I*_MIN, I*_MAX> */
+      float4 colorf = color * f127;
+      char4 final_color = convert_char4_sat_rte (colorf);
+      ((char4 *)data)[base_index] = final_color;
+      return;
+    }
+  if (type == CLK_SNORM_INT16)
+    {
+      float4 colorf = color * f32767;
+      short4 final_color = convert_short4_sat_rte (colorf);
+      ((short4 *)data)[base_index] = final_color;
+      return;
+    }
+  if (type == CLK_UNORM_INT8)
+    {
+      /* <0, I*_MAX> to <0.0, 1.0> */
+      /*  <-1.0, 1.0> to <I*_MIN, I*_MAX> */
+      float4 colorf = color * f255;
+      uchar4 final_color = convert_uchar4_sat_rte (colorf);
+      ((uchar4 *)data)[base_index] = final_color;
+      return;
+    }
+  if (type == CLK_UNORM_INT16)
+    {
+      float4 colorf = color * f65535;
+      ushort4 final_color = convert_ushort4_sat_rte (colorf);
+      ((ushort4 *)data)[base_index] = final_color;
+      return;
+    }
 
-/* writes pixel to coord in image */
-void pocl_write_pixel (void* color_, ADDRESS_SPACE dev_image_t* dev_image,
-                       int4 coord)
+  return;
+}
+
+/* only for CLK_FLOAT, CLK_SNORM_INT8, CLK_UNORM_INT8,
+ * CLK_SNORM_INT16, CLK_UNORM_INT16 channel types */
+static void
+write_float_pixel (float color, void *data, size_t base_index, int type)
 {
-  uint4 *color = (uint4*)color_;
-  int width = dev_image->_width;
-  int height = dev_image->_height;
-  int num_channels = dev_image->_num_channels;
-  int i = num_channels;
-  int elem_size = dev_image->_elem_size;
-  int const base_index =
-    (coord.x + coord.y*width + coord.z*height*width) * num_channels;
+  if (type == CLK_FLOAT)
+    {
+      ((float *)data)[base_index] = color;
+      return;
+    }
+  const float f127 = ((float)SCHAR_MAX);
+  const float f32767 = ((float)SHRT_MAX);
+  const float f255 = ((float)UCHAR_MAX);
+  const float f65535 = ((float)USHRT_MAX);
+  if (type == CLK_SNORM_INT8)
+    {
+      /*  <-1.0, 1.0> to <I*_MIN, I*_MAX> */
+      float colorf = color * f127;
+      char final_color = convert_char_sat_rte (colorf);
+      ((char *)data)[base_index] = final_color;
+      return;
+    }
+  if (type == CLK_SNORM_INT16)
+    {
+      float colorf = color * f32767;
+      short final_color = convert_short_sat_rte (colorf);
+      ((short *)data)[base_index] = final_color;
+      return;
+    }
+  if (type == CLK_UNORM_INT8)
+    {
+      /* <0, I*_MAX> to <0.0, 1.0> */
+      /*  <-1.0, 1.0> to <I*_MIN, I*_MAX> */
+      float colorf = color * f255;
+      uchar final_color = convert_uchar_sat_rte (colorf);
+      ((uchar *)data)[base_index] = final_color;
+      return;
+    }
+  if (type == CLK_UNORM_INT16)
+    {
+      float colorf = color * f65535;
+      ushort final_color = convert_ushort_sat_rte (colorf);
+      ((ushort *)data)[base_index] = final_color;
+      return;
+    }
 
-  if (dev_image->_order == CL_A)
+  return;
+}
+
+/* for use inside filter functions
+ * no channel mapping
+ * no pointers to img metadata */
+static void
+pocl_write_pixel_fast_ui (uint4 color, size_t base_index, int order,
+                          int elem_size, void *data)
+{
+  if (order == CLK_A)
     {
       if (elem_size == 1)
-        ((uchar*) (dev_image->_data))[base_index] = (*color)[3];
+        ((uchar *)data)[base_index] = convert_uchar_sat (color.w);
       else if (elem_size == 2)
-        ((ushort*) (dev_image->_data))[base_index] = (*color)[3];
+        ((ushort *)data)[base_index] = convert_ushort_sat (color.w);
       else if (elem_size == 4)
-        ((uint*) (dev_image->_data))[base_index] = (*color)[3];
+        ((uint *)data)[base_index] = color.w;
       return;
     }
 
   if (elem_size == 1)
     {
-      while (i--)
-        {
-          ((uchar*) (dev_image->_data))[base_index + i] = (*color)[i];
-        }
+      ((uchar4 *)data)[base_index] = convert_uchar4_sat (color);
     }
   else if (elem_size == 2)
     {
-      while (i--)
-        {
-          ((ushort*) dev_image->_data)[base_index + i] = (*color)[i];
-        }
+      ((ushort4 *)data)[base_index] = convert_ushort4_sat (color);
     }
   else if (elem_size == 4)
     {
-      while (i--)
-        {
-          ((uint*) dev_image->_data)[base_index + i] = (*color)[i];
-        }
+      ((uint4 *)data)[base_index] = color;
     }
+
+  return;
 }
 
-/* Implementation for write_image with any image data type and int coordinates
-   __IMGTYPE__ = image type (image2d_t, ...)
-   __DTYPE__  = data type to be read (int4 or uint4 float4)
-   __POSTFIX__ = function name postfix (i, ui, f)
-   __COORD__   = coordinate type (int, int2, int4)
+/* for use inside filter functions
+ * no channel mapping
+ * no pointers to img metadata */
+static void
+pocl_write_pixel_fast_f (float4 color, size_t base_index, int channel_type,
+                         int order, void *data)
+{
+  if (order == CLK_A)
+    {
+      write_float_pixel (color.w, data, base_index, channel_type);
+    }
+  else
+    {
+      write_float4_pixel (color, data, base_index, channel_type);
+    }
+
+  return;
+}
+
+/* for use inside filter functions
+ * no channel mapping
+ * no pointers to img metadata */
+static void
+pocl_write_pixel_fast_i (int4 color, size_t base_index, int order,
+                         int elem_size, void *data)
+{
+  if (order == CLK_A)
+    {
+      if (elem_size == 1)
+        ((char *)data)[base_index] = convert_char_sat (color.w);
+      else if (elem_size == 2)
+        ((short *)data)[base_index] = convert_short_sat (color.w);
+      else if (elem_size == 4)
+        ((int *)data)[base_index] = color.w;
+      return;
+    }
+
+  if (elem_size == 1)
+    {
+      ((char4 *)data)[base_index] = convert_char4_sat (color);
+    }
+  else if (elem_size == 2)
+    {
+      ((short4 *)data)[base_index] = convert_short4_sat (color);
+    }
+  else if (elem_size == 4)
+    {
+      ((int4 *)data)[base_index] = color;
+    }
+  return;
+}
+
+/* full write with channel map conversion etc
+ * Writes a four element pixel to an image pixel pointed by integer coords.
+ */
+static void
+pocl_write_pixel (uint4 color, global dev_image_t *img, int4 coord,
+                  size_t array_offset_pixels, size_t row_pitch,
+                  size_t slice_pitch)
+{
+  int width = img->_width;
+  int height = img->_height;
+  int depth = img->_depth;
+  int order = img->_order;
+  int elem_size = img->_elem_size;
+  int channel_type = img->_data_type;
+  void *data = img->_data;
+
+  if ((coord.x >= width || coord.x < 0)
+      || ((height != 0) && (coord.y >= height || coord.y < 0))
+      || ((depth != 0) && (coord.z >= depth || coord.z < 0)))
+    {
+      return;
+    }
+
+  size_t base_index = array_offset_pixels + coord.x + (coord.y * row_pitch)
+                      + (coord.z * slice_pitch);
+
+  color = map_channels (color, order);
+
+  if ((channel_type == CLK_SIGNED_INT8) || (channel_type == CLK_SIGNED_INT16)
+      || (channel_type == CLK_SIGNED_INT32))
+    pocl_write_pixel_fast_i (as_int4 (color), base_index, order, elem_size,
+                             data);
+  else if ((channel_type == CLK_UNSIGNED_INT8)
+           || (channel_type == CLK_UNSIGNED_INT16)
+           || (channel_type == CLK_UNSIGNED_INT32))
+    pocl_write_pixel_fast_ui (as_uint4 (color), base_index, order, elem_size,
+                              data);
+  else // TODO unsupported channel types
+    pocl_write_pixel_fast_f (as_float4 (color), base_index, channel_type,
+                             order, data);
+}
+
+/*
+write_imagei can only be used with image objects created with
+image_channel_data_type set to one of the following values:
+CLK_SIGNED_INT8, CLK_SIGNED_INT16, and CLK_SIGNED_INT32.
+
+write_imageui functions can only be used with image objects created with
+image_channel_data_type set to one of the following values:
+CLK_UNSIGNED_INT8, CLK_UNSIGNED_INT16, or CLK_UNSIGNED_INT32.
+*/
+
+/*
+ * write_imagef can only be used with image objects created with
+ * image_channel_data_type set to one of the pre-defined packed formats,
+ * or set to CLK_SNORM_INT8, CLK_UNORM_INT8, CLK_SNORM_INT16,
+ * CLK_UNORM_INT16, CLK_HALF_FLOAT or CLK_FLOAT.
 */
-#define IMPLEMENT_WRITE_IMAGE_INT_COORD(__IMGTYPE__,__DTYPE__,__POSTFIX__, \
-                                        __COORD__)                      \
-  void _CL_OVERLOADABLE write_image##__POSTFIX__ (__write_only __IMGTYPE__ image,    \
-                                                  __COORD__ coord,      \
-                                                  __DTYPE__ color)      \
-  {                                                                     \
-    int4 coord4;                                                        \
-    INITCOORD##__COORD__(coord4, coord);                                \
-    global dev_image_t* i_ptr = __builtin_astype (image, global dev_image_t*); \
-    pocl_write_pixel (&color, i_ptr, coord4);                             \
-  }                                                                     \
-
-IMPLEMENT_WRITE_IMAGE_INT_COORD (image2d_t, uint4, ui, int2)
-IMPLEMENT_WRITE_IMAGE_INT_COORD (image2d_t, float4, f, int2)
-IMPLEMENT_WRITE_IMAGE_INT_COORD (image3d_t, float4, f, int4)
+
+#define IMPLEMENT_WRITE_IMAGE_INT_COORD(__IMGTYPE__, __POSTFIX__, __COORD__,  \
+                                        __DTYPE__)                            \
+  void _CL_OVERLOADABLE write_image##__POSTFIX__ (                            \
+      __IMGTYPE__ image, __COORD__ coord, __DTYPE__ color)                    \
+  {                                                                           \
+    int4 coord4;                                                              \
+    INITCOORD##__COORD__ (coord4, coord);                                     \
+    global dev_image_t *i_ptr                                                 \
+        = __builtin_astype (image, global dev_image_t *);                     \
+    int elem_size = i_ptr->_elem_size;                                        \
+    int num_channels = i_ptr->_num_channels;                                  \
+    size_t elem_bytes = num_channels * elem_size;                             \
+    size_t row_pitch = i_ptr->_row_pitch / elem_bytes;                        \
+    size_t slice_pitch = i_ptr->_slice_pitch / elem_bytes;                    \
+    pocl_write_pixel (as_uint4 (color), i_ptr, coord4, 0, row_pitch,          \
+                      slice_pitch);                                           \
+  }
+
+#define IMPLEMENT_WRITE_ARRAY_INT_COORD(__IMGTYPE__, __POSTFIX__, __COORD__,  \
+                                        __DTYPE__)                            \
+  void _CL_OVERLOADABLE write_image##__POSTFIX__ (                            \
+      __IMGTYPE__ image, __COORD__ coord, __DTYPE__ color)                    \
+  {                                                                           \
+    int4 coord4;                                                              \
+    INITCOORD##__COORD__ (coord4, coord);                                     \
+    global dev_image_t *i_ptr                                                 \
+        = __builtin_astype (image, global dev_image_t *);                     \
+    int asize = i_ptr->_image_array_size - 1;                                 \
+    int elem_size = i_ptr->_elem_size;                                        \
+    int num_channels = i_ptr->_num_channels;                                  \
+    size_t elem_bytes = num_channels * elem_size;                             \
+    size_t row_pitch = i_ptr->_row_pitch / elem_bytes;                        \
+    size_t slice_pitch = i_ptr->_slice_pitch / elem_bytes;                    \
+    size_t array_offset_pixels = 0;                                           \
+    if (i_ptr->_height > 0)                                                   \
+      {                                                                       \
+        array_offset_pixels = clamp (coord4.z, 0, asize);                     \
+        coord4.z = 0;                                                         \
+      }                                                                       \
+    else                                                                      \
+      {                                                                       \
+        array_offset_pixels = clamp (coord4.y, 0, asize);                     \
+        coord4.y = 0;                                                         \
+      }                                                                       \
+    array_offset_pixels *= slice_pitch;                                       \
+    pocl_write_pixel (as_uint4 (color), i_ptr, coord4, array_offset_pixels,   \
+                      row_pitch, slice_pitch);                                \
+  }
+
+IMPLEMENT_WRITE_IMAGE_INT_COORD (IMG_WO_AQ image1d_t, ui, int, uint4)
+IMPLEMENT_WRITE_IMAGE_INT_COORD (IMG_WO_AQ image1d_t, i, int, int4)
+IMPLEMENT_WRITE_IMAGE_INT_COORD (IMG_WO_AQ image1d_t, f, int, float4)
+
+IMPLEMENT_WRITE_IMAGE_INT_COORD (IMG_WO_AQ image1d_buffer_t, ui, int, uint4)
+IMPLEMENT_WRITE_IMAGE_INT_COORD (IMG_WO_AQ image1d_buffer_t, i, int, int4)
+IMPLEMENT_WRITE_IMAGE_INT_COORD (IMG_WO_AQ image1d_buffer_t, f, int, float4)
+
+IMPLEMENT_WRITE_ARRAY_INT_COORD (IMG_WO_AQ image1d_array_t, ui, int2, uint4)
+IMPLEMENT_WRITE_ARRAY_INT_COORD (IMG_WO_AQ image1d_array_t, i, int2, int4)
+IMPLEMENT_WRITE_ARRAY_INT_COORD (IMG_WO_AQ image1d_array_t, f, int2, float4)
+
+IMPLEMENT_WRITE_IMAGE_INT_COORD (IMG_WO_AQ image2d_t, ui, int2, uint4)
+IMPLEMENT_WRITE_IMAGE_INT_COORD (IMG_WO_AQ image2d_t, i, int2, int4)
+IMPLEMENT_WRITE_IMAGE_INT_COORD (IMG_WO_AQ image2d_t, f, int2, float4)
+
+IMPLEMENT_WRITE_ARRAY_INT_COORD (IMG_WO_AQ image2d_array_t, ui, int4, uint4)
+IMPLEMENT_WRITE_ARRAY_INT_COORD (IMG_WO_AQ image2d_array_t, i, int4, int4)
+IMPLEMENT_WRITE_ARRAY_INT_COORD (IMG_WO_AQ image2d_array_t, f, int4, float4)
+
+#ifdef cl_khr_3d_image_writes
+IMPLEMENT_WRITE_IMAGE_INT_COORD (IMG_WO_AQ image3d_t, ui, int4, uint4)
+IMPLEMENT_WRITE_IMAGE_INT_COORD (IMG_WO_AQ image3d_t, i, int4, int4)
+IMPLEMENT_WRITE_IMAGE_INT_COORD (IMG_WO_AQ image3d_t, f, int4, float4)
+#endif
+
+#ifdef CLANG_HAS_RW_IMAGES
+
+IMPLEMENT_WRITE_IMAGE_INT_COORD (IMG_RW_AQ image1d_t, ui, int, uint4)
+IMPLEMENT_WRITE_IMAGE_INT_COORD (IMG_RW_AQ image1d_t, i, int, int4)
+IMPLEMENT_WRITE_IMAGE_INT_COORD (IMG_RW_AQ image1d_t, f, int, float4)
+
+IMPLEMENT_WRITE_IMAGE_INT_COORD (IMG_RW_AQ image1d_buffer_t, ui, int, uint4)
+IMPLEMENT_WRITE_IMAGE_INT_COORD (IMG_RW_AQ image1d_buffer_t, i, int, int4)
+IMPLEMENT_WRITE_IMAGE_INT_COORD (IMG_RW_AQ image1d_buffer_t, f, int, float4)
+
+IMPLEMENT_WRITE_ARRAY_INT_COORD (IMG_RW_AQ image1d_array_t, ui, int2, uint4)
+IMPLEMENT_WRITE_ARRAY_INT_COORD (IMG_RW_AQ image1d_array_t, i, int2, int4)
+IMPLEMENT_WRITE_ARRAY_INT_COORD (IMG_RW_AQ image1d_array_t, f, int2, float4)
+
+IMPLEMENT_WRITE_IMAGE_INT_COORD (IMG_RW_AQ image2d_t, ui, int2, uint4)
+IMPLEMENT_WRITE_IMAGE_INT_COORD (IMG_RW_AQ image2d_t, i, int2, int4)
+IMPLEMENT_WRITE_IMAGE_INT_COORD (IMG_RW_AQ image2d_t, f, int2, float4)
+
+IMPLEMENT_WRITE_ARRAY_INT_COORD (IMG_RW_AQ image2d_array_t, ui, int4, uint4)
+IMPLEMENT_WRITE_ARRAY_INT_COORD (IMG_RW_AQ image2d_array_t, i, int4, int4)
+IMPLEMENT_WRITE_ARRAY_INT_COORD (IMG_RW_AQ image2d_array_t, f, int4, float4)
+
+#ifdef cl_khr_3d_image_writes
+IMPLEMENT_WRITE_IMAGE_INT_COORD (IMG_RW_AQ image3d_t, ui, int4, uint4)
+IMPLEMENT_WRITE_IMAGE_INT_COORD (IMG_RW_AQ image3d_t, i, int4, int4)
+IMPLEMENT_WRITE_IMAGE_INT_COORD (IMG_RW_AQ image3d_t, f, int4, float4)
+#endif
+
+#endif
diff --git a/lib/llvmopencl/AllocasToEntry.cc b/lib/llvmopencl/AllocasToEntry.cc
index 0b4193b..ad305c9 100644
--- a/lib/llvmopencl/AllocasToEntry.cc
+++ b/lib/llvmopencl/AllocasToEntry.cc
@@ -1,5 +1,4 @@
-// Header for AllocasToEntry, an LLVM pass to move allocas to the function 
-// entry node.
+// AllocasToEntry, an LLVM pass to move allocas to the function entry node.
 // 
 // Copyright (c) 2013 Pekka Jääskeläinen / TUT
 // 
diff --git a/lib/llvmopencl/AutomaticLocals.cc b/lib/llvmopencl/AutomaticLocals.cc
index f732d82..17aeebd 100644
--- a/lib/llvmopencl/AutomaticLocals.cc
+++ b/lib/llvmopencl/AutomaticLocals.cc
@@ -123,6 +123,25 @@ AutomaticLocals::runOnModule(Module &M)
   return changed;
 }
 
+// Recursively descend a Value's users and convert any constant expressions into
+// regular instructions.
+static void breakConstantExpressions(llvm::Value *Val, llvm::Function *Func) {
+  std::vector<llvm::Value *> Users(Val->user_begin(), Val->user_end());
+  for (auto *U : Users) {
+    if (auto *CE = llvm::dyn_cast<llvm::ConstantExpr>(U)) {
+      // First, make sure no users of this constant expression are themselves
+      // constant expressions.
+      breakConstantExpressions(U, Func);
+
+      // Convert this constant expression to an instruction.
+      llvm::Instruction *I = CE->getAsInstruction();
+      I->insertBefore(&*Func->begin()->begin());
+      CE->replaceAllUsesWith(I);
+      CE->destroyConstant();
+    }
+  }
+}
+
 Function *
 AutomaticLocals::processAutomaticLocals(Function *F) {
 
@@ -143,6 +162,10 @@ AutomaticLocals::processAutomaticLocals(Function *F) {
       Locals.push_back(&*i);
       // Add the parameters to the end of the function parameter list.
       Parameters.push_back(i->getType());
+
+      // Replace any constant expression users with an equivalent instruction.
+      // Otherwise, the IR breaks when we replace the local with an argument.
+      breakConstantExpressions(&*i, F);
     }
   }
 
@@ -172,7 +195,12 @@ AutomaticLocals::processAutomaticLocals(Function *F) {
   }
 
   SmallVector<ReturnInst *, 1> RI;
-  CloneFunctionInto(NewKernel, F, VV, false, RI);
+
+  // As of LLVM 5.0 we need to let CFI to make module level changes,
+  // otherwise there will be an assertion. The changes are likely
+  // additional debug info nodes added when cloning the function into
+  // the other.  For some reason it doesn't want to reuse the old ones.
+  CloneFunctionInto(NewKernel, F, VV, true, RI);
 
   return NewKernel;
 }
diff --git a/lib/llvmopencl/Barrier.h b/lib/llvmopencl/Barrier.h
index e46e7e7..f5cc1cb 100644
--- a/lib/llvmopencl/Barrier.h
+++ b/lib/llvmopencl/Barrier.h
@@ -22,6 +22,8 @@
 
 #include "config.h"
 
+#include "pocl.h"
+
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
@@ -58,10 +60,16 @@ namespace pocl {
           llvm::isa<Barrier>(InsertBefore->getPrevNode()))
         return llvm::cast<Barrier>(InsertBefore->getPrevNode());
 
+#if LLVM_OLDER_THAN_5_0
+      llvm::Function *F = llvm::cast<llvm::Function>
+        (M->getOrInsertFunction(BARRIER_FUNCTION_NAME,
+                                llvm::Type::getVoidTy(M->getContext()), NULL));
+#else
       llvm::Function *F = llvm::cast<llvm::Function>
         (M->getOrInsertFunction(BARRIER_FUNCTION_NAME,
-                                llvm::Type::getVoidTy(M->getContext()),
-                                NULL));
+                                llvm::Type::getVoidTy(M->getContext())));
+#endif
+      F->addFnAttr(llvm::Attribute::NoDuplicate);
       F->setLinkage(llvm::GlobalValue::LinkOnceAnyLinkage);
       return llvm::cast<pocl::Barrier>
         (llvm::CallInst::Create(F, "", InsertBefore));
diff --git a/lib/llvmopencl/BreakConstantGEPs.h b/lib/llvmopencl/BreakConstantGEPs.h
index 017f4e8..2706a9e 100644
--- a/lib/llvmopencl/BreakConstantGEPs.h
+++ b/lib/llvmopencl/BreakConstantGEPs.h
@@ -47,8 +47,8 @@ struct BreakConstantGEPs : public FunctionPass {
 #else
     llvm::StringRef getPassName() const override {return "Remove Constant GEP Expressions";}
 #endif
-    virtual bool runOnFunction (Function & F);
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    virtual bool runOnFunction (Function & F) override;
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
       // This pass does not modify the control-flow graph of the function
       AU.setPreservesCFG();
     }
diff --git a/lib/llvmopencl/CMakeLists.txt b/lib/llvmopencl/CMakeLists.txt
index 82a610f..8241f89 100644
--- a/lib/llvmopencl/CMakeLists.txt
+++ b/lib/llvmopencl/CMakeLists.txt
@@ -24,14 +24,14 @@
 #=============================================================================
 
 set(LLVMPASSES_SOURCES "Barrier.h"
-  "BarrierBlock.h" "BarrierBlock.cc"
   "Kernel.h" "Kernel.cc"
   "ParallelRegion.h" "ParallelRegion.cc"
   "CanonicalizeBarriers.h" "CanonicalizeBarriers.cc"
   "LoopBarriers.h" "LoopBarriers.cc"
   "Workgroup.h" "Workgroup.cc"
   "BarrierTailReplication.h" "BarrierTailReplication.cc"
-  "Flatten.cc" "IsolateRegions.h" "IsolateRegions.cc"
+  "Flatten.cc" "FlattenGlobals.cc"
+  "IsolateRegions.h" "IsolateRegions.cc"
   "WorkitemReplication.h" "WorkitemReplication.cc"
   "ImplicitLoopBarriers.h" "ImplicitLoopBarriers.cc"
   "WorkItemAliasAnalysis.cc"
@@ -47,7 +47,9 @@ set(LLVMPASSES_SOURCES "Barrier.h"
   "ImplicitConditionalBarriers.h"
   "DebugHelpers.h" "DebugHelpers.cc"
   "RemoveBarrierCalls.h" "RemoveBarrierCalls.cc"
-  "HandleSamplerInitialization.h" "HandleSamplerInitialization.cc")
+  "HandleSamplerInitialization.h" "HandleSamplerInitialization.cc"
+  "RemoveOptnoneFromWIFunc.h" "RemoveOptnoneFromWIFunc.cc"
+  "OptimizeWorkItemFuncCalls.h" "OptimizeWorkItemFuncCalls.cc")
 
 if(POCL_USE_FAKE_ADDR_SPACE_IDS)
 list(APPEND LLVMPASSES_SOURCES "TargetAddressSpaces.cc")
diff --git a/lib/llvmopencl/CanonicalizeBarriers.cc b/lib/llvmopencl/CanonicalizeBarriers.cc
index 80027c7..eec85d3 100644
--- a/lib/llvmopencl/CanonicalizeBarriers.cc
+++ b/lib/llvmopencl/CanonicalizeBarriers.cc
@@ -34,7 +34,6 @@ IGNORE_COMPILER_WARNING("-Wunused-parameter")
 #include "llvm/IR/Dominators.h"
 
 #include "CanonicalizeBarriers.h"
-#include "BarrierBlock.h"
 #include "Barrier.h"
 #include "Workgroup.h"
 #include "VariableUniformityAnalysis.h"
@@ -66,7 +65,7 @@ CanonicalizeBarriers::runOnFunction(Function &F)
     return false;
 
   BasicBlock *entry = &F.getEntryBlock();
-  if (!isa<BarrierBlock>(entry)) {
+  if (!Barrier::hasOnlyBarrier(entry)) {
 #ifdef LLVM_OLDER_THAN_3_7
     BasicBlock *effective_entry = SplitBlock(entry, 
                                              &(entry->front()),
@@ -86,7 +85,7 @@ CanonicalizeBarriers::runOnFunction(Function &F)
     TerminatorInst *t = b->getTerminator();
 
     const bool isExitNode = 
-      (t->getNumSuccessors() == 0) && (!isa<BarrierBlock>(b));
+      (t->getNumSuccessors() == 0) && (!Barrier::hasOnlyBarrier(b));
 
     // The function exits should have barriers.
     if (isExitNode && !Barrier::hasOnlyBarrier(b)) {
diff --git a/lib/llvmopencl/DebugHelpers.cc b/lib/llvmopencl/DebugHelpers.cc
index 0a7dc35..d6a7420 100644
--- a/lib/llvmopencl/DebugHelpers.cc
+++ b/lib/llvmopencl/DebugHelpers.cc
@@ -37,8 +37,8 @@ IGNORE_COMPILER_WARNING("-Wunused-parameter")
 
 #include "DebugHelpers.h"
 #include "Barrier.h"
-#include "BarrierBlock.h"
 #include "Workgroup.h"
+#include "pocl_file_util.h"
 
 POP_COMPILER_DIAGS
 
@@ -139,6 +139,15 @@ void dumpCFG(
   if (fname == "")
     fname = std::string("pocl_cfg.") + F.getName().str() + ".dot";
 
+  std::string origName = fname;
+  int counter = 0;
+  while (pocl_exists (fname.c_str())) {
+    std::ostringstream ss;
+    ss << origName << "." << counter;
+    fname = ss.str();
+    ++counter;
+  }
+
   std::ofstream s;
   s.open(fname.c_str(), std::ios::trunc);
   s << "digraph " << F.getName().str() << " {" << std::endl;
diff --git a/lib/llvmopencl/Flatten.cc b/lib/llvmopencl/Flatten.cc
index e29ba10..bbe1e6d 100644
--- a/lib/llvmopencl/Flatten.cc
+++ b/lib/llvmopencl/Flatten.cc
@@ -1,19 +1,18 @@
-// LLVM module pass to inline required functions (those accessing
-// per-workgroup variables) into the kernel.
-// 
+// LLVM module pass to inline ALL called functions into the kernel.
+//
 // Copyright (c) 2011 Universidad Rey Juan Carlos
 //               2012-2015 Pekka Jääskeläinen
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
-// 
+//
 // The above copyright notice and this permission notice shall be included in
 // all copies or substantial portions of the Software.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -29,6 +28,7 @@
 IGNORE_COMPILER_WARNING("-Wunused-parameter")
 
 #include "config.h"
+#include "pocl.h"
 
 #include "llvm/Support/CommandLine.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -56,119 +56,63 @@ namespace {
 extern cl::opt<std::string> KernelName;
 
 char Flatten::ID = 0;
-static RegisterPass<Flatten> X("flatten", "Kernel function flattening pass");
+static RegisterPass<Flatten>
+    X("flatten-inline-all",
+      "Kernel function flattening pass - flatten everything");
 
 //#define DEBUG_FLATTEN
 
-#define INLINE_ALL_NON_KERNEL
-
-#ifdef INLINE_ALL_NON_KERNEL
-
 bool
 Flatten::runOnModule(Module &M)
 {
   bool changed = false;
-  for (llvm::Module::iterator i = M.begin(), e = M.end(); i != e; ++i)
-    {
-      llvm::Function *f = &*i;
-      if (f->isDeclaration()) continue;
-      if (KernelName == f->getName() || 
-          (KernelName == "" && pocl::Workgroup::isKernelToProcess(*f)))
-        {
-          AttributeSet attrs;
-          f->removeAttributes(
-              AttributeSet::FunctionIndex, 
-              attrs.addAttribute
-              (M.getContext(), 
-               AttributeSet::FunctionIndex, Attribute::AlwaysInline));
-
-          f->addFnAttr(Attribute::NoInline);
-
-          f->setLinkage(llvm::GlobalValue::ExternalLinkage);
-          changed = true;
-#ifdef DEBUG_FLATTEN
-          std::cerr << "### NoInline for " << f->getName().str() << std::endl;
+  for (llvm::Module::iterator i = M.begin(), e = M.end(); i != e; ++i) {
+    llvm::Function *f = &*i;
+    if (f->isDeclaration()) continue;
+    if (KernelName == f->getName() ||
+        (KernelName == "" && pocl::Workgroup::isKernelToProcess(*f))) {
+#if LLVM_OLDER_THAN_5_0
+      AttributeSet Attrs;
+      f->removeAttributes(AttributeSet::FunctionIndex,
+                          Attrs.addAttribute(M.getContext(),
+                                             AttributeSet::FunctionIndex,
+                                             Attribute::AlwaysInline));
+#else
+      AttributeSet Attrs;
+      f->removeAttributes(AttributeList::FunctionIndex,
+                          Attrs.addAttribute(M.getContext(),
+                                             Attribute::AlwaysInline));
 #endif
-        } 
-      else
-        {
-          AttributeSet attrs;
-          f->removeAttributes(
-              AttributeSet::FunctionIndex, 
-              attrs.addAttribute(M.getContext(), 
-                                 AttributeSet::FunctionIndex, 
-                                 Attribute::NoInline));
-          f->addFnAttr(Attribute::AlwaysInline);
-
-          f->setLinkage(llvm::GlobalValue::InternalLinkage);
-          changed = true;
+
+      f->addFnAttr(Attribute::NoInline);
+
+      f->setLinkage(llvm::GlobalValue::ExternalLinkage);
+      changed = true;
 #ifdef DEBUG_FLATTEN
-          std::cerr << "### AlwaysInline for " << f->getName().str() << std::endl;
+      std::cerr << "### NoInline for " << f->getName().str() << std::endl;
 #endif
-        }
-    }
-  return changed;
-}
-
+      } else {
+#if LLVM_OLDER_THAN_5_0
+      AttributeSet Attrs;
+      f->removeAttributes(AttributeSet::FunctionIndex,
+                          Attrs.addAttribute(M.getContext(),
+                                             AttributeSet::FunctionIndex,
+                                             Attribute::NoInline));
 #else
+      AttributeSet Attrs;
+      f->removeAttributes(AttributeList::FunctionIndex,
+                          Attrs.addAttribute(M.getContext(),
+                                             Attribute::NoInline));
+#endif
+      f->addFnAttr(Attribute::AlwaysInline);
 
-static const char *workgroup_variables[] = {
-  "_local_id_x", "_local_id_y", "_local_id_z",
-  "_local_size_x", "_local_size_y", "_local_size_z",
-  "_work_dim",
-  "_num_groups_x", "_num_groups_y", "_num_groups_z",
-  "_group_id_x", "_group_id_y", "_group_id_z",
-  "_global_offset_x", "_global_offset_y", "_global_offset_z",
-  NULL
-};
-
-bool
-Flatten::runOnModule(Module &M)
-{
-  SmallPtrSet<Function *, 8> functions_to_inline;
-  SmallVector<Value *, 8> pending;
-
-  const char **s = workgroup_variables;
-  while (*s != NULL) {
-    GlobalVariable *gv = M.getGlobalVariable(*s);
-    if (gv != NULL)
-      pending.push_back(gv);
-
-    ++s;
-  }
-
-  while (!pending.empty()) {
-    Value *v = pending.back();
-    pending.pop_back();
-
-    for (Value::use_iterator i = v->use_begin(), e = v->use_end();
-         i != e; ++i) {
-      llvm::User *user = i->getUser();
-      if (Instruction *ci = dyn_cast<Instruction>(user) {
-        // Prevent infinite looping on recursive functions
-        // (though OpenCL does not allow this?)
-        Function *f = ci->getParent()->getParent();;
-        assert((f != NULL) &&
-               "Per-workgroup global variable used on function with no parent!");
-        if (functions_to_inline.count(f))
-          continue;
-        
-        functions_to_inline.insert(f);
-        pending.push_back(f);
+      f->setLinkage(llvm::GlobalValue::InternalLinkage);
+      changed = true;
+#ifdef DEBUG_FLATTEN
+      std::cerr << "### AlwaysInline for " << f->getName().str() << std::endl;
+#endif
       }
     }
-  }
-
-  for (SmallPtrSet<Function *, 8>::iterator i = functions_to_inline.begin(),
-	 e = functions_to_inline.end();
-       i != e; ++i) {
-    (*i)->removeFnAttr(Attribute::NoInline);
-    (*i)->addFnAttr(Attribute::AlwaysInline);
-  }
-
-  return true;
+  return changed;
 }
 
-#endif
-
-
diff --git a/lib/llvmopencl/Flatten.cc b/lib/llvmopencl/FlattenGlobals.cc
similarity index 52%
copy from lib/llvmopencl/Flatten.cc
copy to lib/llvmopencl/FlattenGlobals.cc
index e29ba10..e98ea13 100644
--- a/lib/llvmopencl/Flatten.cc
+++ b/lib/llvmopencl/FlattenGlobals.cc
@@ -1,19 +1,19 @@
-// LLVM module pass to inline required functions (those accessing
+// LLVM module pass to inline only required functions (those accessing
 // per-workgroup variables) into the kernel.
-// 
+//
 // Copyright (c) 2011 Universidad Rey Juan Carlos
 //               2012-2015 Pekka Jääskeläinen
-// 
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
-// 
+//
 // The above copyright notice and this permission notice shall be included in
 // all copies or substantial portions of the Software.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -29,11 +29,12 @@
 IGNORE_COMPILER_WARNING("-Wunused-parameter")
 
 #include "config.h"
+#include "pocl.h"
 
-#include "llvm/Support/CommandLine.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/Pass.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
 
 #include "Workgroup.h"
 
@@ -42,89 +43,44 @@ POP_COMPILER_DIAGS
 using namespace llvm;
 
 namespace {
-  class Flatten : public ModulePass {
-
-  public:
-    static char ID;
-    Flatten() : ModulePass(ID) {}
+class FlattenGlobals : public ModulePass {
 
-    virtual bool runOnModule(Module &M);
-  };
+public:
+  static char ID;
+  FlattenGlobals() : ModulePass(ID) {}
 
+  virtual bool runOnModule(Module &M);
+};
 }
 
 extern cl::opt<std::string> KernelName;
 
-char Flatten::ID = 0;
-static RegisterPass<Flatten> X("flatten", "Kernel function flattening pass");
+char FlattenGlobals::ID = 0;
+static RegisterPass<FlattenGlobals>
+    X("flatten-globals",
+      "Kernel function flattening pass - flatten global vars' users only");
 
 //#define DEBUG_FLATTEN
 
-#define INLINE_ALL_NON_KERNEL
-
-#ifdef INLINE_ALL_NON_KERNEL
-
-bool
-Flatten::runOnModule(Module &M)
-{
-  bool changed = false;
-  for (llvm::Module::iterator i = M.begin(), e = M.end(); i != e; ++i)
-    {
-      llvm::Function *f = &*i;
-      if (f->isDeclaration()) continue;
-      if (KernelName == f->getName() || 
-          (KernelName == "" && pocl::Workgroup::isKernelToProcess(*f)))
-        {
-          AttributeSet attrs;
-          f->removeAttributes(
-              AttributeSet::FunctionIndex, 
-              attrs.addAttribute
-              (M.getContext(), 
-               AttributeSet::FunctionIndex, Attribute::AlwaysInline));
-
-          f->addFnAttr(Attribute::NoInline);
-
-          f->setLinkage(llvm::GlobalValue::ExternalLinkage);
-          changed = true;
-#ifdef DEBUG_FLATTEN
-          std::cerr << "### NoInline for " << f->getName().str() << std::endl;
-#endif
-        } 
-      else
-        {
-          AttributeSet attrs;
-          f->removeAttributes(
-              AttributeSet::FunctionIndex, 
-              attrs.addAttribute(M.getContext(), 
-                                 AttributeSet::FunctionIndex, 
-                                 Attribute::NoInline));
-          f->addFnAttr(Attribute::AlwaysInline);
-
-          f->setLinkage(llvm::GlobalValue::InternalLinkage);
-          changed = true;
-#ifdef DEBUG_FLATTEN
-          std::cerr << "### AlwaysInline for " << f->getName().str() << std::endl;
-#endif
-        }
-    }
-  return changed;
-}
-
-#else
-
-static const char *workgroup_variables[] = {
-  "_local_id_x", "_local_id_y", "_local_id_z",
-  "_local_size_x", "_local_size_y", "_local_size_z",
-  "_work_dim",
-  "_num_groups_x", "_num_groups_y", "_num_groups_z",
-  "_group_id_x", "_group_id_y", "_group_id_z",
-  "_global_offset_x", "_global_offset_y", "_global_offset_z",
-  NULL
-};
-
-bool
-Flatten::runOnModule(Module &M)
-{
+static const char *workgroup_variables[] = {"_local_id_x",
+                                            "_local_id_y",
+                                            "_local_id_z",
+                                            "_local_size_x",
+                                            "_local_size_y",
+                                            "_local_size_z",
+                                            "_work_dim",
+                                            "_num_groups_x",
+                                            "_num_groups_y",
+                                            "_num_groups_z",
+                                            "_group_id_x",
+                                            "_group_id_y",
+                                            "_group_id_z",
+                                            "_global_offset_x",
+                                            "_global_offset_y",
+                                            "_global_offset_z",
+                                            NULL};
+
+bool FlattenGlobals::runOnModule(Module &M) {
   SmallPtrSet<Function *, 8> functions_to_inline;
   SmallVector<Value *, 8> pending;
 
@@ -141,18 +97,20 @@ Flatten::runOnModule(Module &M)
     Value *v = pending.back();
     pending.pop_back();
 
-    for (Value::use_iterator i = v->use_begin(), e = v->use_end();
-         i != e; ++i) {
+    for (Value::use_iterator i = v->use_begin(), e = v->use_end(); i != e;
+         ++i) {
       llvm::User *user = i->getUser();
-      if (Instruction *ci = dyn_cast<Instruction>(user) {
+      if (Instruction *ci = dyn_cast<Instruction>(user)) {
         // Prevent infinite looping on recursive functions
         // (though OpenCL does not allow this?)
-        Function *f = ci->getParent()->getParent();;
-        assert((f != NULL) &&
-               "Per-workgroup global variable used on function with no parent!");
+        Function *f = ci->getParent()->getParent();
+        ;
+        assert(
+            (f != NULL) &&
+            "Per-workgroup global variable used on function with no parent!");
         if (functions_to_inline.count(f))
           continue;
-        
+
         functions_to_inline.insert(f);
         pending.push_back(f);
       }
@@ -160,15 +118,22 @@ Flatten::runOnModule(Module &M)
   }
 
   for (SmallPtrSet<Function *, 8>::iterator i = functions_to_inline.begin(),
-	 e = functions_to_inline.end();
+                                            e = functions_to_inline.end();
        i != e; ++i) {
     (*i)->removeFnAttr(Attribute::NoInline);
     (*i)->addFnAttr(Attribute::AlwaysInline);
   }
 
+  StringRef barrier("_Z7barrierj");
+  for (llvm::Module::iterator i = M.begin(), e = M.end(); i != e; ++i) {
+    llvm::Function *f = &*i;
+    if (f->isDeclaration())
+      continue;
+    if (f->getName().equals(barrier)) {
+      f->removeFnAttr(Attribute::NoInline);
+      f->addFnAttr(Attribute::AlwaysInline);
+    }
+  }
+
   return true;
 }
-
-#endif
-
-
diff --git a/lib/llvmopencl/HandleSamplerInitialization.cc b/lib/llvmopencl/HandleSamplerInitialization.cc
index 7aee7aa..2f87573 100644
--- a/lib/llvmopencl/HandleSamplerInitialization.cc
+++ b/lib/llvmopencl/HandleSamplerInitialization.cc
@@ -81,9 +81,17 @@ HandleSamplerInitialization::runOnFunction(Function &F) {
     ConstantInt *SamplerValue = dyn_cast<ConstantInt>(C->arg_begin()->get());
 
     llvm::AllocaInst *Alloca = Builder.CreateAlloca(SamplerValue->getType());
+    /* Creates a volatile store. If the store is not volatile, it gets
+     * optimized out by DSE for some reason (possibly because opencl.sampler_t
+     * type is opaque).
+     *
+     * The proper solution would be to use the opencl.sampler_t directly
+     * for storing the sampler value, and not allocate storage at all,
+     * but this requires more changes - TODO.
+     */
     Builder.CreateStore(
       ConstantInt::get(SamplerValue->getType(), SamplerValue->getValue()),
-      Alloca);
+      Alloca, true);
     C->replaceAllUsesWith(Builder.CreateBitOrPointerCast(Alloca, C->getType()));
     C->eraseFromParent();
     Changed = true;
diff --git a/lib/llvmopencl/ImplicitConditionalBarriers.cc b/lib/llvmopencl/ImplicitConditionalBarriers.cc
index 2781c51..afdddc7 100644
--- a/lib/llvmopencl/ImplicitConditionalBarriers.cc
+++ b/lib/llvmopencl/ImplicitConditionalBarriers.cc
@@ -30,18 +30,18 @@ IGNORE_COMPILER_WARNING("-Wunused-parameter")
 
 #include "pocl.h"
 
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
+#include <llvm/Transforms/Utils/BasicBlockUtils.h>
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/Instructions.h>
+#include <llvm/IR/Module.h>
+
+POP_COMPILER_DIAGS
 
 #include "ImplicitConditionalBarriers.h"
 #include "Barrier.h"
-#include "BarrierBlock.h"
 #include "Workgroup.h"
 #include "VariableUniformityAnalysis.h"
 
-POP_COMPILER_DIAGS
 
 //#define DEBUG_COND_BARRIERS
 
@@ -138,15 +138,17 @@ ImplicitConditionalBarriers::runOnFunction(Function &F) {
     // BB before which to inject the barrier.
     BasicBlock *pos = b;
     if (pred_begin(b) == pred_end(b)) {
+#ifdef DEBUG_COND_BARRIERS
       b->dump();
+#endif
       assert (pred_begin(b) == pred_end(b));
     }
     BasicBlock *pred = firstNonBackedgePredecessor(b);
 
 #ifdef LLVM_OLDER_THAN_3_9
-    while (!isa<BarrierBlock>(pred) && PDT->dominates(b, pred)) {
+    while (!Barrier::hasOnlyBarrier(pred) && PDT->dominates(b, pred)) {
 #else
-    while (!isa<BarrierBlock>(pred) && PDT->getPostDomTree().dominates(b, pred)) {
+    while (!Barrier::hasOnlyBarrier(pred) && PDT->getPostDomTree().dominates(b, pred)) {
 #endif
 
 #ifdef DEBUG_COND_BARRIERS
@@ -161,9 +163,9 @@ ImplicitConditionalBarriers::runOnFunction(Function &F) {
 
     }
 
-    if (isa<BarrierBlock>(pos)) continue;
-    // Inject a barrier at the beginning of the BB and let the CanonicalizeBarrier
-    // to clean it up (split to a separate BB).
+    if (Barrier::hasOnlyBarrier(pos)) continue;
+    // Inject a barrier at the beginning of the BB and let the
+    // CanonicalizeBarrier to clean it up (split to a separate BB).
 
     // mri-q of parboil breaks in case injected at the beginning
     // TODO: investigate. It might related to the alloca-converted
diff --git a/lib/llvmopencl/Kernel.cc b/lib/llvmopencl/Kernel.cc
index fe17fae..9602afe 100644
--- a/lib/llvmopencl/Kernel.cc
+++ b/lib/llvmopencl/Kernel.cc
@@ -45,25 +45,28 @@ static void add_predecessors(SmallVectorImpl<BasicBlock *> &v,
 static bool verify_no_barriers(const BasicBlock *B);
 
 void
-Kernel::getExitBlocks(SmallVectorImpl<BarrierBlock *> &B) 
+Kernel::getExitBlocks(SmallVectorImpl<llvm::BasicBlock *> &B)
 {
   for (iterator i = begin(), e = end(); i != e; ++i) {
     const TerminatorInst *t = i->getTerminator();
     if (t->getNumSuccessors() == 0) {
       // All exits must be barrier blocks.
-      B.push_back(cast<BarrierBlock>(i));
+      llvm::BasicBlock *BB = cast<BasicBlock>(i);
+      if (!Barrier::hasBarrier(BB))
+        Barrier::Create(BB->getTerminator());
+      B.push_back(BB);
     }
   }
 }
 
 ParallelRegion *
-Kernel::createParallelRegionBefore(BarrierBlock *B) 
+Kernel::createParallelRegionBefore(llvm::BasicBlock *B)
 {
   SmallVector<BasicBlock *, 4> pending_blocks;
   SmallPtrSet<BasicBlock *, 8> blocks_in_region;
-  BarrierBlock *region_entry_barrier = NULL;
-  llvm::BasicBlock *entry = NULL;
-  llvm::BasicBlock *exit = B->getSinglePredecessor();
+  BasicBlock *region_entry_barrier = NULL;
+  BasicBlock *entry = NULL;
+  BasicBlock *exit = B->getSinglePredecessor();
   add_predecessors(pending_blocks, B);
 
 #ifdef DEBUG_PR_CREATION
@@ -89,9 +92,9 @@ Kernel::createParallelRegionBefore(BarrierBlock *B)
     
     // If we reach another barrier this must be the
     // parallel region entry.
-    if (isa<BarrierBlock>(current)) {
+    if (Barrier::hasOnlyBarrier(current)) {
       if (region_entry_barrier == NULL)
-        region_entry_barrier = cast<BarrierBlock>(current);
+        region_entry_barrier = current;
 #ifdef DEBUG_PR_CREATION
       std::cerr << "### it's a barrier!" << std::endl;        
 #endif     
@@ -166,18 +169,18 @@ Kernel::getParallelRegions(llvm::LoopInfo *LI) {
   ParallelRegion::ParallelRegionVector *parallel_regions =
     new ParallelRegion::ParallelRegionVector;
 
-  SmallVector<BarrierBlock *, 4> exit_blocks;
+  SmallVector<BasicBlock *, 4> exit_blocks;
   getExitBlocks(exit_blocks);
 
   // We need to keep track of traversed barriers to detect back edges.
-  SmallPtrSet<BarrierBlock *, 8> found_barriers;
+  SmallPtrSet<BasicBlock *, 8> found_barriers;
 
   // First find all the ParallelRegions in the Function.
   while (!exit_blocks.empty()) {
     
     // We start on an exit block and process the parallel regions upwards
     // (finding an execution trace).
-    BarrierBlock *exit = exit_blocks.back();
+    BasicBlock *exit = exit_blocks.back();
     exit_blocks.pop_back();
 
     while (ParallelRegion *PR = createParallelRegionBefore(exit)) {
@@ -189,10 +192,10 @@ Kernel::getParallelRegions(llvm::LoopInfo *LI) {
       parallel_regions->push_back(PR);
       BasicBlock *entry = PR->entryBB();
       int found_predecessors = 0;
-      BarrierBlock *loop_barrier = NULL;
+      BasicBlock *loop_barrier = NULL;
       for (pred_iterator i = pred_begin(entry), e = pred_end(entry);
            i != e; ++i) {
-        BarrierBlock *barrier = cast<BarrierBlock> (*i);
+        BasicBlock *barrier = (*i);
         if (!found_barriers.count(barrier)) {
           /* If this is a loop header block we might have edges from two 
              unprocessed barriers. The one inside the loop (coming from a 
diff --git a/lib/llvmopencl/Kernel.h b/lib/llvmopencl/Kernel.h
index 9ff9dd9..fa8b05e 100644
--- a/lib/llvmopencl/Kernel.h
+++ b/lib/llvmopencl/Kernel.h
@@ -33,8 +33,8 @@ namespace pocl {
 
   class Kernel : public llvm::Function {
   public:
-    void getExitBlocks(llvm::SmallVectorImpl<BarrierBlock *> &B);
-    ParallelRegion *createParallelRegionBefore(BarrierBlock *B);
+    void getExitBlocks(llvm::SmallVectorImpl<llvm::BasicBlock *> &B);
+    ParallelRegion *createParallelRegionBefore(llvm::BasicBlock *B);
     
     ParallelRegion::ParallelRegionVector* 
       getParallelRegions(llvm::LoopInfo *LI);
diff --git a/lib/llvmopencl/LLVMFileUtils.cc b/lib/llvmopencl/LLVMFileUtils.cc
index 2c36318..ef850e0 100644
--- a/lib/llvmopencl/LLVMFileUtils.cc
+++ b/lib/llvmopencl/LLVMFileUtils.cc
@@ -160,6 +160,15 @@ int pocl_touch_file(const char* path) {
     return (close(fd) ? (-errno) : 0);
 
 }
+
+int pocl_rename(const char *oldpath, const char *newpath) {
+
+    Twine op(oldpath);
+    Twine np(newpath);
+    std::error_code ec = sys::fs::rename(op, np);
+    return ec.default_error_condition().value();
+}
+
 /****************************************************************************/
 
 int
@@ -197,14 +206,15 @@ pocl_read_file(const char* path, char** content, uint64_t *filesize) {
 }
 
 
-
-int pocl_write_file(const char *path, const char* content,
-                                    uint64_t    count,
-                                    int         append,
-                                    int         dont_rewrite) {
+/* Atomic write - with rename()
+ * NOTE: still requires a pocl lock before attempting it - because the tempfile
+ * name is not random */
+int pocl_write_file(const char *path, const char *content, uint64_t count,
+                    int append, int dont_rewrite) {
     int fd;
     std::error_code ec;
-    Twine p(path);
+    std::string TmpPath(path);
+    TmpPath.append(".tmp");
 
     assert(path);
     assert(content);
@@ -220,23 +230,34 @@ int pocl_write_file(const char *path, const char* content,
         }
     }
 
-    if (append)
+    if (append) {
+        Twine p(path);
         OPEN_FOR_APPEND;
-    else
+    } else {
+        Twine p(TmpPath);
         OPEN_CREATE;
+    }
 
     RETURN_IF_EC;
 
     if (write(fd, content, (ssize_t)count) < (ssize_t)count)
         return errno ? -errno : -1;
 
-    return (close(fd) ? (-errno) : 0);
-}
+    if (close(fd))
+      return -errno;
 
+    if (append)
+      return 0;
+    else
+      return pocl_rename(TmpPath.c_str(), path);
+}
 
 
 
 
+/* Atomic write of IR - with rename()
+ * NOTE: still requires a pocl lock before attempting it - because the tempfile
+ * name is not random */
 int pocl_write_module(void *module, const char* path, int dont_rewrite) {
 
     assert(module);
@@ -267,11 +288,7 @@ int pocl_write_module(void *module, const char* path, int dont_rewrite) {
     if (os.has_error())
       return 1;
 
-    std::string Command("mv ");
-    Command += TmpPath;
-    Command += " ";
-    Command += path;
-    return system(Command.c_str());
+    return pocl_rename(TmpPath.c_str(), path);
 }
 
 
diff --git a/lib/llvmopencl/LLVMUtils.cc b/lib/llvmopencl/LLVMUtils.cc
index a0a496b..4f3db05 100644
--- a/lib/llvmopencl/LLVMUtils.cc
+++ b/lib/llvmopencl/LLVMUtils.cc
@@ -24,6 +24,7 @@
 
 #include "pocl.h"
 
+#include <llvm/IR/Instructions.h>
 #include <llvm/IR/Module.h>
 #include <llvm/IR/Metadata.h>
 
@@ -88,5 +89,19 @@ regenerate_kernel_metadata(llvm::Module &M, FunctionMapping &kernels)
   }
 }
 
+void eraseFunctionAndCallers(llvm::Function *Function) {
+  if (!Function)
+    return;
+
+  std::vector<llvm::Value *> Callers(Function->user_begin(),
+                                     Function->user_end());
+  for (auto &U : Callers) {
+    llvm::CallInst *Call = llvm::dyn_cast<llvm::CallInst>(U);
+    if (!Call)
+      continue;
+    Call->eraseFromParent();
+  }
+  Function->eraseFromParent();
 }
 
+}
diff --git a/lib/llvmopencl/LLVMUtils.h b/lib/llvmopencl/LLVMUtils.h
index a94a68e..4b8da57 100644
--- a/lib/llvmopencl/LLVMUtils.h
+++ b/lib/llvmopencl/LLVMUtils.h
@@ -47,6 +47,9 @@ typedef std::map<llvm::Function*, llvm::Function*> FunctionMapping;
 void
 regenerate_kernel_metadata(llvm::Module &M, FunctionMapping &kernels);
 
+// Remove a function from a module, along with all callsites.
+void eraseFunctionAndCallers(llvm::Function *Function);
+
 inline bool
 isAutomaticLocal(const std::string &FuncName, llvm::GlobalVariable &Var) {
 #ifdef POCL_USE_FAKE_ADDR_SPACE_IDS
@@ -80,12 +83,12 @@ is_image_type(const llvm::Type& t)
 }
 
 inline bool
-is_sampler_type(const llvm::Type& t) 
+is_sampler_type(const llvm::Type& t)
 {
-  if (t.isPointerTy() && t.getPointerElementType()->isStructTy()) 
+  if (t.isPointerTy() && t.getPointerElementType()->isStructTy())
     {
       llvm::StringRef name = t.getPointerElementType()->getStructName();
-      if (name.startswith("opencl.sampler_t_")) return true;     
+      if (name.startswith("opencl.sampler_t")) return true;
     }
   return false;
 }
diff --git a/lib/llvmopencl/OptimizeWorkItemFuncCalls.cc b/lib/llvmopencl/OptimizeWorkItemFuncCalls.cc
new file mode 100644
index 0000000..5f613e4
--- /dev/null
+++ b/lib/llvmopencl/OptimizeWorkItemFuncCalls.cc
@@ -0,0 +1,159 @@
+// Header for CleanupWorkItemFuncCalls, an LLVM pass to optimize calls to work-item
+// functions like get_local_size().
+//
+// Copyright (c) 2017 Pekka Jääskeläinen / Tampere University of Technology
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+
+#include "config.h"
+
+#include <set>
+#include <iostream>
+
+#include <llvm/IR/Constants.h>
+#include <llvm/IR/Instructions.h>
+
+#include "OptimizeWorkItemFuncCalls.h"
+
+namespace pocl {
+
+using namespace llvm;
+
+namespace {
+  static
+  RegisterPass<pocl::OptimizeWorkItemFuncCalls>
+  X("optimize-wi-func-calls",
+    "Optimize work-item function calls.");
+}
+
+char OptimizeWorkItemFuncCalls::ID = 0;
+
+OptimizeWorkItemFuncCalls::OptimizeWorkItemFuncCalls() : FunctionPass(ID) {}
+
+bool
+OptimizeWorkItemFuncCalls::runOnFunction(Function &F) {
+
+  // Find calls to WI functions and unify them to a single call in the
+  // entry to avoid confusing LLVM later with the 'pseudo loads' and to
+  // reduce the inlining bloat.
+
+  typedef std::set<std::string> WIFuncNameVec;
+  const WIFuncNameVec WIFuncNames = {
+    "_Z13get_global_idj",
+    "_Z17get_global_offsetj",
+    "_Z15get_global_sizej",
+    "_Z12get_group_idj",
+    "_Z12get_local_idj",
+    "_Z14get_local_sizej",
+    "_Z14get_num_groupsj",
+    "_Z12get_work_dimv"
+  };
+
+  Function::iterator I = F.begin();
+  Instruction *FirstInsnPt = &*(I++)->getFirstInsertionPt();
+
+  bool Changed = false;
+
+  std::map<std::string, std::vector<CallInst*>> Calls;
+
+  // First collect all calls of interest.
+  for (Function::iterator E = F.end(); I != E; ++I) {
+    for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
+
+      CallInst *Call = dyn_cast<CallInst>(BI++);
+
+      if (Call == nullptr) continue;
+
+      if (Call->getCalledFunction() == nullptr) {
+        // The callee can be null in case of asm snippets (TCE).
+        continue;
+      }
+      auto FuncNameI = WIFuncNames.find(Call->getCalledFunction()->getName().str());
+      if (FuncNameI == WIFuncNames.end())
+        continue;
+
+      bool Unsupported = false;
+      // Check that the argument list is something we can handle.
+      for (unsigned I = 0; I < Call->getNumArgOperands(); ++I) {
+        llvm::ConstantInt *CallOperand =
+          dyn_cast<llvm::ConstantInt>(Call->getArgOperand(I));
+        if (CallOperand == nullptr)
+          Unsupported = true;
+      }
+      if (Unsupported) continue;
+      Calls[*FuncNameI].push_back(Call);
+    }
+  }
+
+  // Add single calls for the interesting functions.
+  std::map<std::string, std::vector<CallInst*> > CallsInEntry;
+  for (const auto &Call : Calls) {
+    std::string FuncName = Call.first;
+    auto CallInsts = Call.second;
+
+    for (auto CallInst : CallInsts) {
+      // Try to find a previous call with the same parameters which
+      // we can reuse.
+      std::vector<llvm::CallInst*> &CallsMoved = CallsInEntry[FuncName];
+      llvm::CallInst *PreviousCall = nullptr;
+      for (auto &M : CallsMoved) {
+        llvm::CallInst *MovedCall = dyn_cast<llvm::CallInst>(M);
+
+        // WI functions do not have variable argument lists.
+        assert (MovedCall->getNumArgOperands() ==
+                CallInst->getNumArgOperands());
+
+        bool IsApplicable = true;
+        for (unsigned I = 0; I < MovedCall->getNumArgOperands(); ++I) {
+          llvm::ConstantInt *CallOperand =
+            dyn_cast<llvm::ConstantInt>(CallInst->getArgOperand(I));
+          llvm::ConstantInt *PrevCallOperand =
+            dyn_cast<llvm::ConstantInt>(MovedCall->getArgOperand(I));
+
+          assert (isa<llvm::ConstantInt>(PrevCallOperand));
+
+          if (CallOperand->getValue() != PrevCallOperand->getValue()) {
+            IsApplicable = false;
+            break;
+          }
+        }
+        if (IsApplicable) {
+          // Found a suitable previous call instruction we can reuse.
+          PreviousCall = MovedCall;
+          break;
+        }
+      }
+
+      if (PreviousCall == nullptr) {
+        CallInst->moveBefore(FirstInsnPt);
+        CallsInEntry[FuncName].push_back(CallInst);
+        Changed = true;
+      } else {
+        // Not the first call.  Refer to the first call that was moved to
+        // the entry.
+        CallInst->replaceAllUsesWith(PreviousCall);
+        CallInst->eraseFromParent();
+        Changed = true;
+      }
+    }
+  }
+  return Changed;
+}
+
+}
diff --git a/lib/llvmopencl/BarrierBlock.h b/lib/llvmopencl/OptimizeWorkItemFuncCalls.h
similarity index 66%
rename from lib/llvmopencl/BarrierBlock.h
rename to lib/llvmopencl/OptimizeWorkItemFuncCalls.h
index 1e9864a..bd4b4e2 100644
--- a/lib/llvmopencl/BarrierBlock.h
+++ b/lib/llvmopencl/OptimizeWorkItemFuncCalls.h
@@ -1,17 +1,17 @@
-// Class for a basic block that just contains a barrier.
-// 
-// Copyright (c) 2011 Universidad Rey Juan Carlos
-// 
+// Header for OptimizeWorkItemFuncCalls.
+//
+// Copyright (c) 2017 Pekka Jääskeläinen / Tampere University of Technology
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
-// 
+//
 // The above copyright notice and this permission notice shall be included in
 // all copies or substantial portions of the Software.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -20,22 +20,26 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
-#include "config.h"
+#ifndef _POCL_OPTIMIZE_WI_FUNC_CALLS_H
+#define _POCL_OPTIMIZE_WI_FUNC_CALLS_H
 
-#include "llvm/IR/BasicBlock.h"
+#include "config.h"
 
-#ifndef _POCL_BARRIER_BLOCK_H
-#define _POCL_BARRIER_BLOCK_H
+#include <llvm/IR/Function.h>
+#include <llvm/Pass.h>
+#include <llvm/Support/CommandLine.h>
 
 namespace pocl {
+  class OptimizeWorkItemFuncCalls : public llvm::FunctionPass {
+  public:
+    static char ID;
 
-  class BarrierBlock : public llvm::BasicBlock {
+    OptimizeWorkItemFuncCalls();
+    virtual ~OptimizeWorkItemFuncCalls() {};
 
-  public:
-    static bool classof(const BarrierBlock *) { return true; };
-    static bool classof(const llvm::BasicBlock *B);
+    void getAnalysisUsage(llvm::AnalysisUsage &AU) const {}
+    virtual bool runOnFunction(llvm::Function &F);
   };
-
 }
 
 #endif
diff --git a/lib/llvmopencl/ParallelRegion.cc b/lib/llvmopencl/ParallelRegion.cc
index ff4c500..c883a56 100644
--- a/lib/llvmopencl/ParallelRegion.cc
+++ b/lib/llvmopencl/ParallelRegion.cc
@@ -197,17 +197,18 @@ ParallelRegion::chainAfter(ParallelRegion *region)
       tail = region->at(region->size() - 2);
       t = tail->getTerminator();
     }
-  if (t->getNumSuccessors() != 1)
-    {
+#ifdef LLVM_BUILD_MODE_DEBUG
+    if (t->getNumSuccessors() != 1) {
       std::cout << "!!! trying to chain region" << std::endl;
       this->dumpNames();
       std::cout << "!!! after region" << std::endl;
       region->dumpNames();
       t->getParent()->dump();
-      
+
       assert (t->getNumSuccessors() == 1);
     }
-  
+#endif
+
   BasicBlock *successor = t->getSuccessor(0);
   Function::BasicBlockListType &bb_list = 
     successor->getParent()->getBasicBlockList();
@@ -342,8 +343,10 @@ ParallelRegion::insertPrologue(unsigned x,
 void
 ParallelRegion::dump()
 {
+#ifdef LLVM_BUILD_MODE_DEBUG
   for (iterator i = begin(), e = end(); i != e; ++i)
     (*i)->dump();
+#endif
 }
 
 void
@@ -439,12 +442,14 @@ ParallelRegion::Verify()
       ParallelRegion::ParallelRegionVector regions;
       regions.push_back(this);
 
+#ifdef LLVM_BUILD_MODE_DEBUG
       std::set<llvm::BasicBlock*> highlights;
       highlights.insert((*i));
       highlights.insert(exitBB());
       exitBB()->dump();
       dumpNames();
       dumpCFG(*(*i)->getParent(), "broken.dot", &regions, &highlights);
+#endif
 
       assert(0 && "Multiple outgoing edges from exit block!");
       return false;
@@ -670,7 +675,11 @@ ParallelRegion::InjectPrintF
        /*Name=*/"printf", M); 
     printfFunc->setCallingConv(CallingConv::C);
 
+#if LLVM_OLDER_THAN_5_0
     AttributeSet func_printf_PAL;
+#else
+    AttributeList func_printf_PAL;
+#endif
     {
       func_printf_PAL.addAttribute( M->getContext(), 1U, Attribute::NoCapture);
       func_printf_PAL.addAttribute( M->getContext(), 4294967295U, Attribute::NoUnwind);
diff --git a/lib/llvmopencl/ParallelRegion.h b/lib/llvmopencl/ParallelRegion.h
index 212346d..f9bdae4 100644
--- a/lib/llvmopencl/ParallelRegion.h
+++ b/lib/llvmopencl/ParallelRegion.h
@@ -36,8 +36,6 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 
-#include "BarrierBlock.h"
-
 namespace pocl {
 
 #define POCL_LOCAL_ID_X_GLOBAL "_local_id_x"
diff --git a/lib/llvmopencl/AllocasToEntry.cc b/lib/llvmopencl/RemoveOptnoneFromWIFunc.cc
similarity index 55%
copy from lib/llvmopencl/AllocasToEntry.cc
copy to lib/llvmopencl/RemoveOptnoneFromWIFunc.cc
index 0b4193b..892078f 100644
--- a/lib/llvmopencl/AllocasToEntry.cc
+++ b/lib/llvmopencl/RemoveOptnoneFromWIFunc.cc
@@ -1,18 +1,17 @@
-// Header for AllocasToEntry, an LLVM pass to move allocas to the function 
-// entry node.
-// 
-// Copyright (c) 2013 Pekka Jääskeläinen / TUT
-// 
+// Removes optnone keyword from get_global_id().
+//
+// Copyright (c) 2017 Michal Babej / TUT
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
-// 
+//
 // The above copyright notice and this permission notice shall be included in
 // all copies or substantial portions of the Software.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -21,52 +20,44 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
-#include <sstream>
-#include <iostream>
-
 #include "config.h"
 
 #include <llvm/IR/Constants.h>
 #include <llvm/IR/Instructions.h>
 
-#include "AllocasToEntry.h"
+#include "RemoveOptnoneFromWIFunc.h"
+
+#include <iostream>
 
 namespace pocl {
 
 using namespace llvm;
 
 namespace {
-  static
-  RegisterPass<pocl::AllocasToEntry> X("allocastoentry", 
-                                       "Move allocas to the function entry node.");
+static RegisterPass<pocl::RemoveOptnoneFromWIFunc>
+    X("remove-optnone", "Remove optnone keyword from workitem functions.");
 }
 
-char AllocasToEntry::ID = 0;
+char RemoveOptnoneFromWIFunc::ID = 0;
 
+RemoveOptnoneFromWIFunc::RemoveOptnoneFromWIFunc() : FunctionPass(ID) {}
 
-AllocasToEntry::AllocasToEntry() : FunctionPass(ID)
-{
-}
-
-bool
-AllocasToEntry::runOnFunction(Function &F)
-{
-  // This solves problem with dynamic stack objects that are 
-  // not supported by some targets (TCE).
-  Function::iterator I = F.begin();
-  Instruction *firstInsertionPt = &*(I++)->getFirstInsertionPt();
-    
+bool RemoveOptnoneFromWIFunc::runOnFunction(Function &F) {
+  /* Adding "optnone" to get_global_id() solves the problem
+   * that some pass in opt introduces switch tables which the
+   * variable uniformity analysis cannot analyze.
+   *
+   * However having optnone prevents some later optimizations
+   * and creates problems in certain workitem tests.
+   */
+  const char *name = "_Z13get_global_idj";
+  StringRef nameref(name);
   bool changed = false;
-  for (Function::iterator E = F.end(); I != E; ++I) {
-    for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
-      AllocaInst *allocaInst = dyn_cast<AllocaInst>(BI++);
-      if (allocaInst && isa<ConstantInt>(allocaInst->getArraySize())) {
-        allocaInst->moveBefore(firstInsertionPt);
-        changed = true;
-      }
-    }
+
+  if (F.getName().equals(nameref)) {
+    F.removeFnAttr(Attribute::AttrKind::OptimizeNone);
+    changed = true;
   }
   return changed;
 }
-
 }
diff --git a/lib/llvmopencl/BarrierBlock.cc b/lib/llvmopencl/RemoveOptnoneFromWIFunc.h
similarity index 63%
rename from lib/llvmopencl/BarrierBlock.cc
rename to lib/llvmopencl/RemoveOptnoneFromWIFunc.h
index 4281515..60a650c 100644
--- a/lib/llvmopencl/BarrierBlock.cc
+++ b/lib/llvmopencl/RemoveOptnoneFromWIFunc.h
@@ -1,17 +1,17 @@
-// Class for a basic block that just contains a barrier.
-// 
-// Copyright (c) 2011 Universidad Rey Juan Carlos
-// 
+// Header for RemoveOptnoneFromWIFunc, an LLVM pass to remove optnone keyword
+//
+// Copyright (c) 2017 Michal Babej / TUT
+//
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
-// 
+//
 // The above copyright notice and this permission notice shall be included in
 // all copies or substantial portions of the Software.
-// 
+//
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -20,35 +20,27 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.
 
-#include <cassert>
+#ifndef _POCL_REMOVE_OPTNONE_H
+#define _POCL_REMOVE_OPTNONE_H
 
 #include "config.h"
 
-#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
 
-#include "BarrierBlock.h"
-#include "Barrier.h"
+namespace pocl {
+class RemoveOptnoneFromWIFunc : public llvm::FunctionPass {
+public:
+  static char ID;
 
-using namespace llvm;
-using namespace pocl;
+  RemoveOptnoneFromWIFunc();
+  virtual ~RemoveOptnoneFromWIFunc(){};
 
-#ifndef NDEBUG
-static bool verify(const BasicBlock *B) {
-  assert((B->size() == 2) && "Barriers blocks should have no functionality!");
-  assert(isa<Barrier>(B->front()));
+  void getAnalysisUsage(llvm::AnalysisUsage &AU) const { AU.setPreservesAll(); }
 
-  return true;
-}
-#endif
-
-bool BarrierBlock::classof(const BasicBlock *B) {
-
-  if ((B->size() == 2) &&
-      isa<Barrier> (&B->front())) {
-    assert(verify(B));
-    return true;
-  }
-
-  return false;
+  virtual bool runOnFunction(llvm::Function &F);
+};
 }
 
+#endif
diff --git a/lib/llvmopencl/TargetAddressSpaces.cc b/lib/llvmopencl/TargetAddressSpaces.cc
index 025d16c..ce1b535 100644
--- a/lib/llvmopencl/TargetAddressSpaces.cc
+++ b/lib/llvmopencl/TargetAddressSpaces.cc
@@ -89,21 +89,19 @@ ConvertedType(llvm::Type *type, std::map<unsigned, unsigned> &addrSpaceMap,
       return convertedStructsCache[type];
 
     llvm::StructType* OrigType = dyn_cast<llvm::StructType>(type);
-    llvm::StructType* NewType;
-    if (!OrigType->isLiteral()) {
-      std::string s = OrigType->getName().str();
-      s += "_tas_struct";
-      NewType = StructType::create(OrigType->getContext(), s);
-    }
     std::vector<llvm::Type*> newtypes;
     for (llvm::StructType::element_iterator i = OrigType->element_begin(),
          e = OrigType->element_end(); i < e; ++i) {
       newtypes.push_back(ConvertedType(*i, addrSpaceMap, convertedStructsCache));
     }
     ArrayRef<Type*> a(newtypes);
+    llvm::StructType* NewType;
     if (OrigType->isLiteral()) {
       NewType = StructType::get(OrigType->getContext(), a, OrigType->isPacked());
     } else {
+      std::string s = OrigType->getName().str();
+      s += "_tas_struct";
+      NewType = StructType::create(OrigType->getContext(), s);
       NewType->setBody(a, OrigType->isPacked());
     }
     convertedStructsCache[type] = NewType;
@@ -526,6 +524,8 @@ TargetAddressSpaces::runOnModule(llvm::Module &M) {
     return false;
 #endif
 
+  assert(!arch.startswith("nvptx"));
+
   std::map<unsigned, unsigned> addrSpaceMapUp;
 
   addrSpaceMapUp[POCL_FAKE_AS_GLOBAL] = POCL_AS_FAKE_GLOBAL;
diff --git a/lib/llvmopencl/VariableUniformityAnalysis.cc b/lib/llvmopencl/VariableUniformityAnalysis.cc
index 8284ba7..2c7c340 100644
--- a/lib/llvmopencl/VariableUniformityAnalysis.cc
+++ b/lib/llvmopencl/VariableUniformityAnalysis.cc
@@ -41,10 +41,15 @@ IGNORE_COMPILER_WARNING("-Wunused-parameter")
 #include "Kernel.h"
 #include "VariableUniformityAnalysis.h"
 #include "Barrier.h"
+#include "Workgroup.h"
 
 POP_COMPILER_DIAGS
 
-//#define DEBUG_UNIFORMITY_ANALYSIS
+// #define DEBUG_UNIFORMITY_ANALYSIS
+
+#ifdef DEBUG_UNIFORMITY_ANALYSIS
+#include "DebugHelpers.h"
+#endif
 
 namespace pocl {
 
@@ -111,8 +116,13 @@ VariableUniformityAnalysis::markInductionVariables(Function &F, llvm::Loop &L) {
 bool
 VariableUniformityAnalysis::runOnFunction(Function &F) {
 
+  if (!Workgroup::isKernelToProcess(F))
+    return false;
+
 #ifdef DEBUG_UNIFORMITY_ANALYSIS
   std::cerr << "### refreshing VUA" << std::endl;
+  dumpCFG(F, F.getName().str() + ".vua.dot");
+  F.dump();
 #endif
 
   /* Do the actual analysis on-demand except for the basic block
@@ -184,45 +194,57 @@ VariableUniformityAnalysis::shouldBePrivatized
  * b) BBs that post-dominate at least one uniform BB (try the previously 
  *    found one), or
  * c) BBs that are branched to directly from a uniform BB using a uniform branch.
+ *    Note: This assumes the CFG is well-formed in a way that there cannot be a divergent
+ *    branch to the same BB in that case.
  *
  * Otherwise, assume divergent (might not be *proven* to be one!).
- * 
+ *
  */
 void
 VariableUniformityAnalysis::analyzeBBDivergence
 (llvm::Function *f, llvm::BasicBlock *bb, llvm::BasicBlock *previousUniformBB) {
 
 #ifdef DEBUG_UNIFORMITY_ANALYSIS
-  std::cerr << "### Analyzing BB divergence (bb=" << bb->getName().str() 
-            << ", prevUniform=" << previousUniformBB->getName().str() << ")" 
+  std::cerr << "### Analyzing BB divergence (bb=" << bb->getName().str()
+            << ", prevUniform=" << previousUniformBB->getName().str() << ")"
             << std::endl;
 #endif
- 
-
-  llvm::BasicBlock *newPreviousUniformBB = previousUniformBB;
 
-  llvm::BranchInst *br = 
-    dyn_cast<llvm::BranchInst>(previousUniformBB->getTerminator());  
-
-  if (br == NULL) {
+  llvm::TerminatorInst *Term = previousUniformBB->getTerminator();
+  if (Term == NULL) {
     // this is most likely a function with a single basic block, the entry node, which
     // ends with a ret
     return;
   }
 
+  llvm::BranchInst *BrInst = dyn_cast<llvm::BranchInst>(Term);
+  llvm::SwitchInst *SwInst = dyn_cast<llvm::SwitchInst>(Term);
+
+  if (BrInst == nullptr && SwInst == nullptr) {
+    // Can only handle branches and switches for now.
+    return;
+  }
+
+  // The BBs that were found uniform.
+  std::vector<llvm::BasicBlock *> FoundUniforms;
+
   // Condition c)
-  if ((!br->isConditional() || isUniform(f, br->getCondition()))) {
-    for (unsigned suc = 0, end = br->getNumSuccessors(); suc < end; ++suc) {
-      if (br->getSuccessor(suc) == bb) {
-        setUniform(f, bb, true);
-        newPreviousUniformBB = bb;
-        break;
-      }
+  if ((BrInst && (!BrInst->isConditional() ||
+                  isUniform(f, BrInst->getCondition()))) ||
+      (SwInst && isUniform(f, SwInst->getCondition()))) {
+    // This is a branch with a uniform condition, propagate the uniformity
+    // to the BB of interest.
+    for (unsigned suc = 0, end = Term->getNumSuccessors(); suc < end; ++suc) {
+      llvm::BasicBlock *Successor = Term->getSuccessor(suc);
+      // TODO: should we check that there are no divergent entries to this
+      // BB even though if the currently checked condition is uniform?
+      setUniform(f, Successor, true);
+      FoundUniforms.push_back(Successor);
     }
-  } 
+  }
 
   // Condition b)
-  if (newPreviousUniformBB != bb) {
+  if (FoundUniforms.size() == 0) {
 #ifdef LLVM_OLDER_THAN_3_9
     llvm::PostDominatorTree *PDT = &getAnalysis<PostDominatorTree>();
     if (PDT->dominates(bb, previousUniformBB)) {
@@ -231,23 +253,24 @@ VariableUniformityAnalysis::analyzeBBDivergence
     if (PDT->getPostDomTree().dominates(bb, previousUniformBB)) {
 #endif
       setUniform(f, bb, true);
-      newPreviousUniformBB = bb;
+      FoundUniforms.push_back(bb);
     }
-  } 
+  }
 
   /* Assume diverging. */
   if (!isUniformityAnalyzed(f, bb))
     setUniform(f, bb, false);
 
-  llvm::BranchInst *nextbr = dyn_cast<llvm::BranchInst>(bb->getTerminator());  
+  for (auto UniformBB : FoundUniforms) {
 
-  if (nextbr == NULL) return; /* ret */
+    // Propagate the Uniform BB data downwards.
+    llvm::TerminatorInst *NextTerm = UniformBB->getTerminator();
 
-  /* Propagate the data downward. */
-  for (unsigned suc = 0, end = nextbr->getNumSuccessors(); suc < end; ++suc) {
-    llvm::BasicBlock *nextbb = nextbr->getSuccessor(suc);
-    if (!isUniformityAnalyzed(f, nextbb)) {
-      analyzeBBDivergence(f, nextbb, newPreviousUniformBB);
+    for (unsigned suc = 0, end = NextTerm->getNumSuccessors(); suc < end; ++suc) {
+      llvm::BasicBlock *NextBB = NextTerm->getSuccessor(suc);
+      if (!isUniformityAnalyzed(f, NextBB)) {
+        analyzeBBDivergence(f, NextBB, UniformBB);
+      }
     }
   }
 }
diff --git a/lib/llvmopencl/WorkItemAliasAnalysis.cc b/lib/llvmopencl/WorkItemAliasAnalysis.cc
index 4549466..6b2a13d 100644
--- a/lib/llvmopencl/WorkItemAliasAnalysis.cc
+++ b/lib/llvmopencl/WorkItemAliasAnalysis.cc
@@ -199,19 +199,6 @@ RegisterAnalysisGroup<AliasAnalysis> Y(X);
 RegisterAnalysisGroup<WorkItemAAResult> Y(X);
 #endif
 
-/*
-FunctionPass *createWorkItemAliasAnalysisPass() {
-    return new WorkItemAliasAnalysis();
-}
-*/
-
-extern "C" {                                
-    FunctionPass*
-    create_workitem_aa_plugin() {
-        return new WorkItemAliasAnalysis();
-    }
-}
-
 void
 WorkItemAliasAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
     AU.setPreservesAll();
diff --git a/lib/llvmopencl/Workgroup.cc b/lib/llvmopencl/Workgroup.cc
index ebd5bac..28fddad 100644
--- a/lib/llvmopencl/Workgroup.cc
+++ b/lib/llvmopencl/Workgroup.cc
@@ -97,6 +97,7 @@ namespace llvm {
     static StructType *get(LLVMContext &Context) {
       if (size_t_width == 64)
         {
+#ifdef LLVM_OLDER_THAN_5_0
           return StructType::get
             (TypeBuilder<types::i<32>, xcompile>::get(Context),
              TypeBuilder<types::i<64>[3], xcompile>::get(Context),
@@ -104,9 +105,24 @@ namespace llvm {
              TypeBuilder<types::i<64>[3], xcompile>::get(Context),
              TypeBuilder<types::i<64>[3], xcompile>::get(Context),
              NULL);
+#else
+          SmallVector<Type*, 8> Elements;
+          Elements.push_back(
+            TypeBuilder<types::i<32>, xcompile>::get(Context));
+          Elements.push_back(
+            TypeBuilder<types::i<64>[3], xcompile>::get(Context));
+          Elements.push_back(
+            TypeBuilder<types::i<64>[3], xcompile>::get(Context));
+          Elements.push_back(
+            TypeBuilder<types::i<64>[3], xcompile>::get(Context));
+          Elements.push_back(
+            TypeBuilder<types::i<64>[3], xcompile>::get(Context));
+          return StructType::get(Context, Elements);
+#endif
         }
       else if (size_t_width == 32)
         {
+#ifdef LLVM_OLDER_THAN_5_0
           return StructType::get
             (TypeBuilder<types::i<32>, xcompile>::get(Context),
              TypeBuilder<types::i<32>[3], xcompile>::get(Context),
@@ -114,6 +130,20 @@ namespace llvm {
              TypeBuilder<types::i<32>[3], xcompile>::get(Context),
              TypeBuilder<types::i<32>[3], xcompile>::get(Context),
              NULL);
+#else
+          SmallVector<Type*, 8> Elements;
+          Elements.push_back(
+            TypeBuilder<types::i<32>, xcompile>::get(Context));
+          Elements.push_back(
+            TypeBuilder<types::i<32>[3], xcompile>::get(Context));
+          Elements.push_back(
+            TypeBuilder<types::i<32>[3], xcompile>::get(Context));
+          Elements.push_back(
+            TypeBuilder<types::i<32>[3], xcompile>::get(Context));
+          Elements.push_back(
+            TypeBuilder<types::i<32>[3], xcompile>::get(Context));
+          return StructType::get(Context, Elements);
+#endif
         }
       else
         {
@@ -184,6 +214,7 @@ Workgroup::runOnModule(Module &M)
     Function *L = createLauncher(M, &*i);
 
     L->addFnAttr(Attribute::NoInline);
+    L->removeFnAttr(Attribute::AlwaysInline);
 
     privatizeContext(M, L);
 
@@ -208,10 +239,15 @@ Workgroup::runOnModule(Module &M)
       }
   }
 
+#if LLVM_OLDER_THAN_5_0
   Function *barrier = cast<Function>
     (M.getOrInsertFunction(BARRIER_FUNCTION_NAME,
-                           Type::getVoidTy(M.getContext()),
-                           NULL));
+                           Type::getVoidTy(M.getContext()), NULL));
+#else
+  Function *barrier = cast<Function>
+    (M.getOrInsertFunction(BARRIER_FUNCTION_NAME,
+                           Type::getVoidTy(M.getContext())));
+#endif
   BasicBlock *bb = BasicBlock::Create(M.getContext(), "", barrier);
   ReturnInst::Create(M.getContext(), 0, bb);
 
@@ -293,7 +329,7 @@ createLauncher(Module &M, Function *F) {
 
   SmallVector<Value *, 8> arguments;
   Function::arg_iterator ai = L->arg_begin();
-  for (unsigned i = 0, e = F->getArgumentList().size(); i != e; ++i) {
+  for (unsigned i = 0, e = F->arg_size(); i != e; ++i) {
     arguments.push_back(&*ai);
     ++ai;
   }
@@ -345,7 +381,7 @@ createLauncher(Module &M, Function *F) {
   CallInst *c = builder.CreateCall(F, ArrayRef<Value*>(arguments));
   builder.CreateRetVoid();
 
-#ifdef LLVM_4_0
+#ifndef LLVM_OLDER_THAN_4_0
   // At least with LLVM 4.0, the runtime of AddAliasScopeMetadata of
   // llvm::InlineFunction explodes in case of kernels with restrict
   // metadata and a lot of lifetime markers. The issue produces at
@@ -361,10 +397,10 @@ createLauncher(Module &M, Function *F) {
       if (!llvm::isa<CallInst>(Instr)) continue;
       CallInst *CallInstr = dyn_cast<CallInst>(Instr);
       if (CallInstr->getCalledFunction() != nullptr &&
-          (CallInstr->getCalledFunction()->getName() ==
-           "llvm.lifetime.end" ||
-           CallInstr->getCalledFunction()->getName() ==
-           "llvm.lifetime.start")) {
+          (CallInstr->getCalledFunction()->getName().
+	   startswith("llvm.lifetime.end") ||
+           CallInstr->getCalledFunction()->getName().
+	   startswith("llvm.lifetime.start"))) {
         Calls.insert(CallInstr);
       }
     }
diff --git a/lib/llvmopencl/WorkitemLoops.cc b/lib/llvmopencl/WorkitemLoops.cc
index fb52d7b..ca0685e 100644
--- a/lib/llvmopencl/WorkitemLoops.cc
+++ b/lib/llvmopencl/WorkitemLoops.cc
@@ -143,6 +143,8 @@ WorkitemLoops::runOnFunction(Function &F)
   contextArrays.clear();
   tempInstructionIds.clear();
 
+  releaseParallelRegions();
+
   return changed;
 }
 
@@ -251,19 +253,26 @@ WorkitemLoops::CreateLoopAround
 
   IRBuilder<> builder(forInitBB);
 
-  if (peeledFirst)
-    {
-      builder.CreateStore(builder.CreateLoad(localIdXFirstVar), localIdVar);
-      builder.CreateStore
-        (ConstantInt::get(IntegerType::get(C, size_t_width), 0), localIdXFirstVar);
-    }
-  else
-    {
-      builder.CreateStore
-        (ConstantInt::get(IntegerType::get(C, size_t_width), 0), localIdVar);
+  if (peeledFirst) {
+    builder.CreateStore(builder.CreateLoad(localIdXFirstVar), localIdVar);
+    builder.CreateStore
+      (ConstantInt::get(IntegerType::get(C, size_t_width), 0), localIdXFirstVar);
+
+    if (WGDynamicLocalSize) {
+      llvm::Value *cmpResult;
+      cmpResult = builder.CreateICmpULT(builder.CreateLoad(localIdVar),
+                                        builder.CreateLoad(DynamicLocalSize));
+
+      builder.CreateCondBr(cmpResult, loopBodyEntryBB, loopEndBB);
+    } else {
+      builder.CreateBr(loopBodyEntryBB);
     }
+  } else {
+    builder.CreateStore
+      (ConstantInt::get(IntegerType::get(C, size_t_width), 0), localIdVar);
 
-  builder.CreateBr(loopBodyEntryBB);
+    builder.CreateBr(loopBodyEntryBB);
+  }
 
   exitBB->getTerminator()->replaceUsesOfWith(oldExit, forCondBB);
   if (addIncBlock)
@@ -334,6 +343,20 @@ WorkitemLoops::RegionOfBlock(llvm::BasicBlock *bb)
   return NULL;
 }
 
+void WorkitemLoops::releaseParallelRegions() {
+  if (original_parallel_regions) {
+    for (auto i = original_parallel_regions->begin(),
+              e = original_parallel_regions->end();
+              i != e; ++i) {
+      ParallelRegion *p = *i;
+      delete p;
+    }
+
+    delete original_parallel_regions;
+    original_parallel_regions = nullptr;
+  }
+}
+
 bool
 WorkitemLoops::ProcessFunction(Function &F)
 {
@@ -351,6 +374,8 @@ WorkitemLoops::ProcessFunction(Function &F)
       return true;
     }
 
+  releaseParallelRegions();
+
 #ifdef LLVM_OLDER_THAN_3_7
   original_parallel_regions = K->getParallelRegions(LI);
 #else
@@ -916,13 +941,50 @@ WorkitemLoops::GetContextArray(llvm::Instruction *instruction)
     }
   else
     {
-      /* 3D context array. */
-      llvm::Type *contextArrayType =
-        ArrayType::get(
-          ArrayType::get(
-            ArrayType::get(
-                           elementType, WGLocalSizeX),
-            WGLocalSizeY), WGLocalSizeZ);
+      /* 3D context array. In case the elementType itself is an array or struct,
+       * we must take into account it could be alloca-ed with alignment and loads
+       * or stores might use vectorized instructions expecting proper alignment.
+       * Because of that, we cannot simply allocate x*y*z*(size), we must
+       * enlarge the array type to fit the alignment. */
+      Type *allocType = elementType;
+      AllocaInst *allocaInst = dyn_cast<AllocaInst>(instruction);
+      if (allocaInst) {
+        unsigned alignment = allocaInst->getAlignment();
+
+        const DataLayout &dataLayout = M->getDataLayout();
+        uint64_t storeSize =
+          dataLayout.getTypeStoreSize(allocaInst->getAllocatedType());
+
+        if ((alignment > 1) && (storeSize & (alignment - 1))) {
+          uint64_t alignedSize = (storeSize & (~(alignment - 1))) + alignment;
+#ifdef DEBUG_WORK_ITEM_LOOPS
+        std::cerr << "### unaligned type found: aligning " << storeSize
+                  << " to " << alignedSize << "\n";
+#endif
+          if (isa<ArrayType>(elementType)) {
+            allocType =
+              ArrayType::get(elementType->getArrayElementType(), alignedSize);
+          } else if (isa<StructType>(elementType)) {
+            StructType *old_struct = dyn_cast<StructType>(elementType);
+
+            unsigned required_bytes = alignedSize - storeSize;
+            ArrayType *structPadding = ArrayType::get(
+                Type::getInt8Ty(M->getContext()), required_bytes);
+            std::vector<Type *> ary;
+            for (unsigned j = 0; j < old_struct->getNumElements(); j++)
+              ary.push_back(old_struct->getElementType(j));
+            ary.push_back(structPadding);
+            const ArrayRef<Type *> new_el(ary);
+            allocType = StructType::get(old_struct->getContext(), new_el,
+                                        old_struct->isPacked());
+            unsigned newStoreSize = dataLayout.getTypeStoreSize(allocType);
+            assert(newStoreSize == alignedSize);
+          }
+        }
+      }
+      llvm::Type *contextArrayType = ArrayType::get(
+          ArrayType::get(ArrayType::get(allocType, WGLocalSizeX), WGLocalSizeY),
+          WGLocalSizeZ);
 
       /* Allocate the context data array for the variable. */
       Alloca = builder.CreateAlloca(contextArrayType, 0, varName);
diff --git a/lib/llvmopencl/WorkitemLoops.h b/lib/llvmopencl/WorkitemLoops.h
index f781dad..e51b784 100644
--- a/lib/llvmopencl/WorkitemLoops.h
+++ b/lib/llvmopencl/WorkitemLoops.h
@@ -52,7 +52,8 @@ namespace pocl {
   public:
     static char ID;
 
-  WorkitemLoops() : pocl::WorkitemHandler(ID) {}
+  WorkitemLoops() : pocl::WorkitemHandler(ID),
+                    original_parallel_regions(nullptr) {}
 
     virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const;
     virtual bool runOnFunction(llvm::Function &F);
@@ -87,6 +88,7 @@ namespace pocl {
 
     void FixMultiRegionVariables(ParallelRegion *region);
     void AddContextSaveRestore(llvm::Instruction *instruction);
+    void releaseParallelRegions();
 
     llvm::Value *GetLinearWiIndex(llvm::IRBuilder<> &builder, llvm::Module *M,
                                   ParallelRegion *region);
diff --git a/lib/llvmopencl/WorkitemReplication.cc b/lib/llvmopencl/WorkitemReplication.cc
index 6197e25..d91cd43 100644
--- a/lib/llvmopencl/WorkitemReplication.cc
+++ b/lib/llvmopencl/WorkitemReplication.cc
@@ -322,6 +322,16 @@ WorkitemReplication::ProcessFunction(Function &F)
 
   //pocl::dumpCFG(F, F.getName().str() + ".after_repl.dot", original_parallel_regions);
 
+  if (original_parallel_regions) {
+    for (auto i = original_parallel_regions->begin(),
+              e = original_parallel_regions->end();
+              i != e; ++i) {
+      ParallelRegion *p = *i;
+      delete p;
+    }
+  }
+  delete original_parallel_regions;
+  original_parallel_regions = nullptr;
 //  F.viewCFG();
 
   return true;
diff --git a/lib/llvmopencl/linker.cpp b/lib/llvmopencl/linker.cpp
index 2b272e2..689b976 100644
--- a/lib/llvmopencl/linker.cpp
+++ b/lib/llvmopencl/linker.cpp
@@ -263,7 +263,7 @@ CopyFunc(const llvm::StringRef Name,
  * that are defined in 'from', into 'to', adding the mappings to
  * 'vvm'.
  */
-static void
+static int
 copy_func_callgraph(const llvm::StringRef func_name,
                     const llvm::Module *  from,
                     llvm::Module *        to,
@@ -271,7 +271,7 @@ copy_func_callgraph(const llvm::StringRef func_name,
     std::list<llvm::StringRef> callees;
     llvm::Function *RootFunc = from->getFunction(func_name);
     if (RootFunc == NULL)
-      return;
+      return -1;
     DB_PRINT("copying function %s with callgraph\n", RootFunc.data());
 
     find_called_functions(RootFunc, callees);
@@ -291,6 +291,7 @@ copy_func_callgraph(const llvm::StringRef func_name,
       CopyFunc(*ci, from, to, vvm);
     }
     CopyFunc(func_name, from, to, vvm);
+    return 0;
 }
 
 static inline bool
@@ -304,10 +305,8 @@ stringref_cmp(llvm::StringRef a, llvm::StringRef b)
     return a.str() < b.str();
 }
 
-void
-link(llvm::Module *krn, const llvm::Module *lib)
-{
-  assert(krn);
+int link(llvm::Module *program, const llvm::Module *lib, std::string &log) {
+  assert(program);
   assert(lib);
   ValueToValueMapTy vvm;
   std::list<llvm::StringRef> declared;
@@ -315,55 +314,77 @@ link(llvm::Module *krn, const llvm::Module *lib)
   llvm::Module::iterator fi, fe;
 
   // Find and fix opencl.imageX_t arguments
-  for (fi = krn->begin(), fe = krn->end(); fi != fe; fi++) {
+  for (fi = program->begin(), fe = program->end(); fi != fe; fi++) {
     llvm::Function *f = &*fi;
     if (f->isDeclaration())
       continue;
     // need to restart iteration if we replace a function
-    if (CloneFuncFixOpenCLImageT(krn, f) != f) {
-      fi = krn->begin();
+    if (CloneFuncFixOpenCLImageT(program, f) != f) {
+      fi = program->begin();
     }
   }
 
-  // Inspect the kernel, find undefined functions
-  for (fi = krn->begin(), fe = krn->end();  fi != fe; fi++) {
+  // Inspect the program, find undefined functions
+  for (fi = program->begin(), fe = program->end(); fi != fe; fi++) {
     if ((*fi).isDeclaration()) {
       DB_PRINT("%s is not defined\n", fi->getName().data());
       declared.push_back(fi->getName());
       continue;
     }
 
-    // Find all functions the kernel source calls
+    // Find all functions the program source calls
     // TODO: is there no direct way?
     find_called_functions(&*fi, declared);
   }
   declared.sort(stringref_cmp);
   declared.unique(stringref_equal);
 
-  // Copy all the globals from lib to krn.
+  // Copy all the globals from lib to program.
   // It probably is faster to just copy them all, than to inspect
-  // both krn and lib to find which actually are used.
+  // both program and lib to find which actually are used.
   DB_PRINT("cloning the global variables:\n");
   llvm::Module::const_global_iterator gi,ge;
   for (gi=lib->global_begin(), ge=lib->global_end(); gi != ge; gi++) {
     DB_PRINT(" %s\n", gi->getName().data());
     GlobalVariable *GV = new GlobalVariable(
-      *krn, gi->getType()->getElementType(), gi->isConstant(),
+      *program, gi->getType()->getElementType(), gi->isConstant(),
       gi->getLinkage(), (Constant*)0, gi->getName(), (GlobalVariable*)0,
       gi->getThreadLocalMode(), gi->getType()->getAddressSpace());
     GV->copyAttributesFrom(&*gi);
     vvm[&*gi]=GV;
   }
 
-  // For each undefined function in krn, clone it from the lib to the krn module,
+  // For each undefined function in program,
+  // clone it from the lib to the program module,
   // if found in lib
+  bool found_all_undefined = true;
+
+  // this one is a handled with a special pocl LLVM pass
+  StringRef pocl_sampler_handler("__translate_sampler_initializer");
+  // ignore undefined llvm intrinsics
+  StringRef llvm_intrins("llvm.");
   std::list<llvm::StringRef>::iterator di,de;
   for (di = declared.begin(), de = declared.end();
        di != de; di++) {
-      copy_func_callgraph(*di, lib, krn, vvm);
+      llvm::StringRef r = *di;
+      if (copy_func_callgraph(r, lib, program, vvm)) {
+        Function *f = program->getFunction(r);
+        if ((f == NULL) ||
+            (f->isDeclaration() &&
+             !f->getName().equals(pocl_sampler_handler) &&
+             !f->getName().startswith(llvm_intrins))
+           ) {
+          log.append("Cannot find symbol ");
+          log.append(r.str());
+          log.append(" in kernel library\n");
+          found_all_undefined = false;
+        }
+      }
   }
+  if (!found_all_undefined)
+    return 1;
 
-  // copy any aliases to krn
+  // copy any aliases to program
   DB_PRINT("cloning the aliases:\n");
   llvm::Module::const_alias_iterator ai, ae;
   for (ai = lib->alias_begin(), ae = lib->alias_end(); ai != ae; ai++) {
@@ -372,10 +393,10 @@ link(llvm::Module *krn, const llvm::Module *lib)
 #ifndef LLVM_3_7
       GlobalAlias::create(
 	ai->getType(), ai->getType()->getAddressSpace(), ai->getLinkage(),
-	ai->getName(), NULL, krn);
+	ai->getName(), NULL, program);
 #else
     GlobalAlias::create(
-	ai->getType(), ai->getLinkage(), ai->getName(), NULL, krn);
+        ai->getType(), ai->getLinkage(), ai->getName(), NULL, program);
 #endif
 
     GA->copyAttributesFrom(&*ai);
@@ -398,10 +419,12 @@ link(llvm::Module *krn, const llvm::Module *lib)
        mi != me; mi++) {
       const NamedMDNode &NMD=*mi;
       DB_PRINT(" %s:\n", NMD.getName().data());
-      NamedMDNode *NewNMD=krn->getOrInsertNamedMetadata(NMD.getName());
+      NamedMDNode *NewNMD=program->getOrInsertNamedMetadata(NMD.getName());
       for (unsigned i=0, e=NMD.getNumOperands(); i != e; ++i)
         NewNMD->addOperand(MapMetadata(NMD.getOperand(i), vvm));
   }
+
+  return 0;
 }
 
 /* vim: set expandtab ts=4 : */
diff --git a/lib/llvmopencl/linker.h b/lib/llvmopencl/linker.h
index 159233e..a16dc26 100644
--- a/lib/llvmopencl/linker.h
+++ b/lib/llvmopencl/linker.h
@@ -4,6 +4,7 @@
 #include "config.h"
 
 #include "llvm/IR/Module.h"
+#include "llvm/ADT/Triple.h"
 
 #ifdef __GNUC__
 #pragma GCC visibility push(hidden)
@@ -15,8 +16,10 @@
  * in krn from lib, cloning as needed. For big modules,
  * this is faster than calling llvm::Linker and then
  * running DCE.
+ *
+ * log is used to report errors if we run into undefined symbols
  */
-void link(llvm::Module *krn, const llvm::Module *lib);
+int link(llvm::Module *krn, const llvm::Module *lib, std::string &log);
 
 #ifdef __GNUC__
 #pragma GCC visibility pop
diff --git a/lib/poclu/misc.c b/lib/poclu/misc.c
index 722ccb1..d6d6fa7 100644
--- a/lib/poclu/misc.c
+++ b/lib/poclu/misc.c
@@ -32,16 +32,15 @@ cl_context
 poclu_create_any_context ()
 {
   cl_uint i;
-  cl_platform_id* platforms
-    = (cl_platform_id*) malloc (sizeof (cl_platform_id));
+  cl_platform_id platform;
 
-  clGetPlatformIDs (1, platforms, &i);
+  clGetPlatformIDs (1, &platform, &i);
   if (i == 0)
     return (cl_context) 0;
 
   cl_context_properties properties[] =
     {CL_CONTEXT_PLATFORM,
-     (cl_context_properties)platforms[0],
+     (cl_context_properties)platform,
      0};
 
   // create the OpenCL context on any available OCL device
@@ -49,7 +48,6 @@ poclu_create_any_context ()
                                                 CL_DEVICE_TYPE_ALL,
                                                 NULL, NULL, NULL);
 
-  free (platforms);
   return context;
 }
 
diff --git a/ocl-vendors/pocl-tests.icd.in b/ocl-vendors/pocl-tests.icd.in
deleted file mode 100644
index cc7572e..0000000
--- a/ocl-vendors/pocl-tests.icd.in
+++ /dev/null
@@ -1 +0,0 @@
- at abs_top_builddir@/lib/CL/.libs/libpocl.so
diff --git a/pocl.pc.in b/pocl.pc.in
deleted file mode 100644
index 746da25..0000000
--- a/pocl.pc.in
+++ /dev/null
@@ -1,11 +0,0 @@
-prefix=@prefix@
-exec_prefix=@exec_prefix@
-libdir=@libdir@
-includedir=@includedir@
-
-Name: Portable Computing Language
-Description: Portable Computing Language
-Version: @PACKAGE_VERSION@
-Libs: -L${libdir} -lpocl @LD_FLAGS_BIN@
-Cflags: -I${includedir} -I${prefix}/share/pocl/include @PTHREAD_CFLAGS@
-
diff --git a/pocl.pc.in.cmake b/pocl.pc.in.cmake
index 293cdf1..470efaa 100644
--- a/pocl.pc.in.cmake
+++ b/pocl.pc.in.cmake
@@ -6,6 +6,6 @@ includedir=@POCL_INSTALL_PUBLIC_HEADER_DIR@
 Name: Portable Computing Language
 Description: Portable Computing Language
 Version: @POCL_VERSION@
-Libs: -L${libdir} -lpocl -I at POCL_INSTALL_PRIVATE_HEADER_DIR@ @LD_FLAGS_BIN@
+Libs: -L${libdir} -lpocl
 Cflags: -I${includedir}
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index b7d490e..f3666f5 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -23,70 +23,13 @@
 #
 #=============================================================================
 
-#function(add_test_custom RUN_CMD TEST_NAME RESULT_FILE)
-#  foreach(LOOPVAR ${ARGN})
-#    set(RUN_CMD "${RUN_CMD}####${LOOPVAR}")
-#  endforeach()
-#endfunction()
-
-include(CMakeParseArguments)
-
-# This is a wrapper around add_test
-# Solves several problems:
-# 1) allows expected outputs (optionally sorted)
-# 2) handles the exit status problem (test properties WILL_FAIL does not work if
-#    the test exits with !0 exit status)
-
-function(add_test_pocl)
-
-  set(options SORT_OUTPUT)
-  set(oneValueArgs EXPECTED_OUTPUT NAME WORKING_DIRECTORY)
-  set(multiValueArgs COMMAND)
-  cmake_parse_arguments(POCL_TEST "${options}" "${oneValueArgs}"
-                        "${multiValueArgs}" ${ARGN})
-
-  #message(STATUS "POCL_TEST_NAME: ${POCL_TEST_NAME}")
-  #message(STATUS "POCL_TEST_COMMAND: ${POCL_TEST_COMMAND}")
-
-  unset(RUN_CMD)
-  foreach(LOOPVAR ${POCL_TEST_COMMAND})
-    if(NOT RUN_CMD)
-      set(RUN_CMD "${CMAKE_CURRENT_BINARY_DIR}/${LOOPVAR}")
-    else()
-      set(RUN_CMD "${RUN_CMD}####${LOOPVAR}")
-    endif()
-  endforeach()
-
-  set(POCL_TEST_ARGLIST "NAME" "${POCL_TEST_NAME}")
-  if(POCL_TEST_WORKING_DIRECTORY)
-    list(APPEND POCL_TEST_ARGLIST "WORKING_DIRECTORY")
-    list(APPEND POCL_TEST_ARGLIST "${POCL_TEST_WORKING_DIRECTORY}")
-  endif()
-
-  list(APPEND POCL_TEST_ARGLIST "COMMAND" "${CMAKE_COMMAND}" "-Dtest_cmd=${RUN_CMD}")
-  if(POCL_TEST_EXPECTED_OUTPUT)
-    list(APPEND POCL_TEST_ARGLIST
-      "-Doutput_blessed=${CMAKE_CURRENT_SOURCE_DIR}/${POCL_TEST_EXPECTED_OUTPUT}")
-  endif()
-  if(POCL_TEST_SORT_OUTPUT)
-    list(APPEND POCL_TEST_ARGLIST "-Dsort_output=1")
-    endif()
-  list(APPEND POCL_TEST_ARGLIST "-P" "${CMAKE_SOURCE_DIR}/cmake/run_test.cmake")
-
-  add_test(${POCL_TEST_ARGLIST} )
-  set_tests_properties("${POCL_TEST_NAME}" PROPERTIES
-                       PASS_REGULAR_EXPRESSION "OK"
-                       FAIL_REGULAR_EXPRESSION "FAIL")
-
-endfunction()
-
-
 add_test("pocl_version_check" "runtime/test_version")
+
 set_tests_properties("pocl_version_check"
   PROPERTIES
   ENVIRONMENT "POCL_DEVICES=basic"
   PASS_REGULAR_EXPRESSION "basic"
-  LABELS "internal")
+  LABELS "internal;cuda")
 
 #######################################################################
 
diff --git a/tests/atlocal.in b/tests/atlocal.in
deleted file mode 100644
index 4f95b7d..0000000
--- a/tests/atlocal.in
+++ /dev/null
@@ -1,11 +0,0 @@
-
-OCL_ICD_VENDORS="@abs_top_builddir@/ocl-vendors"
-export OCL_ICD_VENDORS
-
-OPENCL_VENDOR_PATH="@abs_top_builddir@/ocl-vendors"
-export OPENCL_VENDOR_PATH
-
-POCL_BUILDING=1
-export POCL_BUILDING
-
-POAT_TESTSUITES="@POAT_TESTSUITES@"
diff --git a/tests/kernel/CMakeLists.txt b/tests/kernel/CMakeLists.txt
index d6963b4..157bf9f 100644
--- a/tests/kernel/CMakeLists.txt
+++ b/tests/kernel/CMakeLists.txt
@@ -23,22 +23,13 @@
 #
 #=============================================================================
 
-#AM_CPPFLAGS = -I$(top_srcdir)/fix-include -I$(top_srcdir)/include -DSRCDIR='"$(abs_srcdir)"'
 add_definitions("-DSRCDIR=\"${CMAKE_CURRENT_SOURCE_DIR}\"")
 
-
-#kernel_CFLAGS = -std=c99 @OPENCL_CFLAGS@
-#smapler_address_clamp_CFLAGS = -std=c99 @OPENCL_CFLAGS@
-#image_query_funcs_CFLAGS = -std=c99 @OPENCL_CFLAGS@
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99 ${OPENCL_CFLAGS}")
-
-#kernel_LDADD = -lm @OPENCL_LIBS@ ../../lib/poclu/libpoclu.la
-#sampler_address_clamp_LDADD = -lm @OPENCL_LIBS@ ../../lib/poclu/libpoclu.la
-#image_query_funcs_LDADD = -lm @OPENCL_LIBS@ ../../lib/poclu/libpoclu.la
-# -> POCLU_LINK_OPTIONS in toplevel cmake
+#cannot use add_compile_options, because we need this only for C files
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99 ${OPENCL_CFLAGS_STR}")
 
 ######################################################################
-add_executable("kernel" "kernel.c") # test_as_type.cl test_bitselect.cl test_convert_sat_regression.cl test_convert_type_*.cl test_fabs.cl test_fmin_fmax_fma.cl test_hadd.cl test_min_max.cl test_length_distance.cl test_rotate.cl test_short16.cl test_sizeof.cl test_block.cl test_printf.cl
+add_executable("kernel" "kernel.c")
 target_link_libraries("kernel" ${POCLU_LINK_OPTIONS})
 
 add_test_pocl(NAME "kernel/test_as_type"
@@ -77,7 +68,16 @@ set_tests_properties( "kernel/test_as_type" "kernel/test_bitselect"
     FAIL_REGULAR_EXPRESSION "FAIL"
     PASS_REGULAR_EXPRESSION "\nOK\n"
     PROCESSORS 1
-    DEPENDS "pocl_version_check")
+    DEPENDS "pocl_version_check"
+    LABELS "kernel")
+
+# convert_type_{4,8,16} on some machines takes >1 hour,
+# so only add convert_type_1/2 to internal tests to keep them reasonably fast.
+
+set_tests_properties( "kernel/test_as_type" "kernel/test_bitselect"
+  "kernel/test_convert_type_1" "kernel/test_convert_type_2"
+  PROPERTIES
+    LABELS "internal;kernel")
 
 set_tests_properties("kernel/test_hadd_loops"
   PROPERTIES ENVIRONMENT "POCL_WORK_GROUP_METHOD=loops")
@@ -137,10 +137,10 @@ if(MSVC)
     test_shuffle.cc kernel.c PROPERTIES LANGUAGE CXX )
 endif(MSVC)
 
-add_executable("sampler_address_clamp" "sampler_address_clamp.c") #test_sampler_address_clamp.cl
+add_executable("sampler_address_clamp" "sampler_address_clamp.c")
 target_link_libraries("sampler_address_clamp" ${POCLU_LINK_OPTIONS})
 
-add_executable("image_query_funcs" "image_query_funcs.c") #test_image_query_funcs.cl
+add_executable("image_query_funcs" "image_query_funcs.c")
 target_link_libraries("image_query_funcs" ${POCLU_LINK_OPTIONS})
 
 add_test_pocl(NAME "kernel/test_sampler_address_clamp"
@@ -283,3 +283,30 @@ add_test_pocl(NAME "kernel/test_sizeof_uint"
 #    COST 2.0
 #    PROCESSORS 1
 #    DEPENDS "pocl_version_check")
+
+# Label tests that work with CUDA backend
+set_property(TEST
+  "kernel/test_min_max"
+  "kernel/test_length_distance"
+  "kernel/test_fmin_fmax_fma"
+  "kernel/test_convert_sat_regression"
+  "kernel/test_rotate"
+  "kernel/test_short16"
+  "kernel/test_local_struct_array"
+  "kernel/test_shuffle_char"
+  "kernel/test_shuffle_short"
+  "kernel/test_shuffle_ushort"
+  "kernel/test_shuffle_int"
+  "kernel/test_shuffle_uint"
+  "kernel/test_shuffle_float"
+  "kernel/test_shuffle_long"
+  "kernel/test_shuffle_ulong"
+  "kernel/test_shuffle_double"
+  "kernel/test_convert_type_1"
+  "kernel/test_convert_type_2"
+  "kernel/test_convert_type_4"
+  "kernel/test_convert_type_8"
+  "kernel/test_convert_type_16"
+  "kernel/test_as_type"
+  "kernel/test_sizeof_uint"
+  APPEND PROPERTY LABELS "cuda")
diff --git a/tests/kernel/image_query_funcs.c b/tests/kernel/image_query_funcs.c
index f6e7d5b..9513077 100644
--- a/tests/kernel/image_query_funcs.c
+++ b/tests/kernel/image_query_funcs.c
@@ -195,7 +195,14 @@ int main(int argc, char **argv)
     retval = 0;
 
 error:
-
+  if (image2)
+    {
+      clReleaseMemObject (image2);
+    }
+  if (image3)
+    {
+      clReleaseMemObject (image3);
+    }
   if (kernel) 
     {
       clReleaseKernel(kernel);
diff --git a/tests/kernel/test_convert_type_1.cl b/tests/kernel/test_convert_type_1.cl
index 55c28ff..51c835d 100644
--- a/tests/kernel/test_convert_type_1.cl
+++ b/tests/kernel/test_convert_type_1.cl
@@ -2627,10 +2627,10 @@ kernel void test_convert_type_1()
     }
     compare_char_elements_float("convert_char(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (char)convert_char(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (char)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (char)max_expected;
     }
     actual.value = convert_char_sat((float)sat_input);
@@ -2648,10 +2648,10 @@ kernel void test_convert_type_1()
     }
     compare_char_elements_float("convert_char_rte(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (char)convert_char_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (char)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (char)max_expected;
     }
     actual.value = convert_char_sat_rte((float)sat_input);
@@ -2669,10 +2669,10 @@ kernel void test_convert_type_1()
     }
     compare_char_elements_float("convert_char_rtz(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (char)convert_char_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (char)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (char)max_expected;
     }
     actual.value = convert_char_sat_rtz((float)sat_input);
@@ -2690,10 +2690,10 @@ kernel void test_convert_type_1()
     }
     compare_char_elements_float("convert_char_rtp(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (char)convert_char_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (char)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (char)max_expected;
     }
     actual.value = convert_char_sat_rtp((float)sat_input);
@@ -2711,10 +2711,10 @@ kernel void test_convert_type_1()
     }
     compare_char_elements_float("convert_char_rtn(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (char)convert_char_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (char)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (char)max_expected;
     }
     actual.value = convert_char_sat_rtn((float)sat_input);
@@ -2741,10 +2741,10 @@ kernel void test_convert_type_1()
     }
     compare_uchar_elements_float("convert_uchar(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (uchar)convert_uchar(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uchar)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uchar)max_expected;
     }
     actual.value = convert_uchar_sat((float)sat_input);
@@ -2762,10 +2762,10 @@ kernel void test_convert_type_1()
     }
     compare_uchar_elements_float("convert_uchar_rte(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (uchar)convert_uchar_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uchar)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uchar)max_expected;
     }
     actual.value = convert_uchar_sat_rte((float)sat_input);
@@ -2783,10 +2783,10 @@ kernel void test_convert_type_1()
     }
     compare_uchar_elements_float("convert_uchar_rtz(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (uchar)convert_uchar_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uchar)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uchar)max_expected;
     }
     actual.value = convert_uchar_sat_rtz((float)sat_input);
@@ -2804,10 +2804,10 @@ kernel void test_convert_type_1()
     }
     compare_uchar_elements_float("convert_uchar_rtp(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (uchar)convert_uchar_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uchar)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uchar)max_expected;
     }
     actual.value = convert_uchar_sat_rtp((float)sat_input);
@@ -2825,10 +2825,10 @@ kernel void test_convert_type_1()
     }
     compare_uchar_elements_float("convert_uchar_rtn(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (uchar)convert_uchar_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uchar)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uchar)max_expected;
     }
     actual.value = convert_uchar_sat_rtn((float)sat_input);
@@ -2855,10 +2855,10 @@ kernel void test_convert_type_1()
     }
     compare_short_elements_float("convert_short(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (short)convert_short(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (short)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (short)max_expected;
     }
     actual.value = convert_short_sat((float)sat_input);
@@ -2876,10 +2876,10 @@ kernel void test_convert_type_1()
     }
     compare_short_elements_float("convert_short_rte(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (short)convert_short_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (short)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (short)max_expected;
     }
     actual.value = convert_short_sat_rte((float)sat_input);
@@ -2897,10 +2897,10 @@ kernel void test_convert_type_1()
     }
     compare_short_elements_float("convert_short_rtz(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (short)convert_short_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (short)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (short)max_expected;
     }
     actual.value = convert_short_sat_rtz((float)sat_input);
@@ -2918,10 +2918,10 @@ kernel void test_convert_type_1()
     }
     compare_short_elements_float("convert_short_rtp(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (short)convert_short_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (short)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (short)max_expected;
     }
     actual.value = convert_short_sat_rtp((float)sat_input);
@@ -2939,10 +2939,10 @@ kernel void test_convert_type_1()
     }
     compare_short_elements_float("convert_short_rtn(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (short)convert_short_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (short)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (short)max_expected;
     }
     actual.value = convert_short_sat_rtn((float)sat_input);
@@ -2969,10 +2969,10 @@ kernel void test_convert_type_1()
     }
     compare_ushort_elements_float("convert_ushort(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (ushort)convert_ushort(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ushort)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ushort)max_expected;
     }
     actual.value = convert_ushort_sat((float)sat_input);
@@ -2990,10 +2990,10 @@ kernel void test_convert_type_1()
     }
     compare_ushort_elements_float("convert_ushort_rte(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (ushort)convert_ushort_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ushort)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ushort)max_expected;
     }
     actual.value = convert_ushort_sat_rte((float)sat_input);
@@ -3011,10 +3011,10 @@ kernel void test_convert_type_1()
     }
     compare_ushort_elements_float("convert_ushort_rtz(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (ushort)convert_ushort_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ushort)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ushort)max_expected;
     }
     actual.value = convert_ushort_sat_rtz((float)sat_input);
@@ -3032,10 +3032,10 @@ kernel void test_convert_type_1()
     }
     compare_ushort_elements_float("convert_ushort_rtp(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (ushort)convert_ushort_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ushort)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ushort)max_expected;
     }
     actual.value = convert_ushort_sat_rtp((float)sat_input);
@@ -3053,10 +3053,10 @@ kernel void test_convert_type_1()
     }
     compare_ushort_elements_float("convert_ushort_rtn(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (ushort)convert_ushort_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ushort)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ushort)max_expected;
     }
     actual.value = convert_ushort_sat_rtn((float)sat_input);
@@ -3083,10 +3083,10 @@ kernel void test_convert_type_1()
     }
     compare_int_elements_float("convert_int(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (int)convert_int(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (int)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (int)max_expected;
     }
     actual.value = convert_int_sat((float)sat_input);
@@ -3104,10 +3104,10 @@ kernel void test_convert_type_1()
     }
     compare_int_elements_float("convert_int_rte(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (int)convert_int_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (int)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (int)max_expected;
     }
     actual.value = convert_int_sat_rte((float)sat_input);
@@ -3125,10 +3125,10 @@ kernel void test_convert_type_1()
     }
     compare_int_elements_float("convert_int_rtz(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (int)convert_int_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (int)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (int)max_expected;
     }
     actual.value = convert_int_sat_rtz((float)sat_input);
@@ -3146,10 +3146,10 @@ kernel void test_convert_type_1()
     }
     compare_int_elements_float("convert_int_rtp(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (int)convert_int_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (int)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (int)max_expected;
     }
     actual.value = convert_int_sat_rtp((float)sat_input);
@@ -3167,10 +3167,10 @@ kernel void test_convert_type_1()
     }
     compare_int_elements_float("convert_int_rtn(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (int)convert_int_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (int)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (int)max_expected;
     }
     actual.value = convert_int_sat_rtn((float)sat_input);
@@ -3197,10 +3197,10 @@ kernel void test_convert_type_1()
     }
     compare_uint_elements_float("convert_uint(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (uint)convert_uint(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uint)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uint)max_expected;
     }
     actual.value = convert_uint_sat((float)sat_input);
@@ -3218,10 +3218,10 @@ kernel void test_convert_type_1()
     }
     compare_uint_elements_float("convert_uint_rte(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (uint)convert_uint_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uint)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uint)max_expected;
     }
     actual.value = convert_uint_sat_rte((float)sat_input);
@@ -3239,10 +3239,10 @@ kernel void test_convert_type_1()
     }
     compare_uint_elements_float("convert_uint_rtz(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (uint)convert_uint_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uint)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uint)max_expected;
     }
     actual.value = convert_uint_sat_rtz((float)sat_input);
@@ -3260,10 +3260,10 @@ kernel void test_convert_type_1()
     }
     compare_uint_elements_float("convert_uint_rtp(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (uint)convert_uint_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uint)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uint)max_expected;
     }
     actual.value = convert_uint_sat_rtp((float)sat_input);
@@ -3281,10 +3281,10 @@ kernel void test_convert_type_1()
     }
     compare_uint_elements_float("convert_uint_rtn(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (uint)convert_uint_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uint)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uint)max_expected;
     }
     actual.value = convert_uint_sat_rtn((float)sat_input);
@@ -3313,10 +3313,10 @@ kernel void test_convert_type_1()
     }
     compare_long_elements_float("convert_long(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (long)convert_long(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (long)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (long)max_expected;
     }
     actual.value = convert_long_sat((float)sat_input);
@@ -3334,10 +3334,10 @@ kernel void test_convert_type_1()
     }
     compare_long_elements_float("convert_long_rte(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (long)convert_long_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (long)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (long)max_expected;
     }
     actual.value = convert_long_sat_rte((float)sat_input);
@@ -3355,10 +3355,10 @@ kernel void test_convert_type_1()
     }
     compare_long_elements_float("convert_long_rtz(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (long)convert_long_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (long)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (long)max_expected;
     }
     actual.value = convert_long_sat_rtz((float)sat_input);
@@ -3376,10 +3376,10 @@ kernel void test_convert_type_1()
     }
     compare_long_elements_float("convert_long_rtp(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (long)convert_long_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (long)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (long)max_expected;
     }
     actual.value = convert_long_sat_rtp((float)sat_input);
@@ -3397,10 +3397,10 @@ kernel void test_convert_type_1()
     }
     compare_long_elements_float("convert_long_rtn(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (long)convert_long_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (long)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (long)max_expected;
     }
     actual.value = convert_long_sat_rtn((float)sat_input);
@@ -3431,10 +3431,10 @@ kernel void test_convert_type_1()
     }
     compare_ulong_elements_float("convert_ulong(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (ulong)convert_ulong(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ulong)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ulong)max_expected;
     }
     actual.value = convert_ulong_sat((float)sat_input);
@@ -3452,10 +3452,10 @@ kernel void test_convert_type_1()
     }
     compare_ulong_elements_float("convert_ulong_rte(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (ulong)convert_ulong_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ulong)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ulong)max_expected;
     }
     actual.value = convert_ulong_sat_rte((float)sat_input);
@@ -3473,10 +3473,10 @@ kernel void test_convert_type_1()
     }
     compare_ulong_elements_float("convert_ulong_rtz(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (ulong)convert_ulong_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ulong)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ulong)max_expected;
     }
     actual.value = convert_ulong_sat_rtz((float)sat_input);
@@ -3494,10 +3494,10 @@ kernel void test_convert_type_1()
     }
     compare_ulong_elements_float("convert_ulong_rtp(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (ulong)convert_ulong_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ulong)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ulong)max_expected;
     }
     actual.value = convert_ulong_sat_rtp((float)sat_input);
@@ -3515,10 +3515,10 @@ kernel void test_convert_type_1()
     }
     compare_ulong_elements_float("convert_ulong_rtn(float)", i, &float_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (ulong)convert_ulong_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ulong)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ulong)max_expected;
     }
     actual.value = convert_ulong_sat_rtn((float)sat_input);
@@ -3549,10 +3549,10 @@ kernel void test_convert_type_1()
     }
     compare_char_elements_double("convert_char(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (char)convert_char(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (char)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (char)max_expected;
     }
     actual.value = convert_char_sat((double)sat_input);
@@ -3570,10 +3570,10 @@ kernel void test_convert_type_1()
     }
     compare_char_elements_double("convert_char_rte(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (char)convert_char_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (char)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (char)max_expected;
     }
     actual.value = convert_char_sat_rte((double)sat_input);
@@ -3591,10 +3591,10 @@ kernel void test_convert_type_1()
     }
     compare_char_elements_double("convert_char_rtz(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (char)convert_char_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (char)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (char)max_expected;
     }
     actual.value = convert_char_sat_rtz((double)sat_input);
@@ -3612,10 +3612,10 @@ kernel void test_convert_type_1()
     }
     compare_char_elements_double("convert_char_rtp(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (char)convert_char_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (char)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (char)max_expected;
     }
     actual.value = convert_char_sat_rtp((double)sat_input);
@@ -3633,10 +3633,10 @@ kernel void test_convert_type_1()
     }
     compare_char_elements_double("convert_char_rtn(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (char)convert_char_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (char)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (char)max_expected;
     }
     actual.value = convert_char_sat_rtn((double)sat_input);
@@ -3667,10 +3667,10 @@ kernel void test_convert_type_1()
     }
     compare_uchar_elements_double("convert_uchar(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (uchar)convert_uchar(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uchar)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uchar)max_expected;
     }
     actual.value = convert_uchar_sat((double)sat_input);
@@ -3688,10 +3688,10 @@ kernel void test_convert_type_1()
     }
     compare_uchar_elements_double("convert_uchar_rte(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (uchar)convert_uchar_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uchar)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uchar)max_expected;
     }
     actual.value = convert_uchar_sat_rte((double)sat_input);
@@ -3709,10 +3709,10 @@ kernel void test_convert_type_1()
     }
     compare_uchar_elements_double("convert_uchar_rtz(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (uchar)convert_uchar_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uchar)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uchar)max_expected;
     }
     actual.value = convert_uchar_sat_rtz((double)sat_input);
@@ -3730,10 +3730,10 @@ kernel void test_convert_type_1()
     }
     compare_uchar_elements_double("convert_uchar_rtp(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (uchar)convert_uchar_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uchar)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uchar)max_expected;
     }
     actual.value = convert_uchar_sat_rtp((double)sat_input);
@@ -3751,10 +3751,10 @@ kernel void test_convert_type_1()
     }
     compare_uchar_elements_double("convert_uchar_rtn(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (uchar)convert_uchar_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uchar)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uchar)max_expected;
     }
     actual.value = convert_uchar_sat_rtn((double)sat_input);
@@ -3785,10 +3785,10 @@ kernel void test_convert_type_1()
     }
     compare_short_elements_double("convert_short(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (short)convert_short(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (short)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (short)max_expected;
     }
     actual.value = convert_short_sat((double)sat_input);
@@ -3806,10 +3806,10 @@ kernel void test_convert_type_1()
     }
     compare_short_elements_double("convert_short_rte(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (short)convert_short_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (short)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (short)max_expected;
     }
     actual.value = convert_short_sat_rte((double)sat_input);
@@ -3827,10 +3827,10 @@ kernel void test_convert_type_1()
     }
     compare_short_elements_double("convert_short_rtz(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (short)convert_short_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (short)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (short)max_expected;
     }
     actual.value = convert_short_sat_rtz((double)sat_input);
@@ -3848,10 +3848,10 @@ kernel void test_convert_type_1()
     }
     compare_short_elements_double("convert_short_rtp(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (short)convert_short_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (short)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (short)max_expected;
     }
     actual.value = convert_short_sat_rtp((double)sat_input);
@@ -3869,10 +3869,10 @@ kernel void test_convert_type_1()
     }
     compare_short_elements_double("convert_short_rtn(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (short)convert_short_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (short)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (short)max_expected;
     }
     actual.value = convert_short_sat_rtn((double)sat_input);
@@ -3903,10 +3903,10 @@ kernel void test_convert_type_1()
     }
     compare_ushort_elements_double("convert_ushort(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (ushort)convert_ushort(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ushort)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ushort)max_expected;
     }
     actual.value = convert_ushort_sat((double)sat_input);
@@ -3924,10 +3924,10 @@ kernel void test_convert_type_1()
     }
     compare_ushort_elements_double("convert_ushort_rte(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (ushort)convert_ushort_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ushort)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ushort)max_expected;
     }
     actual.value = convert_ushort_sat_rte((double)sat_input);
@@ -3945,10 +3945,10 @@ kernel void test_convert_type_1()
     }
     compare_ushort_elements_double("convert_ushort_rtz(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (ushort)convert_ushort_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ushort)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ushort)max_expected;
     }
     actual.value = convert_ushort_sat_rtz((double)sat_input);
@@ -3966,10 +3966,10 @@ kernel void test_convert_type_1()
     }
     compare_ushort_elements_double("convert_ushort_rtp(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (ushort)convert_ushort_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ushort)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ushort)max_expected;
     }
     actual.value = convert_ushort_sat_rtp((double)sat_input);
@@ -3987,10 +3987,10 @@ kernel void test_convert_type_1()
     }
     compare_ushort_elements_double("convert_ushort_rtn(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (ushort)convert_ushort_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ushort)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ushort)max_expected;
     }
     actual.value = convert_ushort_sat_rtn((double)sat_input);
@@ -4021,10 +4021,10 @@ kernel void test_convert_type_1()
     }
     compare_int_elements_double("convert_int(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (int)convert_int(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (int)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (int)max_expected;
     }
     actual.value = convert_int_sat((double)sat_input);
@@ -4042,10 +4042,10 @@ kernel void test_convert_type_1()
     }
     compare_int_elements_double("convert_int_rte(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (int)convert_int_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (int)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (int)max_expected;
     }
     actual.value = convert_int_sat_rte((double)sat_input);
@@ -4063,10 +4063,10 @@ kernel void test_convert_type_1()
     }
     compare_int_elements_double("convert_int_rtz(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (int)convert_int_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (int)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (int)max_expected;
     }
     actual.value = convert_int_sat_rtz((double)sat_input);
@@ -4084,10 +4084,10 @@ kernel void test_convert_type_1()
     }
     compare_int_elements_double("convert_int_rtp(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (int)convert_int_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (int)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (int)max_expected;
     }
     actual.value = convert_int_sat_rtp((double)sat_input);
@@ -4105,10 +4105,10 @@ kernel void test_convert_type_1()
     }
     compare_int_elements_double("convert_int_rtn(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (int)convert_int_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (int)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (int)max_expected;
     }
     actual.value = convert_int_sat_rtn((double)sat_input);
@@ -4139,10 +4139,10 @@ kernel void test_convert_type_1()
     }
     compare_uint_elements_double("convert_uint(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (uint)convert_uint(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uint)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uint)max_expected;
     }
     actual.value = convert_uint_sat((double)sat_input);
@@ -4160,10 +4160,10 @@ kernel void test_convert_type_1()
     }
     compare_uint_elements_double("convert_uint_rte(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (uint)convert_uint_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uint)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uint)max_expected;
     }
     actual.value = convert_uint_sat_rte((double)sat_input);
@@ -4181,10 +4181,10 @@ kernel void test_convert_type_1()
     }
     compare_uint_elements_double("convert_uint_rtz(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (uint)convert_uint_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uint)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uint)max_expected;
     }
     actual.value = convert_uint_sat_rtz((double)sat_input);
@@ -4202,10 +4202,10 @@ kernel void test_convert_type_1()
     }
     compare_uint_elements_double("convert_uint_rtp(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (uint)convert_uint_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uint)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uint)max_expected;
     }
     actual.value = convert_uint_sat_rtp((double)sat_input);
@@ -4223,10 +4223,10 @@ kernel void test_convert_type_1()
     }
     compare_uint_elements_double("convert_uint_rtn(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (uint)convert_uint_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uint)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uint)max_expected;
     }
     actual.value = convert_uint_sat_rtn((double)sat_input);
@@ -4259,10 +4259,10 @@ kernel void test_convert_type_1()
     }
     compare_long_elements_double("convert_long(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (long)convert_long(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (long)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (long)max_expected;
     }
     actual.value = convert_long_sat((double)sat_input);
@@ -4280,10 +4280,10 @@ kernel void test_convert_type_1()
     }
     compare_long_elements_double("convert_long_rte(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (long)convert_long_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (long)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (long)max_expected;
     }
     actual.value = convert_long_sat_rte((double)sat_input);
@@ -4301,10 +4301,10 @@ kernel void test_convert_type_1()
     }
     compare_long_elements_double("convert_long_rtz(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (long)convert_long_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (long)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (long)max_expected;
     }
     actual.value = convert_long_sat_rtz((double)sat_input);
@@ -4322,10 +4322,10 @@ kernel void test_convert_type_1()
     }
     compare_long_elements_double("convert_long_rtp(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (long)convert_long_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (long)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (long)max_expected;
     }
     actual.value = convert_long_sat_rtp((double)sat_input);
@@ -4343,10 +4343,10 @@ kernel void test_convert_type_1()
     }
     compare_long_elements_double("convert_long_rtn(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (long)convert_long_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (long)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (long)max_expected;
     }
     actual.value = convert_long_sat_rtn((double)sat_input);
@@ -4381,10 +4381,10 @@ kernel void test_convert_type_1()
     }
     compare_ulong_elements_double("convert_ulong(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (ulong)convert_ulong(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ulong)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ulong)max_expected;
     }
     actual.value = convert_ulong_sat((double)sat_input);
@@ -4402,10 +4402,10 @@ kernel void test_convert_type_1()
     }
     compare_ulong_elements_double("convert_ulong_rte(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (ulong)convert_ulong_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ulong)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ulong)max_expected;
     }
     actual.value = convert_ulong_sat_rte((double)sat_input);
@@ -4423,10 +4423,10 @@ kernel void test_convert_type_1()
     }
     compare_ulong_elements_double("convert_ulong_rtz(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (ulong)convert_ulong_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ulong)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ulong)max_expected;
     }
     actual.value = convert_ulong_sat_rtz((double)sat_input);
@@ -4444,10 +4444,10 @@ kernel void test_convert_type_1()
     }
     compare_ulong_elements_double("convert_ulong_rtp(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (ulong)convert_ulong_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ulong)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ulong)max_expected;
     }
     actual.value = convert_ulong_sat_rtp((double)sat_input);
@@ -4465,10 +4465,10 @@ kernel void test_convert_type_1()
     }
     compare_ulong_elements_double("convert_ulong_rtn(double)", i, &double_values[i], 0, expected.raw, actual.raw, 1);
     expected.value = (ulong)convert_ulong_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ulong)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ulong)max_expected;
     }
     actual.value = convert_ulong_sat_rtn((double)sat_input);
diff --git a/tests/kernel/test_convert_type_16.cl b/tests/kernel/test_convert_type_16.cl
index b5cfe49..7d94076 100644
--- a/tests/kernel/test_convert_type_16.cl
+++ b/tests/kernel/test_convert_type_16.cl
@@ -2627,10 +2627,10 @@ kernel void test_convert_type_16()
     }
     compare_char_elements_float("convert_char16(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (char16)convert_char(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (char16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (char16)max_expected;
     }
     actual.value = convert_char16_sat((float16)sat_input);
@@ -2648,10 +2648,10 @@ kernel void test_convert_type_16()
     }
     compare_char_elements_float("convert_char16_rte(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (char16)convert_char_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (char16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (char16)max_expected;
     }
     actual.value = convert_char16_sat_rte((float16)sat_input);
@@ -2669,10 +2669,10 @@ kernel void test_convert_type_16()
     }
     compare_char_elements_float("convert_char16_rtz(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (char16)convert_char_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (char16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (char16)max_expected;
     }
     actual.value = convert_char16_sat_rtz((float16)sat_input);
@@ -2690,10 +2690,10 @@ kernel void test_convert_type_16()
     }
     compare_char_elements_float("convert_char16_rtp(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (char16)convert_char_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (char16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (char16)max_expected;
     }
     actual.value = convert_char16_sat_rtp((float16)sat_input);
@@ -2711,10 +2711,10 @@ kernel void test_convert_type_16()
     }
     compare_char_elements_float("convert_char16_rtn(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (char16)convert_char_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (char16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (char16)max_expected;
     }
     actual.value = convert_char16_sat_rtn((float16)sat_input);
@@ -2741,10 +2741,10 @@ kernel void test_convert_type_16()
     }
     compare_uchar_elements_float("convert_uchar16(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (uchar16)convert_uchar(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uchar16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uchar16)max_expected;
     }
     actual.value = convert_uchar16_sat((float16)sat_input);
@@ -2762,10 +2762,10 @@ kernel void test_convert_type_16()
     }
     compare_uchar_elements_float("convert_uchar16_rte(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (uchar16)convert_uchar_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uchar16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uchar16)max_expected;
     }
     actual.value = convert_uchar16_sat_rte((float16)sat_input);
@@ -2783,10 +2783,10 @@ kernel void test_convert_type_16()
     }
     compare_uchar_elements_float("convert_uchar16_rtz(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (uchar16)convert_uchar_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uchar16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uchar16)max_expected;
     }
     actual.value = convert_uchar16_sat_rtz((float16)sat_input);
@@ -2804,10 +2804,10 @@ kernel void test_convert_type_16()
     }
     compare_uchar_elements_float("convert_uchar16_rtp(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (uchar16)convert_uchar_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uchar16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uchar16)max_expected;
     }
     actual.value = convert_uchar16_sat_rtp((float16)sat_input);
@@ -2825,10 +2825,10 @@ kernel void test_convert_type_16()
     }
     compare_uchar_elements_float("convert_uchar16_rtn(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (uchar16)convert_uchar_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uchar16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uchar16)max_expected;
     }
     actual.value = convert_uchar16_sat_rtn((float16)sat_input);
@@ -2855,10 +2855,10 @@ kernel void test_convert_type_16()
     }
     compare_short_elements_float("convert_short16(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (short16)convert_short(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (short16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (short16)max_expected;
     }
     actual.value = convert_short16_sat((float16)sat_input);
@@ -2876,10 +2876,10 @@ kernel void test_convert_type_16()
     }
     compare_short_elements_float("convert_short16_rte(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (short16)convert_short_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (short16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (short16)max_expected;
     }
     actual.value = convert_short16_sat_rte((float16)sat_input);
@@ -2897,10 +2897,10 @@ kernel void test_convert_type_16()
     }
     compare_short_elements_float("convert_short16_rtz(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (short16)convert_short_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (short16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (short16)max_expected;
     }
     actual.value = convert_short16_sat_rtz((float16)sat_input);
@@ -2918,10 +2918,10 @@ kernel void test_convert_type_16()
     }
     compare_short_elements_float("convert_short16_rtp(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (short16)convert_short_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (short16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (short16)max_expected;
     }
     actual.value = convert_short16_sat_rtp((float16)sat_input);
@@ -2939,10 +2939,10 @@ kernel void test_convert_type_16()
     }
     compare_short_elements_float("convert_short16_rtn(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (short16)convert_short_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (short16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (short16)max_expected;
     }
     actual.value = convert_short16_sat_rtn((float16)sat_input);
@@ -2969,10 +2969,10 @@ kernel void test_convert_type_16()
     }
     compare_ushort_elements_float("convert_ushort16(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (ushort16)convert_ushort(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ushort16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ushort16)max_expected;
     }
     actual.value = convert_ushort16_sat((float16)sat_input);
@@ -2990,10 +2990,10 @@ kernel void test_convert_type_16()
     }
     compare_ushort_elements_float("convert_ushort16_rte(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (ushort16)convert_ushort_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ushort16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ushort16)max_expected;
     }
     actual.value = convert_ushort16_sat_rte((float16)sat_input);
@@ -3011,10 +3011,10 @@ kernel void test_convert_type_16()
     }
     compare_ushort_elements_float("convert_ushort16_rtz(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (ushort16)convert_ushort_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ushort16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ushort16)max_expected;
     }
     actual.value = convert_ushort16_sat_rtz((float16)sat_input);
@@ -3032,10 +3032,10 @@ kernel void test_convert_type_16()
     }
     compare_ushort_elements_float("convert_ushort16_rtp(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (ushort16)convert_ushort_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ushort16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ushort16)max_expected;
     }
     actual.value = convert_ushort16_sat_rtp((float16)sat_input);
@@ -3053,10 +3053,10 @@ kernel void test_convert_type_16()
     }
     compare_ushort_elements_float("convert_ushort16_rtn(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (ushort16)convert_ushort_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ushort16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ushort16)max_expected;
     }
     actual.value = convert_ushort16_sat_rtn((float16)sat_input);
@@ -3083,10 +3083,10 @@ kernel void test_convert_type_16()
     }
     compare_int_elements_float("convert_int16(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (int16)convert_int(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (int16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (int16)max_expected;
     }
     actual.value = convert_int16_sat((float16)sat_input);
@@ -3104,10 +3104,10 @@ kernel void test_convert_type_16()
     }
     compare_int_elements_float("convert_int16_rte(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (int16)convert_int_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (int16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (int16)max_expected;
     }
     actual.value = convert_int16_sat_rte((float16)sat_input);
@@ -3125,10 +3125,10 @@ kernel void test_convert_type_16()
     }
     compare_int_elements_float("convert_int16_rtz(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (int16)convert_int_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (int16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (int16)max_expected;
     }
     actual.value = convert_int16_sat_rtz((float16)sat_input);
@@ -3146,10 +3146,10 @@ kernel void test_convert_type_16()
     }
     compare_int_elements_float("convert_int16_rtp(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (int16)convert_int_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (int16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (int16)max_expected;
     }
     actual.value = convert_int16_sat_rtp((float16)sat_input);
@@ -3167,10 +3167,10 @@ kernel void test_convert_type_16()
     }
     compare_int_elements_float("convert_int16_rtn(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (int16)convert_int_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (int16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (int16)max_expected;
     }
     actual.value = convert_int16_sat_rtn((float16)sat_input);
@@ -3197,10 +3197,10 @@ kernel void test_convert_type_16()
     }
     compare_uint_elements_float("convert_uint16(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (uint16)convert_uint(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uint16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uint16)max_expected;
     }
     actual.value = convert_uint16_sat((float16)sat_input);
@@ -3218,10 +3218,10 @@ kernel void test_convert_type_16()
     }
     compare_uint_elements_float("convert_uint16_rte(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (uint16)convert_uint_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uint16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uint16)max_expected;
     }
     actual.value = convert_uint16_sat_rte((float16)sat_input);
@@ -3239,10 +3239,10 @@ kernel void test_convert_type_16()
     }
     compare_uint_elements_float("convert_uint16_rtz(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (uint16)convert_uint_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uint16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uint16)max_expected;
     }
     actual.value = convert_uint16_sat_rtz((float16)sat_input);
@@ -3260,10 +3260,10 @@ kernel void test_convert_type_16()
     }
     compare_uint_elements_float("convert_uint16_rtp(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (uint16)convert_uint_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uint16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uint16)max_expected;
     }
     actual.value = convert_uint16_sat_rtp((float16)sat_input);
@@ -3281,10 +3281,10 @@ kernel void test_convert_type_16()
     }
     compare_uint_elements_float("convert_uint16_rtn(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (uint16)convert_uint_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uint16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uint16)max_expected;
     }
     actual.value = convert_uint16_sat_rtn((float16)sat_input);
@@ -3313,10 +3313,10 @@ kernel void test_convert_type_16()
     }
     compare_long_elements_float("convert_long16(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (long16)convert_long(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (long16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (long16)max_expected;
     }
     actual.value = convert_long16_sat((float16)sat_input);
@@ -3334,10 +3334,10 @@ kernel void test_convert_type_16()
     }
     compare_long_elements_float("convert_long16_rte(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (long16)convert_long_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (long16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (long16)max_expected;
     }
     actual.value = convert_long16_sat_rte((float16)sat_input);
@@ -3355,10 +3355,10 @@ kernel void test_convert_type_16()
     }
     compare_long_elements_float("convert_long16_rtz(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (long16)convert_long_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (long16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (long16)max_expected;
     }
     actual.value = convert_long16_sat_rtz((float16)sat_input);
@@ -3376,10 +3376,10 @@ kernel void test_convert_type_16()
     }
     compare_long_elements_float("convert_long16_rtp(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (long16)convert_long_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (long16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (long16)max_expected;
     }
     actual.value = convert_long16_sat_rtp((float16)sat_input);
@@ -3397,10 +3397,10 @@ kernel void test_convert_type_16()
     }
     compare_long_elements_float("convert_long16_rtn(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (long16)convert_long_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (long16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (long16)max_expected;
     }
     actual.value = convert_long16_sat_rtn((float16)sat_input);
@@ -3431,10 +3431,10 @@ kernel void test_convert_type_16()
     }
     compare_ulong_elements_float("convert_ulong16(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (ulong16)convert_ulong(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ulong16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ulong16)max_expected;
     }
     actual.value = convert_ulong16_sat((float16)sat_input);
@@ -3452,10 +3452,10 @@ kernel void test_convert_type_16()
     }
     compare_ulong_elements_float("convert_ulong16_rte(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (ulong16)convert_ulong_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ulong16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ulong16)max_expected;
     }
     actual.value = convert_ulong16_sat_rte((float16)sat_input);
@@ -3473,10 +3473,10 @@ kernel void test_convert_type_16()
     }
     compare_ulong_elements_float("convert_ulong16_rtz(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (ulong16)convert_ulong_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ulong16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ulong16)max_expected;
     }
     actual.value = convert_ulong16_sat_rtz((float16)sat_input);
@@ -3494,10 +3494,10 @@ kernel void test_convert_type_16()
     }
     compare_ulong_elements_float("convert_ulong16_rtp(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (ulong16)convert_ulong_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ulong16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ulong16)max_expected;
     }
     actual.value = convert_ulong16_sat_rtp((float16)sat_input);
@@ -3515,10 +3515,10 @@ kernel void test_convert_type_16()
     }
     compare_ulong_elements_float("convert_ulong16_rtn(float16)", i, &float_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (ulong16)convert_ulong_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ulong16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ulong16)max_expected;
     }
     actual.value = convert_ulong16_sat_rtn((float16)sat_input);
@@ -3549,10 +3549,10 @@ kernel void test_convert_type_16()
     }
     compare_char_elements_double("convert_char16(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (char16)convert_char(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (char16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (char16)max_expected;
     }
     actual.value = convert_char16_sat((double16)sat_input);
@@ -3570,10 +3570,10 @@ kernel void test_convert_type_16()
     }
     compare_char_elements_double("convert_char16_rte(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (char16)convert_char_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (char16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (char16)max_expected;
     }
     actual.value = convert_char16_sat_rte((double16)sat_input);
@@ -3591,10 +3591,10 @@ kernel void test_convert_type_16()
     }
     compare_char_elements_double("convert_char16_rtz(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (char16)convert_char_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (char16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (char16)max_expected;
     }
     actual.value = convert_char16_sat_rtz((double16)sat_input);
@@ -3612,10 +3612,10 @@ kernel void test_convert_type_16()
     }
     compare_char_elements_double("convert_char16_rtp(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (char16)convert_char_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (char16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (char16)max_expected;
     }
     actual.value = convert_char16_sat_rtp((double16)sat_input);
@@ -3633,10 +3633,10 @@ kernel void test_convert_type_16()
     }
     compare_char_elements_double("convert_char16_rtn(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (char16)convert_char_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (char16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (char16)max_expected;
     }
     actual.value = convert_char16_sat_rtn((double16)sat_input);
@@ -3667,10 +3667,10 @@ kernel void test_convert_type_16()
     }
     compare_uchar_elements_double("convert_uchar16(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (uchar16)convert_uchar(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uchar16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uchar16)max_expected;
     }
     actual.value = convert_uchar16_sat((double16)sat_input);
@@ -3688,10 +3688,10 @@ kernel void test_convert_type_16()
     }
     compare_uchar_elements_double("convert_uchar16_rte(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (uchar16)convert_uchar_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uchar16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uchar16)max_expected;
     }
     actual.value = convert_uchar16_sat_rte((double16)sat_input);
@@ -3709,10 +3709,10 @@ kernel void test_convert_type_16()
     }
     compare_uchar_elements_double("convert_uchar16_rtz(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (uchar16)convert_uchar_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uchar16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uchar16)max_expected;
     }
     actual.value = convert_uchar16_sat_rtz((double16)sat_input);
@@ -3730,10 +3730,10 @@ kernel void test_convert_type_16()
     }
     compare_uchar_elements_double("convert_uchar16_rtp(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (uchar16)convert_uchar_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uchar16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uchar16)max_expected;
     }
     actual.value = convert_uchar16_sat_rtp((double16)sat_input);
@@ -3751,10 +3751,10 @@ kernel void test_convert_type_16()
     }
     compare_uchar_elements_double("convert_uchar16_rtn(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (uchar16)convert_uchar_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uchar16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uchar16)max_expected;
     }
     actual.value = convert_uchar16_sat_rtn((double16)sat_input);
@@ -3785,10 +3785,10 @@ kernel void test_convert_type_16()
     }
     compare_short_elements_double("convert_short16(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (short16)convert_short(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (short16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (short16)max_expected;
     }
     actual.value = convert_short16_sat((double16)sat_input);
@@ -3806,10 +3806,10 @@ kernel void test_convert_type_16()
     }
     compare_short_elements_double("convert_short16_rte(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (short16)convert_short_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (short16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (short16)max_expected;
     }
     actual.value = convert_short16_sat_rte((double16)sat_input);
@@ -3827,10 +3827,10 @@ kernel void test_convert_type_16()
     }
     compare_short_elements_double("convert_short16_rtz(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (short16)convert_short_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (short16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (short16)max_expected;
     }
     actual.value = convert_short16_sat_rtz((double16)sat_input);
@@ -3848,10 +3848,10 @@ kernel void test_convert_type_16()
     }
     compare_short_elements_double("convert_short16_rtp(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (short16)convert_short_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (short16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (short16)max_expected;
     }
     actual.value = convert_short16_sat_rtp((double16)sat_input);
@@ -3869,10 +3869,10 @@ kernel void test_convert_type_16()
     }
     compare_short_elements_double("convert_short16_rtn(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (short16)convert_short_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (short16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (short16)max_expected;
     }
     actual.value = convert_short16_sat_rtn((double16)sat_input);
@@ -3903,10 +3903,10 @@ kernel void test_convert_type_16()
     }
     compare_ushort_elements_double("convert_ushort16(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (ushort16)convert_ushort(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ushort16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ushort16)max_expected;
     }
     actual.value = convert_ushort16_sat((double16)sat_input);
@@ -3924,10 +3924,10 @@ kernel void test_convert_type_16()
     }
     compare_ushort_elements_double("convert_ushort16_rte(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (ushort16)convert_ushort_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ushort16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ushort16)max_expected;
     }
     actual.value = convert_ushort16_sat_rte((double16)sat_input);
@@ -3945,10 +3945,10 @@ kernel void test_convert_type_16()
     }
     compare_ushort_elements_double("convert_ushort16_rtz(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (ushort16)convert_ushort_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ushort16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ushort16)max_expected;
     }
     actual.value = convert_ushort16_sat_rtz((double16)sat_input);
@@ -3966,10 +3966,10 @@ kernel void test_convert_type_16()
     }
     compare_ushort_elements_double("convert_ushort16_rtp(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (ushort16)convert_ushort_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ushort16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ushort16)max_expected;
     }
     actual.value = convert_ushort16_sat_rtp((double16)sat_input);
@@ -3987,10 +3987,10 @@ kernel void test_convert_type_16()
     }
     compare_ushort_elements_double("convert_ushort16_rtn(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (ushort16)convert_ushort_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ushort16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ushort16)max_expected;
     }
     actual.value = convert_ushort16_sat_rtn((double16)sat_input);
@@ -4021,10 +4021,10 @@ kernel void test_convert_type_16()
     }
     compare_int_elements_double("convert_int16(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (int16)convert_int(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (int16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (int16)max_expected;
     }
     actual.value = convert_int16_sat((double16)sat_input);
@@ -4042,10 +4042,10 @@ kernel void test_convert_type_16()
     }
     compare_int_elements_double("convert_int16_rte(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (int16)convert_int_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (int16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (int16)max_expected;
     }
     actual.value = convert_int16_sat_rte((double16)sat_input);
@@ -4063,10 +4063,10 @@ kernel void test_convert_type_16()
     }
     compare_int_elements_double("convert_int16_rtz(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (int16)convert_int_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (int16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (int16)max_expected;
     }
     actual.value = convert_int16_sat_rtz((double16)sat_input);
@@ -4084,10 +4084,10 @@ kernel void test_convert_type_16()
     }
     compare_int_elements_double("convert_int16_rtp(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (int16)convert_int_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (int16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (int16)max_expected;
     }
     actual.value = convert_int16_sat_rtp((double16)sat_input);
@@ -4105,10 +4105,10 @@ kernel void test_convert_type_16()
     }
     compare_int_elements_double("convert_int16_rtn(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (int16)convert_int_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (int16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (int16)max_expected;
     }
     actual.value = convert_int16_sat_rtn((double16)sat_input);
@@ -4139,10 +4139,10 @@ kernel void test_convert_type_16()
     }
     compare_uint_elements_double("convert_uint16(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (uint16)convert_uint(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uint16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uint16)max_expected;
     }
     actual.value = convert_uint16_sat((double16)sat_input);
@@ -4160,10 +4160,10 @@ kernel void test_convert_type_16()
     }
     compare_uint_elements_double("convert_uint16_rte(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (uint16)convert_uint_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uint16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uint16)max_expected;
     }
     actual.value = convert_uint16_sat_rte((double16)sat_input);
@@ -4181,10 +4181,10 @@ kernel void test_convert_type_16()
     }
     compare_uint_elements_double("convert_uint16_rtz(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (uint16)convert_uint_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uint16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uint16)max_expected;
     }
     actual.value = convert_uint16_sat_rtz((double16)sat_input);
@@ -4202,10 +4202,10 @@ kernel void test_convert_type_16()
     }
     compare_uint_elements_double("convert_uint16_rtp(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (uint16)convert_uint_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uint16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uint16)max_expected;
     }
     actual.value = convert_uint16_sat_rtp((double16)sat_input);
@@ -4223,10 +4223,10 @@ kernel void test_convert_type_16()
     }
     compare_uint_elements_double("convert_uint16_rtn(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (uint16)convert_uint_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uint16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uint16)max_expected;
     }
     actual.value = convert_uint16_sat_rtn((double16)sat_input);
@@ -4259,10 +4259,10 @@ kernel void test_convert_type_16()
     }
     compare_long_elements_double("convert_long16(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (long16)convert_long(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (long16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (long16)max_expected;
     }
     actual.value = convert_long16_sat((double16)sat_input);
@@ -4280,10 +4280,10 @@ kernel void test_convert_type_16()
     }
     compare_long_elements_double("convert_long16_rte(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (long16)convert_long_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (long16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (long16)max_expected;
     }
     actual.value = convert_long16_sat_rte((double16)sat_input);
@@ -4301,10 +4301,10 @@ kernel void test_convert_type_16()
     }
     compare_long_elements_double("convert_long16_rtz(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (long16)convert_long_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (long16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (long16)max_expected;
     }
     actual.value = convert_long16_sat_rtz((double16)sat_input);
@@ -4322,10 +4322,10 @@ kernel void test_convert_type_16()
     }
     compare_long_elements_double("convert_long16_rtp(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (long16)convert_long_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (long16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (long16)max_expected;
     }
     actual.value = convert_long16_sat_rtp((double16)sat_input);
@@ -4343,10 +4343,10 @@ kernel void test_convert_type_16()
     }
     compare_long_elements_double("convert_long16_rtn(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (long16)convert_long_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (long16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (long16)max_expected;
     }
     actual.value = convert_long16_sat_rtn((double16)sat_input);
@@ -4381,10 +4381,10 @@ kernel void test_convert_type_16()
     }
     compare_ulong_elements_double("convert_ulong16(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (ulong16)convert_ulong(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ulong16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ulong16)max_expected;
     }
     actual.value = convert_ulong16_sat((double16)sat_input);
@@ -4402,10 +4402,10 @@ kernel void test_convert_type_16()
     }
     compare_ulong_elements_double("convert_ulong16_rte(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (ulong16)convert_ulong_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ulong16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ulong16)max_expected;
     }
     actual.value = convert_ulong16_sat_rte((double16)sat_input);
@@ -4423,10 +4423,10 @@ kernel void test_convert_type_16()
     }
     compare_ulong_elements_double("convert_ulong16_rtz(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (ulong16)convert_ulong_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ulong16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ulong16)max_expected;
     }
     actual.value = convert_ulong16_sat_rtz((double16)sat_input);
@@ -4444,10 +4444,10 @@ kernel void test_convert_type_16()
     }
     compare_ulong_elements_double("convert_ulong16_rtp(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (ulong16)convert_ulong_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ulong16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ulong16)max_expected;
     }
     actual.value = convert_ulong16_sat_rtp((double16)sat_input);
@@ -4465,10 +4465,10 @@ kernel void test_convert_type_16()
     }
     compare_ulong_elements_double("convert_ulong16_rtn(double16)", i, &double_values[i], 0, expected.raw, actual.raw, 16);
     expected.value = (ulong16)convert_ulong_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ulong16)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ulong16)max_expected;
     }
     actual.value = convert_ulong16_sat_rtn((double16)sat_input);
diff --git a/tests/kernel/test_convert_type_2.cl b/tests/kernel/test_convert_type_2.cl
index a26ff14..3ee7054 100644
--- a/tests/kernel/test_convert_type_2.cl
+++ b/tests/kernel/test_convert_type_2.cl
@@ -2627,10 +2627,10 @@ kernel void test_convert_type_2()
     }
     compare_char_elements_float("convert_char2(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (char2)convert_char(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (char2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (char2)max_expected;
     }
     actual.value = convert_char2_sat((float2)sat_input);
@@ -2648,10 +2648,10 @@ kernel void test_convert_type_2()
     }
     compare_char_elements_float("convert_char2_rte(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (char2)convert_char_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (char2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (char2)max_expected;
     }
     actual.value = convert_char2_sat_rte((float2)sat_input);
@@ -2669,10 +2669,10 @@ kernel void test_convert_type_2()
     }
     compare_char_elements_float("convert_char2_rtz(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (char2)convert_char_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (char2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (char2)max_expected;
     }
     actual.value = convert_char2_sat_rtz((float2)sat_input);
@@ -2690,10 +2690,10 @@ kernel void test_convert_type_2()
     }
     compare_char_elements_float("convert_char2_rtp(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (char2)convert_char_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (char2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (char2)max_expected;
     }
     actual.value = convert_char2_sat_rtp((float2)sat_input);
@@ -2711,10 +2711,10 @@ kernel void test_convert_type_2()
     }
     compare_char_elements_float("convert_char2_rtn(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (char2)convert_char_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (char2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (char2)max_expected;
     }
     actual.value = convert_char2_sat_rtn((float2)sat_input);
@@ -2741,10 +2741,10 @@ kernel void test_convert_type_2()
     }
     compare_uchar_elements_float("convert_uchar2(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (uchar2)convert_uchar(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uchar2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uchar2)max_expected;
     }
     actual.value = convert_uchar2_sat((float2)sat_input);
@@ -2762,10 +2762,10 @@ kernel void test_convert_type_2()
     }
     compare_uchar_elements_float("convert_uchar2_rte(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (uchar2)convert_uchar_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uchar2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uchar2)max_expected;
     }
     actual.value = convert_uchar2_sat_rte((float2)sat_input);
@@ -2783,10 +2783,10 @@ kernel void test_convert_type_2()
     }
     compare_uchar_elements_float("convert_uchar2_rtz(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (uchar2)convert_uchar_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uchar2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uchar2)max_expected;
     }
     actual.value = convert_uchar2_sat_rtz((float2)sat_input);
@@ -2804,10 +2804,10 @@ kernel void test_convert_type_2()
     }
     compare_uchar_elements_float("convert_uchar2_rtp(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (uchar2)convert_uchar_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uchar2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uchar2)max_expected;
     }
     actual.value = convert_uchar2_sat_rtp((float2)sat_input);
@@ -2825,10 +2825,10 @@ kernel void test_convert_type_2()
     }
     compare_uchar_elements_float("convert_uchar2_rtn(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (uchar2)convert_uchar_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uchar2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uchar2)max_expected;
     }
     actual.value = convert_uchar2_sat_rtn((float2)sat_input);
@@ -2855,10 +2855,10 @@ kernel void test_convert_type_2()
     }
     compare_short_elements_float("convert_short2(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (short2)convert_short(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (short2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (short2)max_expected;
     }
     actual.value = convert_short2_sat((float2)sat_input);
@@ -2876,10 +2876,10 @@ kernel void test_convert_type_2()
     }
     compare_short_elements_float("convert_short2_rte(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (short2)convert_short_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (short2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (short2)max_expected;
     }
     actual.value = convert_short2_sat_rte((float2)sat_input);
@@ -2897,10 +2897,10 @@ kernel void test_convert_type_2()
     }
     compare_short_elements_float("convert_short2_rtz(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (short2)convert_short_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (short2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (short2)max_expected;
     }
     actual.value = convert_short2_sat_rtz((float2)sat_input);
@@ -2918,10 +2918,10 @@ kernel void test_convert_type_2()
     }
     compare_short_elements_float("convert_short2_rtp(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (short2)convert_short_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (short2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (short2)max_expected;
     }
     actual.value = convert_short2_sat_rtp((float2)sat_input);
@@ -2939,10 +2939,10 @@ kernel void test_convert_type_2()
     }
     compare_short_elements_float("convert_short2_rtn(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (short2)convert_short_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (short2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (short2)max_expected;
     }
     actual.value = convert_short2_sat_rtn((float2)sat_input);
@@ -2969,10 +2969,10 @@ kernel void test_convert_type_2()
     }
     compare_ushort_elements_float("convert_ushort2(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (ushort2)convert_ushort(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ushort2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ushort2)max_expected;
     }
     actual.value = convert_ushort2_sat((float2)sat_input);
@@ -2990,10 +2990,10 @@ kernel void test_convert_type_2()
     }
     compare_ushort_elements_float("convert_ushort2_rte(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (ushort2)convert_ushort_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ushort2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ushort2)max_expected;
     }
     actual.value = convert_ushort2_sat_rte((float2)sat_input);
@@ -3011,10 +3011,10 @@ kernel void test_convert_type_2()
     }
     compare_ushort_elements_float("convert_ushort2_rtz(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (ushort2)convert_ushort_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ushort2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ushort2)max_expected;
     }
     actual.value = convert_ushort2_sat_rtz((float2)sat_input);
@@ -3032,10 +3032,10 @@ kernel void test_convert_type_2()
     }
     compare_ushort_elements_float("convert_ushort2_rtp(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (ushort2)convert_ushort_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ushort2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ushort2)max_expected;
     }
     actual.value = convert_ushort2_sat_rtp((float2)sat_input);
@@ -3053,10 +3053,10 @@ kernel void test_convert_type_2()
     }
     compare_ushort_elements_float("convert_ushort2_rtn(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (ushort2)convert_ushort_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ushort2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ushort2)max_expected;
     }
     actual.value = convert_ushort2_sat_rtn((float2)sat_input);
@@ -3083,10 +3083,10 @@ kernel void test_convert_type_2()
     }
     compare_int_elements_float("convert_int2(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (int2)convert_int(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (int2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (int2)max_expected;
     }
     actual.value = convert_int2_sat((float2)sat_input);
@@ -3104,10 +3104,10 @@ kernel void test_convert_type_2()
     }
     compare_int_elements_float("convert_int2_rte(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (int2)convert_int_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (int2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (int2)max_expected;
     }
     actual.value = convert_int2_sat_rte((float2)sat_input);
@@ -3125,10 +3125,10 @@ kernel void test_convert_type_2()
     }
     compare_int_elements_float("convert_int2_rtz(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (int2)convert_int_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (int2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (int2)max_expected;
     }
     actual.value = convert_int2_sat_rtz((float2)sat_input);
@@ -3146,10 +3146,10 @@ kernel void test_convert_type_2()
     }
     compare_int_elements_float("convert_int2_rtp(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (int2)convert_int_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (int2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (int2)max_expected;
     }
     actual.value = convert_int2_sat_rtp((float2)sat_input);
@@ -3167,10 +3167,10 @@ kernel void test_convert_type_2()
     }
     compare_int_elements_float("convert_int2_rtn(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (int2)convert_int_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (int2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (int2)max_expected;
     }
     actual.value = convert_int2_sat_rtn((float2)sat_input);
@@ -3197,10 +3197,10 @@ kernel void test_convert_type_2()
     }
     compare_uint_elements_float("convert_uint2(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (uint2)convert_uint(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uint2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uint2)max_expected;
     }
     actual.value = convert_uint2_sat((float2)sat_input);
@@ -3218,10 +3218,10 @@ kernel void test_convert_type_2()
     }
     compare_uint_elements_float("convert_uint2_rte(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (uint2)convert_uint_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uint2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uint2)max_expected;
     }
     actual.value = convert_uint2_sat_rte((float2)sat_input);
@@ -3239,10 +3239,10 @@ kernel void test_convert_type_2()
     }
     compare_uint_elements_float("convert_uint2_rtz(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (uint2)convert_uint_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uint2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uint2)max_expected;
     }
     actual.value = convert_uint2_sat_rtz((float2)sat_input);
@@ -3260,10 +3260,10 @@ kernel void test_convert_type_2()
     }
     compare_uint_elements_float("convert_uint2_rtp(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (uint2)convert_uint_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uint2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uint2)max_expected;
     }
     actual.value = convert_uint2_sat_rtp((float2)sat_input);
@@ -3281,10 +3281,10 @@ kernel void test_convert_type_2()
     }
     compare_uint_elements_float("convert_uint2_rtn(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (uint2)convert_uint_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uint2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uint2)max_expected;
     }
     actual.value = convert_uint2_sat_rtn((float2)sat_input);
@@ -3313,10 +3313,10 @@ kernel void test_convert_type_2()
     }
     compare_long_elements_float("convert_long2(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (long2)convert_long(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (long2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (long2)max_expected;
     }
     actual.value = convert_long2_sat((float2)sat_input);
@@ -3334,10 +3334,10 @@ kernel void test_convert_type_2()
     }
     compare_long_elements_float("convert_long2_rte(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (long2)convert_long_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (long2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (long2)max_expected;
     }
     actual.value = convert_long2_sat_rte((float2)sat_input);
@@ -3355,10 +3355,10 @@ kernel void test_convert_type_2()
     }
     compare_long_elements_float("convert_long2_rtz(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (long2)convert_long_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (long2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (long2)max_expected;
     }
     actual.value = convert_long2_sat_rtz((float2)sat_input);
@@ -3376,10 +3376,10 @@ kernel void test_convert_type_2()
     }
     compare_long_elements_float("convert_long2_rtp(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (long2)convert_long_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (long2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (long2)max_expected;
     }
     actual.value = convert_long2_sat_rtp((float2)sat_input);
@@ -3397,10 +3397,10 @@ kernel void test_convert_type_2()
     }
     compare_long_elements_float("convert_long2_rtn(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (long2)convert_long_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (long2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (long2)max_expected;
     }
     actual.value = convert_long2_sat_rtn((float2)sat_input);
@@ -3431,10 +3431,10 @@ kernel void test_convert_type_2()
     }
     compare_ulong_elements_float("convert_ulong2(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (ulong2)convert_ulong(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ulong2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ulong2)max_expected;
     }
     actual.value = convert_ulong2_sat((float2)sat_input);
@@ -3452,10 +3452,10 @@ kernel void test_convert_type_2()
     }
     compare_ulong_elements_float("convert_ulong2_rte(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (ulong2)convert_ulong_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ulong2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ulong2)max_expected;
     }
     actual.value = convert_ulong2_sat_rte((float2)sat_input);
@@ -3473,10 +3473,10 @@ kernel void test_convert_type_2()
     }
     compare_ulong_elements_float("convert_ulong2_rtz(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (ulong2)convert_ulong_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ulong2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ulong2)max_expected;
     }
     actual.value = convert_ulong2_sat_rtz((float2)sat_input);
@@ -3494,10 +3494,10 @@ kernel void test_convert_type_2()
     }
     compare_ulong_elements_float("convert_ulong2_rtp(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (ulong2)convert_ulong_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ulong2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ulong2)max_expected;
     }
     actual.value = convert_ulong2_sat_rtp((float2)sat_input);
@@ -3515,10 +3515,10 @@ kernel void test_convert_type_2()
     }
     compare_ulong_elements_float("convert_ulong2_rtn(float2)", i, &float_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (ulong2)convert_ulong_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ulong2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ulong2)max_expected;
     }
     actual.value = convert_ulong2_sat_rtn((float2)sat_input);
@@ -3549,10 +3549,10 @@ kernel void test_convert_type_2()
     }
     compare_char_elements_double("convert_char2(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (char2)convert_char(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (char2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (char2)max_expected;
     }
     actual.value = convert_char2_sat((double2)sat_input);
@@ -3570,10 +3570,10 @@ kernel void test_convert_type_2()
     }
     compare_char_elements_double("convert_char2_rte(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (char2)convert_char_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (char2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (char2)max_expected;
     }
     actual.value = convert_char2_sat_rte((double2)sat_input);
@@ -3591,10 +3591,10 @@ kernel void test_convert_type_2()
     }
     compare_char_elements_double("convert_char2_rtz(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (char2)convert_char_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (char2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (char2)max_expected;
     }
     actual.value = convert_char2_sat_rtz((double2)sat_input);
@@ -3612,10 +3612,10 @@ kernel void test_convert_type_2()
     }
     compare_char_elements_double("convert_char2_rtp(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (char2)convert_char_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (char2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (char2)max_expected;
     }
     actual.value = convert_char2_sat_rtp((double2)sat_input);
@@ -3633,10 +3633,10 @@ kernel void test_convert_type_2()
     }
     compare_char_elements_double("convert_char2_rtn(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (char2)convert_char_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (char2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (char2)max_expected;
     }
     actual.value = convert_char2_sat_rtn((double2)sat_input);
@@ -3667,10 +3667,10 @@ kernel void test_convert_type_2()
     }
     compare_uchar_elements_double("convert_uchar2(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (uchar2)convert_uchar(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uchar2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uchar2)max_expected;
     }
     actual.value = convert_uchar2_sat((double2)sat_input);
@@ -3688,10 +3688,10 @@ kernel void test_convert_type_2()
     }
     compare_uchar_elements_double("convert_uchar2_rte(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (uchar2)convert_uchar_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uchar2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uchar2)max_expected;
     }
     actual.value = convert_uchar2_sat_rte((double2)sat_input);
@@ -3709,10 +3709,10 @@ kernel void test_convert_type_2()
     }
     compare_uchar_elements_double("convert_uchar2_rtz(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (uchar2)convert_uchar_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uchar2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uchar2)max_expected;
     }
     actual.value = convert_uchar2_sat_rtz((double2)sat_input);
@@ -3730,10 +3730,10 @@ kernel void test_convert_type_2()
     }
     compare_uchar_elements_double("convert_uchar2_rtp(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (uchar2)convert_uchar_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uchar2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uchar2)max_expected;
     }
     actual.value = convert_uchar2_sat_rtp((double2)sat_input);
@@ -3751,10 +3751,10 @@ kernel void test_convert_type_2()
     }
     compare_uchar_elements_double("convert_uchar2_rtn(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (uchar2)convert_uchar_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uchar2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uchar2)max_expected;
     }
     actual.value = convert_uchar2_sat_rtn((double2)sat_input);
@@ -3785,10 +3785,10 @@ kernel void test_convert_type_2()
     }
     compare_short_elements_double("convert_short2(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (short2)convert_short(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (short2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (short2)max_expected;
     }
     actual.value = convert_short2_sat((double2)sat_input);
@@ -3806,10 +3806,10 @@ kernel void test_convert_type_2()
     }
     compare_short_elements_double("convert_short2_rte(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (short2)convert_short_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (short2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (short2)max_expected;
     }
     actual.value = convert_short2_sat_rte((double2)sat_input);
@@ -3827,10 +3827,10 @@ kernel void test_convert_type_2()
     }
     compare_short_elements_double("convert_short2_rtz(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (short2)convert_short_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (short2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (short2)max_expected;
     }
     actual.value = convert_short2_sat_rtz((double2)sat_input);
@@ -3848,10 +3848,10 @@ kernel void test_convert_type_2()
     }
     compare_short_elements_double("convert_short2_rtp(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (short2)convert_short_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (short2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (short2)max_expected;
     }
     actual.value = convert_short2_sat_rtp((double2)sat_input);
@@ -3869,10 +3869,10 @@ kernel void test_convert_type_2()
     }
     compare_short_elements_double("convert_short2_rtn(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (short2)convert_short_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (short2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (short2)max_expected;
     }
     actual.value = convert_short2_sat_rtn((double2)sat_input);
@@ -3903,10 +3903,10 @@ kernel void test_convert_type_2()
     }
     compare_ushort_elements_double("convert_ushort2(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (ushort2)convert_ushort(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ushort2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ushort2)max_expected;
     }
     actual.value = convert_ushort2_sat((double2)sat_input);
@@ -3924,10 +3924,10 @@ kernel void test_convert_type_2()
     }
     compare_ushort_elements_double("convert_ushort2_rte(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (ushort2)convert_ushort_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ushort2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ushort2)max_expected;
     }
     actual.value = convert_ushort2_sat_rte((double2)sat_input);
@@ -3945,10 +3945,10 @@ kernel void test_convert_type_2()
     }
     compare_ushort_elements_double("convert_ushort2_rtz(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (ushort2)convert_ushort_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ushort2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ushort2)max_expected;
     }
     actual.value = convert_ushort2_sat_rtz((double2)sat_input);
@@ -3966,10 +3966,10 @@ kernel void test_convert_type_2()
     }
     compare_ushort_elements_double("convert_ushort2_rtp(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (ushort2)convert_ushort_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ushort2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ushort2)max_expected;
     }
     actual.value = convert_ushort2_sat_rtp((double2)sat_input);
@@ -3987,10 +3987,10 @@ kernel void test_convert_type_2()
     }
     compare_ushort_elements_double("convert_ushort2_rtn(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (ushort2)convert_ushort_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ushort2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ushort2)max_expected;
     }
     actual.value = convert_ushort2_sat_rtn((double2)sat_input);
@@ -4021,10 +4021,10 @@ kernel void test_convert_type_2()
     }
     compare_int_elements_double("convert_int2(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (int2)convert_int(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (int2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (int2)max_expected;
     }
     actual.value = convert_int2_sat((double2)sat_input);
@@ -4042,10 +4042,10 @@ kernel void test_convert_type_2()
     }
     compare_int_elements_double("convert_int2_rte(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (int2)convert_int_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (int2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (int2)max_expected;
     }
     actual.value = convert_int2_sat_rte((double2)sat_input);
@@ -4063,10 +4063,10 @@ kernel void test_convert_type_2()
     }
     compare_int_elements_double("convert_int2_rtz(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (int2)convert_int_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (int2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (int2)max_expected;
     }
     actual.value = convert_int2_sat_rtz((double2)sat_input);
@@ -4084,10 +4084,10 @@ kernel void test_convert_type_2()
     }
     compare_int_elements_double("convert_int2_rtp(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (int2)convert_int_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (int2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (int2)max_expected;
     }
     actual.value = convert_int2_sat_rtp((double2)sat_input);
@@ -4105,10 +4105,10 @@ kernel void test_convert_type_2()
     }
     compare_int_elements_double("convert_int2_rtn(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (int2)convert_int_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (int2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (int2)max_expected;
     }
     actual.value = convert_int2_sat_rtn((double2)sat_input);
@@ -4139,10 +4139,10 @@ kernel void test_convert_type_2()
     }
     compare_uint_elements_double("convert_uint2(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (uint2)convert_uint(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uint2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uint2)max_expected;
     }
     actual.value = convert_uint2_sat((double2)sat_input);
@@ -4160,10 +4160,10 @@ kernel void test_convert_type_2()
     }
     compare_uint_elements_double("convert_uint2_rte(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (uint2)convert_uint_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uint2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uint2)max_expected;
     }
     actual.value = convert_uint2_sat_rte((double2)sat_input);
@@ -4181,10 +4181,10 @@ kernel void test_convert_type_2()
     }
     compare_uint_elements_double("convert_uint2_rtz(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (uint2)convert_uint_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uint2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uint2)max_expected;
     }
     actual.value = convert_uint2_sat_rtz((double2)sat_input);
@@ -4202,10 +4202,10 @@ kernel void test_convert_type_2()
     }
     compare_uint_elements_double("convert_uint2_rtp(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (uint2)convert_uint_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uint2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uint2)max_expected;
     }
     actual.value = convert_uint2_sat_rtp((double2)sat_input);
@@ -4223,10 +4223,10 @@ kernel void test_convert_type_2()
     }
     compare_uint_elements_double("convert_uint2_rtn(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (uint2)convert_uint_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uint2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uint2)max_expected;
     }
     actual.value = convert_uint2_sat_rtn((double2)sat_input);
@@ -4259,10 +4259,10 @@ kernel void test_convert_type_2()
     }
     compare_long_elements_double("convert_long2(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (long2)convert_long(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (long2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (long2)max_expected;
     }
     actual.value = convert_long2_sat((double2)sat_input);
@@ -4280,10 +4280,10 @@ kernel void test_convert_type_2()
     }
     compare_long_elements_double("convert_long2_rte(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (long2)convert_long_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (long2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (long2)max_expected;
     }
     actual.value = convert_long2_sat_rte((double2)sat_input);
@@ -4301,10 +4301,10 @@ kernel void test_convert_type_2()
     }
     compare_long_elements_double("convert_long2_rtz(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (long2)convert_long_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (long2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (long2)max_expected;
     }
     actual.value = convert_long2_sat_rtz((double2)sat_input);
@@ -4322,10 +4322,10 @@ kernel void test_convert_type_2()
     }
     compare_long_elements_double("convert_long2_rtp(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (long2)convert_long_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (long2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (long2)max_expected;
     }
     actual.value = convert_long2_sat_rtp((double2)sat_input);
@@ -4343,10 +4343,10 @@ kernel void test_convert_type_2()
     }
     compare_long_elements_double("convert_long2_rtn(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (long2)convert_long_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (long2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (long2)max_expected;
     }
     actual.value = convert_long2_sat_rtn((double2)sat_input);
@@ -4381,10 +4381,10 @@ kernel void test_convert_type_2()
     }
     compare_ulong_elements_double("convert_ulong2(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (ulong2)convert_ulong(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ulong2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ulong2)max_expected;
     }
     actual.value = convert_ulong2_sat((double2)sat_input);
@@ -4402,10 +4402,10 @@ kernel void test_convert_type_2()
     }
     compare_ulong_elements_double("convert_ulong2_rte(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (ulong2)convert_ulong_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ulong2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ulong2)max_expected;
     }
     actual.value = convert_ulong2_sat_rte((double2)sat_input);
@@ -4423,10 +4423,10 @@ kernel void test_convert_type_2()
     }
     compare_ulong_elements_double("convert_ulong2_rtz(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (ulong2)convert_ulong_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ulong2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ulong2)max_expected;
     }
     actual.value = convert_ulong2_sat_rtz((double2)sat_input);
@@ -4444,10 +4444,10 @@ kernel void test_convert_type_2()
     }
     compare_ulong_elements_double("convert_ulong2_rtp(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (ulong2)convert_ulong_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ulong2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ulong2)max_expected;
     }
     actual.value = convert_ulong2_sat_rtp((double2)sat_input);
@@ -4465,10 +4465,10 @@ kernel void test_convert_type_2()
     }
     compare_ulong_elements_double("convert_ulong2_rtn(double2)", i, &double_values[i], 0, expected.raw, actual.raw, 2);
     expected.value = (ulong2)convert_ulong_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ulong2)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ulong2)max_expected;
     }
     actual.value = convert_ulong2_sat_rtn((double2)sat_input);
diff --git a/tests/kernel/test_convert_type_4.cl b/tests/kernel/test_convert_type_4.cl
index d5a00ec..9320e92 100644
--- a/tests/kernel/test_convert_type_4.cl
+++ b/tests/kernel/test_convert_type_4.cl
@@ -2627,10 +2627,10 @@ kernel void test_convert_type_4()
     }
     compare_char_elements_float("convert_char4(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (char4)convert_char(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (char4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (char4)max_expected;
     }
     actual.value = convert_char4_sat((float4)sat_input);
@@ -2648,10 +2648,10 @@ kernel void test_convert_type_4()
     }
     compare_char_elements_float("convert_char4_rte(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (char4)convert_char_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (char4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (char4)max_expected;
     }
     actual.value = convert_char4_sat_rte((float4)sat_input);
@@ -2669,10 +2669,10 @@ kernel void test_convert_type_4()
     }
     compare_char_elements_float("convert_char4_rtz(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (char4)convert_char_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (char4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (char4)max_expected;
     }
     actual.value = convert_char4_sat_rtz((float4)sat_input);
@@ -2690,10 +2690,10 @@ kernel void test_convert_type_4()
     }
     compare_char_elements_float("convert_char4_rtp(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (char4)convert_char_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (char4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (char4)max_expected;
     }
     actual.value = convert_char4_sat_rtp((float4)sat_input);
@@ -2711,10 +2711,10 @@ kernel void test_convert_type_4()
     }
     compare_char_elements_float("convert_char4_rtn(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (char4)convert_char_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (char4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (char4)max_expected;
     }
     actual.value = convert_char4_sat_rtn((float4)sat_input);
@@ -2741,10 +2741,10 @@ kernel void test_convert_type_4()
     }
     compare_uchar_elements_float("convert_uchar4(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (uchar4)convert_uchar(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uchar4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uchar4)max_expected;
     }
     actual.value = convert_uchar4_sat((float4)sat_input);
@@ -2762,10 +2762,10 @@ kernel void test_convert_type_4()
     }
     compare_uchar_elements_float("convert_uchar4_rte(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (uchar4)convert_uchar_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uchar4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uchar4)max_expected;
     }
     actual.value = convert_uchar4_sat_rte((float4)sat_input);
@@ -2783,10 +2783,10 @@ kernel void test_convert_type_4()
     }
     compare_uchar_elements_float("convert_uchar4_rtz(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (uchar4)convert_uchar_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uchar4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uchar4)max_expected;
     }
     actual.value = convert_uchar4_sat_rtz((float4)sat_input);
@@ -2804,10 +2804,10 @@ kernel void test_convert_type_4()
     }
     compare_uchar_elements_float("convert_uchar4_rtp(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (uchar4)convert_uchar_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uchar4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uchar4)max_expected;
     }
     actual.value = convert_uchar4_sat_rtp((float4)sat_input);
@@ -2825,10 +2825,10 @@ kernel void test_convert_type_4()
     }
     compare_uchar_elements_float("convert_uchar4_rtn(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (uchar4)convert_uchar_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uchar4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uchar4)max_expected;
     }
     actual.value = convert_uchar4_sat_rtn((float4)sat_input);
@@ -2855,10 +2855,10 @@ kernel void test_convert_type_4()
     }
     compare_short_elements_float("convert_short4(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (short4)convert_short(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (short4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (short4)max_expected;
     }
     actual.value = convert_short4_sat((float4)sat_input);
@@ -2876,10 +2876,10 @@ kernel void test_convert_type_4()
     }
     compare_short_elements_float("convert_short4_rte(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (short4)convert_short_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (short4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (short4)max_expected;
     }
     actual.value = convert_short4_sat_rte((float4)sat_input);
@@ -2897,10 +2897,10 @@ kernel void test_convert_type_4()
     }
     compare_short_elements_float("convert_short4_rtz(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (short4)convert_short_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (short4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (short4)max_expected;
     }
     actual.value = convert_short4_sat_rtz((float4)sat_input);
@@ -2918,10 +2918,10 @@ kernel void test_convert_type_4()
     }
     compare_short_elements_float("convert_short4_rtp(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (short4)convert_short_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (short4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (short4)max_expected;
     }
     actual.value = convert_short4_sat_rtp((float4)sat_input);
@@ -2939,10 +2939,10 @@ kernel void test_convert_type_4()
     }
     compare_short_elements_float("convert_short4_rtn(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (short4)convert_short_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (short4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (short4)max_expected;
     }
     actual.value = convert_short4_sat_rtn((float4)sat_input);
@@ -2969,10 +2969,10 @@ kernel void test_convert_type_4()
     }
     compare_ushort_elements_float("convert_ushort4(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (ushort4)convert_ushort(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ushort4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ushort4)max_expected;
     }
     actual.value = convert_ushort4_sat((float4)sat_input);
@@ -2990,10 +2990,10 @@ kernel void test_convert_type_4()
     }
     compare_ushort_elements_float("convert_ushort4_rte(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (ushort4)convert_ushort_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ushort4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ushort4)max_expected;
     }
     actual.value = convert_ushort4_sat_rte((float4)sat_input);
@@ -3011,10 +3011,10 @@ kernel void test_convert_type_4()
     }
     compare_ushort_elements_float("convert_ushort4_rtz(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (ushort4)convert_ushort_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ushort4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ushort4)max_expected;
     }
     actual.value = convert_ushort4_sat_rtz((float4)sat_input);
@@ -3032,10 +3032,10 @@ kernel void test_convert_type_4()
     }
     compare_ushort_elements_float("convert_ushort4_rtp(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (ushort4)convert_ushort_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ushort4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ushort4)max_expected;
     }
     actual.value = convert_ushort4_sat_rtp((float4)sat_input);
@@ -3053,10 +3053,10 @@ kernel void test_convert_type_4()
     }
     compare_ushort_elements_float("convert_ushort4_rtn(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (ushort4)convert_ushort_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ushort4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ushort4)max_expected;
     }
     actual.value = convert_ushort4_sat_rtn((float4)sat_input);
@@ -3083,10 +3083,10 @@ kernel void test_convert_type_4()
     }
     compare_int_elements_float("convert_int4(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (int4)convert_int(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (int4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (int4)max_expected;
     }
     actual.value = convert_int4_sat((float4)sat_input);
@@ -3104,10 +3104,10 @@ kernel void test_convert_type_4()
     }
     compare_int_elements_float("convert_int4_rte(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (int4)convert_int_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (int4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (int4)max_expected;
     }
     actual.value = convert_int4_sat_rte((float4)sat_input);
@@ -3125,10 +3125,10 @@ kernel void test_convert_type_4()
     }
     compare_int_elements_float("convert_int4_rtz(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (int4)convert_int_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (int4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (int4)max_expected;
     }
     actual.value = convert_int4_sat_rtz((float4)sat_input);
@@ -3146,10 +3146,10 @@ kernel void test_convert_type_4()
     }
     compare_int_elements_float("convert_int4_rtp(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (int4)convert_int_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (int4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (int4)max_expected;
     }
     actual.value = convert_int4_sat_rtp((float4)sat_input);
@@ -3167,10 +3167,10 @@ kernel void test_convert_type_4()
     }
     compare_int_elements_float("convert_int4_rtn(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (int4)convert_int_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (int4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (int4)max_expected;
     }
     actual.value = convert_int4_sat_rtn((float4)sat_input);
@@ -3197,10 +3197,10 @@ kernel void test_convert_type_4()
     }
     compare_uint_elements_float("convert_uint4(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (uint4)convert_uint(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uint4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uint4)max_expected;
     }
     actual.value = convert_uint4_sat((float4)sat_input);
@@ -3218,10 +3218,10 @@ kernel void test_convert_type_4()
     }
     compare_uint_elements_float("convert_uint4_rte(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (uint4)convert_uint_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uint4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uint4)max_expected;
     }
     actual.value = convert_uint4_sat_rte((float4)sat_input);
@@ -3239,10 +3239,10 @@ kernel void test_convert_type_4()
     }
     compare_uint_elements_float("convert_uint4_rtz(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (uint4)convert_uint_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uint4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uint4)max_expected;
     }
     actual.value = convert_uint4_sat_rtz((float4)sat_input);
@@ -3260,10 +3260,10 @@ kernel void test_convert_type_4()
     }
     compare_uint_elements_float("convert_uint4_rtp(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (uint4)convert_uint_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uint4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uint4)max_expected;
     }
     actual.value = convert_uint4_sat_rtp((float4)sat_input);
@@ -3281,10 +3281,10 @@ kernel void test_convert_type_4()
     }
     compare_uint_elements_float("convert_uint4_rtn(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (uint4)convert_uint_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uint4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uint4)max_expected;
     }
     actual.value = convert_uint4_sat_rtn((float4)sat_input);
@@ -3313,10 +3313,10 @@ kernel void test_convert_type_4()
     }
     compare_long_elements_float("convert_long4(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (long4)convert_long(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (long4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (long4)max_expected;
     }
     actual.value = convert_long4_sat((float4)sat_input);
@@ -3334,10 +3334,10 @@ kernel void test_convert_type_4()
     }
     compare_long_elements_float("convert_long4_rte(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (long4)convert_long_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (long4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (long4)max_expected;
     }
     actual.value = convert_long4_sat_rte((float4)sat_input);
@@ -3355,10 +3355,10 @@ kernel void test_convert_type_4()
     }
     compare_long_elements_float("convert_long4_rtz(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (long4)convert_long_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (long4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (long4)max_expected;
     }
     actual.value = convert_long4_sat_rtz((float4)sat_input);
@@ -3376,10 +3376,10 @@ kernel void test_convert_type_4()
     }
     compare_long_elements_float("convert_long4_rtp(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (long4)convert_long_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (long4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (long4)max_expected;
     }
     actual.value = convert_long4_sat_rtp((float4)sat_input);
@@ -3397,10 +3397,10 @@ kernel void test_convert_type_4()
     }
     compare_long_elements_float("convert_long4_rtn(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (long4)convert_long_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (long4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (long4)max_expected;
     }
     actual.value = convert_long4_sat_rtn((float4)sat_input);
@@ -3431,10 +3431,10 @@ kernel void test_convert_type_4()
     }
     compare_ulong_elements_float("convert_ulong4(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (ulong4)convert_ulong(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ulong4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ulong4)max_expected;
     }
     actual.value = convert_ulong4_sat((float4)sat_input);
@@ -3452,10 +3452,10 @@ kernel void test_convert_type_4()
     }
     compare_ulong_elements_float("convert_ulong4_rte(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (ulong4)convert_ulong_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ulong4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ulong4)max_expected;
     }
     actual.value = convert_ulong4_sat_rte((float4)sat_input);
@@ -3473,10 +3473,10 @@ kernel void test_convert_type_4()
     }
     compare_ulong_elements_float("convert_ulong4_rtz(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (ulong4)convert_ulong_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ulong4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ulong4)max_expected;
     }
     actual.value = convert_ulong4_sat_rtz((float4)sat_input);
@@ -3494,10 +3494,10 @@ kernel void test_convert_type_4()
     }
     compare_ulong_elements_float("convert_ulong4_rtp(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (ulong4)convert_ulong_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ulong4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ulong4)max_expected;
     }
     actual.value = convert_ulong4_sat_rtp((float4)sat_input);
@@ -3515,10 +3515,10 @@ kernel void test_convert_type_4()
     }
     compare_ulong_elements_float("convert_ulong4_rtn(float4)", i, &float_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (ulong4)convert_ulong_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ulong4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ulong4)max_expected;
     }
     actual.value = convert_ulong4_sat_rtn((float4)sat_input);
@@ -3549,10 +3549,10 @@ kernel void test_convert_type_4()
     }
     compare_char_elements_double("convert_char4(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (char4)convert_char(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (char4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (char4)max_expected;
     }
     actual.value = convert_char4_sat((double4)sat_input);
@@ -3570,10 +3570,10 @@ kernel void test_convert_type_4()
     }
     compare_char_elements_double("convert_char4_rte(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (char4)convert_char_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (char4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (char4)max_expected;
     }
     actual.value = convert_char4_sat_rte((double4)sat_input);
@@ -3591,10 +3591,10 @@ kernel void test_convert_type_4()
     }
     compare_char_elements_double("convert_char4_rtz(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (char4)convert_char_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (char4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (char4)max_expected;
     }
     actual.value = convert_char4_sat_rtz((double4)sat_input);
@@ -3612,10 +3612,10 @@ kernel void test_convert_type_4()
     }
     compare_char_elements_double("convert_char4_rtp(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (char4)convert_char_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (char4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (char4)max_expected;
     }
     actual.value = convert_char4_sat_rtp((double4)sat_input);
@@ -3633,10 +3633,10 @@ kernel void test_convert_type_4()
     }
     compare_char_elements_double("convert_char4_rtn(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (char4)convert_char_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (char4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (char4)max_expected;
     }
     actual.value = convert_char4_sat_rtn((double4)sat_input);
@@ -3667,10 +3667,10 @@ kernel void test_convert_type_4()
     }
     compare_uchar_elements_double("convert_uchar4(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (uchar4)convert_uchar(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uchar4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uchar4)max_expected;
     }
     actual.value = convert_uchar4_sat((double4)sat_input);
@@ -3688,10 +3688,10 @@ kernel void test_convert_type_4()
     }
     compare_uchar_elements_double("convert_uchar4_rte(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (uchar4)convert_uchar_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uchar4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uchar4)max_expected;
     }
     actual.value = convert_uchar4_sat_rte((double4)sat_input);
@@ -3709,10 +3709,10 @@ kernel void test_convert_type_4()
     }
     compare_uchar_elements_double("convert_uchar4_rtz(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (uchar4)convert_uchar_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uchar4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uchar4)max_expected;
     }
     actual.value = convert_uchar4_sat_rtz((double4)sat_input);
@@ -3730,10 +3730,10 @@ kernel void test_convert_type_4()
     }
     compare_uchar_elements_double("convert_uchar4_rtp(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (uchar4)convert_uchar_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uchar4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uchar4)max_expected;
     }
     actual.value = convert_uchar4_sat_rtp((double4)sat_input);
@@ -3751,10 +3751,10 @@ kernel void test_convert_type_4()
     }
     compare_uchar_elements_double("convert_uchar4_rtn(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (uchar4)convert_uchar_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uchar4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uchar4)max_expected;
     }
     actual.value = convert_uchar4_sat_rtn((double4)sat_input);
@@ -3785,10 +3785,10 @@ kernel void test_convert_type_4()
     }
     compare_short_elements_double("convert_short4(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (short4)convert_short(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (short4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (short4)max_expected;
     }
     actual.value = convert_short4_sat((double4)sat_input);
@@ -3806,10 +3806,10 @@ kernel void test_convert_type_4()
     }
     compare_short_elements_double("convert_short4_rte(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (short4)convert_short_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (short4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (short4)max_expected;
     }
     actual.value = convert_short4_sat_rte((double4)sat_input);
@@ -3827,10 +3827,10 @@ kernel void test_convert_type_4()
     }
     compare_short_elements_double("convert_short4_rtz(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (short4)convert_short_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (short4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (short4)max_expected;
     }
     actual.value = convert_short4_sat_rtz((double4)sat_input);
@@ -3848,10 +3848,10 @@ kernel void test_convert_type_4()
     }
     compare_short_elements_double("convert_short4_rtp(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (short4)convert_short_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (short4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (short4)max_expected;
     }
     actual.value = convert_short4_sat_rtp((double4)sat_input);
@@ -3869,10 +3869,10 @@ kernel void test_convert_type_4()
     }
     compare_short_elements_double("convert_short4_rtn(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (short4)convert_short_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (short4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (short4)max_expected;
     }
     actual.value = convert_short4_sat_rtn((double4)sat_input);
@@ -3903,10 +3903,10 @@ kernel void test_convert_type_4()
     }
     compare_ushort_elements_double("convert_ushort4(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (ushort4)convert_ushort(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ushort4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ushort4)max_expected;
     }
     actual.value = convert_ushort4_sat((double4)sat_input);
@@ -3924,10 +3924,10 @@ kernel void test_convert_type_4()
     }
     compare_ushort_elements_double("convert_ushort4_rte(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (ushort4)convert_ushort_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ushort4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ushort4)max_expected;
     }
     actual.value = convert_ushort4_sat_rte((double4)sat_input);
@@ -3945,10 +3945,10 @@ kernel void test_convert_type_4()
     }
     compare_ushort_elements_double("convert_ushort4_rtz(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (ushort4)convert_ushort_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ushort4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ushort4)max_expected;
     }
     actual.value = convert_ushort4_sat_rtz((double4)sat_input);
@@ -3966,10 +3966,10 @@ kernel void test_convert_type_4()
     }
     compare_ushort_elements_double("convert_ushort4_rtp(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (ushort4)convert_ushort_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ushort4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ushort4)max_expected;
     }
     actual.value = convert_ushort4_sat_rtp((double4)sat_input);
@@ -3987,10 +3987,10 @@ kernel void test_convert_type_4()
     }
     compare_ushort_elements_double("convert_ushort4_rtn(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (ushort4)convert_ushort_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ushort4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ushort4)max_expected;
     }
     actual.value = convert_ushort4_sat_rtn((double4)sat_input);
@@ -4021,10 +4021,10 @@ kernel void test_convert_type_4()
     }
     compare_int_elements_double("convert_int4(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (int4)convert_int(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (int4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (int4)max_expected;
     }
     actual.value = convert_int4_sat((double4)sat_input);
@@ -4042,10 +4042,10 @@ kernel void test_convert_type_4()
     }
     compare_int_elements_double("convert_int4_rte(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (int4)convert_int_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (int4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (int4)max_expected;
     }
     actual.value = convert_int4_sat_rte((double4)sat_input);
@@ -4063,10 +4063,10 @@ kernel void test_convert_type_4()
     }
     compare_int_elements_double("convert_int4_rtz(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (int4)convert_int_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (int4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (int4)max_expected;
     }
     actual.value = convert_int4_sat_rtz((double4)sat_input);
@@ -4084,10 +4084,10 @@ kernel void test_convert_type_4()
     }
     compare_int_elements_double("convert_int4_rtp(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (int4)convert_int_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (int4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (int4)max_expected;
     }
     actual.value = convert_int4_sat_rtp((double4)sat_input);
@@ -4105,10 +4105,10 @@ kernel void test_convert_type_4()
     }
     compare_int_elements_double("convert_int4_rtn(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (int4)convert_int_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (int4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (int4)max_expected;
     }
     actual.value = convert_int4_sat_rtn((double4)sat_input);
@@ -4139,10 +4139,10 @@ kernel void test_convert_type_4()
     }
     compare_uint_elements_double("convert_uint4(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (uint4)convert_uint(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uint4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uint4)max_expected;
     }
     actual.value = convert_uint4_sat((double4)sat_input);
@@ -4160,10 +4160,10 @@ kernel void test_convert_type_4()
     }
     compare_uint_elements_double("convert_uint4_rte(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (uint4)convert_uint_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uint4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uint4)max_expected;
     }
     actual.value = convert_uint4_sat_rte((double4)sat_input);
@@ -4181,10 +4181,10 @@ kernel void test_convert_type_4()
     }
     compare_uint_elements_double("convert_uint4_rtz(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (uint4)convert_uint_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uint4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uint4)max_expected;
     }
     actual.value = convert_uint4_sat_rtz((double4)sat_input);
@@ -4202,10 +4202,10 @@ kernel void test_convert_type_4()
     }
     compare_uint_elements_double("convert_uint4_rtp(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (uint4)convert_uint_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uint4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uint4)max_expected;
     }
     actual.value = convert_uint4_sat_rtp((double4)sat_input);
@@ -4223,10 +4223,10 @@ kernel void test_convert_type_4()
     }
     compare_uint_elements_double("convert_uint4_rtn(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (uint4)convert_uint_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uint4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uint4)max_expected;
     }
     actual.value = convert_uint4_sat_rtn((double4)sat_input);
@@ -4259,10 +4259,10 @@ kernel void test_convert_type_4()
     }
     compare_long_elements_double("convert_long4(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (long4)convert_long(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (long4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (long4)max_expected;
     }
     actual.value = convert_long4_sat((double4)sat_input);
@@ -4280,10 +4280,10 @@ kernel void test_convert_type_4()
     }
     compare_long_elements_double("convert_long4_rte(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (long4)convert_long_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (long4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (long4)max_expected;
     }
     actual.value = convert_long4_sat_rte((double4)sat_input);
@@ -4301,10 +4301,10 @@ kernel void test_convert_type_4()
     }
     compare_long_elements_double("convert_long4_rtz(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (long4)convert_long_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (long4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (long4)max_expected;
     }
     actual.value = convert_long4_sat_rtz((double4)sat_input);
@@ -4322,10 +4322,10 @@ kernel void test_convert_type_4()
     }
     compare_long_elements_double("convert_long4_rtp(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (long4)convert_long_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (long4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (long4)max_expected;
     }
     actual.value = convert_long4_sat_rtp((double4)sat_input);
@@ -4343,10 +4343,10 @@ kernel void test_convert_type_4()
     }
     compare_long_elements_double("convert_long4_rtn(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (long4)convert_long_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (long4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (long4)max_expected;
     }
     actual.value = convert_long4_sat_rtn((double4)sat_input);
@@ -4381,10 +4381,10 @@ kernel void test_convert_type_4()
     }
     compare_ulong_elements_double("convert_ulong4(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (ulong4)convert_ulong(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ulong4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ulong4)max_expected;
     }
     actual.value = convert_ulong4_sat((double4)sat_input);
@@ -4402,10 +4402,10 @@ kernel void test_convert_type_4()
     }
     compare_ulong_elements_double("convert_ulong4_rte(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (ulong4)convert_ulong_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ulong4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ulong4)max_expected;
     }
     actual.value = convert_ulong4_sat_rte((double4)sat_input);
@@ -4423,10 +4423,10 @@ kernel void test_convert_type_4()
     }
     compare_ulong_elements_double("convert_ulong4_rtz(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (ulong4)convert_ulong_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ulong4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ulong4)max_expected;
     }
     actual.value = convert_ulong4_sat_rtz((double4)sat_input);
@@ -4444,10 +4444,10 @@ kernel void test_convert_type_4()
     }
     compare_ulong_elements_double("convert_ulong4_rtp(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (ulong4)convert_ulong_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ulong4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ulong4)max_expected;
     }
     actual.value = convert_ulong4_sat_rtp((double4)sat_input);
@@ -4465,10 +4465,10 @@ kernel void test_convert_type_4()
     }
     compare_ulong_elements_double("convert_ulong4_rtn(double4)", i, &double_values[i], 0, expected.raw, actual.raw, 4);
     expected.value = (ulong4)convert_ulong_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ulong4)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ulong4)max_expected;
     }
     actual.value = convert_ulong4_sat_rtn((double4)sat_input);
diff --git a/tests/kernel/test_convert_type_8.cl b/tests/kernel/test_convert_type_8.cl
index 8910043..7fad116 100644
--- a/tests/kernel/test_convert_type_8.cl
+++ b/tests/kernel/test_convert_type_8.cl
@@ -2627,10 +2627,10 @@ kernel void test_convert_type_8()
     }
     compare_char_elements_float("convert_char8(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (char8)convert_char(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (char8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (char8)max_expected;
     }
     actual.value = convert_char8_sat((float8)sat_input);
@@ -2648,10 +2648,10 @@ kernel void test_convert_type_8()
     }
     compare_char_elements_float("convert_char8_rte(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (char8)convert_char_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (char8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (char8)max_expected;
     }
     actual.value = convert_char8_sat_rte((float8)sat_input);
@@ -2669,10 +2669,10 @@ kernel void test_convert_type_8()
     }
     compare_char_elements_float("convert_char8_rtz(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (char8)convert_char_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (char8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (char8)max_expected;
     }
     actual.value = convert_char8_sat_rtz((float8)sat_input);
@@ -2690,10 +2690,10 @@ kernel void test_convert_type_8()
     }
     compare_char_elements_float("convert_char8_rtp(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (char8)convert_char_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (char8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (char8)max_expected;
     }
     actual.value = convert_char8_sat_rtp((float8)sat_input);
@@ -2711,10 +2711,10 @@ kernel void test_convert_type_8()
     }
     compare_char_elements_float("convert_char8_rtn(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (char8)convert_char_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (char8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (char8)max_expected;
     }
     actual.value = convert_char8_sat_rtn((float8)sat_input);
@@ -2741,10 +2741,10 @@ kernel void test_convert_type_8()
     }
     compare_uchar_elements_float("convert_uchar8(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (uchar8)convert_uchar(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uchar8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uchar8)max_expected;
     }
     actual.value = convert_uchar8_sat((float8)sat_input);
@@ -2762,10 +2762,10 @@ kernel void test_convert_type_8()
     }
     compare_uchar_elements_float("convert_uchar8_rte(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (uchar8)convert_uchar_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uchar8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uchar8)max_expected;
     }
     actual.value = convert_uchar8_sat_rte((float8)sat_input);
@@ -2783,10 +2783,10 @@ kernel void test_convert_type_8()
     }
     compare_uchar_elements_float("convert_uchar8_rtz(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (uchar8)convert_uchar_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uchar8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uchar8)max_expected;
     }
     actual.value = convert_uchar8_sat_rtz((float8)sat_input);
@@ -2804,10 +2804,10 @@ kernel void test_convert_type_8()
     }
     compare_uchar_elements_float("convert_uchar8_rtp(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (uchar8)convert_uchar_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uchar8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uchar8)max_expected;
     }
     actual.value = convert_uchar8_sat_rtp((float8)sat_input);
@@ -2825,10 +2825,10 @@ kernel void test_convert_type_8()
     }
     compare_uchar_elements_float("convert_uchar8_rtn(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (uchar8)convert_uchar_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uchar8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uchar8)max_expected;
     }
     actual.value = convert_uchar8_sat_rtn((float8)sat_input);
@@ -2855,10 +2855,10 @@ kernel void test_convert_type_8()
     }
     compare_short_elements_float("convert_short8(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (short8)convert_short(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (short8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (short8)max_expected;
     }
     actual.value = convert_short8_sat((float8)sat_input);
@@ -2876,10 +2876,10 @@ kernel void test_convert_type_8()
     }
     compare_short_elements_float("convert_short8_rte(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (short8)convert_short_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (short8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (short8)max_expected;
     }
     actual.value = convert_short8_sat_rte((float8)sat_input);
@@ -2897,10 +2897,10 @@ kernel void test_convert_type_8()
     }
     compare_short_elements_float("convert_short8_rtz(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (short8)convert_short_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (short8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (short8)max_expected;
     }
     actual.value = convert_short8_sat_rtz((float8)sat_input);
@@ -2918,10 +2918,10 @@ kernel void test_convert_type_8()
     }
     compare_short_elements_float("convert_short8_rtp(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (short8)convert_short_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (short8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (short8)max_expected;
     }
     actual.value = convert_short8_sat_rtp((float8)sat_input);
@@ -2939,10 +2939,10 @@ kernel void test_convert_type_8()
     }
     compare_short_elements_float("convert_short8_rtn(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (short8)convert_short_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (short8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (short8)max_expected;
     }
     actual.value = convert_short8_sat_rtn((float8)sat_input);
@@ -2969,10 +2969,10 @@ kernel void test_convert_type_8()
     }
     compare_ushort_elements_float("convert_ushort8(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (ushort8)convert_ushort(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ushort8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ushort8)max_expected;
     }
     actual.value = convert_ushort8_sat((float8)sat_input);
@@ -2990,10 +2990,10 @@ kernel void test_convert_type_8()
     }
     compare_ushort_elements_float("convert_ushort8_rte(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (ushort8)convert_ushort_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ushort8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ushort8)max_expected;
     }
     actual.value = convert_ushort8_sat_rte((float8)sat_input);
@@ -3011,10 +3011,10 @@ kernel void test_convert_type_8()
     }
     compare_ushort_elements_float("convert_ushort8_rtz(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (ushort8)convert_ushort_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ushort8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ushort8)max_expected;
     }
     actual.value = convert_ushort8_sat_rtz((float8)sat_input);
@@ -3032,10 +3032,10 @@ kernel void test_convert_type_8()
     }
     compare_ushort_elements_float("convert_ushort8_rtp(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (ushort8)convert_ushort_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ushort8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ushort8)max_expected;
     }
     actual.value = convert_ushort8_sat_rtp((float8)sat_input);
@@ -3053,10 +3053,10 @@ kernel void test_convert_type_8()
     }
     compare_ushort_elements_float("convert_ushort8_rtn(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (ushort8)convert_ushort_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ushort8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ushort8)max_expected;
     }
     actual.value = convert_ushort8_sat_rtn((float8)sat_input);
@@ -3083,10 +3083,10 @@ kernel void test_convert_type_8()
     }
     compare_int_elements_float("convert_int8(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (int8)convert_int(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (int8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (int8)max_expected;
     }
     actual.value = convert_int8_sat((float8)sat_input);
@@ -3104,10 +3104,10 @@ kernel void test_convert_type_8()
     }
     compare_int_elements_float("convert_int8_rte(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (int8)convert_int_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (int8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (int8)max_expected;
     }
     actual.value = convert_int8_sat_rte((float8)sat_input);
@@ -3125,10 +3125,10 @@ kernel void test_convert_type_8()
     }
     compare_int_elements_float("convert_int8_rtz(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (int8)convert_int_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (int8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (int8)max_expected;
     }
     actual.value = convert_int8_sat_rtz((float8)sat_input);
@@ -3146,10 +3146,10 @@ kernel void test_convert_type_8()
     }
     compare_int_elements_float("convert_int8_rtp(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (int8)convert_int_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (int8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (int8)max_expected;
     }
     actual.value = convert_int8_sat_rtp((float8)sat_input);
@@ -3167,10 +3167,10 @@ kernel void test_convert_type_8()
     }
     compare_int_elements_float("convert_int8_rtn(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (int8)convert_int_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (int8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (int8)max_expected;
     }
     actual.value = convert_int8_sat_rtn((float8)sat_input);
@@ -3197,10 +3197,10 @@ kernel void test_convert_type_8()
     }
     compare_uint_elements_float("convert_uint8(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (uint8)convert_uint(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uint8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uint8)max_expected;
     }
     actual.value = convert_uint8_sat((float8)sat_input);
@@ -3218,10 +3218,10 @@ kernel void test_convert_type_8()
     }
     compare_uint_elements_float("convert_uint8_rte(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (uint8)convert_uint_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uint8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uint8)max_expected;
     }
     actual.value = convert_uint8_sat_rte((float8)sat_input);
@@ -3239,10 +3239,10 @@ kernel void test_convert_type_8()
     }
     compare_uint_elements_float("convert_uint8_rtz(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (uint8)convert_uint_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uint8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uint8)max_expected;
     }
     actual.value = convert_uint8_sat_rtz((float8)sat_input);
@@ -3260,10 +3260,10 @@ kernel void test_convert_type_8()
     }
     compare_uint_elements_float("convert_uint8_rtp(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (uint8)convert_uint_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uint8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uint8)max_expected;
     }
     actual.value = convert_uint8_sat_rtp((float8)sat_input);
@@ -3281,10 +3281,10 @@ kernel void test_convert_type_8()
     }
     compare_uint_elements_float("convert_uint8_rtn(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (uint8)convert_uint_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (uint8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (uint8)max_expected;
     }
     actual.value = convert_uint8_sat_rtn((float8)sat_input);
@@ -3313,10 +3313,10 @@ kernel void test_convert_type_8()
     }
     compare_long_elements_float("convert_long8(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (long8)convert_long(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (long8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (long8)max_expected;
     }
     actual.value = convert_long8_sat((float8)sat_input);
@@ -3334,10 +3334,10 @@ kernel void test_convert_type_8()
     }
     compare_long_elements_float("convert_long8_rte(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (long8)convert_long_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (long8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (long8)max_expected;
     }
     actual.value = convert_long8_sat_rte((float8)sat_input);
@@ -3355,10 +3355,10 @@ kernel void test_convert_type_8()
     }
     compare_long_elements_float("convert_long8_rtz(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (long8)convert_long_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (long8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (long8)max_expected;
     }
     actual.value = convert_long8_sat_rtz((float8)sat_input);
@@ -3376,10 +3376,10 @@ kernel void test_convert_type_8()
     }
     compare_long_elements_float("convert_long8_rtp(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (long8)convert_long_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (long8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (long8)max_expected;
     }
     actual.value = convert_long8_sat_rtp((float8)sat_input);
@@ -3397,10 +3397,10 @@ kernel void test_convert_type_8()
     }
     compare_long_elements_float("convert_long8_rtn(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (long8)convert_long_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (long8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (long8)max_expected;
     }
     actual.value = convert_long8_sat_rtn((float8)sat_input);
@@ -3431,10 +3431,10 @@ kernel void test_convert_type_8()
     }
     compare_ulong_elements_float("convert_ulong8(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (ulong8)convert_ulong(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ulong8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ulong8)max_expected;
     }
     actual.value = convert_ulong8_sat((float8)sat_input);
@@ -3452,10 +3452,10 @@ kernel void test_convert_type_8()
     }
     compare_ulong_elements_float("convert_ulong8_rte(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (ulong8)convert_ulong_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ulong8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ulong8)max_expected;
     }
     actual.value = convert_ulong8_sat_rte((float8)sat_input);
@@ -3473,10 +3473,10 @@ kernel void test_convert_type_8()
     }
     compare_ulong_elements_float("convert_ulong8_rtz(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (ulong8)convert_ulong_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ulong8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ulong8)max_expected;
     }
     actual.value = convert_ulong8_sat_rtz((float8)sat_input);
@@ -3494,10 +3494,10 @@ kernel void test_convert_type_8()
     }
     compare_ulong_elements_float("convert_ulong8_rtp(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (ulong8)convert_ulong_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ulong8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ulong8)max_expected;
     }
     actual.value = convert_ulong8_sat_rtp((float8)sat_input);
@@ -3515,10 +3515,10 @@ kernel void test_convert_type_8()
     }
     compare_ulong_elements_float("convert_ulong8_rtn(float8)", i, &float_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (ulong8)convert_ulong_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (float)min_expected) {
        expected.value = (ulong8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (float)max_expected) {
        expected.value = (ulong8)max_expected;
     }
     actual.value = convert_ulong8_sat_rtn((float8)sat_input);
@@ -3549,10 +3549,10 @@ kernel void test_convert_type_8()
     }
     compare_char_elements_double("convert_char8(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (char8)convert_char(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (char8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (char8)max_expected;
     }
     actual.value = convert_char8_sat((double8)sat_input);
@@ -3570,10 +3570,10 @@ kernel void test_convert_type_8()
     }
     compare_char_elements_double("convert_char8_rte(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (char8)convert_char_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (char8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (char8)max_expected;
     }
     actual.value = convert_char8_sat_rte((double8)sat_input);
@@ -3591,10 +3591,10 @@ kernel void test_convert_type_8()
     }
     compare_char_elements_double("convert_char8_rtz(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (char8)convert_char_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (char8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (char8)max_expected;
     }
     actual.value = convert_char8_sat_rtz((double8)sat_input);
@@ -3612,10 +3612,10 @@ kernel void test_convert_type_8()
     }
     compare_char_elements_double("convert_char8_rtp(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (char8)convert_char_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (char8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (char8)max_expected;
     }
     actual.value = convert_char8_sat_rtp((double8)sat_input);
@@ -3633,10 +3633,10 @@ kernel void test_convert_type_8()
     }
     compare_char_elements_double("convert_char8_rtn(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (char8)convert_char_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (char8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (char8)max_expected;
     }
     actual.value = convert_char8_sat_rtn((double8)sat_input);
@@ -3667,10 +3667,10 @@ kernel void test_convert_type_8()
     }
     compare_uchar_elements_double("convert_uchar8(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (uchar8)convert_uchar(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uchar8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uchar8)max_expected;
     }
     actual.value = convert_uchar8_sat((double8)sat_input);
@@ -3688,10 +3688,10 @@ kernel void test_convert_type_8()
     }
     compare_uchar_elements_double("convert_uchar8_rte(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (uchar8)convert_uchar_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uchar8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uchar8)max_expected;
     }
     actual.value = convert_uchar8_sat_rte((double8)sat_input);
@@ -3709,10 +3709,10 @@ kernel void test_convert_type_8()
     }
     compare_uchar_elements_double("convert_uchar8_rtz(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (uchar8)convert_uchar_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uchar8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uchar8)max_expected;
     }
     actual.value = convert_uchar8_sat_rtz((double8)sat_input);
@@ -3730,10 +3730,10 @@ kernel void test_convert_type_8()
     }
     compare_uchar_elements_double("convert_uchar8_rtp(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (uchar8)convert_uchar_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uchar8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uchar8)max_expected;
     }
     actual.value = convert_uchar8_sat_rtp((double8)sat_input);
@@ -3751,10 +3751,10 @@ kernel void test_convert_type_8()
     }
     compare_uchar_elements_double("convert_uchar8_rtn(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (uchar8)convert_uchar_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uchar8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uchar8)max_expected;
     }
     actual.value = convert_uchar8_sat_rtn((double8)sat_input);
@@ -3785,10 +3785,10 @@ kernel void test_convert_type_8()
     }
     compare_short_elements_double("convert_short8(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (short8)convert_short(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (short8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (short8)max_expected;
     }
     actual.value = convert_short8_sat((double8)sat_input);
@@ -3806,10 +3806,10 @@ kernel void test_convert_type_8()
     }
     compare_short_elements_double("convert_short8_rte(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (short8)convert_short_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (short8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (short8)max_expected;
     }
     actual.value = convert_short8_sat_rte((double8)sat_input);
@@ -3827,10 +3827,10 @@ kernel void test_convert_type_8()
     }
     compare_short_elements_double("convert_short8_rtz(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (short8)convert_short_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (short8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (short8)max_expected;
     }
     actual.value = convert_short8_sat_rtz((double8)sat_input);
@@ -3848,10 +3848,10 @@ kernel void test_convert_type_8()
     }
     compare_short_elements_double("convert_short8_rtp(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (short8)convert_short_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (short8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (short8)max_expected;
     }
     actual.value = convert_short8_sat_rtp((double8)sat_input);
@@ -3869,10 +3869,10 @@ kernel void test_convert_type_8()
     }
     compare_short_elements_double("convert_short8_rtn(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (short8)convert_short_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (short8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (short8)max_expected;
     }
     actual.value = convert_short8_sat_rtn((double8)sat_input);
@@ -3903,10 +3903,10 @@ kernel void test_convert_type_8()
     }
     compare_ushort_elements_double("convert_ushort8(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (ushort8)convert_ushort(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ushort8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ushort8)max_expected;
     }
     actual.value = convert_ushort8_sat((double8)sat_input);
@@ -3924,10 +3924,10 @@ kernel void test_convert_type_8()
     }
     compare_ushort_elements_double("convert_ushort8_rte(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (ushort8)convert_ushort_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ushort8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ushort8)max_expected;
     }
     actual.value = convert_ushort8_sat_rte((double8)sat_input);
@@ -3945,10 +3945,10 @@ kernel void test_convert_type_8()
     }
     compare_ushort_elements_double("convert_ushort8_rtz(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (ushort8)convert_ushort_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ushort8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ushort8)max_expected;
     }
     actual.value = convert_ushort8_sat_rtz((double8)sat_input);
@@ -3966,10 +3966,10 @@ kernel void test_convert_type_8()
     }
     compare_ushort_elements_double("convert_ushort8_rtp(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (ushort8)convert_ushort_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ushort8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ushort8)max_expected;
     }
     actual.value = convert_ushort8_sat_rtp((double8)sat_input);
@@ -3987,10 +3987,10 @@ kernel void test_convert_type_8()
     }
     compare_ushort_elements_double("convert_ushort8_rtn(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (ushort8)convert_ushort_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ushort8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ushort8)max_expected;
     }
     actual.value = convert_ushort8_sat_rtn((double8)sat_input);
@@ -4021,10 +4021,10 @@ kernel void test_convert_type_8()
     }
     compare_int_elements_double("convert_int8(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (int8)convert_int(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (int8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (int8)max_expected;
     }
     actual.value = convert_int8_sat((double8)sat_input);
@@ -4042,10 +4042,10 @@ kernel void test_convert_type_8()
     }
     compare_int_elements_double("convert_int8_rte(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (int8)convert_int_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (int8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (int8)max_expected;
     }
     actual.value = convert_int8_sat_rte((double8)sat_input);
@@ -4063,10 +4063,10 @@ kernel void test_convert_type_8()
     }
     compare_int_elements_double("convert_int8_rtz(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (int8)convert_int_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (int8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (int8)max_expected;
     }
     actual.value = convert_int8_sat_rtz((double8)sat_input);
@@ -4084,10 +4084,10 @@ kernel void test_convert_type_8()
     }
     compare_int_elements_double("convert_int8_rtp(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (int8)convert_int_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (int8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (int8)max_expected;
     }
     actual.value = convert_int8_sat_rtp((double8)sat_input);
@@ -4105,10 +4105,10 @@ kernel void test_convert_type_8()
     }
     compare_int_elements_double("convert_int8_rtn(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (int8)convert_int_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (int8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (int8)max_expected;
     }
     actual.value = convert_int8_sat_rtn((double8)sat_input);
@@ -4139,10 +4139,10 @@ kernel void test_convert_type_8()
     }
     compare_uint_elements_double("convert_uint8(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (uint8)convert_uint(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uint8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uint8)max_expected;
     }
     actual.value = convert_uint8_sat((double8)sat_input);
@@ -4160,10 +4160,10 @@ kernel void test_convert_type_8()
     }
     compare_uint_elements_double("convert_uint8_rte(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (uint8)convert_uint_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uint8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uint8)max_expected;
     }
     actual.value = convert_uint8_sat_rte((double8)sat_input);
@@ -4181,10 +4181,10 @@ kernel void test_convert_type_8()
     }
     compare_uint_elements_double("convert_uint8_rtz(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (uint8)convert_uint_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uint8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uint8)max_expected;
     }
     actual.value = convert_uint8_sat_rtz((double8)sat_input);
@@ -4202,10 +4202,10 @@ kernel void test_convert_type_8()
     }
     compare_uint_elements_double("convert_uint8_rtp(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (uint8)convert_uint_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uint8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uint8)max_expected;
     }
     actual.value = convert_uint8_sat_rtp((double8)sat_input);
@@ -4223,10 +4223,10 @@ kernel void test_convert_type_8()
     }
     compare_uint_elements_double("convert_uint8_rtn(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (uint8)convert_uint_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (uint8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (uint8)max_expected;
     }
     actual.value = convert_uint8_sat_rtn((double8)sat_input);
@@ -4259,10 +4259,10 @@ kernel void test_convert_type_8()
     }
     compare_long_elements_double("convert_long8(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (long8)convert_long(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (long8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (long8)max_expected;
     }
     actual.value = convert_long8_sat((double8)sat_input);
@@ -4280,10 +4280,10 @@ kernel void test_convert_type_8()
     }
     compare_long_elements_double("convert_long8_rte(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (long8)convert_long_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (long8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (long8)max_expected;
     }
     actual.value = convert_long8_sat_rte((double8)sat_input);
@@ -4301,10 +4301,10 @@ kernel void test_convert_type_8()
     }
     compare_long_elements_double("convert_long8_rtz(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (long8)convert_long_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (long8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (long8)max_expected;
     }
     actual.value = convert_long8_sat_rtz((double8)sat_input);
@@ -4322,10 +4322,10 @@ kernel void test_convert_type_8()
     }
     compare_long_elements_double("convert_long8_rtp(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (long8)convert_long_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (long8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (long8)max_expected;
     }
     actual.value = convert_long8_sat_rtp((double8)sat_input);
@@ -4343,10 +4343,10 @@ kernel void test_convert_type_8()
     }
     compare_long_elements_double("convert_long8_rtn(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (long8)convert_long_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (long8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (long8)max_expected;
     }
     actual.value = convert_long8_sat_rtn((double8)sat_input);
@@ -4381,10 +4381,10 @@ kernel void test_convert_type_8()
     }
     compare_ulong_elements_double("convert_ulong8(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (ulong8)convert_ulong(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ulong8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ulong8)max_expected;
     }
     actual.value = convert_ulong8_sat((double8)sat_input);
@@ -4402,10 +4402,10 @@ kernel void test_convert_type_8()
     }
     compare_ulong_elements_double("convert_ulong8_rte(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (ulong8)convert_ulong_rte(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ulong8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ulong8)max_expected;
     }
     actual.value = convert_ulong8_sat_rte((double8)sat_input);
@@ -4423,10 +4423,10 @@ kernel void test_convert_type_8()
     }
     compare_ulong_elements_double("convert_ulong8_rtz(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (ulong8)convert_ulong_rtz(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ulong8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ulong8)max_expected;
     }
     actual.value = convert_ulong8_sat_rtz((double8)sat_input);
@@ -4444,10 +4444,10 @@ kernel void test_convert_type_8()
     }
     compare_ulong_elements_double("convert_ulong8_rtp(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (ulong8)convert_ulong_rtp(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ulong8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ulong8)max_expected;
     }
     actual.value = convert_ulong8_sat_rtp((double8)sat_input);
@@ -4465,10 +4465,10 @@ kernel void test_convert_type_8()
     }
     compare_ulong_elements_double("convert_ulong8_rtn(double8)", i, &double_values[i], 0, expected.raw, actual.raw, 8);
     expected.value = (ulong8)convert_ulong_rtn(sat_input);
-    if (sat_input < min_expected) {
+    if (sat_input <= (double)min_expected) {
        expected.value = (ulong8)min_expected;
     }
-    else if (sat_input > max_expected) {
+    else if (sat_input >= (double)max_expected) {
        expected.value = (ulong8)max_expected;
     }
     actual.value = convert_ulong8_sat_rtn((double8)sat_input);
diff --git a/tests/kernel/test_shuffle.cc b/tests/kernel/test_shuffle.cc
index 4ee8a0e..cdd3ba1 100644
--- a/tests/kernel/test_shuffle.cc
+++ b/tests/kernel/test_shuffle.cc
@@ -28,7 +28,7 @@ cl_command_queue queue;
 
 #define ERRCHECK()  if (check_cl_error(errcode, __LINE__, __FUNCTION__)) abort();
 
-static const unsigned vecelts[5]={2,3,4,8,16};
+static const unsigned vecelts[] = {2,4,8,16};
 static const int stimuli[] = {4, 2, 69, 4, 5, 0, 45, 16, 4, 6, 1, 18, 28, 14,
                  22, 16, 8, 2, 0, 31, 42, 11, 62, 88, 99, 23, 13};
 
@@ -71,8 +71,8 @@ private:
         mask_type = "UNKNOWN_MASK";
     }
 
-    for(unsigned n_loop=0; n_loop<5; n_loop++) {
-        for(unsigned m_loop=0; m_loop<5; m_loop++) {
+    for(unsigned n_loop=0; n_loop<4; n_loop++) {
+        for(unsigned m_loop=0; m_loop<4; m_loop++) {
 
             n = vecelts[n_loop];
             m = vecelts[m_loop];
@@ -296,8 +296,8 @@ public:
     ERRCHECK()
 
     unsigned errors = 0;
-    for(unsigned n_loop=0; n_loop<5; n_loop++) {
-          for(unsigned m_loop=0; m_loop<5; m_loop++) {
+    for(unsigned n_loop=0; n_loop<4; n_loop++) {
+          for(unsigned m_loop=0; m_loop<4; m_loop++) {
               unsigned m = vecelts[m_loop];
               for(unsigned i=0; i<m; i++) {
                 in2[i]=(D)(i+m);
@@ -336,6 +336,11 @@ int main( int argc, char *argv[])
 
 	poclu_get_any_device( &ctx, &did, &queue);
 
+#if (__GNUC__ > 5)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wignored-attributes"
+#endif
+
 	/* Loop over input (m) and output (n) vector lengths.
 	 * The big if-else is needed to pass the string
 	 * representation to runtest.
@@ -395,6 +400,12 @@ int main( int argc, char *argv[])
 
 	   }
 
+#if (__GNUC__ > 5)
+#pragma GCC diagnostic pop
+#endif
+	clReleaseCommandQueue(queue);
+	clReleaseContext(ctx);
+
 	if( num_errors == 0)
 		std::cout << "OK" << std::endl;
 	return num_errors;
diff --git a/tests/regression/CMakeLists.txt b/tests/regression/CMakeLists.txt
index ad9e922..3283bae 100644
--- a/tests/regression/CMakeLists.txt
+++ b/tests/regression/CMakeLists.txt
@@ -1,7 +1,7 @@
 #=============================================================================
 #   CMake build system files
 #
-#   Copyright (c) 2014 pocl developers
+#   Copyright (c) 2014-2017 pocl developers
 #
 #   Permission is hereby granted, free of charge, to any person obtaining a copy
 #   of this software and associated documentation files (the "Software"), to deal
@@ -23,10 +23,12 @@
 #
 #=============================================================================
 
+
 # Mac OS X currently can't digest cl2.hpp, which all reg tests include
 if(NOT APPLE)
 
 set(C_PROGRAMS_TO_BUILD test_assign_loop_variable_to_privvar_makes_it_local
+     test_program_from_binary_with_local_1_1_1
      test_assign_loop_variable_to_privvar_makes_it_local_2)
 foreach(PROG ${C_PROGRAMS_TO_BUILD})
   if(MSVC)
@@ -43,8 +45,8 @@ set(PROGRAMS_TO_BUILD test_barrier_between_for_loops test_early_return
   test_simple_for_with_a_barrier test_structs_as_args test_vectors_as_args
   test_barrier_before_return test_infinite_loop test_constant_array
   test_undominated_variable test_setargs test_null_arg
-  test_fors_with_var_iteration_counts test_issue_231 test_issue_445)
-
+  test_fors_with_var_iteration_counts test_issue_231 test_issue_445
+  test_autolocals_in_constexprs test_issue_553 test_issue_577)
 
 if (MSVC)
   add_compile_options(${OPENCL_CFLAGS})
@@ -63,7 +65,11 @@ endforeach()
 
 ######################################################################
 
-#AT_SETUP([phi nodes not replicated (repl) - lp:927573])
+add_test_pocl(NAME "regression/test_issue_231" COMMAND "test_issue_231")
+add_test_pocl(NAME "regression/test_issue_445" COMMAND "test_issue_445")
+add_test_pocl(NAME "regression/test_issue_553" COMMAND "test_issue_553")
+add_test_pocl(NAME "regression/test_issue_577" COMMAND "test_issue_577")
+
 
 # repl
 
@@ -95,6 +101,9 @@ add_test_pocl(NAME "regression/assigning_a_loop_iterator_variable_to_a_private_m
 add_test_pocl(NAME "regression/assigning_a_loop_iterator_variable_to_a_private_makes_it_local_2_REPL"
               COMMAND "test_assign_loop_variable_to_privvar_makes_it_local_2")
 
+add_test_pocl(NAME "regression/test_program_from_binary_with_local_1_1_1_REPL"
+              COMMAND "test_program_from_binary_with_local_1_1_1")
+
 set_tests_properties("regression/phi_nodes_not_replicated_REPL"
   "regression/issues_with_local_pointers_REPL"
   "regression/barrier_between_two_for_loops_REPL"
@@ -108,6 +117,7 @@ set_tests_properties("regression/phi_nodes_not_replicated_REPL"
   "regression/undominated_variable_from_conditional_barrier_handling_REPL"
   "regression/assigning_a_loop_iterator_variable_to_a_private_makes_it_local_REPL"
   "regression/assigning_a_loop_iterator_variable_to_a_private_makes_it_local_2_REPL"
+  "regression/test_program_from_binary_with_local_1_1_1_REPL"
   PROPERTIES
     ENVIRONMENT "POCL_WORK_GROUP_METHOD=workitemrepl"
     COST 1.5
@@ -146,6 +156,9 @@ add_test_pocl(NAME "regression/assigning_a_loop_iterator_variable_to_a_private_m
 add_test_pocl(NAME "regression/assigning_a_loop_iterator_variable_to_a_private_makes_it_local_2_LOOPS"
               COMMAND "test_assign_loop_variable_to_privvar_makes_it_local_2")
 
+add_test_pocl(NAME "regression/test_program_from_binary_with_local_1_1_1_LOOPS"
+              COMMAND "test_program_from_binary_with_local_1_1_1")
+
 set_tests_properties("regression/phi_nodes_not_replicated_LOOPS"
   "regression/issues_with_local_pointers_LOOPS"
   "regression/barrier_between_two_for_loops_LOOPS"
@@ -159,6 +172,7 @@ set_tests_properties("regression/phi_nodes_not_replicated_LOOPS"
   "regression/undominated_variable_from_conditional_barrier_handling_LOOPS"
   "regression/assigning_a_loop_iterator_variable_to_a_private_makes_it_local_LOOPS"
   "regression/assigning_a_loop_iterator_variable_to_a_private_makes_it_local_2_LOOPS"
+  "regression/test_program_from_binary_with_local_1_1_1_LOOPS"
   PROPERTIES
     ENVIRONMENT "POCL_WORK_GROUP_METHOD=workitemloops"
     COST 1.5
@@ -177,7 +191,8 @@ add_test_pocl(NAME "regression/passing_a_constant_array_as_an_arg" COMMAND "test
 
 add_test_pocl(NAME "regression/case_with_multiple_variable_length_loops_and_a_barrier_in_one" COMMAND "test_fors_with_var_iteration_counts")
 
-# these 2 will fail
+add_test_pocl(NAME "regression/autolocals_in_constexprs" COMMAND "test_autolocals_in_constexprs")
+
 add_test_pocl(NAME "regression/struct_kernel_arguments" COMMAND "test_structs_as_args")
 
 add_test_pocl(NAME "regression/vector_kernel_arguments" COMMAND "test_vectors_as_args")
@@ -187,6 +202,7 @@ set_tests_properties("regression/setting_a_buffer_argument_to_NULL_causes_a_segf
   "regression/passing_a_constant_array_as_an_arg"
   "regression/case_with_multiple_variable_length_loops_and_a_barrier_in_one"
   "regression/struct_kernel_arguments" "regression/vector_kernel_arguments"
+  "regression/autolocals_in_constexprs"
   PROPERTIES
     COST 1.5
     PROCESSORS 1
@@ -204,12 +220,38 @@ set_tests_properties("regression/barrier_between_two_for_loops_LOOPS"
   PROPERTIES
     LABELS "internal;regression;tce")
 
-# The vector/struct kernel arguments are known to be flaky and
-# work by luck sometimes. Disable them for now.
-#if((LLVM_CXXFLAGS MATCHES "_DEBUG") OR (NOT LLVM_CXXFLAGS MATCHES "DNDEBUG"))
-#  set_tests_properties("regression/vector_kernel_arguments"
-#    PROPERTIES  WILL_FAIL 1)
-#endif()
+# Label tests that work with CUDA backend
+set_property(TEST
+  "regression/phi_nodes_not_replicated_REPL"
+  "regression/issues_with_local_pointers_REPL"
+  "regression/barrier_between_two_for_loops_REPL"
+  "regression/simple_for-loop_with_a_barrier_inside_REPL"
+  "regression/for-loop_with_computation_after_the_brexit_REPL"
+  "regression/for-loop_with_a_variable_iteration_count_REPL"
+  "regression/early_return_before_a_barrier_region_REPL"
+  "regression/id-dependent_computation_before_kernel_exit_REPL"
+  "regression/barrier_just_before_return_REPL"
+  "regression/undominated_variable_from_conditional_barrier_handling_REPL"
+  "regression/assigning_a_loop_iterator_variable_to_a_private_makes_it_local_REPL"
+  "regression/test_program_from_binary_with_local_1_1_1_REPL"
+  "regression/phi_nodes_not_replicated_LOOPS"
+  "regression/issues_with_local_pointers_LOOPS"
+  "regression/barrier_between_two_for_loops_LOOPS"
+  "regression/simple_for-loop_with_a_barrier_inside_LOOPS"
+  "regression/for-loop_with_computation_after_the_brexit_LOOPS"
+  "regression/for-loop_with_a_variable_iteration_count_LOOPS"
+  "regression/early_return_before_a_barrier_region_LOOPS"
+  "regression/id-dependent_computation_before_kernel_exit_LOOPS"
+  "regression/barrier_just_before_return_LOOPS"
+  "regression/undominated_variable_from_conditional_barrier_handling_LOOPS"
+  "regression/assigning_a_loop_iterator_variable_to_a_private_makes_it_local_LOOPS"
+  "regression/test_program_from_binary_with_local_1_1_1_LOOPS"
+  "regression/setting_a_buffer_argument_to_NULL_causes_a_segfault"
+  "regression/clSetKernelArg_overwriting_the_previous_kernel's_args"
+  "regression/case_with_multiple_variable_length_loops_and_a_barrier_in_one"
+  "regression/vector_kernel_arguments"
+  "regression/autolocals_in_constexprs"
+  APPEND PROPERTY LABELS "cuda")
 
 if (NOT CLANG_IS_PATCHED_FOR_SPIR_CC)
 set_tests_properties("regression/struct_kernel_arguments"
diff --git a/tests/regression/test_autolocals_in_constexprs.cpp b/tests/regression/test_autolocals_in_constexprs.cpp
new file mode 100644
index 0000000..de5c421
--- /dev/null
+++ b/tests/regression/test_autolocals_in_constexprs.cpp
@@ -0,0 +1,114 @@
+/* AutomaticLocals pass might break the IR if it promotes a local used in a
+   constant expression to an argument (which is no longer constant)
+   (GitHub issue #467).
+
+   Copyright (c) 2017 pocl developers
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+// Enable OpenCL C++ exceptions
+#define CL_HPP_ENABLE_EXCEPTIONS
+#define CL_HPP_MINIMUM_OPENCL_VERSION 120
+#define CL_HPP_TARGET_OPENCL_VERSION 120
+#include <CL/cl2.hpp>
+
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+
+#include "poclu.h"
+
+static char
+kernelSourceCode[] =
+"kernel void test_kernel (global ulong *output)\n"
+"{\n"
+"   local char  l_int8[3]; \n"
+"   local int   l_int32[3]; \n"
+"   local float l_float[3]; \n"
+"   output[0] = (ulong)l_int8;\n"
+"   output[1] = (ulong)l_int32;\n"
+"   output[2] = (ulong)l_float;\n"
+"}\n";
+
+int
+main(void)
+{
+  uint64_t A[3];
+
+  try {
+    std::vector<cl::Platform> platformList;
+
+    // Pick platform
+    cl::Platform::get(&platformList);
+
+    // Pick first platform
+    cl_context_properties cprops[] = {
+      CL_CONTEXT_PLATFORM, (cl_context_properties)(platformList[0])(), 0};
+    cl::Context context(CL_DEVICE_TYPE_CPU|CL_DEVICE_TYPE_GPU, cprops);
+
+    // Query the set of devices attched to the context
+    std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
+
+    // Create and program from source
+    cl::Program::Sources sources({kernelSourceCode});
+    cl::Program program(context, sources);
+
+    cl_device_id dev_id = devices.at(0)();
+
+    for (int i = 0; i < 3; ++i)
+      A[i] = 0;
+
+    // Build program
+    program.build(devices);
+
+    cl::Buffer aBuffer = cl::Buffer(
+        context,
+        CL_MEM_COPY_HOST_PTR,
+        3 * sizeof(uint64_t),
+        (void *) &A[0]);
+
+    // Create kernel object
+    cl::Kernel kernel(program, "test_kernel");
+
+    // Set kernel args
+    kernel.setArg(0, aBuffer);
+
+    // Create command queue
+    cl::CommandQueue queue(context, devices[0], 0);
+
+    // Do the work
+    queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(1));
+    queue.finish();
+
+    // We don't actually care about the result.
+  }
+  catch (cl::Error err) {
+    std::cerr
+      << "ERROR: "
+      << err.what()
+      << "("
+      << err.err()
+      << ")"
+      << std::endl;
+    return EXIT_FAILURE;
+  }
+
+  return EXIT_SUCCESS;
+}
diff --git a/tests/regression/test_fors_with_var_iteration_counts.cpp b/tests/regression/test_fors_with_var_iteration_counts.cpp
index 9b36f27..974a13d 100644
--- a/tests/regression/test_fors_with_var_iteration_counts.cpp
+++ b/tests/regression/test_fors_with_var_iteration_counts.cpp
@@ -128,7 +128,7 @@ main(void)
 
         // Map cBuffer to host pointer. This enforces a sync with 
         // the host backing space, remember we choose GPU device.
-        int * output = (int *) queue.enqueueMapBuffer(
+        queue.enqueueMapBuffer(
             cBuffer,
             CL_TRUE, // block 
             CL_MAP_READ,
diff --git a/tests/regression/test_issue_231.cpp b/tests/regression/test_issue_231.cpp
index b288e1a..f75a597 100644
--- a/tests/regression/test_issue_231.cpp
+++ b/tests/regression/test_issue_231.cpp
@@ -5,7 +5,6 @@
 #define CL_HPP_ENABLE_EXCEPTIONS
 #define CL_HPP_MINIMUM_OPENCL_VERSION 120
 #define CL_HPP_TARGET_OPENCL_VERSION 120
-#define CL_HPP_CL_1_2_DEFAULT_BUILD
 #include <CL/cl2.hpp>
 #include <iostream>
 
@@ -83,6 +82,11 @@ int main(int argc, char *argv[])
   cl::CommandQueue queue = cl::CommandQueue::getDefault();
   cl::Program program(SOURCE, true);
 
+#if (__GNUC__ > 5)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wignored-attributes"
+#endif
+
   auto kernel = cl::KernelFunctor<cl::Buffer, cl::Buffer, cl::Buffer, cl_int, cl_int, cl::Buffer>(program, "scan_scan_intervals_lev1");
 
   cl_int i = 0;
@@ -90,5 +94,9 @@ int main(int argc, char *argv[])
   kernel(cl::EnqueueArgs(queue, cl::NDRange(16), cl::NDRange(16)),
          buffer, buffer, buffer, i, i, buffer);
 
+#if (__GNUC__ > 5)
+#pragma GCC diagnostic pop
+#endif
+
   queue.finish();
 }
diff --git a/tests/regression/test_issue_445.cpp b/tests/regression/test_issue_445.cpp
index 8b39f2b..aed5aa7 100644
--- a/tests/regression/test_issue_445.cpp
+++ b/tests/regression/test_issue_445.cpp
@@ -6,7 +6,6 @@
 #define CL_HPP_ENABLE_EXCEPTIONS
 #define CL_HPP_MINIMUM_OPENCL_VERSION 120
 #define CL_HPP_TARGET_OPENCL_VERSION 120
-#define CL_HPP_CL_1_2_DEFAULT_BUILD
 #include <CL/cl2.hpp>
 #include <iostream>
 
@@ -29,7 +28,7 @@ private_local_array(__global int *__restrict__ out)
 }
 )CLC";
 
-int main(int, char *)
+int main(int, char **)
 {
   try {
     int N = 9;
diff --git a/tests/regression/test_issue_553.cpp b/tests/regression/test_issue_553.cpp
new file mode 100644
index 0000000..bee52e3
--- /dev/null
+++ b/tests/regression/test_issue_553.cpp
@@ -0,0 +1,75 @@
+
+// See https://github.com/pocl/pocl/issues/553
+
+#define CL_HPP_ENABLE_EXCEPTIONS
+#define CL_HPP_MINIMUM_OPENCL_VERSION 120
+#define CL_HPP_TARGET_OPENCL_VERSION 120
+#include <CL/cl2.hpp>
+#include <iostream>
+
+using namespace std;
+
+const char *SOURCE = R"RAW(
+// Expected output:
+//  outer=A inner=B
+//  + outer=A inner=B
+// for each value of A and B.
+// However I see three copies of the second line (starting with +).
+// Commenting out any one line marked with YYYY bring it down to two copies,
+// and commenting out any one line marked with XXXX gives the expected output.
+
+__kernel void pocltest(int xarg1, int xarg2) {
+  int outerend = 1;
+  int innerend = 1;
+  outerend = 2; // YYYY
+  innerend = 2; // YYYY
+  int outer = 0;
+  int inner = 0;
+  int arg1 = 1;
+  int arg2 = 1;
+  arg1 = xarg1; // XXXX
+  arg2 = xarg2; // XXXX
+  for (outer = 0; outer < outerend; outer++) // XXXX
+  {
+    for (inner = 0; inner < innerend; inner++) // XXXX
+    {
+      //barrier(CLK_LOCAL_MEM_FENCE);
+	    printf("outer=%d inner=%d lid=%d\n", outer, inner, get_local_id(0));
+	    if (arg2 > arg1) // XXXX
+	    {
+        barrier(CLK_LOCAL_MEM_FENCE); // XXXX
+	    }
+	    if (arg1 > 0) // XXXX
+	    {
+        barrier(CLK_LOCAL_MEM_FENCE); // XXXX
+	    }
+	    printf("+ outer=%d inner=%d lid=%d\n", outer, inner, get_local_id(0));
+      //barrier(CLK_LOCAL_MEM_FENCE); /* This barrier also fixes it.  */
+    }
+  }
+}
+)RAW";
+
+int main(int argc, char *argv[])
+{
+  cl::Device device = cl::Device::getDefault();
+  cl::CommandQueue queue = cl::CommandQueue::getDefault();
+  cl::Program program(SOURCE, true);
+
+#if (__GNUC__ > 5)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wignored-attributes"
+#endif
+
+  auto kernel = cl::KernelFunctor<cl_int, cl_int>(program, "pocltest");
+
+  cl_int i = 0;
+  cl::Buffer buffer;
+  kernel(cl::EnqueueArgs(queue, cl::NDRange(2), cl::NDRange(2)), 1, 2);
+
+#if (__GNUC__ > 5)
+#pragma GCC diagnostic pop
+#endif
+
+  queue.finish();
+}
diff --git a/tests/regression/test_issue_577.cpp b/tests/regression/test_issue_577.cpp
new file mode 100644
index 0000000..d365a01
--- /dev/null
+++ b/tests/regression/test_issue_577.cpp
@@ -0,0 +1,41 @@
+
+// Trying to build a faulty program twice results in NULL deref
+// See https://github.com/pocl/pocl/issues/577
+// should print "BUILD ERROR" twice then "OK" once.
+
+#define CL_HPP_ENABLE_EXCEPTIONS
+#define CL_HPP_MINIMUM_OPENCL_VERSION 120
+#define CL_HPP_TARGET_OPENCL_VERSION 120
+#include <CL/cl2.hpp>
+#include <iostream>
+
+const char *SOURCE = R"RAW(
+
+  __kernel void foo(__global int *input) {
+    !@#$%^&*();
+  }
+
+)RAW";
+
+int main(int argc, char *argv[]) {
+  cl_int err;
+  unsigned error_count = 0;
+  cl::Program program(SOURCE, false, &err);
+
+  for (unsigned i = 0; i < 2; i++) {
+    try {
+      program.compile();
+    } catch (cl::BuildError &e) {
+      std::cout << "BUILD ERROR\n";
+      error_count++;
+    }
+  }
+
+  if (error_count == 2) {
+    std::cout << "OK\n";
+    return 0;
+  } else {
+    std::cout << "FAIL\n";
+    return 1;
+  }
+}
diff --git a/tests/regression/test_locals.cpp b/tests/regression/test_locals.cpp
index 67167dd..09d31f5 100644
--- a/tests/regression/test_locals.cpp
+++ b/tests/regression/test_locals.cpp
@@ -96,15 +96,12 @@ main(void)
             BUFFER_SIZE * sizeof(float), 
             (void *) &A[0]);
 
-        cl::Buffer localBuffer = cl::Buffer(
-            context, 0, BUFFER_SIZE * sizeof(int), NULL);
-
         // Create kernel object
         cl::Kernel kernel(program, "test_kernel");
 
         // Set kernel args
         kernel.setArg(0, aBuffer);
-        kernel.setArg(1, localBuffer);
+        kernel.setArg(1, (BUFFER_SIZE * sizeof(int)), NULL);
         kernel.setArg(2, scalar);
 
         // Create command queue
diff --git a/tests/regression/test_program_from_binary_with_local_1_1_1.c b/tests/regression/test_program_from_binary_with_local_1_1_1.c
new file mode 100644
index 0000000..6a195e6
--- /dev/null
+++ b/tests/regression/test_program_from_binary_with_local_1_1_1.c
@@ -0,0 +1,149 @@
+/* Tests a case where a program created from binary (with a loop) is run with
+   a local size of (1, 1, 1).
+
+   Copyright (c) 2017 pocl developers
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+*/
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <CL/opencl.h>
+
+#define NB_WORK_GROUP 32
+#define VEC_SIZE 32
+
+const char *kernelSource =
+  "__kernel void test(__global unsigned * restrict buffer,          "
+  "                   __local unsigned * restrict local_input,      "
+  "                   const unsigned vec_size)                      "
+  "{                                                                "
+  "  unsigned i, j;                                                 "
+  "  size_t gid = get_global_id(0);                                 "
+  "  size_t lid = get_local_id(0);                                  "
+  "  size_t lsize = get_local_size(0);                              "
+  "  event_t event_read, event_write;                               "
+  "  event_read = async_work_group_copy(local_input, &buffer[gid*vec_size*lsize], vec_size*lsize, 0);"
+  "  for (i=0; i<vec_size; i++)                                     "
+  "    {                                                            "
+  "      if (i == 0)                                                "
+  "        wait_group_events(1, &event_read);                       "
+  "      local_input[i*lsize+lid]++;                                "
+  "    }                                                            "
+  "  event_write = async_work_group_copy(&buffer[gid*vec_size*lsize], local_input, vec_size*lsize, event_write);"
+  "  wait_group_events(1, &event_write);                            "
+  "}                                                                ";
+
+int main ()
+{
+	cl_platform_id platform;
+  cl_device_id device;
+  cl_context context;
+  cl_command_queue queue;
+  cl_program program_source, program_binary;
+  cl_kernel kernel;
+  cl_mem buffer;
+  cl_uint vec_size = VEC_SIZE;
+  cl_uint input_buffer[NB_WORK_GROUP * VEC_SIZE] = {0};
+  cl_uint output_buffer[NB_WORK_GROUP * VEC_SIZE] = {0};
+  cl_int err;
+
+  size_t global_size = NB_WORK_GROUP;
+  size_t local_size = 1;
+  size_t sizeof_buffer = global_size * vec_size * sizeof(unsigned);
+  size_t binary_size;
+
+  char *binary;
+
+  unsigned k, i;
+
+  for (k=0; k<global_size; k++)
+    {
+      for (i=0; i<vec_size; i++)
+        {
+          input_buffer[k*vec_size+i]=k*vec_size+i;
+        }
+    }
+
+  err = clGetPlatformIDs(1, &platform, NULL);
+  assert(!err);
+  err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, NULL);
+  assert(!err);
+  context = clCreateContext(0, 1, &device, NULL, NULL, &err);
+  assert(!err);
+  queue = clCreateCommandQueue(context, device, 0, &err);
+  assert(!err);
+
+  program_source = clCreateProgramWithSource(context, 1, &kernelSource, NULL, &err);
+  assert(!err);
+  err = clBuildProgram(program_source, 0, NULL, NULL, NULL, NULL);
+  assert(!err);
+  err = clGetProgramInfo(program_source, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binary_size, NULL);
+  assert(!err);
+  binary = (char *)malloc(sizeof(char)*binary_size);
+  assert(binary);
+  err = clGetProgramInfo(program_source, CL_PROGRAM_BINARIES, sizeof(char*), &binary, NULL);
+  assert(!err);
+
+  program_binary = clCreateProgramWithBinary(context, 1, &device, &binary_size,
+                                             (const unsigned char **)&binary, NULL, &err);
+  assert(!err);
+  err = clBuildProgram(program_binary, 0, NULL, NULL, NULL, NULL);
+  assert(!err);
+  kernel = clCreateKernel(program_binary, "test", &err);
+  assert(!err);
+  buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof_buffer, &input_buffer, &err);
+  assert(!err);
+  err = clSetKernelArg(kernel, 0, sizeof (cl_mem), &buffer);
+  assert(!err);
+  err = clSetKernelArg(kernel, 1, sizeof (unsigned) * vec_size, NULL);
+  assert(!err);
+  err = clSetKernelArg(kernel, 2, sizeof (cl_uint), &vec_size);
+  assert(!err);
+  err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL);
+  assert(!err);
+  clFinish(queue);
+  err = clEnqueueReadBuffer(queue, buffer, CL_TRUE, 0, sizeof_buffer, output_buffer, 0, NULL, NULL);
+  assert(!err);
+
+  for (k=0; k<global_size; k++)
+    {
+      for (i=0; i<vec_size; i++)
+        {
+          unsigned expected = (input_buffer[k*vec_size+i]+1);
+          if (output_buffer[k*vec_size+i] != expected)
+            {
+              printf("Error at %u %u : %u != %u\n", k, i,
+                     output_buffer[k*vec_size+i], expected);
+              return 1;
+            }
+        }
+    }
+
+  clReleaseMemObject(buffer);
+  clReleaseKernel(kernel);
+  clReleaseProgram(program_source);
+  clReleaseProgram(program_binary);
+  clReleaseCommandQueue(queue);
+  clReleaseContext(context);
+  clReleaseDevice(device);
+
+  return 0;
+}
diff --git a/tests/runtime/CMakeLists.txt b/tests/runtime/CMakeLists.txt
index 7336775..c68fe32 100644
--- a/tests/runtime/CMakeLists.txt
+++ b/tests/runtime/CMakeLists.txt
@@ -33,15 +33,6 @@ set(PROGRAMS_TO_BUILD test_clFinish test_clGetDeviceInfo test_clGetEventInfo
   test_enqueue_kernel_from_binary test_user_event
   test_clSetMemObjectDestructorCallback)
 
-#EXTRA_DIST= \
-# test_kernel_src_in_pwd.h \
-# test_clCreateKernelsInProgram.cl \
-# test_data/test_kernel_src_in_another_dir.h
-
-#AM_LDFLAGS = @OPENCL_LIBS@ ../../lib/poclu/libpoclu.la
-# POCLU_LINK_OPTIONS
-
-#AM_CPPFLAGS = -I$(top_srcdir)/fix-include -I$(top_srcdir)/include @OPENCL_CFLAGS@
 add_compile_options(${OPENCL_CFLAGS})
 
 foreach(PROG ${PROGRAMS_TO_BUILD})
@@ -75,8 +66,7 @@ add_test_pocl(NAME "runtime/clFinish" COMMAND "test_clFinish")
 
 add_test_pocl(NAME "runtime/test_event_cycle" COMMAND "test_event_cycle")
 
-# currently fails, see commit 13e5bc89a6b7675efbc
-#add_test("runtime/test_link_error" "test_link_error")
+add_test_pocl(NAME "runtime/test_link_error" COMMAND "test_link_error")
 
 add_test_pocl(NAME "runtime/test_read-copy-write-buffer" COMMAND "test_read-copy-write-buffer")
 
@@ -110,10 +100,10 @@ set_tests_properties( "runtime/clGetDeviceInfo" "runtime/clEnqueueNativeKernel"
   "runtime/clGetSupportedImageFormats" "runtime/clCreateKernelsInProgram"
   "runtime/clCreateKernel" "runtime/clGetKernelArgInfo"
   "runtime/test_kernel_cache_includes" "runtime/test_event_cycle"
-  "runtime/test_read-copy-write-buffer" "runtime/test_buffer-image-copy" #"runtime/test_link_error"
+  "runtime/test_read-copy-write-buffer" "runtime/test_buffer-image-copy"
   "runtime/test_event_free" "runtime/clCreateSubDevices"
   "runtime/test_enqueue_kernel_from_binary" "runtime/test_user_event"
-  "runtime/clSetMemObjectDestructorCallback"
+  "runtime/clSetMemObjectDestructorCallback" "runtime/test_link_error"
   PROPERTIES
     COST 2.0
     PROCESSORS 1
@@ -135,3 +125,21 @@ set_tests_properties("runtime/clFinish"
 set_tests_properties("runtime/test_kernel_cache_includes"
   PROPERTIES PASS_REGULAR_EXPRESSION
   "function 1.*first include.*function 2.*second include")
+
+# Label tests that work with CUDA backend
+set_property(TEST
+  "runtime/clGetDeviceInfo"
+  "runtime/clGetEventInfo"
+  "runtime/clCreateProgramWithBinary"
+  "runtime/test_kernel_cache_includes"
+  "runtime/clFinish"
+  "runtime/test_read-copy-write-buffer"
+  "runtime/test_buffer-image-copy"
+  "runtime/clSetEventCallback"
+  "runtime/clGetSupportedImageFormats"
+  "runtime/clCreateKernelsInProgram"
+  "runtime/test_event_cycle"
+  "runtime/test_user_event"
+  "runtime/clSetMemObjectDestructorCallback"
+  "runtime/test_enqueue_kernel_from_binary"
+  APPEND PROPERTY LABELS "cuda")
diff --git a/tests/runtime/test_buffer-image-copy.c b/tests/runtime/test_buffer-image-copy.c
index 32cce26..ae13a9c 100644
--- a/tests/runtime/test_buffer-image-copy.c
+++ b/tests/runtime/test_buffer-image-copy.c
@@ -186,13 +186,15 @@ main(void)
 
       TEST_ASSERT(memcmp(buf_map, host_buf, buf_size) == 0);
 
-      CHECK_CL_ERROR(clEnqueueUnmapMemObject(queue, buf, buf_map, 0, NULL, NULL));
-      CHECK_CL_ERROR(clFinish(queue));
+      CHECK_CL_ERROR (
+          clEnqueueUnmapMemObject (queue, buf, buf_map, 0, NULL, NULL));
+      CHECK_CL_ERROR (clFinish (queue));
 
       free(host_buf);
-      CHECK_CL_ERROR(clReleaseMemObject(img));
-      CHECK_CL_ERROR(clReleaseMemObject(buf));
-      CHECK_CL_ERROR(clReleaseCommandQueue(queue));
+      CHECK_CL_ERROR (clReleaseMemObject (img));
+      CHECK_CL_ERROR (clReleaseMemObject (buf));
+      CHECK_CL_ERROR (clReleaseCommandQueue (queue));
+      CHECK_CL_ERROR (clReleaseContext (context));
     }
   }
   return EXIT_SUCCESS;
diff --git a/tests/runtime/test_clBuildProgram.c b/tests/runtime/test_clBuildProgram.c
index e90bcbb..551b98e 100644
--- a/tests/runtime/test_clBuildProgram.c
+++ b/tests/runtime/test_clBuildProgram.c
@@ -51,6 +51,10 @@ static const char invalid_kernel[] =
 static const char warning_kernel[] =
   "kernel void test_kernel(int j, k) { return; }\n";
 
+static const char missing_symbol_kernel[] = "kernel void test_kernel() { "
+                                            "one_does_not_simply_walk_into_"
+                                            "mordor(); }\n";
+
 /* kernel can have any name, except main() starting from OpenCL 2.0 */
 static const char valid_kernel[] =
   "kernel void init(global int *arg) { return; }\n";
@@ -65,9 +69,9 @@ void buildprogram_callback(cl_program program, void *user_data)
   fprintf(stderr, "cl_program callback (via pfn_notify)\n");
 
   if (user_data == (void*)FAKE_PTR)
-    fprintf(stderr, "OK\n");
+    fprintf (stderr, "build callback successful\n");
   else
-    fprintf(stderr, "FAIL\n");
+    fprintf (stderr, "build callback FAILED\n");
 }
 
 
@@ -137,12 +141,17 @@ main(void){
               size_t log_size = 0;
               CHECK_CL_ERROR(clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG,
                       0, NULL, &log_size));
-              char *log = malloc(log_size);
-              CHECK_CL_ERROR(clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG,
-                      log_size, log, NULL));
-              log[log_size] = '\0';
-              fprintf(stderr, "preprocess failure log[%u]: %s\n", i, log);
-              free(log);
+              if (log_size)
+                {
+                  char *log = malloc (log_size + 1);
+                  fprintf (stderr, "log: %p\n", log);
+                  CHECK_CL_ERROR (clGetProgramBuildInfo (program, devices[i],
+                                                         CL_PROGRAM_BUILD_LOG,
+                                                         log_size, log, NULL));
+                  log[log_size] = '\0';
+                  fprintf (stderr, "preprocess failure log[%u]: %s\n", i, log);
+                  free (log);
+                }
       }
       /*Lets not release the program as we need it in the next test case*/
       /*CHECK_CL_ERROR(clReleaseProgram(program));*/
@@ -155,16 +164,19 @@ main(void){
 
       for (i = 0; i < num_devices; ++i) {
           size_t log_size = 0;
-          err = clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG,
-              0, NULL, &log_size);
-          CHECK_OPENCL_ERROR_IN("get build log size");
-          char *log = malloc(log_size);
-          err = clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG,
-              log_size, log, NULL);
-          CHECK_OPENCL_ERROR_IN("get build log");
-          log[log_size] = '\0';
-          fprintf(stderr, "preprocess failure log[%u]: %s\n", i, log);
-          free(log);
+          CHECK_CL_ERROR (clGetProgramBuildInfo (
+              program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size));
+          if (log_size)
+            {
+              char *log = malloc (log_size + 1);
+              err = clGetProgramBuildInfo (program, devices[i],
+                                           CL_PROGRAM_BUILD_LOG, log_size, log,
+                                           NULL);
+              CHECK_OPENCL_ERROR_IN ("get build log");
+              log[log_size] = '\0';
+              fprintf (stderr, "preprocess failure log[%u]: %s\n", i, log);
+              free (log);
+            }
       }
 
       CHECK_CL_ERROR(clReleaseProgram(program));
@@ -260,19 +272,22 @@ main(void){
 
       for (i = 0; i < num_devices; ++i) {
           size_t log_size = 0;
-          err = clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG,
-                                      0, NULL, &log_size);
-          CHECK_OPENCL_ERROR_IN("get build log size");
-          char *log = malloc(log_size);
-          err = clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG,
-                                      log_size, log, NULL);
-          CHECK_OPENCL_ERROR_IN("get build log");
-          log[log_size] = '\0';
-          /*As this build option deprecated after OCL1.0 we should see a warning here*/
-          fprintf(stderr, "Deprecated -cl-strict-aliasing log[%u]: %s\n", i, log);
-
-          free(log);
-
+          CHECK_CL_ERROR (clGetProgramBuildInfo (
+              program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size));
+          if (log_size)
+            {
+              char *log = malloc (log_size + 1);
+              err = clGetProgramBuildInfo (program, devices[i],
+                                           CL_PROGRAM_BUILD_LOG,
+                                           log_size, log, NULL);
+              CHECK_OPENCL_ERROR_IN ("get build log");
+              log[log_size] = '\0';
+              /*As this build option deprecated after OCL1.0 we should see a
+               * warning here*/
+              fprintf (stderr, "Deprecated -cl-strict-aliasing log[%u]: %s\n",
+                       i, log);
+              free (log);
+            }
           cl_program_binary_type bin_type = 0;
           err = clGetProgramBuildInfo(program, devices[i],
                                       CL_PROGRAM_BINARY_TYPE,
@@ -400,6 +415,8 @@ main(void){
     CHECK_CL_ERROR(clBuildProgram(program, num_devices, devices, NULL, NULL, NULL));
 
     CHECK_CL_ERROR(clReleaseProgram(program));
+
+    free (macro_kernel);
   }
 
   /* TEST 12: warning into error */
@@ -432,5 +449,28 @@ main(void){
       CHECK_CL_ERROR(clReleaseProgram(program));
   }
 
+#if !(defined(LLVM_3_6) || defined(LLVM_3_7) ||  defined(LLVM_3_8))
+  /* TEST 13: missing symbols: kernel referring nonexistent function */
+  {
+    size_t kernel_size = strlen (missing_symbol_kernel);
+    const char *kernel_buffer = missing_symbol_kernel;
+
+    program = clCreateProgramWithSource (
+        context, 1, (const char **)&kernel_buffer, &kernel_size, &err);
+    // clCreateProgramWithSource for invalid kernel failed
+    CHECK_OPENCL_ERROR_IN ("clCreateProgramWithSource");
+
+    err = clBuildProgram (program, num_devices, devices, NULL, NULL, NULL);
+    TEST_ASSERT (err == CL_BUILD_PROGRAM_FAILURE);
+
+    CHECK_CL_ERROR (clReleaseProgram (program));
+  }
+#endif
+
+  CHECK_CL_ERROR (clReleaseContext (context));
+  CHECK_CL_ERROR (clUnloadCompiler ());
+
+  printf ("OK\n");
+
   return EXIT_SUCCESS;
 }
diff --git a/tests/runtime/test_clCreateKernel.c b/tests/runtime/test_clCreateKernel.c
index be66953..1cf38cf 100644
--- a/tests/runtime/test_clCreateKernel.c
+++ b/tests/runtime/test_clCreateKernel.c
@@ -37,6 +37,13 @@ int main(int argc, char **argv)
   TEST_ASSERT(err == CL_INVALID_KERNEL_NAME);
   TEST_ASSERT(kernel == NULL);
 
+  CHECK_CL_ERROR (clReleaseCommandQueue (queue));
+  CHECK_CL_ERROR (clReleaseProgram (program));
+  CHECK_CL_ERROR (clReleaseContext (ctx));
+  CHECK_CL_ERROR (clUnloadCompiler ());
+
+  free ((void *)krn_src);
+
   printf("OK\n");
 
   return 0;
diff --git a/tests/runtime/test_clCreateKernelsInProgram.c b/tests/runtime/test_clCreateKernelsInProgram.c
index 7181252..f814d71 100644
--- a/tests/runtime/test_clCreateKernelsInProgram.c
+++ b/tests/runtime/test_clCreateKernelsInProgram.c
@@ -66,6 +66,16 @@ int main(int argc, char **argv)
   err = clFinish(queue);
   CHECK_OPENCL_ERROR_IN("clFinish");
 
+  CHECK_CL_ERROR (clReleaseCommandQueue (queue));
+  CHECK_CL_ERROR (clReleaseKernel (kernels[0]));
+  CHECK_CL_ERROR (clReleaseKernel (kernels[1]));
+  CHECK_CL_ERROR (clReleaseProgram (program));
+  CHECK_CL_ERROR (clReleaseProgram (empty));
+  CHECK_CL_ERROR (clReleaseContext (ctx));
+  CHECK_CL_ERROR (clUnloadCompiler ());
+
+  free ((void *)krn_src);
+
   return EXIT_SUCCESS;
 }
 
diff --git a/tests/runtime/test_clCreateProgramWithBinary.c b/tests/runtime/test_clCreateProgramWithBinary.c
index 1f86e92..956d7df 100644
--- a/tests/runtime/test_clCreateProgramWithBinary.c
+++ b/tests/runtime/test_clCreateProgramWithBinary.c
@@ -178,8 +178,13 @@ main(void){
   if (binaries) 
     free(binaries);
   if (program)
-    clReleaseProgram(program);  
+    CHECK_CL_ERROR (clReleaseProgram (program));
   if (program_with_binary)
-    clReleaseProgram(program_with_binary);
+    CHECK_CL_ERROR (clReleaseProgram (program_with_binary));
+  if (context)
+    CHECK_CL_ERROR (clReleaseContext (context));
+
+  CHECK_CL_ERROR (clUnloadCompiler ());
+
   return err == CL_SUCCESS ? EXIT_SUCCESS : EXIT_FAILURE;
 }
diff --git a/tests/runtime/test_clCreateSubDevices.c b/tests/runtime/test_clCreateSubDevices.c
index e18b10d..9e6bbd2 100644
--- a/tests/runtime/test_clCreateSubDevices.c
+++ b/tests/runtime/test_clCreateSubDevices.c
@@ -123,7 +123,7 @@ int main(int argc, char **argv)
     *eqdev = alldevs + 1,
     *countdev = alldevs + 3;
   cl_uint max_cus, max_subs;
-  cl_uint i;
+  cl_uint i, j;
 
   cl_int err = poclu_get_any_device(&ctx, &rootdev, &q);
   CHECK_OPENCL_ERROR_IN("poclu_get_any_device");
@@ -163,15 +163,16 @@ int main(int argc, char **argv)
     dev_pt_size, dev_pt, NULL);
   CHECK_OPENCL_ERROR_IN("CL_DEVICE_PARTITION_PROPERTIES");
 
-  dev_pt_size /= sizeof(*dev_pt); // number of partition types
+  j = dev_pt_size / sizeof (*dev_pt); // number of partition types
 
   // check that partition types EQUALLY and BY_COUNTS are supported
   int found = 0;
-  for (i = 0; i < dev_pt_size; ++i) {
-    if (dev_pt[i] == CL_DEVICE_PARTITION_EQUALLY ||
-        dev_pt[i] == CL_DEVICE_PARTITION_BY_COUNTS)
-      ++found;
-  }
+  for (i = 0; i < j; ++i)
+    {
+      if (dev_pt[i] == CL_DEVICE_PARTITION_EQUALLY
+          || dev_pt[i] == CL_DEVICE_PARTITION_BY_COUNTS)
+        ++found;
+    }
 
   TEST_ASSERT(found == 2);
 
@@ -196,6 +197,12 @@ int main(int argc, char **argv)
   err = clCreateSubDevices(rootdev, equal_splitter, 2, eqdev, NULL);
   CHECK_OPENCL_ERROR_IN("partition equally");
 
+  cl_uint refc;
+  err = clGetDeviceInfo (eqdev[0], CL_DEVICE_REFERENCE_COUNT, sizeof (refc),
+                         &refc, NULL);
+  CHECK_OPENCL_ERROR_IN ("get refcount");
+  TEST_ASSERT (refc == 1);
+
   /* First, check that the root device is untouched */
 
   err = clGetDeviceInfo(rootdev, CL_DEVICE_MAX_COMPUTE_UNITS,
@@ -372,6 +379,14 @@ int main(int argc, char **argv)
   TEST_ASSERT( test_context(ctx, prog_src_two, -1, NUMDEVS - 1, alldevs + 1)
     == CL_SUCCESS );
 
+  for (i = 0; i < NUMDEVS; i++)
+    clReleaseDevice (alldevs[i]);
+
+  CHECK_CL_ERROR (clUnloadCompiler ());
+  free (dev_pt);
+
+  printf ("OK\n");
+
   return 0;
 }
 
diff --git a/tests/runtime/test_clEnqueueNativeKernel.c b/tests/runtime/test_clEnqueueNativeKernel.c
index 25b9340..4540e3f 100644
--- a/tests/runtime/test_clEnqueueNativeKernel.c
+++ b/tests/runtime/test_clEnqueueNativeKernel.c
@@ -127,13 +127,13 @@ int main(int argc, char **argv) {
         printf("Fail to validate vector\n");
         goto error;
       }
-     
-  CHECK_CL_ERROR(clReleaseMemObject(d_a));
-  CHECK_CL_ERROR(clReleaseMemObject(d_b));
-  CHECK_CL_ERROR(clReleaseMemObject(d_c));
-  CHECK_CL_ERROR(clReleaseCommandQueue(queue));
-  CHECK_CL_ERROR(clReleaseContext(ctx));
- 
+
+  CHECK_CL_ERROR (clReleaseMemObject (d_a));
+  CHECK_CL_ERROR (clReleaseMemObject (d_b));
+  CHECK_CL_ERROR (clReleaseMemObject (d_c));
+  CHECK_CL_ERROR (clReleaseCommandQueue (queue));
+  CHECK_CL_ERROR (clReleaseContext (ctx));
+
   free(h_a);
   free(h_b);
   free(h_c);
diff --git a/tests/runtime/test_clFinish.c b/tests/runtime/test_clFinish.c
index 65fe133..ab0fe13 100644
--- a/tests/runtime/test_clFinish.c
+++ b/tests/runtime/test_clFinish.c
@@ -29,7 +29,7 @@
 char kernelASourceCode[] = 
 "kernel \n"
 "void test_kernel(constant char* input) {\n"
-"    printf(\"%s\", input);\n"
+"    printf(\"%c\", *input);\n"
 "}\n";
 
 int main()
@@ -177,6 +177,30 @@ int main()
   CHECK_OPENCL_ERROR_IN("clEnqueueNDRangeKernel");
 
   clFinish(queueC);
+  /* TODO some checks */
+
+  CHECK_CL_ERROR (clReleaseEvent (eventA1));
+  CHECK_CL_ERROR (clReleaseEvent (eventB2));
+  CHECK_CL_ERROR (clReleaseEvent (eventA3));
+  CHECK_CL_ERROR (clReleaseEvent (eventB4));
+
+  CHECK_CL_ERROR (clReleaseKernel (kernelA));
+  CHECK_CL_ERROR (clReleaseKernel (kernelB));
+  CHECK_CL_ERROR (clReleaseKernel (kernelC));
+
+  CHECK_CL_ERROR (clReleaseProgram (program));
+
+  CHECK_CL_ERROR (clReleaseCommandQueue (queueA));
+  CHECK_CL_ERROR (clReleaseCommandQueue (queueB));
+  CHECK_CL_ERROR (clReleaseCommandQueue (queueC));
+
+  CHECK_CL_ERROR (clReleaseMemObject (inputBufferA));
+  CHECK_CL_ERROR (clReleaseMemObject (inputBufferB));
+  CHECK_CL_ERROR (clReleaseMemObject (inputBufferC));
+
+  CHECK_CL_ERROR (clReleaseContext (context));
+  CHECK_CL_ERROR (clUnloadCompiler ());
+
   printf("\n");
   return EXIT_SUCCESS;
 
diff --git a/tests/runtime/test_clGetDeviceInfo.c b/tests/runtime/test_clGetDeviceInfo.c
index 4abb80d..f173fcb 100644
--- a/tests/runtime/test_clGetDeviceInfo.c
+++ b/tests/runtime/test_clGetDeviceInfo.c
@@ -49,5 +49,7 @@ main(void)
       TEST_ASSERT(max_mem_alloc_size >= min_max_mem_alloc_size);
     }
   }
+
+  printf ("OK\n");
   return EXIT_SUCCESS;
 }
diff --git a/tests/runtime/test_clGetEventInfo.c b/tests/runtime/test_clGetEventInfo.c
index 02b6c36..978615b 100644
--- a/tests/runtime/test_clGetEventInfo.c
+++ b/tests/runtime/test_clGetEventInfo.c
@@ -56,7 +56,11 @@ main(void)
           CHECK_CL_ERROR(clReleaseMemObject(buf));
 
           CHECK_CL_ERROR(clReleaseCommandQueue(queue));
+
+          CHECK_CL_ERROR (clReleaseContext (context));
         }
     }
+
+  printf ("OK\n");
   return EXIT_SUCCESS;
 }
diff --git a/tests/runtime/test_clGetKernelArgInfo.c b/tests/runtime/test_clGetKernelArgInfo.c
index 216b40b..2e9938c 100644
--- a/tests/runtime/test_clGetKernelArgInfo.c
+++ b/tests/runtime/test_clGetKernelArgInfo.c
@@ -27,6 +27,7 @@
 #include <stdlib.h>
 #include "poclu.h"
 #include "config.h"
+#include "pocl.h"
 
 char kernelSourceCode[] =
 "constant sampler_t samp =  CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n"
@@ -171,11 +172,42 @@ int test_program(cl_program program, int is_spir) {
   TEST_ASSERT((kernel_arg.type==CL_KERNEL_ARG_TYPE_NONE) &&
 	      "type qualifier of arg of test_kernel is not NONE");
 
-  err = clGetKernelArgInfo(test_kernel, 3, CL_KERNEL_ARG_TYPE_QUALIFIER,
-                            BUF_LEN, &kernel_arg.type, &retsize);
-  CHECK_OPENCL_ERROR_IN("clGetKernelArgInfo");
-  TEST_ASSERT((kernel_arg.type==CL_KERNEL_ARG_TYPE_CONST) &&
-	      "type qualifier of arg of test_kernel is not CONST");
+#ifndef LLVM_OLDER_THAN_5_0
+  if (!is_spir) {
+
+    /* Clang versions before 5 added the const MD also for non-pointer
+       types even though OpenCL specs mandates to put them only in
+       pointer args. This was fixed in Clang r299192 (see below).
+       TODO: update the SPIRs with 5.0+ Clang to drop the metadata. */
+
+    /*
+      r299192 | echuraev | 2017-03-31 13:14:52 +0300 (Fri, 31 Mar 2017) | 26 lines
+
+      [OpenCL] Do not generate "kernel_arg_type_qual" metadata for non-pointer args
+
+      Summary:
+      "kernel_arg_type_qual" metadata should contain const/volatile/restrict
+      tags only for pointer types to match the corresponding requirement of
+      the OpenCL specification.
+
+      OpenCL 2.0 spec 5.9.3 Kernel Object Queries:
+
+      CL_KERNEL_ARG_TYPE_VOLATILE is returned if the argument is a pointer
+      and the referenced type is declared with the volatile qualifier.
+      [...]
+      Similarly, CL_KERNEL_ARG_TYPE_CONST is returned if the argument is a
+      pointer and the referenced type is declared with the restrict or const
+      qualifier.
+      [...]
+      CL_KERNEL_ARG_TYPE_RESTRICT will be returned if the pointer type is
+      marked restrict.
+    */
+    err = clGetKernelArgInfo(test_kernel, 3, CL_KERNEL_ARG_TYPE_QUALIFIER,
+                             BUF_LEN, &kernel_arg.type, &retsize);
+    CHECK_OPENCL_ERROR_IN("clGetKernelArgInfo");
+    TEST_ASSERT((kernel_arg.type == CL_KERNEL_ARG_TYPE_NONE));
+  }
+#endif
 
   /* NAME tests */
   // constant char* msg, global volatile float* in, global float* out, const float j, local int* c
@@ -287,6 +319,9 @@ int test_program_nometa(cl_program program) {
                             BUF_LEN, &kernel_arg.string, &retsize);
   TEST_ASSERT(err == CL_KERNEL_ARG_INFO_NOT_AVAILABLE);
 
+  err = clReleaseKernel (test_kernel);
+  CHECK_OPENCL_ERROR_IN ("clReleaseKernel");
+
   return EXIT_SUCCESS;
 }
 
@@ -309,6 +344,8 @@ int spir_program(char * filename, cl_context ctx, cl_device_id did, cl_program*
 
   CHECK_CL_ERROR(clBuildProgram (*program, 1, &did, NULL, NULL, NULL));
 
+  free (program_buffer);
+
   return EXIT_SUCCESS;
 }
 
@@ -349,6 +386,15 @@ int main()
 
   CHECK_CL_ERROR(clReleaseProgram(program));
 
+  char extensions[1024];
+  err = clGetDeviceInfo(did, CL_DEVICE_EXTENSIONS, 1024, extensions, NULL);
+  CHECK_OPENCL_ERROR_IN("clGetDeviceInfo");
+  if (strstr(extensions, "cl_khr_spir") == NULL)
+    {
+      printf ("SPIR not supported, skipping SPIR arg info tests\n");
+      goto FINISH;
+    }
+
   /* SPIR program */
 
   printf("\nSPIR with metadata\n");
@@ -365,6 +411,11 @@ int main()
 
   CHECK_CL_ERROR(clReleaseProgram(program));
 
+FINISH:
+  CHECK_CL_ERROR (clReleaseCommandQueue (queue));
+  CHECK_CL_ERROR (clReleaseContext (ctx));
+  CHECK_CL_ERROR (clUnloadCompiler ());
+
   printf("\nOK\n");
   return EXIT_SUCCESS;
 
diff --git a/tests/runtime/test_clGetSupportedImageFormats.c b/tests/runtime/test_clGetSupportedImageFormats.c
index cf320aa..06ae35b 100644
--- a/tests/runtime/test_clGetSupportedImageFormats.c
+++ b/tests/runtime/test_clGetSupportedImageFormats.c
@@ -41,5 +41,9 @@ main(void)
 
   TEST_ASSERT(num_entries != 0);
 
+  CHECK_CL_ERROR (clReleaseContext (context));
+
+  free (img_formats);
+
   return EXIT_SUCCESS;
 }
diff --git a/tests/runtime/test_clSetEventCallback.c b/tests/runtime/test_clSetEventCallback.c
index 1fa4015..007269e 100644
--- a/tests/runtime/test_clSetEventCallback.c
+++ b/tests/runtime/test_clSetEventCallback.c
@@ -136,6 +136,18 @@ int main()
           return EXIT_FAILURE;
         }
     }
+
+  CHECK_CL_ERROR (clReleaseEvent (an_event));
+
+  CHECK_CL_ERROR (clReleaseCommandQueue (queue));
+  CHECK_CL_ERROR (clReleaseMemObject (inputBuffer));
+
+  CHECK_CL_ERROR (clReleaseKernel (kernel));
+  CHECK_CL_ERROR (clReleaseProgram (program));
+
+  CHECK_CL_ERROR (clReleaseContext (context));
+  CHECK_CL_ERROR (clUnloadCompiler ());
+
   return EXIT_SUCCESS;
 
 }
diff --git a/tests/runtime/test_clSetMemObjectDestructorCallback.c b/tests/runtime/test_clSetMemObjectDestructorCallback.c
index 88bd663..17326e9 100644
--- a/tests/runtime/test_clSetMemObjectDestructorCallback.c
+++ b/tests/runtime/test_clSetMemObjectDestructorCallback.c
@@ -42,8 +42,9 @@ main(void)
   err = clSetMemObjectDestructorCallback (mem, callback, (void*)FAKE_PTR);
   CHECK_OPENCL_ERROR_IN("clSetMemObjectDestructorCallback");
 
-  err = clReleaseMemObject (mem);
-  CHECK_OPENCL_ERROR_IN("clReleaseMemObject");
+  CHECK_CL_ERROR (clReleaseMemObject (mem));
+  CHECK_CL_ERROR (clReleaseContext (context));
+  CHECK_CL_ERROR (clUnloadCompiler ());
 
   return EXIT_SUCCESS;
 }
diff --git a/tests/runtime/test_enqueue_kernel_from_binary.c b/tests/runtime/test_enqueue_kernel_from_binary.c
index d23f1d6..54ab723 100644
--- a/tests/runtime/test_enqueue_kernel_from_binary.c
+++ b/tests/runtime/test_enqueue_kernel_from_binary.c
@@ -387,32 +387,40 @@ int main(void)
                  bb2[i]);
           return EXIT_FAILURE;
         }
-      if (static_wg_buf[i] != 2 + i + 1)
+      if (static_wg_buf[i] != 2 + (int)i + 1)
         {
           printf("static wg kernel failed at index %d (%d != %d)\n", i,
-                 static_wg_buf[i], 2 + i + 1);
+                 static_wg_buf[i], 2 + (int)i + 1);
           return EXIT_FAILURE;
         }
     }
 
-  CHECK_CL_ERROR(clReleaseMemObject(d_a));
-  CHECK_CL_ERROR(clReleaseMemObject(d_b));
-  CHECK_CL_ERROR(clReleaseMemObject(d_c1));
-  CHECK_CL_ERROR(clReleaseMemObject(d_c2));
-  CHECK_CL_ERROR(clReleaseMemObject(d_c3));
-  CHECK_CL_ERROR(clReleaseMemObject(barrier_buffer1));
-  CHECK_CL_ERROR(clReleaseMemObject(barrier_buffer2));
-  CHECK_CL_ERROR(clReleaseProgram(program1));
-  CHECK_CL_ERROR(clReleaseProgram(program2));
-  CHECK_CL_ERROR(clReleaseProgram(b_program1));
-  CHECK_CL_ERROR(clReleaseProgram(b_program2));
-  CHECK_CL_ERROR(clReleaseKernel(kernel1));
-  CHECK_CL_ERROR(clReleaseKernel(kernel2));
-  CHECK_CL_ERROR(clReleaseKernel(kernel3));
-  CHECK_CL_ERROR(clReleaseKernel(barrier_kernel1));
-  CHECK_CL_ERROR(clReleaseKernel(barrier_kernel2));
-  CHECK_CL_ERROR(clReleaseCommandQueue(queue));
-  CHECK_CL_ERROR(clReleaseContext(context));
+  CHECK_CL_ERROR (clReleaseMemObject (d_a));
+  CHECK_CL_ERROR (clReleaseMemObject (d_b));
+  CHECK_CL_ERROR (clReleaseMemObject (d_c1));
+  CHECK_CL_ERROR (clReleaseMemObject (d_c2));
+  CHECK_CL_ERROR (clReleaseMemObject (d_c3));
+  CHECK_CL_ERROR (clReleaseMemObject (barrier_buffer1));
+  CHECK_CL_ERROR (clReleaseMemObject (barrier_buffer2));
+  CHECK_CL_ERROR (clReleaseMemObject (static_wg_buffer));
+
+  CHECK_CL_ERROR (clReleaseKernel (kernel1));
+  CHECK_CL_ERROR (clReleaseKernel (kernel2));
+  CHECK_CL_ERROR (clReleaseKernel (kernel3));
+  CHECK_CL_ERROR (clReleaseKernel (barrier_kernel1));
+  CHECK_CL_ERROR (clReleaseKernel (barrier_kernel2));
+  CHECK_CL_ERROR (clReleaseKernel (static_wg_kernel));
+
+  CHECK_CL_ERROR (clReleaseProgram (program1));
+  CHECK_CL_ERROR (clReleaseProgram (program2));
+  CHECK_CL_ERROR (clReleaseProgram (b_program1));
+  CHECK_CL_ERROR (clReleaseProgram (b_program2));
+  CHECK_CL_ERROR (clReleaseProgram (static_wg_size_bin_program));
+  CHECK_CL_ERROR (clReleaseProgram (static_wg_size_program));
+
+  CHECK_CL_ERROR (clReleaseCommandQueue (queue));
+  CHECK_CL_ERROR (clReleaseContext (context));
+  CHECK_CL_ERROR (clUnloadCompiler ());
 
   free(bb1);
   free(bb2);
@@ -423,6 +431,8 @@ int main(void)
   free(h_c3);
   free(binary);
   free(barrier_binary);
+  free (static_wg_binary);
+  free (static_wg_buf);
 
   return 0;
 }
diff --git a/tests/runtime/test_event_cycle.c b/tests/runtime/test_event_cycle.c
index 765b124..9cd25d9 100644
--- a/tests/runtime/test_event_cycle.c
+++ b/tests/runtime/test_event_cycle.c
@@ -125,12 +125,16 @@ main(void)
 
       free(host_buf2);
       free(host_buf1);
-      clReleaseEvent(buf2_event);
-      clReleaseEvent(bufcp_event);
-      clReleaseEvent(buf1_event);
-      clReleaseMemObject(buf2);
-      clReleaseMemObject(buf1);
-      clReleaseCommandQueue(queue);
+
+      CHECK_CL_ERROR (clReleaseEvent (buf2_event));
+      CHECK_CL_ERROR (clReleaseEvent (bufcp_event));
+      CHECK_CL_ERROR (clReleaseEvent (buf1_event));
+
+      CHECK_CL_ERROR (clReleaseMemObject (buf1));
+      CHECK_CL_ERROR (clReleaseMemObject (buf2));
+
+      CHECK_CL_ERROR (clReleaseCommandQueue (queue));
+      CHECK_CL_ERROR (clReleaseContext (context));
     }
   }
   return EXIT_SUCCESS;
diff --git a/tests/runtime/test_event_free.c b/tests/runtime/test_event_free.c
index ac43ad5..a345c7b 100644
--- a/tests/runtime/test_event_free.c
+++ b/tests/runtime/test_event_free.c
@@ -44,7 +44,7 @@ int main(int argc, char **argv)
   CHECK_OPENCL_ERROR_IN("clCreateBuffer");
 
   cl_image_format img_fmt = {
-    .image_channel_order = CL_R,
+    .image_channel_order = CL_RGBA,
     .image_channel_data_type = CL_UNSIGNED_INT32 };
   cl_image_desc img_dsc = {
     .image_type = CL_MEM_OBJECT_IMAGE2D,
@@ -208,6 +208,13 @@ int main(int argc, char **argv)
     TEST_ASSERT(map_event == initial_value);
   }
 
+  clFinish (queue);
+  clReleaseMemObject (img);
+  clReleaseMemObject (buf);
+
+  CHECK_CL_ERROR (clReleaseCommandQueue (queue));
+  CHECK_CL_ERROR (clReleaseContext (ctx));
+
   return EXIT_SUCCESS;
 
 }
diff --git a/tests/runtime/test_kernel_cache_includes.c b/tests/runtime/test_kernel_cache_includes.c
index 3416157..246f440 100644
--- a/tests/runtime/test_kernel_cache_includes.c
+++ b/tests/runtime/test_kernel_cache_includes.c
@@ -75,5 +75,15 @@ int main(int argc, char **argv)
   err = clFinish(queue);
   CHECK_OPENCL_ERROR_IN("clFinish 2");
 
+  CHECK_CL_ERROR (clReleaseCommandQueue (queue));
+  CHECK_CL_ERROR (clReleaseKernel (kernel));
+  CHECK_CL_ERROR (clReleaseKernel (kernel2));
+  CHECK_CL_ERROR (clReleaseProgram (program));
+  CHECK_CL_ERROR (clReleaseProgram (program2));
+  CHECK_CL_ERROR (clReleaseContext (ctx));
+  CHECK_CL_ERROR (clUnloadCompiler ());
+
+  free ((void *)krn_src);
+
   return 0;
 }
diff --git a/tests/runtime/test_kernel_src_in_pwd.h b/tests/runtime/test_kernel_src_in_pwd.h
index c688a53..9f404d2 100644
--- a/tests/runtime/test_kernel_src_in_pwd.h
+++ b/tests/runtime/test_kernel_src_in_pwd.h
@@ -2,7 +2,14 @@
 #error __FUNC__ macro did not propagate to the Program
 #endif
 
-void __FUNC__();
+#define STR(a) S (a)
+#define S(a) #a
+
+void
+__FUNC__ ()
+{
+  printf ("inside __FUNC__ (%s)\n", STR (__FUNC__));
+}
 
 kernel
 void test_kernel() {
diff --git a/tests/runtime/test_link_error.c b/tests/runtime/test_link_error.c
index abd09dc..dc3923a 100644
--- a/tests/runtime/test_link_error.c
+++ b/tests/runtime/test_link_error.c
@@ -61,5 +61,11 @@ main(void){
   err = clBuildProgram(program, 1, &did, NULL, NULL, NULL);
   TEST_ASSERT(err == CL_BUILD_PROGRAM_FAILURE);
 
+  CHECK_CL_ERROR (clReleaseCommandQueue (queue));
+  CHECK_CL_ERROR (clReleaseProgram (program));
+  CHECK_CL_ERROR (clReleaseContext (context));
+
+  CHECK_CL_ERROR (clUnloadCompiler ());
+
   return EXIT_SUCCESS;
 }
diff --git a/tests/runtime/test_read-copy-write-buffer.c b/tests/runtime/test_read-copy-write-buffer.c
index c0716f8..08d37f6 100644
--- a/tests/runtime/test_read-copy-write-buffer.c
+++ b/tests/runtime/test_read-copy-write-buffer.c
@@ -113,15 +113,21 @@ main(void)
             2, evts, evts + 2));
         CHECK_CL_ERROR(clFinish(queue));
 
+        CHECK_CL_ERROR (clReleaseEvent (evts[2]));
+        CHECK_CL_ERROR (clReleaseEvent (evts[1]));
+        CHECK_CL_ERROR (clReleaseEvent (evts[0]));
+
         TEST_ASSERT(memcmp(host_buf2, host_buf1, buf_size) == 0);
       }
 
       free(host_buf2);
       free(host_buf1);
-      CHECK_CL_ERROR(clReleaseMemObject(buf2));
-      CHECK_CL_ERROR(clReleaseMemObject(buf1));
-      CHECK_CL_ERROR(clReleaseCommandQueue(queue));
+      CHECK_CL_ERROR (clReleaseMemObject (buf2));
+      CHECK_CL_ERROR (clReleaseMemObject (buf1));
+      CHECK_CL_ERROR (clReleaseCommandQueue (queue));
+      CHECK_CL_ERROR (clReleaseContext (context));
     }
   }
+
   return EXIT_SUCCESS;
 }
diff --git a/tests/runtime/test_user_event.c b/tests/runtime/test_user_event.c
index 549cc34..1c902b4 100644
--- a/tests/runtime/test_user_event.c
+++ b/tests/runtime/test_user_event.c
@@ -34,7 +34,7 @@ int main()
 {
   cl_int err;
   cl_event user_evt = NULL;
-  int i;
+  unsigned i;
 
   // An user event can be set to either complete or a negative value, indicating error;
   // additionally, no objects involved in a command that waits on the user event should
@@ -52,8 +52,8 @@ int main()
 	  cl_command_queue queue;
 	  cl_device_id device;
 
-	  CHECK_CL_ERROR(poclu_get_any_device(&context, &device, &queue));
-	  TEST_ASSERT( context );
+          CHECK_CL_ERROR (poclu_get_any_device (&context, &device, &queue));
+          TEST_ASSERT( context );
 	  TEST_ASSERT( device );
 	  TEST_ASSERT( queue );
 
@@ -76,9 +76,9 @@ int main()
 		  sizeof(endtime), &endtime, NULL);
 	  TEST_ASSERT(err == CL_PROFILING_INFO_NOT_AVAILABLE);
 
-	  CHECK_CL_ERROR(clReleaseEvent(user_evt));
-	  CHECK_CL_ERROR(clReleaseCommandQueue(queue));
-	  CHECK_CL_ERROR(clReleaseContext(context));
+          CHECK_CL_ERROR (clReleaseEvent (user_evt));
+          CHECK_CL_ERROR (clReleaseCommandQueue (queue));
+          CHECK_CL_ERROR (clReleaseContext (context));
   }
 
   return EXIT_SUCCESS;
diff --git a/tests/runtime/test_version.c b/tests/runtime/test_version.c
index 8a23746..c41ef51 100644
--- a/tests/runtime/test_version.c
+++ b/tests/runtime/test_version.c
@@ -16,7 +16,6 @@ int main(void)
 	cl_device_id did;
 	cl_platform_id pid; 
 	cl_command_queue queue;
-	cl_int err;
 	size_t rvs;
 	char result[1024];
 	char *needle;
@@ -54,8 +53,11 @@ int main(void)
 		*needle=0;		
 	}
 	printf("%s\n", result);
-	
 
-	return 0;
+        CHECK_CL_ERROR (clReleaseCommandQueue (queue));
+        CHECK_CL_ERROR (clReleaseContext (context));
+        CHECK_CL_ERROR (clUnloadCompiler ());
+
+        return 0;
 }
 
diff --git a/tests/tce/fp16/host.cpp b/tests/tce/fp16/host.cpp
index 76befd9..e0be579 100644
--- a/tests/tce/fp16/host.cpp
+++ b/tests/tce/fp16/host.cpp
@@ -38,7 +38,6 @@ static char
 kernelSourceCode[] = 
 "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
 //"#pragma OPENCL EXTENSION cl_khr_fp16 : disable\n"
-"int putchar(int c);\n"
 "kernel \n"
 "void test_kernel(float a) {\n"
 "   half h1 = (half)(a);\n"
diff --git a/tests/tce/multi_AS_copy/host.cpp b/tests/tce/multi_AS_copy/host.cpp
index c587293..4ed4d16 100644
--- a/tests/tce/multi_AS_copy/host.cpp
+++ b/tests/tce/multi_AS_copy/host.cpp
@@ -45,13 +45,12 @@
 
 static char
 kernelSourceCode[] =
-"int putchar(int c);\n"
 "kernel \n"
 "void test_kernel(__global char *buffer, __global char *reference) {\n"
 "    int i;\n"
 "    for (i = 0; i < 64; ++i) {\n"
 "        if(buffer[i] != reference[i])\n"
-"            putchar(buffer[i] + 48);\n"
+"            printf(\"%c\", (buffer[i] + 48));\n"
 "    }\n"
 "}\n";
 
diff --git a/tests/tce/tcemc/host.cpp b/tests/tce/tcemc/host.cpp
index e5c5122..96cdd70 100644
--- a/tests/tce/tcemc/host.cpp
+++ b/tests/tce/tcemc/host.cpp
@@ -36,7 +36,6 @@
 
 static char
 kernelSourceCode[] = 
-"int putchar(int c);\n"
 "kernel \n"
 "void test_kernel(constant char *input,\n"
 "                 __global char *output,\n"
@@ -44,7 +43,7 @@ kernelSourceCode[] =
 "                 int b) {\n"
 "    constant char* pos = input; \n"
 "    while (*pos) {\n"
-"        putchar (*pos);\n"
+"        printf (\"%c\", *pos);\n"
 "        ++pos;\n"
 "    }\n"
 "#ifdef cl_TCE_ABSF\n"
diff --git a/tests/tce/ttasim/host.cpp b/tests/tce/ttasim/host.cpp
index ad8a0f1..bd96f58 100644
--- a/tests/tce/ttasim/host.cpp
+++ b/tests/tce/ttasim/host.cpp
@@ -36,7 +36,6 @@
 
 static char
 kernelSourceCode[] =
-"int putchar(int c);\n"
 "kernel \n"
 "void test_kernel(constant char *input,\n"
 "                 __global char *output,\n"
@@ -50,7 +49,7 @@ kernelSourceCode[] =
 "    auto_local_array[3] = 'G'; \n"
 "    auto_local_array[4] = '\\0'; \n"
 "    while (*pos) {\n"
-"        putchar (*pos);\n"
+"        printf (\"%c\", *pos);\n"
 "        ++pos;\n"
 "    }\n"
 "#ifdef cl_TCE_ABSF\n"
diff --git a/tests/testsuite-amd.at b/tests/testsuite-amd.at
deleted file mode 100644
index c5b0a3b..0000000
--- a/tests/testsuite-amd.at
+++ /dev/null
@@ -1,311 +0,0 @@
-m4_define([POAT_AMDSDK_SETUP],[
-  AT_SETUP([$1])
-  AT_KEYWORDS([amdsdk $1 $2])
-  AT_SKIP_IF([AS_CASE([" $POAT_TESTSUITES "],[*" amd "*],[false],[:])])
-  # AMD APP SDK tests require ICD loader with
-  AT_SKIP_IF([! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h])
-])
-
-AT_BANNER([AMD APP SDK tests])
-
-POAT_AMDSDK_SETUP([aesencryptdecrypt-repl], [long])
-#This fails, and on LLVM 3.3 it takes more than an hour (on modest PPC hardware) to detect it.
-AT_SKIP_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc])
-AT_CHECK_UNQUOTED([POCL_WORK_GROUP_METHOD=repl make test_AESEncryptDecrypt -sC $abs_top_builddir/examples/AMD | grep "Encryption Passed"], 0, 
-[Encryption Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([aesencryptdecrypt-loops])
-AT_CHECK_UNQUOTED([POCL_WORK_GROUP_METHOD=loops make test_AESEncryptDecrypt -sC $abs_top_builddir/examples/AMD | grep "Encryption Passed"], 0, 
-[Encryption Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([atomiccounters])
-# Expected Error: Device does not support cl_ext_atomic_counters_32 extension!
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_AtomicCounters -sC $abs_top_builddir/examples/AMD | grep "Encryption Passed"], 0, 
-[Encryption Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([bitonicsort])
-AT_CHECK_UNQUOTED([make test_BitonicSort -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([binarysearch])
-AT_CHECK_UNQUOTED([make test_BinarySearch -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([binomialoption-repl], [long])
-AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc &&
-             grep -q "define LLVM_3_2" $abs_top_builddir/config.h])
-AT_CHECK_UNQUOTED([POCL_WORK_GROUP_METHOD=repl make test_BinomialOption -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([binomialoption-loops])
-AT_CHECK_UNQUOTED([POCL_WORK_GROUP_METHOD=loops make test_BinomialOption -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([blackscholes])
-AT_CHECK_UNQUOTED([make test_BlackScholes -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([blackscholesdp])
-AT_KEYWORDS([cl_amd_fp64])
-#this causes assert on LLVM 3.1
-AT_SKIP_IF([ grep "#define LLVM_3_1" $abs_top_builddir/config.h ])
-AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc])
-AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q armv &&
-             (grep -q "define LLVM_3_2" $abs_top_builddir/config.h ||
-              grep -q "define LLVM_3_3" $abs_top_builddir/config.h )])
-AT_CHECK_UNQUOTED([make test_BlackScholesDP -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Passed!
-], ignore)
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([boxfilter])
-AT_CHECK_UNQUOTED([make test_BoxFilter -sC $abs_top_builddir/examples/AMD | egrep "Passed|failed"], 0, 
-[Passed!
-Verifying results...Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([dct])
-#uninvestigated miscompilation. regression from 0.8
-AT_SKIP_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc])
-AT_CHECK_UNQUOTED([make test_DCT -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([devicefission])
-AT_CHECK_UNQUOTED([make test_DeviceFission -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([dwthaar1d])
-# 3.5 introduced the noduplicate attribute which, when
-# used with barrier(), fixes this.
-AT_XFAIL_IF([egrep -q "#define LLVM_3_4" $abs_top_builddir/config.h])
-AT_CHECK_UNQUOTED([make test_DwtHaar1D -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([fastwalshtransform])
-AT_CHECK_UNQUOTED([make test_FastWalshTransform -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([floydwarshall])
-AT_CHECK_UNQUOTED([make test_FloydWarshall -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([fluidsimulation2d])
-# error: can't convert between vector values of different size ('uint' (aka 'unsigned int') and 'int8')
-# It should be a legal implicit conversion according to 6.3 Operators. Some other error makes it
-# break with Intel OCL also.
-AT_XFAIL_IF(true)
-AT_KEYWORDS([cl_amd_fp64])
-AT_CHECK_UNQUOTED([make test_FluidSimulation2D -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Passed!
-], ignore)
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([helloworld])
-AT_CHECK_UNQUOTED([make test_HelloWorld -sC $abs_top_builddir/examples/AMD | egrep "GdkknVnqkc|HelloWorld"], 0, 
-[GdkknVnqkc
-HelloWorld
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([histogram-repl], [long])
-AT_CHECK_UNQUOTED([POCL_WORK_GROUP_METHOD=repl make test_Histogram -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Passed!
-])     
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([histogram-loops])
-AT_CHECK_UNQUOTED([POCL_WORK_GROUP_METHOD=loops make test_Histogram -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Passed!
-])     
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([imageoverlap])
-# doen't work because of image indexing, sdk 2.9 version works
-AT_SKIP_IF(true)
-AT_CHECK_UNQUOTED([make test_ImageOverlap -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Verifying result - Passed!
-])     
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([ludecomposition])
-AT_KEYWORDS([cl_amd_fp64])
-#test uses doubles
-AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc])
-AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q armv &&
-             (grep -q "define LLVM_3_2" $abs_top_builddir/config.h ||
-              grep -q "define LLVM_3_3" $abs_top_builddir/config.h )])
-AT_CHECK_UNQUOTED([make test_LUDecomposition -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Passed!
-], ignore)
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([mandelbrot])
-AT_SKIP_IF([ grep "undef HAVE_GLEW" $abs_top_builddir/config.h ])
-# undefined symbol: _Z7std_fmaDv4_fS_S_ with VML
-AT_CHECK_UNQUOTED([make test_Mandelbrot -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([matrixmul])
-#uninvestigated miscompilation. regression from 0.8
-AT_SKIP_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc])
-AT_CHECK_UNQUOTED([make test_MatrixMultiplication -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([matrixmulimage])
-AT_CHECK_UNQUOTED([make test_MatrixMulImage -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Passed!
-], ignore)
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([matrixtranspose])
-AT_CHECK_UNQUOTED([make test_MatrixTranspose -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([memorymodel-repl], [long])
-AT_CHECK_UNQUOTED([POCL_WORK_GROUP_METHOD=repl make test_MemoryModel -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([memorymodel-loops])
-AT_CHECK_UNQUOTED([POCL_WORK_GROUP_METHOD=loops make test_MemoryModel -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([montecarloasian])
-AT_CHECK_UNQUOTED([make test_MonteCarloAsian -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([montecarloasiandp])
-AT_KEYWORDS([cl_amd_fp64])
-# error: can't convert between vector values of different size ('double4' and 'int')
-# It should be a legal implicit conversion according to 6.3 Operators. Works also with Intel OCL
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_MonteCarloAsianDP -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Passed!
-], ignore)
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([nbody])
-AT_SKIP_IF([ grep "undef HAVE_GLEW" $abs_top_builddir/config.h ])
-AT_CHECK_UNQUOTED([make test_NBody -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([prefixsum])
-AT_CHECK_UNQUOTED([make test_PrefixSum -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([quasirandomsequence])
-AT_CHECK_UNQUOTED([make test_QuasiRandomSequence -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([radixsort])
-AT_CHECK_UNQUOTED([make test_RadixSort -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Passed!
-], ignore)     
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([recursivegaussian])
-AT_CHECK_UNQUOTED([make test_RecursiveGaussian -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([reduction])
-AT_CHECK_UNQUOTED([make test_Reduction -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([scanlargearrays])
-# Fails with vectorization. With wiloops and no unrolling, the vectorization won't apply.
-AT_CHECK_UNQUOTED([make test_ScanLargeArrays -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([simpleconvolution])
-AT_CHECK_UNQUOTED([make test_SimpleConvolution -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([simpleimage])
-AT_CHECK_UNQUOTED([make test_SimpleImage -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Verifying 2D copy result - Passed!
-Verifying 3D copy result - Passed!
-])     
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([sobelfilter])
-AT_CHECK_UNQUOTED([make test_SobelFilter -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Passed!
-])     
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([template])
-AT_CHECK_UNQUOTED([make test_Template -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Passed!
-])     
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([templatec])
-AT_CHECK_UNQUOTED([make test_TemplateC -sC $abs_top_builddir/examples/AMD | grep Passed], 0, 
-[Passed!
-])     
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([transferoverlap])
-AT_CHECK_UNQUOTED([make test_TransferOverlap -sC $abs_top_builddir/examples/AMD | grep "Passed"], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([urng])
-AT_CHECK_UNQUOTED([make test_URNG -sC $abs_top_builddir/examples/AMD | grep Passed | cut -c -7], 0, 
-[Passed! 
-])     
-AT_CLEANUP
diff --git a/tests/testsuite-amdsdk2_9.at b/tests/testsuite-amdsdk2_9.at
deleted file mode 100644
index 26b5965..0000000
--- a/tests/testsuite-amdsdk2_9.at
+++ /dev/null
@@ -1,586 +0,0 @@
-m4_define([POAT_AMDSDK_SETUP],[
-  AT_SETUP([$1])
-  AT_KEYWORDS([amdsdk2.9 amdsdk long $1 $2])
-  AT_SKIP_IF([AS_CASE([" $POAT_TESTSUITES "],[*" amdsdk2_9 "*],[false],[:])])
-  # AMD APP SDK tests require ICD loader with
-  AT_SKIP_IF([! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h])
-])
-
-m4_define([POAT_AMDSDK_HSA_SETUP],[
-  AT_SETUP([$1])
-  AT_KEYWORDS([hsa amdsdk2.9 amdsdk long $1 $2])
-  AT_SKIP_IF([AS_CASE([" $POAT_TESTSUITES "],[*" amdsdk2_9 "*],[false],[:])])
-  # AMD APP SDK tests require ICD loader with
-  AT_SKIP_IF([! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h])
-])
-
-
-AT_BANNER([AMD APP SDK 2.9 tests])
-
-POAT_AMDSDK_SETUP([asyncdatatransfer])
-AT_CHECK_UNQUOTED([make test_AsyncDataTransfer -sC $abs_top_builddir/examples/AMDSDK2.9 | grep "Passed" | sed -e 's/^[ \t]*//'], 0, 
-[SyncKernel verification  : Passed!
-AsyncKernel verification : Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([atomiccounters])
-# Expected Error: Device does not support cl_ext_atomic_counters_32 extension!
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_AtomicCounters -sC $abs_top_builddir/examples/AMDSDK2.9 | grep "Encryption Passed"], 0, 
-[Encryption Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([basicdebug])
-# This tests debugging features by executing a kernel that writes
-# out of bounds of a local array. No point testing it here as the
-# result should be undefined (basic device crashes, pthread device
-# silently passes). It passes if the kernel's local array size is
-# increased so there is no out of bounds error.
-AT_SKIP_IF(true)
-AT_CHECK_UNQUOTED([make test_BasicDebug -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-], ignore)
-AT_CLEANUP
-
-POAT_AMDSDK_HSA_SETUP([binarysearch])
-AT_CHECK_UNQUOTED([make test_BinarySearch -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([binomialoption-repl])
-AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc &&
-             grep -q "define LLVM_3_2" $abs_top_builddir/config.h])
-AT_CHECK_UNQUOTED([POCL_WORK_GROUP_METHOD=repl make test_BinomialOption -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_HSA_SETUP([binomialoption-loops])
-AT_CHECK_UNQUOTED([POCL_WORK_GROUP_METHOD=loops make test_BinomialOption -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([binomialoptionmultigpu])
-AT_SKIP_IF([ ! test -e $abs_top_builddir/examples/AMDSDK2.9/AMD-APP-SDK-v2.9-RC-lnx64/samples/opencl/cl/BinomialOptionMultiGPU/bin/x86_64/Release/BinomialOptionMultiGPU])
-AT_CHECK_UNQUOTED([make test_BinomialOptionMultiGPU -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_HSA_SETUP([bitonicsort])
-AT_CHECK_UNQUOTED([make test_BitonicSort -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_HSA_SETUP([blackscholes])
-AT_CHECK_UNQUOTED([make test_BlackScholes -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([blackscholesdp])
-AT_KEYWORDS([cl_amd_fp64])
-#this causes assert on LLVM 3.1
-AT_SKIP_IF([ grep "#define LLVM_3_1" $abs_top_builddir/config.h ])
-AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc])
-AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q armv])
-AT_CHECK_UNQUOTED([make test_BlackScholesDP -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-], ignore)
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([boxfilter])
-AT_CHECK_UNQUOTED([make test_BoxFilter -sC $abs_top_builddir/examples/AMDSDK2.9 | egrep "Passed|failed"], 0, 
-[Passed!
-Verifying results...Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([boxfilterGL])
-# doesnt work
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_BoxFilterGL -sC $abs_top_builddir/examples/AMDSDK2.9 | egrep "Passed|failed"], 0, 
-[Passed!
-Verifying results...Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([bufferbandwidth])
-# Device does not support cl_khr_local_int32_base_atomics extension!
-# AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_BufferBandwidth -sC $abs_top_builddir/examples/AMDSDK2.9 | egrep "Passed|failed"], 0, 
-[ Verification Passed!
- Verification Passed!
-Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([bufferImageInterop])
-# Error: Selected device doesn't support Buffer-Image
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_BufferImageInterop -sC $abs_top_builddir/examples/AMDSDK2.9 | egrep "Passed|failed"], 0, 
-[Passed!
-Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([concurrentkernel])
-AT_CHECK_UNQUOTED([make test_ConcurrentKernel -sC $abs_top_builddir/examples/AMDSDK2.9 | egrep "Passed|failed"], 0, 
-[ Sequential Kernel verification : Passed!
- Concurrent Kernel verification : Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([constantbandwidth])
-AT_CHECK_UNQUOTED([make test_ConstantBandwidth -sC $abs_top_builddir/examples/AMDSDK2.9 | egrep Passed], 0, 
-[Passed!
-Passed!
-Passed!
-Passed!
-], ignore)
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([cpluspluswrapper])
-AT_CHECK_UNQUOTED([make test_CplusplusWrapper -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed
-])
-AT_CLEANUP
-
-POAT_AMDSDK_HSA_SETUP([dct])
-AT_CHECK_UNQUOTED([make test_DCT -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([devicefission])
-AT_CHECK_UNQUOTED([make test_DeviceFission -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([devicefission11ext])
-# Expected Error: Device does not support cl_ext_device_fission extension!
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_DeviceFission11Ext -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([dwthaar1d])
-# 3.5 introduced the noduplicate attribute which, when
-# used with barrier(), fixes this.
-AT_XFAIL_IF([egrep -q "#define LLVM_3_4" $abs_top_builddir/config.h])
-AT_CHECK_UNQUOTED([make test_DwtHaar1D -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([dwthaar1dcppkernel])
-# Error: clBuildProgram failed. Error code : CL_INVALID_BUILD_OPTIONS (-x clc++)
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_DwtHaar1DCPPKernel -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([eigenvalue])
-# Error: clBuildProgram failed. Error code : CL_INVALID_BUILD_OPTIONS (-x clc++)
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_EigenValue -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_HSA_SETUP([fastwalshtransform])
-AT_CHECK_UNQUOTED([make test_FastWalshTransform -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_HSA_SETUP([floydwarshall])
-AT_CHECK_UNQUOTED([make test_FloydWarshall -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([fft])
-# Build parameter clc++ not supported
-AT_SKIP_IF(true)
-AT_CHECK_UNQUOTED([make test_FFT -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([fluidsimulation2d])
-# error: can't convert between vector values of different size ('uint' (aka 'unsigned int') and 'int8')
-# It should be a legal implicit conversion according to 6.3 Operators. Some other error makes it
-# break with Intel OCL also.
-AT_XFAIL_IF(true)
-AT_KEYWORDS([cl_amd_fp64])
-AT_CHECK_UNQUOTED([make test_FluidSimulation2D -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-], ignore)
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([gaussiannoise])
-AT_CHECK_UNQUOTED([make test_GaussianNoise -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed | sed 's/ //g'], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([gaussiannoisegl])
-#doesnt work
-AT_SKIP_IF(true)
-AT_CHECK_UNQUOTED([make test_GaussianNoiseGL -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed! 
-])
-AT_CLEANUP
-
-#Benchmark test
-#POAT_AMDSDK_SETUP([globalmemorybandwidth])
-#AT_CHECK_UNQUOTED([make test_GlobalMemoryBandwidth -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-#[Passed!
-#])
-#AT_CLEANUP
-
-POAT_AMDSDK_SETUP([hdrtonemapping])
-AT_CHECK_UNQUOTED([make test_HDRToneMapping -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!!
-], ignore)
-AT_CLEANUP
-
-POAT_AMDSDK_HSA_SETUP([helloworld])
-AT_CHECK_UNQUOTED([make test_HelloWorld -sC $abs_top_builddir/examples/AMDSDK2.9 | egrep "GdkknVnqkc|HelloWorld"], 0, 
-[GdkknVnqkc
-HelloWorld
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([histogram-repl])
-AT_CHECK_UNQUOTED([POCL_WORK_GROUP_METHOD=repl make test_Histogram -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])     
-AT_CLEANUP
-
-POAT_AMDSDK_HSA_SETUP([histogram-loops])
-AT_CHECK_UNQUOTED([POCL_WORK_GROUP_METHOD=loops make test_Histogram -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])     
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([histogramatomic])
-AT_CHECK_UNQUOTED([make test_HistogramAtomics -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])     
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([imagebandwidth])
-# AT_SKIP_IF(true)
-AT_CHECK_UNQUOTED([make test_ImageBandwidth -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])     
-AT_CLEANUP
-
-
-POAT_AMDSDK_SETUP([imageoverlap])
-AT_CHECK_UNQUOTED([make test_ImageOverlap -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Verifying result - Passed!
-])     
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([introstatickcppkernel])
-# Error: clBuildProgram failed. Error code : CL_INVALID_BUILD_OPTIONS (-x clc++)
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_IntroStaticCPPKernel -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Verifying result - Passed!
-])     
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([kernellauch])
-# Device does not support cl_khr_local_int32_base_atomics extension! 
-# works anyway
-AT_CHECK_UNQUOTED([make test_KernelLaunch -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed!], 0, 
-[Passed!
-])     
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([kmeansautoclustering])
-# doesn't find opencl library for some reason
-AT_XFAIL_IF(true) 
-AT_CHECK_UNQUOTED([make test_KmeansAutoclustering -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])     
-AT_CLEANUP
-
-Benchmark test
-POAT_AMDSDK_SETUP([ldsbandwidth])
-AT_CHECK_UNQUOTED([make test_LDSBandwidth -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-Passed!
-Passed!
-])     
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([ludecomposition])
-AT_KEYWORDS([cl_amd_fp64])
-#test uses doubles
-AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc])
-AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q armv])
-AT_CHECK_UNQUOTED([make test_LUDecomposition -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-], ignore)
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([mandelbrot])
-AT_SKIP_IF([ grep "undef HAVE_GLEW" $abs_top_builddir/config.h ])
-AT_SKIP_IF([ ! test -e $abs_top_builddir/examples/AMDSDK2.9/AMD-APP-SDK-v2.9-RC-lnx64/samples/opencl/cl/Mandelbrot/bin/x86_64/Release/Mandelbrot])
-# undefined symbol: _Z7std_fmaDv4_fS_S_ with VML
-AT_CHECK_UNQUOTED([make test_Mandelbrot -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-], ignore)
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([matrixmuldouble])
-AT_CHECK_UNQUOTED([make test_MatrixMulDouble -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([matrixmulimage])
-AT_CHECK_UNQUOTED([make test_MatrixMulImage -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-], ignore)
-AT_CLEANUP
-
-POAT_AMDSDK_HSA_SETUP([matrixmultiplication])
-# pocl error: encountered unimplemented part of the OpenCL specs in clCreateImage2D.c:119
-#AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_MatrixMultiplication -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_HSA_SETUP([matrixtranspose])
-AT_CHECK_UNQUOTED([make test_MatrixTranspose -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([memorymodel-repl])
-AT_CHECK_UNQUOTED([POCL_WORK_GROUP_METHOD=repl make test_MemoryModel -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([memorymodel-loops])
-AT_CHECK_UNQUOTED([POCL_WORK_GROUP_METHOD=loops make test_MemoryModel -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([memoryoptimizations])
-#Device does not support global_int32_base_atomics
-#AT_SKIP_IF(true)
-AT_CHECK_UNQUOTED([make test_MemoryOptimizations -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-], ignore)
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([merzennetwister])
-#Build parameter clc++ is not supported
-AT_SKIP_IF(true)
-AT_CHECK_UNQUOTED([make test_MersenneTwister -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-], ignore)
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([montecarloasian])
-# kernel compilation fails due to 
-# error: can't convert between vector values of different size ('float4' and 'int')
-# It should be a legal implicit conversion according to 6.3 Operators. Works also with Intel OCL
-AT_XFAIL_IF([grep -q "define LLVM_3_7" $abs_top_builddir/config.h])
-AT_CHECK_UNQUOTED([make test_MonteCarloAsian -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([montecarloasiandp])
-AT_KEYWORDS([cl_amd_fp64])
-# error: can't convert between vector values of different size ('double4' and 'int')
-# It should be a legal implicit conversion according to 6.3 Operators. Works also with Intel OCL
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_MonteCarloAsianDP -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-], ignore)
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([montecarloasianmultigpu])
-AT_CHECK_UNQUOTED([make test_MonteCarloAsianMultiGPU -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([nbody])
-AT_SKIP_IF([ grep "undef HAVE_GLEW" $abs_top_builddir/config.h ])
-AT_SKIP_IF([ ! test -e $abs_top_builddir/examples/AMDSDK2.9/AMD-APP-SDK-v2.9-RC-lnx64/samples/opencl/cl/NBody/bin/x86_64/Release/NBody])
-AT_CHECK_UNQUOTED([make test_NBody -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_HSA_SETUP([prefixsum])
-AT_CHECK_UNQUOTED([make test_PrefixSum -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_HSA_SETUP([quasirandomsequence])
-AT_CHECK_UNQUOTED([make test_QuasiRandomSequence -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([radixsort])
-AT_CHECK_UNQUOTED([make test_RadixSort -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-], ignore)     
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([recursivegaussian])
-AT_CHECK_UNQUOTED([make test_RecursiveGaussian -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([reduction])
-AT_CHECK_UNQUOTED([make test_Reduction -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_HSA_SETUP([scanlargearrays])
-# Fails with vectorization. With wiloops and no unrolling, the vectorization won't apply.
-AT_CHECK_UNQUOTED([make test_ScanLargeArrays -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_HSA_SETUP([simpleconvolution])
-AT_CHECK_UNQUOTED([make test_SimpleConvolution -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([simplegl])
-# doesn't find opecl library
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_SimpleGL -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])     
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([simpleimage])
-AT_CHECK_UNQUOTED([make test_SimpleImage -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Verifying 2D copy result - Passed!
-Verifying 3D copy result - Passed!
-])     
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([soaversusaos])
-#Build Options are : -x clc++ -D num1=4096 -D num2=4096 
-#Error: clBuildProgram failed. Error code : CL_INVALID_BUILD_OPTIONS
-AT_SKIP_IF(true)
-AT_CHECK_UNQUOTED([make test_SoAversusAoS -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])     
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([sobelfilter])
-AT_CHECK_UNQUOTED([make test_SobelFilter -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])     
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([sobelfilterimage])
-# segfault
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_SobelFilterImage -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])     
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([stringsearch])
-AT_CHECK_UNQUOTED([make test_StringSearch -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-Passed!
-])     
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([template])
-AT_CHECK_UNQUOTED([make test_Template -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])     
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([transferoverlap])
-AT_CHECK_UNQUOTED([make test_TransferOverlap -sC $abs_top_builddir/examples/AMDSDK2.9 | grep "Passed"], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([transferoverlapcpp])
-# Expected Error: Device does not support cl_khr_local_int32_base_atomics extension! and segfault
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_TransferOverlapCPP -sC $abs_top_builddir/examples/AMDSDK2.9 | grep "Passed"], 0, 
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([unsharpmask])
-# doesn't find opencl library
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_UnsharpMask -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed], 0, 
-[Passed!
-])     
-AT_CLEANUP
-
-POAT_AMDSDK_HSA_SETUP([urng])
-AT_CHECK_UNQUOTED([make test_URNG -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed | cut -c -7], 0, 
-[Passed! 
-])     
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([urngnoisegl])
-# Error: clGetPlatformIDs failed. Error code : CL_PLATFORM_NOT_FOUND_KHR
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_URNGNoiseGL -sC $abs_top_builddir/examples/AMDSDK2.9 | grep Passed | cut -c -7], 0, 
-[Passed! 
-])     
-AT_CLEANUP
diff --git a/tests/testsuite-amdsdk3_0.at b/tests/testsuite-amdsdk3_0.at
deleted file mode 100644
index d20a4a8..0000000
--- a/tests/testsuite-amdsdk3_0.at
+++ /dev/null
@@ -1,727 +0,0 @@
-m4_define([POAT_AMDSDK_SETUP],[
-  AT_SETUP([$1])
-  AT_KEYWORDS([amdsdk3.0 amdsdk long $1 $2])
-  AT_SKIP_IF([AS_CASE([" $POAT_TESTSUITES "],[*" amdsdk3_0 "*],[false],[:])])
-  # AMD APP SDK tests require ICD loader with
-  AT_SKIP_IF([! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h])
-])
-
-m4_define([POAT_AMDSDK_HSA_SETUP],[
-  AT_SETUP([$1])
-  AT_KEYWORDS([hsa amdsdk3.0 amdsdk long $1 $2])
-  AT_SKIP_IF([AS_CASE([" $POAT_TESTSUITES "],[*" amdsdk3_0 "*],[false],[:])])
-  # AMD APP SDK tests require ICD loader with
-  AT_SKIP_IF([! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h])
-])
-
-##########################################################################
-
-AT_BANNER([AMD APP SDK 3.0 tests])
-
-POAT_AMDSDK_SETUP([asyncdatatransfer])
-# needs asynch properties implemented
-AT_SKIP_IF(true)
-AT_CHECK_UNQUOTED([make test_AsyncDataTransfer -sC $abs_top_builddir/examples/AMDSDK3.0 | grep "Passed" | sed -e 's/^[ \t]*//'], 0,
-[SyncKernel verification  : Passed!
-AsyncKernel verification : Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([atomiccounters])
-# Expected Error: Device does not support cl_ext_atomic_counters_32 extension!
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_AtomicCounters -sC $abs_top_builddir/examples/AMDSDK3.0 | grep "Encryption Passed"], 0,
-[Encryption Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([basicdebug])
-# This tests debugging features by executing a kernel that writes
-# out of bounds of a local array. No point testing it here as the
-# result should be undefined (basic device crashes, pthread device
-# silently passes). It passes if the kernel's local array size is
-# increased so there is no out of bounds error.
-AT_SKIP_IF(true)
-AT_CHECK_UNQUOTED([make test_BasicDebug -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-], ignore)
-AT_CLEANUP
-
-POAT_AMDSDK_HSA_SETUP([binarysearchdevicesideenqueue])
-# requires dev side queue
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_BinarySearchDeviceSideEnqueue -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([binomialoption-repl])
-AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc &&
-             grep -q "define LLVM_3_2" $abs_top_builddir/config.h])
-AT_CHECK_UNQUOTED([POCL_WORK_GROUP_METHOD=repl make test_BinomialOption -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_HSA_SETUP([binomialoption-loops])
-AT_CHECK_UNQUOTED([POCL_WORK_GROUP_METHOD=loops make test_BinomialOption -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([binomialoptionmultigpu])
-AT_SKIP_IF([ ! test -e $abs_top_builddir/examples/AMDSDK3.0/AMD-APP-SDK-v3.0-RC-lnx64/samples/opencl/cl/BinomialOptionMultiGPU/bin/x86_64/Release/BinomialOptionMultiGPU])
-AT_CHECK_UNQUOTED([make test_BinomialOptionMultiGPU -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_HSA_SETUP([bitonicsort])
-AT_CHECK_UNQUOTED([make test_BitonicSort -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_HSA_SETUP([blackscholes])
-AT_CHECK_UNQUOTED([make test_BlackScholes -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([blackscholesdp])
-AT_KEYWORDS([cl_amd_fp64])
-AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc])
-AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q armv])
-AT_CHECK_UNQUOTED([make test_BlackScholesDP -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-], ignore)
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([boxfilter])
-AT_CHECK_UNQUOTED([make test_BoxFilter -sC $abs_top_builddir/examples/AMDSDK3.0 | egrep "Passed|failed"], 0,
-[Passed!
-Verifying results...Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([boxfilterGL])
-# doesnt work
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_BoxFilterGL -sC $abs_top_builddir/examples/AMDSDK3.0 | egrep "Passed|failed"], 0,
-[Passed!
-Verifying results...Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([bufferbandwidth])
-# freezes/takes forever
-AT_SKIP_IF(true)
-AT_CHECK_UNQUOTED([make test_BufferBandwidth -sC $abs_top_builddir/examples/AMDSDK3.0 | egrep "Passed|failed"], 0,
-[ Verification Passed!
- Verification Passed!
-Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([bufferImageInterop])
-# Error: Selected device doesn't support Buffer-Image
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_BufferImageInterop -sC $abs_top_builddir/examples/AMDSDK3.0 | egrep "Passed|failed"], 0,
-[Passed!
-Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([builtinscan])
-# requires work_group_scan_inclusive_add, work_group_barrier & work_group_broadcast
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_BuiltInScan -sC $abs_top_builddir/examples/AMDSDK3.0 | egrep "Passed|failed"], 0,
-[OK
-])
-AT_CLEANUP
-
-POAT_AMDSDK_HSA_SETUP([calcpie])
-AT_CHECK_UNQUOTED([make test_CalcPie -sC $abs_top_builddir/examples/AMDSDK3.0 | egrep "Passed|failed"], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([concurrentkernel])
-AT_CHECK_UNQUOTED([make test_ConcurrentKernel -sC $abs_top_builddir/examples/AMDSDK3.0 | egrep "Passed|failed"], 0,
-[ Sequential Kernel verification : Passed!
- Concurrent Kernel verification : Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([constantbandwidth])
-AT_CHECK_UNQUOTED([make test_ConstantBandwidth -sC $abs_top_builddir/examples/AMDSDK3.0 | egrep Passed], 0,
-[Passed!
-Passed!
-Passed!
-Passed!
-], ignore)
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([cpluspluswrapper])
-# insists on AMD platform
-AT_SKIP_IF(true)
-AT_CHECK_UNQUOTED([make test_CplusplusWrapper -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed
-])
-AT_CLEANUP
-
-POAT_AMDSDK_HSA_SETUP([dct])
-AT_CHECK_UNQUOTED([make test_DCT -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([deviceenqueuebfs])
-# requires dev queue
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_DeviceEnqueueBFS -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([devicefission])
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_DeviceFission -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([devicefission11ext])
-# Expected Error: Device does not support cl_ext_device_fission extension!
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_DeviceFission11Ext -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([dwthaar1d])
-# 3.5 introduced the noduplicate attribute which, when
-# used with barrier(), fixes this.
-AT_XFAIL_IF([egrep -q "#define LLVM_3_4" $abs_top_builddir/config.h])
-AT_CHECK_UNQUOTED([make test_DwtHaar1D -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([dwthaar1dcppkernel])
-# Error: clBuildProgram failed. Error code : CL_INVALID_BUILD_OPTIONS (-x clc++)
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_DwtHaar1DCPPKernel -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([eigenvalue])
-# Error: clBuildProgram failed. Error code : CL_INVALID_BUILD_OPTIONS (-x clc++)
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_EigenValue -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([extractprimes])
-# requires dev queue
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_ExtractPrimes -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_HSA_SETUP([fastwalshtransform])
-AT_CHECK_UNQUOTED([make test_FastWalshTransform -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([fft])
-# Build parameter clc++ not supported
-AT_SKIP_IF(true)
-AT_CHECK_UNQUOTED([make test_FFT -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([finegrainsvm])
-# freezes with every device - requires async running queue
-AT_SKIP_IF(true)
-AT_CHECK_UNQUOTED([make test_FineGrainSVM -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_HSA_SETUP([finegrainsvmcas])
-AT_CHECK_UNQUOTED([make test_FineGrainSVMCAS -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-ignore, ignore)
-AT_CLEANUP
-
-POAT_AMDSDK_HSA_SETUP([floydwarshall])
-AT_CHECK_UNQUOTED([make test_FloydWarshall -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([fluidsimulation2d])
-# error: can't convert between vector values of different size ('uint' (aka 'unsigned int') and 'int8')
-# It should be a legal implicit conversion according to 6.3 Operators. Some other error makes it
-# break with Intel OCL also.
-AT_XFAIL_IF(true)
-AT_KEYWORDS([cl_amd_fp64])
-AT_CHECK_UNQUOTED([make test_FluidSimulation2D -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-], ignore)
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([gaussiannoise])
-AT_CHECK_UNQUOTED([make test_GaussianNoise -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed | sed 's/ //g'], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([gaussiannoisegl])
-#doesnt work
-AT_SKIP_IF(true)
-AT_CHECK_UNQUOTED([make test_GaussianNoiseGL -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-#Benchmark test
-#POAT_AMDSDK_SETUP([globalmemorybandwidth])
-#AT_CHECK_UNQUOTED([make test_GlobalMemoryBandwidth -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-#[Passed!
-#])
-#AT_CLEANUP
-
-POAT_AMDSDK_SETUP([hdrtonemapping])
-AT_CHECK_UNQUOTED([make test_HDRToneMapping -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!!
-], ignore)
-AT_CLEANUP
-
-POAT_AMDSDK_HSA_SETUP([heatpde])
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_HeatPDE -sC $abs_top_builddir/examples/AMDSDK3.0 | egrep "Passed"], 0,
-[Passed])
-AT_CLEANUP
-
-POAT_AMDSDK_HSA_SETUP([helloworld])
-AT_CHECK_UNQUOTED([make test_HelloWorld -sC $abs_top_builddir/examples/AMDSDK3.0 | egrep "GdkknVnqkc|HelloWorld"], 0,
-[GdkknVnqkc
-HelloWorld
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([histogram-repl])
-AT_CHECK_UNQUOTED([POCL_WORK_GROUP_METHOD=repl make test_Histogram -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_HSA_SETUP([histogram-loops])
-AT_CHECK_UNQUOTED([POCL_WORK_GROUP_METHOD=loops make test_Histogram -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([histogramatomics])
-AT_CHECK_UNQUOTED([make test_HistogramAtomics -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([imagebandwidth])
-# GPU not found. Exiting application
-AT_SKIP_IF(true)
-AT_CHECK_UNQUOTED([make test_ImageBandwidth -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([imagebinarization])
-# requires work_group_barrier
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_ImageBinarization -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([imageoverlap])
-AT_CHECK_UNQUOTED([make test_ImageOverlap -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Verifying result - Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([introstatickcppkernel])
-# Error: clBuildProgram failed. Error code : CL_INVALID_BUILD_OPTIONS (-x clc++)
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_IntroStaticCPPKernel -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Verifying result - Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([kernellauch])
-# GPU not found. Exiting application
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_KernelLaunch -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed!], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([kmeansautoclustering])
-# doesn't find opencl library for some reason
-AT_SKIP_IF(true)
-AT_CHECK_UNQUOTED([make test_KmeansAutoclustering -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-Benchmark test
-POAT_AMDSDK_SETUP([ldsbandwidth])
-# requires dev queue
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_LDSBandwidth -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-Passed!
-Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([ludecomposition])
-AT_KEYWORDS([cl_amd_fp64])
-#test uses doubles
-AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc])
-AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q armv])
-AT_CHECK_UNQUOTED([make test_LUDecomposition -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-], ignore)
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([mandelbrot])
-AT_SKIP_IF([ grep "undef HAVE_GLEW" $abs_top_builddir/config.h ])
-AT_SKIP_IF([ ! test -e $abs_top_builddir/examples/AMDSDK3.0/AMD-APP-SDK-v3.0-RC-lnx64/samples/opencl/cl/Mandelbrot/bin/x86_64/Release/Mandelbrot])
-# undefined symbol: _Z7std_fmaDv4_fS_S_ with VML
-AT_CHECK_UNQUOTED([make test_Mandelbrot -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-], ignore)
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([matrixmuldouble])
-AT_KEYWORDS([cl_amd_fp64])
-AT_CHECK_UNQUOTED([make test_MatrixMulDouble -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([matrixmulimage])
-AT_CHECK_UNQUOTED([make test_MatrixMulImage -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-], ignore)
-AT_CLEANUP
-
-POAT_AMDSDK_HSA_SETUP([matrixmultiplication])
-# pocl error: encountered unimplemented part of the OpenCL specs in clCreateImage2D.c:119
-#AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_MatrixMultiplication -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_HSA_SETUP([matrixtranspose])
-AT_CHECK_UNQUOTED([make test_MatrixTranspose -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([memorymodel-repl])
-AT_CHECK_UNQUOTED([POCL_WORK_GROUP_METHOD=repl make test_MemoryModel -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([memorymodel-loops])
-AT_CHECK_UNQUOTED([POCL_WORK_GROUP_METHOD=loops make test_MemoryModel -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([memoryoptimizations])
-AT_CHECK_UNQUOTED([make test_MemoryOptimizations -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-Passed!
-], ignore)
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([mersennetwister])
-#Build parameter clc++ is not supported
-AT_SKIP_IF(true)
-AT_CHECK_UNQUOTED([make test_MersenneTwister -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-], ignore)
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([montecarloasian])
-AT_CHECK_UNQUOTED([make test_MonteCarloAsian -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-
-POAT_AMDSDK_SETUP([montecarloasiandp])
-# passes arguments via a struct
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_MonteCarloAsianDP -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-], ignore)
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([montecarloasianmultigpu])
-AT_CHECK_UNQUOTED([make test_MonteCarloAsianMultiGPU -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([nbody])
-AT_SKIP_IF([ grep "undef HAVE_GLEW" $abs_top_builddir/config.h ])
-AT_SKIP_IF([ ! test -e $abs_top_builddir/examples/AMDSDK3.0/AMD-APP-SDK-v3.0-RC-lnx64/samples/opencl/cl/NBody/bin/x86_64/Release/NBody])
-AT_CHECK_UNQUOTED([make test_NBody -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([pipeproducerconsumerkernels])
-# no pipe support yet
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_PipeProducerConsumerKernels -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-], ignore)
-AT_CLEANUP
-
-POAT_AMDSDK_HSA_SETUP([prefixsum])
-AT_CHECK_UNQUOTED([make test_PrefixSum -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_HSA_SETUP([quasirandomsequence])
-AT_CHECK_UNQUOTED([make test_QuasiRandomSequence -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([radixsort])
-AT_CHECK_UNQUOTED([make test_RadixSort -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-], ignore)
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([rangeminimumquery])
-# requires work_group_reduce_min
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_RangeMinimumQuery -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([recursivegaussian])
-AT_CHECK_UNQUOTED([make test_RecursiveGaussian -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([recursivegaussianprogramscope])
-# uses in-source global variable
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_RecursiveGaussian_ProgramScope -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([reduction])
-AT_CHECK_UNQUOTED([make test_Reduction -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([regiongrowingsegmentation])
-# requires dev queue
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_RegionGrowingSegmentation -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_HSA_SETUP([scanlargearrays])
-# Fails with vectorization. With wiloops and no unrolling, the vectorization won't apply.
-AT_CHECK_UNQUOTED([make test_ScanLargeArrays -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_HSA_SETUP([simpleconvolution])
-AT_CHECK_UNQUOTED([make test_SimpleConvolution -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Verifying non-Separable Convolution Kernel result - Passed!
-Verifying Separable Convolution Kernel result - Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([simpledepthimage])
-#  *** ERROR ***  Requested image format is not supported
-# Error: clCreateImage failed.(oclImage) Error code : CL_IMAGE_FORMAT_NOT_SUPPORTED
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_SimpleDepthImage -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Verifying 2D copy result - Passed!
-Verifying 3D copy result - Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([simplegenericaddressspace])
-# requires work_group_barrier
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_SimpleGenericAddressSpace -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([simplegl])
-# doesn't find opecl library
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_SimpleGL -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([simpleimage])
-AT_CHECK_UNQUOTED([make test_SimpleImage -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Verifying 2D copy result - Passed!
-Verifying 3D copy result - Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([simplepipe])
-# pipe not implemented
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_SimplePipe -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[OK
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([simplespir])
-# Device side queue is unimplemented
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_SimpleSPIR -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[OK
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([soaversusaos])
-#Build Options are : -x clc++ -D num1=4096 -D num2=4096
-#Error: clBuildProgram failed. Error code : CL_INVALID_BUILD_OPTIONS
-AT_SKIP_IF(true)
-AT_CHECK_UNQUOTED([make test_SoAversusAoS -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([sobelfilter])
-AT_CHECK_UNQUOTED([make test_SobelFilter -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([sobelfilterimage])
-# segfault
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_SobelFilterImage -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([stringsearch])
-AT_CHECK_UNQUOTED([make test_StringSearch -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([svmatomicsbinarytreeinsert])
-
-AT_CHECK_UNQUOTED([make test_SVMAtomicsBinaryTreeInsert -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-AT_XFAIL_IF(true)
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([svmbinarytreesearch])
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_SVMBinaryTreeSearch -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-
-POAT_AMDSDK_SETUP([template])
-AT_CHECK_UNQUOTED([make test_Template -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([transferoverlap])
-AT_CHECK_UNQUOTED([make test_TransferOverlap -sC $abs_top_builddir/examples/AMDSDK3.0 | grep "Passed"], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([transferoverlapcpp])
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_TransferOverlapCPP -sC $abs_top_builddir/examples/AMDSDK3.0 | grep "Passed"], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([unsharpmask])
-# doesn't find opencl library
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_UnsharpMask -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_HSA_SETUP([urng])
-AT_CHECK_UNQUOTED([make test_URNG -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed | cut -c -7], 0,
-[Passed!
-])
-AT_CLEANUP
-
-POAT_AMDSDK_SETUP([urngnoisegl])
-# Error: clGetPlatformIDs failed. Error code : CL_PLATFORM_NOT_FOUND_KHR
-AT_XFAIL_IF(true)
-AT_CHECK_UNQUOTED([make test_URNGNoiseGL -sC $abs_top_builddir/examples/AMDSDK3.0 | grep Passed | cut -c -7], 0,
-[Passed!
-])
-AT_CLEANUP
diff --git a/tests/testsuite-cloverleaf.at b/tests/testsuite-cloverleaf.at
deleted file mode 100644
index 1dd4098..0000000
--- a/tests/testsuite-cloverleaf.at
+++ /dev/null
@@ -1,17 +0,0 @@
-m4_define([POAT_CLOVERLEAF_SETUP],[
-  AT_SETUP([$1])
-  AT_KEYWORDS([cloverleaf $1 $2])
-  AT_SKIP_IF([AS_CASE([" $POAT_TESTSUITES "],[*" CloverLeaf "*],[false],[:])])
-  # AMD APP SDK tests require ICD loader with
-  AT_SKIP_IF([! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h])
-])
-
-AT_BANNER([CloverLeaf])
-
-POAT_CLOVERLEAF_SETUP([cloverleaf])
-AT_CHECK_UNQUOTED([cd $abs_top_builddir/examples/CloverLeaf/CloverLeaf_OpenCL ;
-./clover_leaf 2>&1 |grep First | cut -c-6 
-], 0,
-[ First
-])
-AT_CLEANUP
diff --git a/tests/testsuite-halide.at b/tests/testsuite-halide.at
deleted file mode 100644
index 5e02de5..0000000
--- a/tests/testsuite-halide.at
+++ /dev/null
@@ -1,34 +0,0 @@
-m4_define([POAT_HALIDE_SETUP],[
-  AT_SETUP([$1])
-  AT_KEYWORDS([halide $1 $2])
-  AT_SKIP_IF([AS_CASE([" $POAT_TESTSUITES "],[*" Halide "*],[false],[:])])
-  # AMD APP SDK tests require ICD loader with
-  AT_SKIP_IF([! grep -q "#define BUILD_ICD" $abs_top_builddir/config.h])
-])
-
-AT_BANNER([Halide OpenCL examples])
-
-POAT_HALIDE_SETUP([tutorial12])
-AT_CHECK_UNQUOTED([cd $abs_top_builddir/examples/Halide/Halide/tutorial ; 
-LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:../bin HL_TARGET=opencl ../bin/tutorial_lesson_12_using_the_gpu | cut -d ' ' -f 2], 0,
-[performance
-milliseconds
-performance
-milliseconds
-])
-AT_CLEANUP
-
-POAT_HALIDE_SETUP([bilateral_grid])
-AT_CHECK_UNQUOTED([cd $abs_top_builddir/examples/Halide/Halide/apps/bilateral_grid ; 
-LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:../bin HL_TARGET=opencl ./filter ../images/gray.png out.png 0.1], 0, ignore)
-AT_CLEANUP
-
-POAT_HALIDE_SETUP([interpolate])
-AT_CHECK_UNQUOTED([cd $abs_top_builddir/examples/Halide/Halide/apps/interpolate ; 
-LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:../bin HL_TARGET=opencl ./interpolate ../images/rgba.png out.png], 0, ignore)
-AT_CLEANUP
-
-POAT_HALIDE_SETUP([local_laplacian])
-AT_CHECK_UNQUOTED([cd $abs_top_builddir/examples/Halide/Halide/apps/local_laplacian ; 
-LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:../bin HL_TARGET=opencl ./process ../images/rgb.png 8 1 1 out.png], 0, ignore)
-AT_CLEANUP
diff --git a/tests/testsuite-opencv.at b/tests/testsuite-opencv.at
deleted file mode 100644
index fe30f4d..0000000
--- a/tests/testsuite-opencv.at
+++ /dev/null
@@ -1,327 +0,0 @@
-m4_define([POAT_OPENCV_SETUP],[
-  AT_SETUP([$1])
-  AT_KEYWORDS([opencv $1 $2])
-  AT_SKIP_IF([AS_CASE([" $POAT_TESTSUITES "],[*" OpenCV "*],[false],[:])])
-])
-
-AT_BANNER([OpenCV UMat tests])
-
-POAT_OPENCV_SETUP([UMat])
-AT_CHECK_UNQUOTED([make test_UMat -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([Core_UMat])
-AT_CHECK_UNQUOTED([make test_Core_UMat -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([Image2D])
-AT_CHECK_UNQUOTED([make test_Image2D -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([UMatBasicTests])
-AT_CHECK_UNQUOTED([make test_UMat/UMatBasicTests -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([UMatTestReshape])
-AT_CHECK_UNQUOTED([make test_UMat/UMatTestReshape -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([UMatTestRoi])
-AT_CHECK_UNQUOTED([make test_UMat/UMatTestRoi -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([UMatTestSizeOperations])
-AT_CHECK_UNQUOTED([make test_UMat/UMatTestSizeOperations -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([UMatTestUMatOperations])
-AT_CHECK_UNQUOTED([make test_UMat/UMatTestUMatOperations -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-AT_BANNER(OpenCV Channels test)
-
-POAT_OPENCV_SETUP([OCL_Channels/Merge])
-AT_CHECK_UNQUOTED([make test_OCL_Channels/Merge -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([OCL_Channels/Split])
-AT_CHECK_UNQUOTED([make test_OCL_Channels/Split -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([OCL_Channels/MixChannels])
-AT_CHECK_UNQUOTED([make test_OCL_Channels/MixChannels -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([OCL_Channels/InsertChannel])
-AT_CHECK_UNQUOTED([make test_OCL_Channels/InsertChannels -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([OCL_Channels/ExtractChannel])
-AT_CHECK_UNQUOTED([make test_OCL_Channels/ExtractChannels -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-AT_BANNER(OpenCV Arithm tests)
-
-POAT_OPENCV_SETUP([Lut])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/Lut -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([Add])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/Add -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([Subtract])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/Subtract -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([Mul])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/Mul -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([Div])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/Div -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([Min])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/Min -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([Max])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/Max -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([AddWeighted])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/AddWeighted -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([Absdiff])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/Absdiff -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([CartToPolar])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/CartToPolar -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([PolarToCart])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/PolarToCart -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([Transpose])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/Transpose -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([Bitwise_and])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/Bitwise_and -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([Bitwise_or])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/Bitwise_or -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([Bitwise_xor])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/Bitwise_xor -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([Bitwise_not])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/Bitwise_not -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([Compare])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/Compare -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([Pow])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/Pow -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([SetIdentity])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/SetIdentity -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([Repeat])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/Repeat -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([CountNonZero])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/CountNonZero -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([Sum])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/Sum -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([MeanStdDev])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/MeanStdDev -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([Log])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/Log -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([Exp])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/Exp -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([Phase])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/Phase -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([Magnitude])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/Magnitude -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([Flip])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/Flip -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([MinMaxIdx])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/MinMaxIdx -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([MinMaxIdx_Mask])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/MinMaxIdx_Mask -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([Norm])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/Norm -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([UMatDot])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/UMatDot -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([Sqrt])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/Sqrt -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([Normalize])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/Normalize -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([InRange])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/InRange -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([ConvertScaleAbs])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/ConvertScaleAbs -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([ScaleAdd])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/ScaleAdd -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([PatchNaNs])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/PatchNaNs -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([Psnr])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/Psnr -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([ReduceSum])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/ReduceSum -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([ReduceMax])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/ReduceMax -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([ReduceAvg])
-AT_CHECK_UNQUOTED([make test_OCL_Arithm/ReduceAvg -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-AT_Banner(OpenCV Core tests)
-
-POAT_OPENCV_SETUP([Gemm])
-AT_CHECK_UNQUOTED([make test_OCL_Core/Gemm -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([Dft])
-AT_CHECK_UNQUOTED([make test_OCL_Core/Dft -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-AT_BANNER(OpenCV ImgProc test)
-
-POAT_OPENCV_SETUP([MultiSpectrums])
-AT_CHECK_UNQUOTED([make test_OCL_OCL_ImgProc/MultiSpectrums -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-AT_BANNER(OpenCV MatrixOperation tests)
-
-POAT_OPENCV_SETUP([ConvertTo])
-AT_CHECK_UNQUOTED([make test_OCL_MatrixOperation/ConvertTo -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([CopyTo])
-AT_CHECK_UNQUOTED([make test_OCL_MatrixOperation/CopyTo -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([SetTo])
-AT_CHECK_UNQUOTED([make test_OCL_MatrixOperation/SetTo -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
-
-POAT_OPENCV_SETUP([UMatExpr])
-AT_CHECK_UNQUOTED([make test_OCL_MatrixOperation/UMatExpr -sC $abs_top_builddir/examples/OpenCV | grep "FAILED"], 1, 
-)
-AT_CLEANUP
diff --git a/tests/testsuite-parboil.at b/tests/testsuite-parboil.at
deleted file mode 100644
index c0326c0..0000000
--- a/tests/testsuite-parboil.at
+++ /dev/null
@@ -1,113 +0,0 @@
-
-dnl POAT_PB_SETUP(name, extra_keywords) 
-m4_define([POAT_PB_SETUP],[
-  AT_SETUP([$1])
-  AT_KEYWORDS([parboil $1 $2])
-  AT_SKIP_IF([AS_CASE([" $POAT_TESTSUITES "],[*" parboil "*],[false],[:])])
-])
-dnl POAT_PB_CHECK_BUILD(name, extra build cmd)
-m4_define([POAT_PB_CHECK_BUILD],[
-  AT_CHECK([make build-$1 -sC ${abs_top_builddir}/examples/Parboil 2>&1 | grep "Parboil parallel benchmark suite" | grep . $2 ], 0, 
-[Parboil parallel benchmark suite, version 0.2
-])
-])
-
-AT_BANNER([Parboil tests])
-
-POAT_PB_SETUP([spmv])
-#This fails when pocl has ICD enabled. 
-AT_XFAIL_IF([grep -q "#define HAVE_OCL_ICD 1" $abs_top_builddir/config.h])
-POAT_PB_CHECK_BUILD([spmv])
-AT_CHECK_UNQUOTED([make run-spmv -sC $abs_top_builddir/examples/Parboil | grep Pass], 0, 
-[Pass
-])
-AT_CLEANUP
-
-POAT_PB_SETUP([stencil])
-POAT_PB_CHECK_BUILD([stencil])
-AT_CHECK_UNQUOTED([make run-stencil -sC $abs_top_builddir/examples/Parboil | grep Pass], 0, 
-[Pass
-])
-AT_CLEANUP
-
-POAT_PB_SETUP([tpacf])
-# This probably is in infinite loop with wiloops.
-AT_SKIP_IF([true])
-POAT_PB_CHECK_BUILD([tpacf])
-AT_CHECK_UNQUOTED([make run-tpacf -sC $abs_top_builddir/examples/Parboil | grep Pass], 0, 
-[Pass
-])
-# Result verification error.
-AT_XFAIL_IF(true)
-AT_CLEANUP
-
-POAT_PB_SETUP([cutcp])
-POAT_PB_CHECK_BUILD([cutcp])
-AT_CHECK_UNQUOTED([make run-cutcp -sC $abs_top_builddir/examples/Parboil | grep Pass], 0, 
-[Pass
-])
-AT_CLEANUP
-
-POAT_PB_SETUP([mri-gridding], [long])
-# Takes forever to compile with the repl method.
-# Also tries to create an illegal work group size and has potentially
-# erroneous kernels?
-AT_SKIP_IF([true])
-POAT_PB_CHECK_BUILD([mri-gridding])
-AT_CHECK_UNQUOTED([make run-mri-gridding -sC $abs_top_builddir/examples/Parboil | grep Pass], 0, 
-[Pass
-])
-#AT_XFAIL_IF(true)
-AT_CLEANUP
-
-POAT_PB_SETUP([sad])
-POAT_PB_CHECK_BUILD([sad])
-AT_CHECK_UNQUOTED([make run-sad -sC $abs_top_builddir/examples/Parboil | grep Pass], 0, 
-[Pass
-])
-# Requires read_imageui
-AT_XFAIL_IF(true)
-AT_CLEANUP
-
-POAT_PB_SETUP([bfs], [bfs-parboil long])
-POAT_PB_CHECK_BUILD([bfs])
-AT_CHECK_UNQUOTED([make run-bfs -sC $abs_top_builddir/examples/Parboil 2>&1 | grep Pass], 0,
-[Pass
-])
-# LLVM 3.4's SROA crashes with this. Reported in http://llvm.org/bugs/show_bug.cgi?id=15907
-# However, this is fixed with a TCE-patched version of 3.4, so let's just
-# skip it for 3.4 for now.
-AT_SKIP_IF([grep -q "#define LLVM_3_4" $abs_top_builddir/config.h])
-#AT_XFAIL_IF([grep -q "#define LLVM_3_4" $abs_top_builddir/config.h])
-AT_CLEANUP
-
-POAT_PB_SETUP([histo])
-# Illegal kernels with array parameters to functions.
-AT_SKIP_IF([true])
-POAT_PB_CHECK_BUILD([histo])
-AT_CHECK_UNQUOTED([make run-histo -sC $abs_top_builddir/examples/Parboil | grep Pass], 0, 
-[])
-AT_XFAIL_IF(true)
-AT_CLEANUP
-
-POAT_PB_SETUP([sgemm])
-POAT_PB_CHECK_BUILD([sgemm])
-AT_CHECK_UNQUOTED([make run-sgemm -sC $abs_top_builddir/examples/Parboil 2>&1| grep Pass], 0, 
-[Pass
-])
-AT_CLEANUP
-
-POAT_PB_SETUP([mri-q])
-POAT_PB_CHECK_BUILD([mri-q])
-AT_CHECK_UNQUOTED([make run-mri-q -sC $abs_top_builddir/examples/Parboil | grep Pass], 0, 
-[Pass
-])
-AT_CLEANUP
-
-POAT_PB_SETUP([lbm], [long])
-POAT_PB_CHECK_BUILD([lbm])
-AT_CHECK_UNQUOTED([make run-lbm -sC $abs_top_builddir/examples/Parboil | grep Pass], 0, 
-[Pass
-], ignore)
-#AT_XFAIL_IF(true)
-AT_CLEANUP
diff --git a/tests/testsuite-piglit.at b/tests/testsuite-piglit.at
deleted file mode 100644
index 39c8058..0000000
--- a/tests/testsuite-piglit.at
+++ /dev/null
@@ -1,9 +0,0 @@
-AT_BANNER([Piglit OpenCL tests])
-
-AT_SETUP([Piglit testsuite with LLVM 3.5])
-AT_KEYWORDS([piglit long])
-AT_SKIP_IF([AS_CASE([" $POAT_TESTSUITES "],[*" piglit "*],[false],[:])])
-AT_SKIP_IF([! grep -q "#define LLVM_3_5" $abs_top_builddir/config.h])
-AT_CHECK([cd $abs_top_builddir/examples/piglit/; ./produce_results.sh ], ignore, ignore, ignore)
-AT_CHECK([cd $abs_top_builddir/examples/piglit/; LC_ALL=C comm -23 sorted_ref_llvm_3.5 sorted_result])
-AT_CLEANUP
diff --git a/tests/testsuite-regression.at b/tests/testsuite-regression.at
deleted file mode 100644
index 81112a0..0000000
--- a/tests/testsuite-regression.at
+++ /dev/null
@@ -1,270 +0,0 @@
-
-AT_BANNER([Regression tests])
-
-AT_SETUP([phi nodes not replicated (repl) - lp:927573])
-AT_KEYWORDS([regression])
-AT_SKIP_IF([! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h])
-AT_CHECK([POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_loop_phi_replication], 0)
-AT_CLEANUP
-
-AT_SETUP([phi nodes not replicated (loops) - lp:927573])
-AT_KEYWORDS([regression])
-AT_SKIP_IF([! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h])
-AT_CHECK([POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/regression/test_loop_phi_replication], 0)
-AT_CLEANUP
-
-AT_SETUP([issues with local pointers (repl) - lp:918801])
-AT_KEYWORDS([regression locals])
-AT_SKIP_IF([! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h])
-AT_CHECK([POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_locals], 0)
-AT_CLEANUP
-
-AT_SETUP([issues with local pointers (loops) - lp:918801])
-AT_KEYWORDS([regression locals])
-AT_SKIP_IF([! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h])
-AT_CHECK([POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/regression/test_locals], 0)
-AT_CLEANUP
-
-AT_SETUP([barrier between two for loops (repl)])
-AT_KEYWORDS([regression tce])
-AT_SKIP_IF([! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h])
-AT_CHECK([POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_barrier_between_for_loops], 0)
-AT_CLEANUP
-
-AT_SETUP([barrier between two for loops (loops)])
-AT_KEYWORDS([regression tce])
-AT_SKIP_IF([! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h])
-AT_CHECK([POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/regression/test_barrier_between_for_loops], 0)
-AT_CLEANUP
-
-AT_SETUP([simple for-loop with a barrier inside (repl)])
-AT_KEYWORDS([regression])
-AT_SKIP_IF([! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h])
-AT_CHECK([POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_simple_for_with_a_barrier], 0)
-AT_CLEANUP
-
-AT_SETUP([simple for-loop with a barrier inside (loops)])
-AT_KEYWORDS([regression])
-AT_SKIP_IF([! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h])
-AT_CHECK([POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/regression/test_simple_for_with_a_barrier], 0)
-AT_CLEANUP
-
-AT_SETUP([for-loop with computation after the brexit (repl) - lp:938123])
-AT_KEYWORDS([regression tce])
-AT_SKIP_IF([! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h])
-AT_CHECK([POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_multi_level_loops_with_barriers], 0)
-AT_CLEANUP
-
-AT_SETUP([for-loop with computation after the brexit (loops) - lp:938123])
-AT_KEYWORDS([regression tce])
-AT_SKIP_IF([! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h])
-AT_CHECK([POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/regression/test_multi_level_loops_with_barriers], 0)
-AT_CLEANUP
-
-AT_SETUP([for-loop with a variable iteration count (repl) - lp:938883])
-AT_KEYWORDS([regression])
-AT_SKIP_IF([! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h])
-AT_CHECK([POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_for_with_var_iteration_count], 0)
-AT_CLEANUP
-
-AT_SETUP([for-loop with a variable iteration count (loops) - lp:938883])
-AT_KEYWORDS([regression])
-AT_SKIP_IF([! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h])
-#this broke on ppc in commit 525, for LLVM 3.1 
-AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc &&\
-  grep -q "define LLVM_3_1" $abs_top_builddir/config.h])
-AT_CHECK([POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/regression/test_for_with_var_iteration_count], 0)
-AT_CLEANUP
-
-AT_SETUP([early return before a barrier region (repl) - lp:940248])
-AT_KEYWORDS([regression early-return tce])
-AT_SKIP_IF([! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h])
-AT_CHECK([POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_early_return], 0)
-AT_CLEANUP
-
-AT_SETUP([early return before a barrier region (loops) - lp:940248])
-AT_KEYWORDS([regression early-return tce])
-AT_SKIP_IF([! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h])
-#AT_SKIP_IF(true)
-AT_CHECK([POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/regression/test_early_return], 0)
-AT_CLEANUP
-
-AT_SETUP([id-dependent computation before kernel exit (repl) - lp:940549])
-AT_KEYWORDS([regression])
-AT_SKIP_IF([! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h])
-AT_CHECK([POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_id_dependent_computation], 0)
-AT_CLEANUP
-
-AT_SETUP([id-dependent computation before kernel exit (loops) - lp:940549])
-AT_KEYWORDS([regression])
-AT_SKIP_IF([! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h])
-AT_CHECK([POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/regression/test_id_dependent_computation], 0)
-AT_CLEANUP
-
-AT_SETUP([struct kernel arguments - lp:987905])
-AT_XFAIL_IF([true])
-# Skip for now as this passes and fails depending on the target (ABI).
-AT_SKIP_IF([true])
-AT_SKIP_IF([! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h])
-AT_KEYWORDS([regression struct])
-AT_CHECK([$abs_top_builddir/tests/regression/test_structs_as_args], 0)
-AT_CLEANUP
-
-AT_SETUP([vector kernel arguments - lp:987905])
-AT_SKIP_IF([true])
-# Skip for now as this passes and fails depending on the target (ABI).
-AT_KEYWORDS([regression vectorarg])
-AT_SKIP_IF([! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h])
-# This test fails with an assert. If asserts are off, it succeeds.
-# It fails depending if the argument list generated to the kernel
-# function happens to be 1:1 with the clKernelSetArgs indices or not
-# and that depends on the CC/ABI of the target at hand.
-AT_XFAIL_IF([grep LLVM_CXX_FLAGS $abs_top_builddir/config.log | grep -q -v DNDEBUG])
-AT_XFAIL_IF([grep LLVM_CXX_FLAGS $abs_top_builddir/config.log | grep -q _DEBUG])
-AT_CHECK([$abs_top_builddir/tests/regression/test_vectors_as_args], 0)
-AT_CLEANUP
-
-AT_SETUP([barrier just before return (repl) - lp:1012030])
-AT_KEYWORDS([regression struct])
-AT_SKIP_IF([! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h])
-AT_CHECK([POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_barrier_before_return], 0)
-AT_CLEANUP
-
-AT_SETUP([barrier just before return (loops) - lp:1012030])
-AT_KEYWORDS([regression struct])
-AT_SKIP_IF([! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h])
-AT_CHECK([POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/regression/test_barrier_before_return], 0)
-AT_CLEANUP
-
-AT_SETUP([infinite loop (repl) - lp:941558])
-# This used to be also tce test, but not anymore
-# (requires threading for tce driver)
-AT_KEYWORDS([regression infinite-loop])
-AT_SKIP_IF([! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h])
-AT_SKIP_IF([ env | grep -q POCL_IMPLICIT_FINISH])
-AT_CHECK([POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_infinite_loop], 0)
-AT_CLEANUP
-
-AT_SETUP([infinite loop (loops) - lp:941558])
-AT_KEYWORDS([regression infinite-loop])
-AT_SKIP_IF([! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h])
-AT_SKIP_IF([ env | grep -q POCL_IMPLICIT_FINISH])
-AT_CHECK([POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/regression/test_infinite_loop], 0)
-AT_CLEANUP
-
-AT_SETUP([passing a constant array as an arg - lp:1032203])
-# This used to be also tce test, but not anymore
-# (requires threading for tce driver)
-AT_KEYWORDS([regression const-array])
-AT_SKIP_IF([! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h])
-AT_CHECK([$abs_top_builddir/tests/regression/test_constant_array], 0)
-AT_CLEANUP
-
-AT_SETUP([undominated variable from conditional barrier handling (repl) - lp:1045835])
-AT_KEYWORDS([regression undominated])
-AT_SKIP_IF([! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h])
-AT_CHECK([POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_undominated_variable], 0)
-AT_CLEANUP
-
-AT_SETUP([undominated variable from conditional barrier handling (loops) - lp:1045835])
-AT_KEYWORDS([regression undominated])
-AT_SKIP_IF([! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h])
-#this broke on ppc in commit 525, for LLVM 3.1
-AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc &&\
-  grep -q "define LLVM_3_1" $abs_top_builddir/config.h])
-AT_CHECK([POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/regression/test_undominated_variable], 0)
-AT_CLEANUP
-
-AT_SETUP([clSetKernelArg overwriting the previous kernel's args - lp:1075134])
-AT_KEYWORDS([regression setkernelarg])
-AT_SKIP_IF([! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h])
-AT_CHECK([$abs_top_builddir/tests/regression/test_setargs], 0)
-AT_CLEANUP
-
-AT_SETUP([setting a buffer argument to NULL causes a segfault - lp:1109030])
-AT_KEYWORDS([regression nullarg])
-AT_SKIP_IF([! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h])
-AT_CHECK([$abs_top_builddir/tests/regression/test_null_arg], 0)
-AT_CLEANUP
-
-AT_SETUP([sizeof(uint)])
-AT_KEYWORDS([sizeof])
-AT_CHECK_UNQUOTED([$abs_top_builddir/tests/kernel/kernel test_sizeof], 0,
-[$(cat $abs_top_srcdir/tests/kernel/test_sizeof_expout.txt)
-])
-AT_CLEANUP
-
-AT_SETUP([block])
-AT_KEYWORDS([block])
-AT_SKIP_IF([! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h])
-AT_CHECK_UNQUOTED([$abs_top_builddir/tests/kernel/kernel test_block], 0,
-[$(cat $abs_top_srcdir/tests/kernel/test_block_expout.txt)
-])
-# LLVM 3.4's crashes with an illegal const expr cast. Unresolved.
-AT_XFAIL_IF([grep -q "#define LLVM_3_4" $abs_top_builddir/config.h])
-AT_CLEANUP
-
-AT_SETUP([case with multiple variable length loops and a barrier in one])
-AT_KEYWORDS([regression varlengthloops])
-AT_SKIP_IF([! grep "#define HAVE_OPENCL_HPP" $abs_top_builddir/config.h])
-AT_CHECK([POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/regression/test_fors_with_var_iteration_counts], 0)
-AT_CLEANUP
-
-AT_SETUP([assigning a loop iterator variable to a private makes it local - issue 94 (repl)])
-AT_KEYWORDS([regression looppriv])
-AT_DATA([expout],
-[Changed value at global_id: 67599, local_id 3, group_id 16899, to: 854
-Value is changed at global_id: 67599, local_id 3, group_id 16899, to: 854
-])
-AT_CHECK([POCL_WORK_GROUP_METHOD=repl $abs_top_builddir/tests/regression/test_assign_loop_variable_to_privvar_makes_it_local], 0, expout)
-AT_CLEANUP
-
-AT_SETUP([assigning a loop iterator variable to a private makes it local - issue 94 (loops)])
-AT_KEYWORDS([regression looppriv])
-AT_DATA([expout],
-[Changed value at global_id: 67599, local_id 3, group_id 16899, to: 854
-Value is changed at global_id: 67599, local_id 3, group_id 16899, to: 854
-])
-AT_CHECK([POCL_WORK_GROUP_METHOD=loops $abs_top_builddir/tests/regression/test_assign_loop_variable_to_privvar_makes_it_local], 0, expout)
-AT_CLEANUP
-
-AT_SETUP([assigning a loop iterator variable to a private makes it local 2 - issue 102 (repl)])
-AT_KEYWORDS([regression looppriv])
-AT_DATA([expout],
-[changing the value at global_id: 6, local_id 2, group_id 1, to: 3
-value is changed at global_id: 6, local_id 2, group_id 1, to: 3
-])
-AT_CHECK([POCL_WORK_GROUP_METHOD=repl $abs_top_builddir/tests/regression/test_assign_loop_variable_to_privvar_makes_it_local_2], 0, expout)
-AT_CLEANUP
-
-AT_SETUP([assigning a loop iterator variable to a private makes it local 2 - issue 102 (loops)])
-AT_KEYWORDS([regression looppriv])
-AT_DATA([expout],
-[changing the value at global_id: 6, local_id 2, group_id 1, to: 3
-value is changed at global_id: 6, local_id 2, group_id 1, to: 3
-])
-AT_CHECK([POCL_WORK_GROUP_METHOD=loops $abs_top_builddir/tests/regression/test_assign_loop_variable_to_privvar_makes_it_local_2], 0, expout)
-AT_CLEANUP
-
-# See: https://github.com/pocl/pocl/issues/195
-# Possibly still fails with multi-AS targets.
-AT_SETUP([local struct arrays produce illegal AS casts])
-AT_KEYWORDS([regression local_struct_array])
-# The HSA branch of LLVM 3.7 is probably outdated in comparison
-# to the upstream trunk and makes this case fail.
-AT_SKIP_IF([grep -q "#define BUILD_HSA" $abs_top_builddir/config.h])
-AT_CHECK_UNQUOTED([$abs_top_builddir/tests/kernel/kernel test_local_struct_array], 0,
-[Running test test_local_struct_array...
-OK
-])
-AT_XFAIL_IF([grep -q "#define LLVM_BUILT_WITH_ASSERTS" $abs_top_builddir/config.h && \
-! grep -q "#define LLVM_3_2" $abs_top_builddir/config.h && \
-! grep -q "#define LLVM_3_3" $abs_top_builddir/config.h])
-AT_CLEANUP
-
-
-# https://github.com/pocl/pocl/issues/231
-AT_SETUP([LoopVectorizer crash with Haswell and Broadwell - issue 231])
-AT_KEYWORDS([regression issue_231])
-AT_CHECK([$abs_top_builddir/tests/regression/test_issue_231], 0)
-AT_CLEANUP
diff --git a/tests/testsuite-rodinia.at b/tests/testsuite-rodinia.at
deleted file mode 100644
index 483c932..0000000
--- a/tests/testsuite-rodinia.at
+++ /dev/null
@@ -1,108 +0,0 @@
-
-dnl POAT_ROD_SETUP(name, extra_keywords) 
-m4_define([POAT_ROD_SETUP],[
-  AT_SETUP([$1])
-  AT_KEYWORDS([rodinia $1 $2])
-  AT_SKIP_IF([AS_CASE([" $POAT_TESTSUITES "],[*" rodinia "*],[false],[:])])
-  # Rodinia does not (yet) work with the (OCL-)ICD build.
-  AT_SKIP_IF([grep -q "#define HAVE_OCL_ICD 1" $abs_top_builddir/config.h])
-])
-dnl POAT_ROD_CHECK_BUILD(name, extra build cmd)
-m4_define([POAT_ROD_CHECK_BUILD],[
-  AT_CHECK([make build-$1 -sC ${abs_top_builddir}/examples/Rodinia 2>&1 | egrep -v "^make" $2 ], 0,
-[  [[Building Rodinia $1]]
-])
-])
-
-AT_BANNER([Rodinia 2.0.1 tests])
-
-POAT_ROD_SETUP([backprop],[long])
-POAT_ROD_CHECK_BUILD([backprop],[ | egrep -v "^backprop_ocl.cpp:"])
-AT_CHECK_UNQUOTED([cd $abs_top_builddir/examples/Rodinia/rodinia_2.0.1/opencl/backprop; ./run | grep .], 0,
-[Random number generator seed: 7
-Input layer size : 65536
-Starting training kernel
-num_devices = 1
-Performing GPU computation
-Finish the training for one iteration
-])
-AT_CLEANUP
-
-POAT_ROD_SETUP([bfs])
-POAT_ROD_CHECK_BUILD([bfs], [ | egrep -v "^CLHelper.h:|^In file included from bfs.cpp:" ])
-# Test passes NVIDIA-specific parameters to clBuildProgram which is now
-# detected as an unsupported parameter by pocl.
-AT_SKIP_IF([true])
-AT_CHECK_UNQUOTED([cd $abs_top_builddir/examples/Rodinia/rodinia_2.0.1/opencl/bfs; ./run 2>&1|grep -v "incomplete" | grep -v "argument unused" | grep .], 0,
-[Reading File
---cambine:passed:-)
-])
-AT_CLEANUP
-
-POAT_ROD_SETUP([cfd],[long])
-POAT_ROD_CHECK_BUILD([cfd],[| egrep -v "^euler3d.cpp:" ])
-AT_CHECK_UNQUOTED([cd $abs_top_builddir/examples/Rodinia/rodinia_2.0.1/opencl/cfd; ./run 2>&1| grep . | egrep -v "incomplete|device.name"], 0,
-[--cambine: nel=97046, nelr=97152
-Starting...
-Saving solution...
-Saved solution...
-Cleaning up...
-Done...
-])
-AT_CLEANUP
-
-POAT_ROD_SETUP([lud])
-POAT_ROD_CHECK_BUILD([lud],[ | egrep -v "^lud.cpp:|^../common/common.c:" ])
-AT_CHECK_UNQUOTED([cd $abs_top_builddir/examples/Rodinia/rodinia_2.0.1/opencl/lud/ocl; ./run | grep "Time con" | cut -c -18 | grep .], 0,
-[Time consumed(ms):
-])
-AT_CLEANUP
-
-POAT_ROD_SETUP([hotspot],[long])
-POAT_ROD_CHECK_BUILD([hotspot])
-AT_CHECK_UNQUOTED([cd $abs_top_builddir/examples/Rodinia/rodinia_2.0.1/opencl/hotspot; ./run | grep "time" | cut -d ' ' -f1 | grep .], 0,
-[Kernel
-Total
-])
-AT_CLEANUP
-
-POAT_ROD_SETUP([kmeans])
-POAT_ROD_CHECK_BUILD([kmeans],[ | egrep -v "^kmeans.cpp:|^read_input.c:" ])
-AT_CHECK_UNQUOTED([cd $abs_top_builddir/examples/Rodinia/rodinia_2.0.1/opencl/kmeans; ./run | grep "Number of" | grep .], 0,
-[Number of objects: 494020
-Number of features: 34
-Number of Iteration: 1
-])
-AT_CLEANUP
-
-POAT_ROD_SETUP([lavaMD])
-POAT_ROD_CHECK_BUILD([lavaMD],[ | egrep -v "^./kernel/kernel_gpu_opencl_wrapper.c:|^rm:" ])
-# This test case uses struct kernel arguments which currently do not
-# work correctly due to assuming the ABI maps variables 1:1 to kernel
-# arguments and at least AMD64 seems to sometimes split the struct
-# arguments to multiple scalar arguments. This (falsely) passes with
-# the pthread device and crashes with the basic device.
-AT_SKIP_IF([true])
-AT_CHECK_UNQUOTED([cd $abs_top_builddir/examples/Rodinia/rodinia_2.0.1/opencl/lavaMD; ./run | grep "Total time:"], 0,
-[Total time:
-])
-AT_CLEANUP
-
-POAT_ROD_SETUP([pathfinder])
-POAT_ROD_CHECK_BUILD([pathfinder], [ | egrep -v "^OpenCL.cpp:" ])
-# This started to fail at around 2014-12-03.
-AT_SKIP_IF([true])
-AT_CHECK_UNQUOTED([cd $abs_top_builddir/examples/Rodinia/rodinia_2.0.1/opencl/pathfinder; \
-POCL_MAX_WORK_GROUP_SIZE=2 ./run 2>&1 | grep -v "pocl warning:" ; \
-cat result.txt 2>&1 | egrep -v "DEVICE_|PROFILE|VERSION|NAME|EXTENSIONS" | grep .], 0,
-[$(cat $abs_top_srcdir/examples/Rodinia/pathfinder.stdout)
-])
-AT_CLEANUP
-
-POAT_ROD_SETUP([srad])
-POAT_ROD_CHECK_BUILD([srad], [ | egrep -v "^main.cpp:" ])
-AT_CHECK_UNQUOTED([cd $abs_top_builddir/examples/Rodinia/rodinia_2.0.1/opencl/srad; \
-POCL_MAX_WORK_GROUP_SIZE=2 ./run 2>&1 | grep "Total time" | grep .], 0,
-[Total time:
-])
-AT_CLEANUP
diff --git a/tests/testsuite-runtime.at b/tests/testsuite-runtime.at
deleted file mode 100644
index bc2d9d4..0000000
--- a/tests/testsuite-runtime.at
+++ /dev/null
@@ -1,106 +0,0 @@
-
-AT_BANNER([Runtime library tests])
-
-AT_SETUP([clGetDeviceInfo])
-AT_KEYWORDS([runtime])
-AT_CHECK([$abs_top_builddir/tests/runtime/test_clGetDeviceInfo])
-AT_CLEANUP
-
-AT_SETUP([clEnqueueNativeKernel])
-AT_KEYWORDS([runtime])
-AT_CHECK([$abs_top_builddir/tests/runtime/test_clEnqueueNativeKernel])
-AT_CLEANUP
-
-AT_SETUP([clGetEventInfo])
-AT_KEYWORDS([runtime])
-AT_CHECK([$abs_top_builddir/tests/runtime/test_clGetEventInfo])
-AT_CLEANUP
-
-AT_SETUP([read/copy/write buffer])
-AT_KEYWORDS([runtime])
-AT_CHECK([$abs_top_builddir/tests/runtime/test_read-copy-write-buffer])
-AT_CLEANUP
-
-AT_SETUP([event cycle])
-AT_KEYWORDS([runtime])
-AT_CHECK([$abs_top_builddir/tests/runtime/test_event_cycle])
-AT_CLEANUP
-
-AT_SETUP([event freeing])
-AT_KEYWORDS([runtime])
-AT_CHECK([$abs_top_builddir/tests/runtime/test_event_free])
-AT_CLEANUP
-
-AT_SETUP([clCreateProgramWithBinary])
-AT_KEYWORDS([runtime])
-AT_CHECK([$abs_top_builddir/tests/runtime/test_clCreateProgramWithBinary])
-AT_CLEANUP
-
-#test_clBuildProgram tests include paths so it must be executed in the directory
-AT_SETUP([clBuildProgram])
-AT_KEYWORDS([runtime])
-AT_CHECK([cd $abs_top_srcdir/tests/runtime/; $abs_top_builddir/tests/runtime/test_clBuildProgram], 0, ignore, ignore)
-AT_CLEANUP
-
-#test_kernel_cache_includes tests include paths so it must be executed in the directory
-AT_SETUP([test_kernel_cache_includes])
-AT_KEYWORDS([runtime])
-AT_CHECK_UNQUOTED([cd $abs_top_builddir/tests/runtime/; $abs_top_builddir/tests/runtime/test_kernel_cache_includes], 0,
-[$(cat $abs_top_srcdir/tests/runtime/test_kernel_cache_includes_expout.txt)
-])
-AT_CLEANUP
-
-AT_SETUP([enqueue_kernel_created_with_binary])
-AT_KEYWORDS([runtime])
-AT_CHECK([$abs_top_builddir/tests/runtime/test_enqueue_kernel_from_binary], 0)
-AT_CLEANUP
-
-
-AT_SETUP([clBuildProgram link error])
-AT_XFAIL_IF(true)
-AT_KEYWORDS([runtime])
-AT_CHECK([$abs_top_builddir/tests/runtime/test_link_error])
-AT_CLEANUP
-
-
-AT_SETUP([clFinish])
-AT_KEYWORDS([runtime])
-AT_CHECK_UNQUOTED([$abs_top_builddir/tests/runtime/test_clFinish | grep "ABABC"], 0, [ABABC
-])
-AT_CLEANUP
-
-AT_SETUP([clSetEventCallback])
-AT_KEYWORDS([runtime])
-AT_CHECK([$abs_top_builddir/tests/runtime/test_clSetEventCallback], 0, ignore,
-ignore)
-AT_CLEANUP
-
-AT_SETUP([clGetSupportedImageFormats])
-AT_KEYWORDS([runtime])
-AT_CHECK([POCL_DEVICES="pthread pthread" $abs_top_builddir/tests/runtime/test_clGetSupportedImageFormats])
-AT_CLEANUP
-
-AT_SETUP([clCreateKernelsInProgram])
-AT_KEYWORDS([runtime])
-AT_CHECK([$abs_top_builddir/tests/runtime/test_clCreateKernelsInProgram] , 0,
-[Hello
-World
-])
-AT_CLEANUP
-
-AT_SETUP([clCreateKernel])
-AT_KEYWORDS([runtime])
-AT_CHECK([$abs_top_builddir/tests/runtime/test_clCreateKernel] , 0, [OK
-])
-AT_CLEANUP
-
-AT_SETUP([clGetKernelArgInfo])
-AT_XFAIL_IF([grep -q "#define LLVM_3_2" $abs_top_builddir/config.h])
-AT_KEYWORDS([runtime])
-AT_CHECK([$abs_top_builddir/tests/runtime/test_clGetKernelArgInfo], 0, ignore, ignore)
-AT_CLEANUP
-
-AT_SETUP([clCreateSubDevices])
-AT_KEYWORDS([runtime])
-AT_CHECK([$abs_top_builddir/tests/runtime/test_clCreateSubDevices])
-AT_CLEANUP
diff --git a/tests/testsuite-samples.at b/tests/testsuite-samples.at
deleted file mode 100644
index 9061774..0000000
--- a/tests/testsuite-samples.at
+++ /dev/null
@@ -1,114 +0,0 @@
-
-m4_define([POAT_SAMPLES_SKIP], [
-  AT_SKIP_IF([AS_CASE([" $POAT_TESTSUITES "],[*" samples "*],[false],[:])])
-])
-
-AT_BANNER([OpenCL Programming Guide Samples])
-
-AT_SETUP([Building the sources against pocl])
-AT_KEYWORDS([booksamples buildsamples])
-POAT_SAMPLES_SKIP
-AT_DATA([expout],
-[Built target HelloWorld
-Built target OpenCLInfo
-Built target OpenCLConvolutionChap3
-Built target HelloBinaryWorld
-Built target SimpleBufferSubBuffer
-Built target ImageFilter2D
-Built target vecadd
-Built target histogram
-Built target Dijkstra
-Built target spmv
-])
-AT_CHECK([sed -i 's/#include "bmpLoader.hpp"/\/\/#include "bmpLoader.hpp/g' \
-$abs_top_srcdir/examples/opencl-book-samples/checkout/src/Chapter_12/Sinewave/sinewave.cpp;
-make -sC $abs_top_builddir/examples/opencl-book-samples clean build 2>&1 | grep 'Built target' | cut -c8-], 0, expout)
-AT_CLEANUP
-
-AT_SETUP([Run Chapter 2: Hello World])
-AT_KEYWORDS([booksamples helloworld])
-POAT_SAMPLES_SKIP
-
-AT_CHECK_UNQUOTED([cd $abs_top_builddir/examples/opencl-book-samples/build/src/Chapter_2/HelloWorld; ./HelloWorld | grep -v "Could not create GPU context, trying CPU"], 0, 
-[`cat $abs_top_srcdir/examples/opencl-book-samples/HelloWorld.stdout`
-])
-          
-AT_CLEANUP
-
-AT_SETUP([Run Chapter 3: OpenCLConvolution])
-AT_KEYWORDS([booksamples])
-POAT_SAMPLES_SKIP
-
-AT_CHECK_UNQUOTED([
-cd $abs_top_builddir/examples/opencl-book-samples/build/src/Chapter_3/OpenCLConvolution
-ln -sf $abs_top_srcdir/examples/opencl-book-samples/checkout/src/Chapter_3/OpenCLConvolution/Convolution.cl 
-./OpenCLConvolutionChap3
-], 0,
-[`cat $abs_top_srcdir/examples/opencl-book-samples/OpenCLConvolutionChap3.stdout`
-])
-          
-AT_CLEANUP
-
-AT_SETUP([Run Chapter 6: HelloBinaryWorld])
-AT_KEYWORDS([booksamples hellobinaryworld])
-POAT_SAMPLES_SKIP
-AT_CHECK_UNQUOTED([
-cd $abs_top_builddir/examples/opencl-book-samples/build/src/Chapter_6/HelloBinaryWorld
-rm -f HelloWorld.cl.bin
-./HelloBinaryWorld | grep -v "Could not create GPU"
-./HelloBinaryWorld | grep -v "Could not create GPU"
-], 0, 
-[`cat $abs_top_srcdir/examples/opencl-book-samples/HelloBinaryWorld.stdout`
-])
-          
-AT_CLEANUP
-
-AT_SETUP([Run Chapter 7: SimpleBufferSubBuffer])
-AT_KEYWORDS([booksamples simplebuffersubbuffer])
-POAT_SAMPLES_SKIP
-
-AT_CHECK_UNQUOTED([
-cd $abs_top_builddir/examples/opencl-book-samples/build/src/Chapter_7/SimpleBufferSubBuffer
-./SimpleBufferSubBuffer --useMap | egrep -v "_TYPE_GPU|16 17 18 19"
-], 0, 
-[`cat $abs_top_srcdir/examples/opencl-book-samples/SimpleBufferSubBuffer.stdout`
-], [ignore])
-          
-AT_CLEANUP
-
-AT_SETUP([Run Chapter 8: ImageFilter2D])
-AT_KEYWORDS([booksamples imagefilter2d])
-POAT_SAMPLES_SKIP
-AT_CHECK_UNQUOTED([ 
-cd ${abs_top_srcdir}/examples/opencl-book-samples/checkout/src/Chapter_8/ImageFilter2D ;
-cp ${abs_top_srcdir}/examples/opencl-book-samples/ImageFilter2D.cl ./
-sed "6s/float/constant float/" -i ImageFilter2D.cl ;
-sed '418cclFinish(commandQueue);' -i ImageFilter2D.cpp ;
-cd ${abs_top_builddir}/examples/opencl-book-samples/build/src/Chapter_8/ImageFilter2D ;
-make]
-, 0, [ignore], [ignore])
-
-AT_CHECK_UNQUOTED([
-cd $abs_top_builddir/examples/opencl-book-samples/build/src/Chapter_8/ImageFilter2D
-./ImageFilter2D $abs_top_srcdir/examples/opencl-book-samples/checkout/src/Chapter_19/oclFlow/data/minicooper/frame10.png output.png
-], 0, 
-[Could not create GPU context, trying CPU...
-
-Executed program succesfully.
-], [ignore])
-          
-AT_CLEANUP
-
-AT_SETUP([Run Chapter 12: VectorAdd (C++ bindings)])
-AT_KEYWORDS([booksamples])
-POAT_SAMPLES_SKIP
-
-AT_CHECK_UNQUOTED([
-cd $abs_top_builddir/examples/opencl-book-samples/build/src/Chapter_12/VectorAdd
-./vecadd
-], 0, 
-[`cat $abs_top_srcdir/examples/opencl-book-samples/VectorAdd.stdout`
-], [ignore])
-          
-AT_CLEANUP
-
diff --git a/tests/testsuite-tce.at b/tests/testsuite-tce.at
deleted file mode 100644
index ff60c0e..0000000
--- a/tests/testsuite-tce.at
+++ /dev/null
@@ -1,67 +0,0 @@
-
-AT_BANNER([TCE tests])
-
-AT_SETUP([A basic ttasim driver test])
-AT_KEYWORDS([tce tta ttasim])
-AT_SKIP_IF([! grep -c "define TCE_AVAILABLE" $abs_top_builddir/config.h])
-AT_CHECK([make -s -C $abs_top_builddir/tests/tce/ttasim run | egrep -v "^make"], 0, 
-[PING23456.000000 2000001OK
-])
-AT_CLEANUP
-
-AT_SETUP([Half-precision floats on ttasim (repl)])
-AT_XFAIL_IF([grep -q "#define LLVM_3_2" $abs_top_builddir/config.h])
-AT_KEYWORDS([tce tta ttasim half])
-AT_SKIP_IF([! grep -c "define TCE_AVAILABLE" $abs_top_builddir/config.h])
-AT_CHECK([POCL_WORK_GROUP_METHOD=repl make -s -C $abs_top_builddir/tests/tce/fp16 run | egrep -v "^make"], 0, 
-[through conversion: 42
-2.500000
-2.500000
-2.500000
-2.500000
-2.500000
-2.500000
-2.500000
-2.500000
-32.000000
-32.000000
-32.000000
-32.000000
-32.000000
-32.000000
-32.000000
-32.000000
-])
-AT_CLEANUP
-
-AT_SETUP([Half-precision floats on ttasim (loopvec)])
-AT_KEYWORDS([tce tta ttasim half])
-AT_SKIP_IF([! grep -c "define TCEMC_AVAILABLE" $abs_top_builddir/config.h])
-AT_CHECK([make -s -C $abs_top_builddir/tests/tce/fp16 run | egrep -v "^make"], 0, 
-[through conversion: 42
-2.500000
-2.500000
-2.500000
-2.500000
-2.500000
-2.500000
-2.500000
-2.500000
-32.000000
-32.000000
-32.000000
-32.000000
-32.000000
-32.000000
-32.000000
-32.000000
-])
-AT_CLEANUP
-
-AT_SETUP([A basic TCEMC test])
-AT_KEYWORDS([tce tta ttasim tcemc])
-AT_SKIP_IF([! grep -c "define TCEMC_AVAILABLE" $abs_top_builddir/config.h])
-AT_CHECK([make -s -C $abs_top_builddir/tests/tce/tcemc run | egrep -v "^make"], 0, 
-[PING23456.000000 2000001OK
-])
-AT_CLEANUP
diff --git a/tests/testsuite-vexcl.at b/tests/testsuite-vexcl.at
deleted file mode 100644
index 9faffbd..0000000
--- a/tests/testsuite-vexcl.at
+++ /dev/null
@@ -1,80 +0,0 @@
-dnl POAT_PB_SETUP(name, extra_keywords) 
-m4_define([POAT_PB_SETUP],[
-  AT_SETUP([$1])
-  AT_KEYWORDS([vexcl $1 $2])
-  AT_SKIP_IF([AS_CASE([" $POAT_TESTSUITES "],[*" vexcl "*],[false],[:])])
-])
-
-AT_BANNER([VexCL tests])
-
-POAT_PB_SETUP([fft])
-# Warnings ignored:
-# clang: warning: argument unused during compilation: '-cl-mad-enable'
-# clang: warning: argument unused during compilation: '-cl-fast-relaxed-math'
-AT_CHECK_UNQUOTED([cd $abs_top_builddir/examples/VexCL/vexcl/tests ; ./fft 2>&1 | grep "No errors"], 0, 
-[*** No errors detected
-], ignore)
-AT_CLEANUP
-
-POAT_PB_SETUP([generator])
-AT_CHECK_UNQUOTED([cd $abs_top_builddir/examples/VexCL/vexcl/tests ; ./generator 2>&1 | grep "No errors"], 0, 
-[*** No errors detected
-], ignore)
-AT_CLEANUP
-
-POAT_PB_SETUP([multiple_objects])
-AT_CHECK_UNQUOTED([cd $abs_top_builddir/examples/VexCL/vexcl/tests ; ./multiple_objects 2>&1 | grep "No errors"], 0, 
-[*** No errors detected
-], ignore)
-AT_CLEANUP
-
-POAT_PB_SETUP([multivector_arithmetics])
-AT_CHECK_UNQUOTED([cd $abs_top_builddir/examples/VexCL/vexcl/tests ; ./multivector_arithmetics 2>&1 | grep "No errors"], 0, 
-[*** No errors detected
-], ignore)
-AT_CLEANUP
-
-POAT_PB_SETUP([multivector_create])
-AT_CHECK_UNQUOTED([cd $abs_top_builddir/examples/VexCL/vexcl/tests ; ./multivector_create 2>&1 | grep "No errors"], 0, 
-[*** No errors detected
-], ignore)
-AT_CLEANUP
-
-POAT_PB_SETUP([random])
-AT_CHECK_UNQUOTED([cd $abs_top_builddir/examples/VexCL/vexcl/tests ; ./random 2>&1 | grep "No errors"], 0, 
-[*** No errors detected
-], ignore)
-AT_CLEANUP
-
-POAT_PB_SETUP([spmv])
-AT_CHECK_UNQUOTED([cd $abs_top_builddir/examples/VexCL/vexcl/tests ; ./spmv 2>&1 | grep "No errors"], 0, 
-[*** No errors detected
-], ignore)
-AT_CLEANUP
-
-POAT_PB_SETUP([stencil])
-# Crash.
-AT_XFAIL_IF([true])
-AT_CHECK_UNQUOTED([cd $abs_top_builddir/examples/VexCL/vexcl/tests ; ./stencil 2>&1 | grep "No errors"], 0, 
-[*** No errors detected
-], ignore)
-AT_CLEANUP
-
-POAT_PB_SETUP([vector_arithmetics])
-AT_CHECK_UNQUOTED([cd $abs_top_builddir/examples/VexCL/vexcl/tests ; ./vector_arithmetics 2>&1 | grep "No errors"], 0, 
-[*** No errors detected
-], ignore)
-AT_CLEANUP
-
-POAT_PB_SETUP([vector_copy])
-AT_CHECK_UNQUOTED([cd $abs_top_builddir/examples/VexCL/vexcl/tests ; ./vector_copy 2>&1 | grep "No errors"], 0, 
-[*** No errors detected
-], ignore)
-AT_CLEANUP
-
-POAT_PB_SETUP([vector_create])
-AT_CHECK_UNQUOTED([cd $abs_top_builddir/examples/VexCL/vexcl/tests ; ./vector_create 2>&1 | grep "No errors"], 0, 
-[*** No errors detected
-], ignore)
-AT_CLEANUP
-
diff --git a/tests/testsuite-viennacl.at b/tests/testsuite-viennacl.at
deleted file mode 100644
index 9b18409..0000000
--- a/tests/testsuite-viennacl.at
+++ /dev/null
@@ -1,201 +0,0 @@
-
-dnl POAT_VCL_SETUP(name, extra_keywords) 
-m4_define([POAT_VCL_SETUP],[
-  AT_SETUP([$1])
-  AT_KEYWORDS([viennacl $1 $2])
-  AT_SKIP_IF([AS_CASE([" $POAT_TESTSUITES "],[*" viennacl "*],[false],[:])])
-])
-dnl POAT_VCL_CHECK_RUN(name, sed_postprocess)
-m4_define([POAT_VCL_CHECK_RUN],[
-  cat $abs_top_srcdir/examples/ViennaCL/$(basename $1).stdout > expout
-  AT_CHECK_UNQUOTED(
-    [$abs_top_builddir/examples/ViennaCL/ViennaCL-1.5.1/build/$1 2>&1 | sed -e ''$2], 0,
-    [expout])
-])
-AT_BANNER([ViennaCL 1.5.1 tests])
-
-POAT_VCL_SETUP([fft])
-POAT_VCL_CHECK_RUN([examples/tutorial/fft])
-AT_CLEANUP
-
-POAT_VCL_SETUP([custom-context])
-POAT_VCL_CHECK_RUN([examples/tutorial/custom-context], ['1d;2d;3d;4d;8d;9d'])
-AT_CLEANUP
-
-POAT_VCL_SETUP([custom-kernels])
-POAT_VCL_CHECK_RUN([examples/tutorial/custom-kernels])
-AT_CLEANUP
-
-POAT_VCL_SETUP([scheduler])
-POAT_VCL_CHECK_RUN([examples/tutorial/scheduler],['1d;2d'])
-AT_CLEANUP
-
-#POAT_VCL_SETUP([blas1])
-##uninvestigated pocl issue
-#AT_XFAIL_IF([true])
-#POAT_VCL_CHECK_RUN([examples/tutorial/blas1])
-#AT_CLEANUP
-#
-POAT_VCL_SETUP([bandwidth-reduction], [long])
-POAT_VCL_CHECK_RUN([examples/tutorial/bandwidth-reduction])
-AT_CLEANUP
-
-#POAT_VCL_SETUP([blas3_prod_double-test-opencl])
-## ViennaCL borks with "Generator: Key not found in map"
-#AT_XFAIL_IF([true])
-#POAT_VCL_CHECK_RUN(tests/blas3_prod_double-test-opencl)
-#AT_CLEANUP
-#
-#POAT_VCL_SETUP([blas3_prod_float-test-opencl])
-## ViennaCL borks with "Generator: Key not found in map"
-#AT_XFAIL_IF([true])
-#POAT_VCL_CHECK_RUN(tests/blas3_prod_float-test-opencl)
-#AT_CLEANUP
-
-POAT_VCL_SETUP([blas3_solve_double-test-opencl], [long])
-#rouning/codegen/float mode errors:
-#- * lower_tag:       passed! 5.16861e-07
-#+ * lower_tag:       passed! 0
-AT_XFAIL_IF([true])
-POAT_VCL_CHECK_RUN(tests/blas3_solve_double-test-opencl)
-AT_CLEANUP
-
-#POAT_VCL_SETUP([blas3_solve_float-test-opencl], [long])
-#rouning/codegen/float mode errors:
-#- * lower_tag:       passed! 5.16861e-07
-#+ * lower_tag:       passed! 0
-#AT_XFAIL_IF([true])
-#POAT_VCL_CHECK_RUN(tests/blas3_solve_float-test-opencl)
-#AT_CLEANUP
-
-POAT_VCL_SETUP([external_linkage-opencl])
-# This is a buggy test as it relies on the destruction order
-# of global objects. Wrong order results in a crash due to
-# a dangling OpenCL object pointer.
-AT_SKIP_IF(true)
-POAT_VCL_CHECK_RUN([tests/external_linkage-opencl])
-AT_CLEANUP
-
-#POAT_VCL_SETUP([generator_blas2-test-opencl])
-## ViennaCL borks with "Generator: Key not found in map"
-#AT_XFAIL_IF([true])
-#POAT_VCL_CHECK_RUN(tests/generator_blas2-test-opencl)
-#AT_CLEANUP
-#
-#POAT_VCL_SETUP([generator_blas3-test-opencl])
-## ViennaCL borks with "Generator: Key not found in map"
-## also needs to skip a few lines
-#AT_XFAIL_IF([true])
-#POAT_VCL_CHECK_RUN(tests/generator_blas3-test-opencl)
-#AT_CLEANUP
-#
-POAT_VCL_SETUP([global_variables-test-opencl])
-POAT_VCL_CHECK_RUN(tests/global_variables-test-opencl)
-AT_CLEANUP
-
-POAT_VCL_SETUP([iterators-test-opencl])
-POAT_VCL_CHECK_RUN(tests/iterators-test-opencl)
-AT_CLEANUP
-
-#note: the reference check for this one is from pocl, as 
-#(to be unnamed) reference platform errored out
-# 3.5 introduced the noduplicate attribute which, when
-# used with barrier(), fixes this.
-POAT_VCL_SETUP([matrix_col_double-test-opencl long])
-POAT_VCL_CHECK_RUN(tests/matrix_col_double-test-opencl)
-AT_CLEANUP
-
-#note: the reference check for this one is from pocl, as 
-#(to be unnamed) reference platform errored out
-POAT_VCL_SETUP([matrix_col_float-test-opencl long])
-POAT_VCL_CHECK_RUN(tests/matrix_col_float-test-opencl)
-AT_CLEANUP
-
-#note: the reference check for this one is from pocl, as 
-#(to be unnamed) reference platform errored out
-POAT_VCL_SETUP([matrix_col_int-test-opencl])
-#uninvestigated
-AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q armv])
-POAT_VCL_CHECK_RUN(tests/matrix_col_int-test-opencl)
-AT_CLEANUP
-
-#note: the reference check for this one is from pocl, as 
-#(to be unnamed) reference platform errored out
-POAT_VCL_SETUP([matrix_row_double-test-opencl long])
-POAT_VCL_CHECK_RUN(tests/matrix_row_double-test-opencl)
-AT_CLEANUP
-
-#note: the reference check for this one is from pocl, as 
-#(to be unnamed) reference platform errored out
-POAT_VCL_SETUP([matrix_row_float-test-opencl long])
-POAT_VCL_CHECK_RUN(tests/matrix_row_float-test-opencl)
-AT_CLEANUP
-
-#note: the reference check for this one is from pocl, as 
-#(to be unnamed) reference platform errored out
-POAT_VCL_SETUP([matrix_row_int-test-opencl])
-#uninvestigated
-AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q armv])
-POAT_VCL_CHECK_RUN(tests/matrix_row_int-test-opencl)
-AT_CLEANUP
-
-#note: the reference check for this one is from pocl, as 
-#(to be unnamed) reference platform errored out
-POAT_VCL_SETUP([matrix_vector_int-test-opencl])
-#uninvestigated
-AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q armv])
-POAT_VCL_CHECK_RUN(tests/matrix_vector_int-test-opencl)
-AT_CLEANUP
-
-#note: the reference check for this one is from pocl, as 
-#(to be unnamed) reference platform errored out
-POAT_VCL_SETUP([matrix_vector-test-opencl long])
-POAT_VCL_CHECK_RUN(tests/matrix_vector-test-opencl)
-AT_CLEANUP
-
-POAT_VCL_SETUP([nmf-test-opencl])
-# ViennaCL borks with "Generator: Key not found in map"
-#Note: uncomment a few other tests if fixing this
-AT_XFAIL_IF([true])
-POAT_VCL_CHECK_RUN(tests/nmf-test-opencl)
-AT_CLEANUP
-
-
-POAT_VCL_SETUP([scalar-test-opencl])
-POAT_VCL_CHECK_RUN(tests/scalar-test-opencl)
-AT_CLEANUP
-
-POAT_VCL_SETUP([structured-matrices-test-opencl])
-#float/double precision errors. Might not even be 
-#errors, the test seems to pass.
-#TODO, investigate, skip result passing with sed 
-AT_SKIP_IF([true])
-POAT_VCL_CHECK_RUN(tests/structured-matrices-test-opencl)
-AT_CLEANUP
-
-POAT_VCL_SETUP([vector_double-test-opencl])
-#fails with "Could not find a dominating alternative variable"
-#Note: uncomment a few other tests if fixing this
-AT_XFAIL_IF([egrep -q "#define LLVM_3_4" $abs_top_builddir/config.h])
-POAT_VCL_CHECK_RUN(tests/vector_double-test-opencl)
-AT_CLEANUP
-
-#POAT_VCL_SETUP([vector_float-test-opencl])
-##"Could not find a dominating alternative variable"
-#POAT_VCL_CHECK_RUN(tests/vector_float-test-opencl)
-#AT_CLEANUP
-#
-#POAT_VCL_SETUP([vector_int-test-opencl])
-##"Could not find a dominating alternative variable"
-#POAT_VCL_CHECK_RUN(tests/vector_int-test-opencl)
-#AT_CLEANUP
-#
-#POAT_VCL_SETUP([vector_multi_inner_prod-test-opencl])
-##"Could not find a dominating alternative variable"
-#POAT_VCL_CHECK_RUN(tests/vector_multi_inner_prod-test-opencl)
-#AT_CLEANUP
-#
-#POAT_VCL_SETUP([vector_uint-test-opencl])
-##"Could not find a dominating alternative variable"
-#POAT_VCL_CHECK_RUN(tests/vector_uint-test-opencl)
-#AT_CLEANUP
diff --git a/tests/testsuite-workgroup.at b/tests/testsuite-workgroup.at
deleted file mode 100644
index bad8fcd..0000000
--- a/tests/testsuite-workgroup.at
+++ /dev/null
@@ -1,160 +0,0 @@
-# Testsuite for the kernel compiler's work group function generation functionality.
-#
-# Makefile.am for pocl/lib/kernel.
-# 
-# Copyright (c) 2011 Universidad Rey Juan Carlos
-# Copyright (c) 2011- Pekka Jääskeläinen / Tampere University of Technology
-# 
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-# 
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-# 
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-
-AT_BANNER([Workgroup creation tests])
-
-AT_SETUP([unconditional barriers (full replication)])
-AT_KEYWORDS([workgroup])
-AT_CHECK_UNQUOTED([POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/workgroup/run_kernel basic_barriers.cl 2 2 2 2], 0,
-[$(cat $abs_top_srcdir/tests/workgroup/basic_barriers_2_2_2_2.stdout)
-])
-AT_CLEANUP
-
-AT_SETUP([unconditional barriers (loops)])
-AT_KEYWORDS([workgroup])
-AT_CHECK_UNQUOTED([POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/workgroup/run_kernel basic_barriers.cl 2 2 2 2], 0,
-[$(cat $abs_top_srcdir/tests/workgroup/basic_barriers_2_2_2_2.stdout)
-])
-AT_CLEANUP
-
-AT_SETUP([unbarriered for loops (full replication)])
-AT_KEYWORDS([workgroup])
-AT_CHECK_UNQUOTED([POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/workgroup/run_kernel forloops.cl 2 2 1 1], 0, 
-[$(cat $abs_top_srcdir/tests/workgroup/forloops_2_2_1_1.stdout)
-])
-AT_CLEANUP
-
-AT_SETUP([unbarriered for loops (loops)])
-AT_KEYWORDS([workgroup])
-AT_CHECK_UNQUOTED([POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/workgroup/run_kernel forloops.cl 2 2 1 1], 0,
-[$(cat $abs_top_srcdir/tests/workgroup/forloops_2_2_1_1.stdout)
-])
-AT_CLEANUP
-
-AT_SETUP([barriered for loops (full replication)])
-AT_KEYWORDS([workgroup])
-AT_CHECK_UNQUOTED([POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/workgroup/run_kernel loopbarriers.cl 2 2 1 1], 0,
-[$(cat $abs_top_srcdir/tests/workgroup/loopbarriers_2_2_1_1.stdout)
-])
-AT_CLEANUP
-
-AT_SETUP([barriered for loops (loops)])
-AT_KEYWORDS([workgroup])
-AT_CHECK_UNQUOTED([POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/workgroup/run_kernel loopbarriers.cl 2 2 1 1], 0,
-[$(cat $abs_top_srcdir/tests/workgroup/loopbarriers_2_2_1_1.stdout)
-])
-AT_CLEANUP
-
-AT_SETUP([conditional barrier (full replication)])
-AT_KEYWORDS([condbar workgroup])
-AT_CHECK_UNQUOTED([POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/workgroup/run_kernel conditional_barriers.cl 1 2 1 1], 0,
-[$(cat $abs_top_srcdir/tests/workgroup/cond_barriers_1_2_1_1.stdout)
-])
-AT_CLEANUP
-
-AT_SETUP([conditional barrier (loops)])
-AT_KEYWORDS([condbar workgroup])
-AT_CHECK_UNQUOTED([POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/workgroup/run_kernel conditional_barriers.cl 1 2 1 1], 0,
-[$(cat $abs_top_srcdir/tests/workgroup/cond_barriers_1_2_1_1.stdout)
-])
-AT_CLEANUP
-
-AT_SETUP([b-loop with none of the WIs reaching the barrier (full replication)])
-AT_KEYWORDS([b-loop workgroup])
-AT_CHECK_UNQUOTED([POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/workgroup/run_kernel tricky_for.cl 1 2 1 1], 0,
-[$(cat $abs_top_srcdir/tests/workgroup/tricky_for_1_2_1_1.stdout)
-])
-AT_CLEANUP
-
-AT_SETUP([b-loop with none of the WIs reaching the barrier (loops)])
-AT_KEYWORDS([b-loop workgroup])
-AT_CHECK_UNQUOTED([POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/workgroup/run_kernel tricky_for.cl 1 2 1 1], 0,
-[$(cat $abs_top_srcdir/tests/workgroup/tricky_for_1_2_1_1.stdout)
-])
-AT_CLEANUP
-
-AT_SETUP([forcing horizontal parallelization to some outer loops (repl)])
-AT_KEYWORDS([workgroup outerlooppar])
-AT_CHECK_UNQUOTED([POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/workgroup/run_kernel outerlooppar.cl 2 2 1 1], 0, 
-[$(cat $abs_top_srcdir/tests/workgroup/outerlooppar_2_2_1_1.stdout)
-])
-AT_CLEANUP
-
-AT_SETUP([forcing horizontal parallelization to some outer loops (loops)])
-AT_KEYWORDS([workgroup outerlooppar])
-AT_CHECK_UNQUOTED([POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/workgroup/run_kernel outerlooppar.cl 2 2 1 1], 0, 
-[$(cat $abs_top_srcdir/tests/workgroup/outerlooppar_2_2_1_1.stdout)
-])
-AT_CLEANUP
-
-AT_SETUP([different implicit barrier injection scenarios (loops)])
-AT_KEYWORDS([workgroup implicit])
-AT_CHECK_UNQUOTED([POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=loops $abs_top_builddir/tests/workgroup/run_kernel implicit_barriers.cl 1 2 1 1], 0, 
-[$(cat $abs_top_srcdir/tests/workgroup/implicit_barriers_1_2_1_1.stdout)
-])
-AT_CLEANUP
-
-AT_SETUP([loop with two paths to the latch (full replication)])
-AT_KEYWORDS([twolatchpaths workgroup])
-AT_CHECK_UNQUOTED([POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/workgroup/run_kernel for_bug.cl 1 2 1 1], 0,
-[$(cat $abs_top_srcdir/tests/workgroup/for_bug_1_2_1_1.stdout)
-])
-AT_CLEANUP
-
-AT_SETUP([loop with two paths to the latch (loops)])
-AT_KEYWORDS([twolatchpaths workgroup])
-AT_CHECK_UNQUOTED([POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/workgroup/run_kernel for_bug.cl 1 2 1 1], 0,
-[$(cat $abs_top_srcdir/tests/workgroup/for_bug_1_2_1_1.stdout)
-])
-AT_CLEANUP
-
-AT_SETUP([b-loop with two latches (full replication)])
-AT_KEYWORDS([twolatches workgroup])
-AT_CHECK_UNQUOTED([POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/workgroup/run_kernel multilatch_bloop.cl 1 3 1 1], 0,
-[$(cat $abs_top_srcdir/tests/workgroup/multilatch_bloop_1_3_1_1.stdout)
-])
-AT_CLEANUP
-
-AT_SETUP([b-loop with two latches (loops)])
-AT_KEYWORDS([twolatches workgroup])
-AT_CHECK_UNQUOTED([POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=loops $abs_top_builddir/tests/workgroup/run_kernel multilatch_bloop.cl 1 3 1 1], 0,
-[$(cat $abs_top_srcdir/tests/workgroup/multilatch_bloop_1_3_1_1.stdout)
-])
-AT_CLEANUP
-
-AT_SETUP([workgroup_sizes: work-items get wrong ids (full replication)])
-AT_KEYWORDS([id workgroup])
-AT_CHECK_UNQUOTED([POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemrepl $abs_top_builddir/tests/workgroup/run_kernel print_all_ids.cl 1 1 1 4 | sort], 0, 
-[$(cat $abs_top_srcdir/tests/workgroup/print_all_ids_114114.txt)
-])
-AT_CLEANUP
-
-
-AT_SETUP([workgroup_sizes: work-items get wrong ids (loop)])
-AT_KEYWORDS([id workgroup])
-AT_CHECK_UNQUOTED([POCL_DEVICES=basic POCL_WORK_GROUP_METHOD=workitemloops $abs_top_builddir/tests/workgroup/run_kernel print_all_ids.cl 1 1 1 4 | sort], 0, 
-[$(cat $abs_top_srcdir/tests/workgroup/print_all_ids_114114.txt)
-])
-AT_CLEANUP
diff --git a/tests/testsuite.at b/tests/testsuite.at
deleted file mode 100644
index 114e370..0000000
--- a/tests/testsuite.at
+++ /dev/null
@@ -1,417 +0,0 @@
-# Testsuite for pocl.
-#
-# Makefile.am for pocl/lib/kernel.
-# 
-# Copyright (c) 2011 Universidad Rey Juan Carlos
-#               2011-2015 pocl developers
-# 
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-# 
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-# 
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-# THE SOFTWARE.
-
-AT_INIT()
-AT_COLOR_TESTS
-
-AT_SETUP([check for pocl version])
-AT_CHECK([POCL_DEVICES=basic $abs_top_builddir/tests/runtime/test_version], 0,
-[basic
-])
-AT_CLEANUP
-
-AT_BANNER([OpenCL specification tests])
-
-AT_SETUP([example1: dot product])
-AT_KEYWORDS([tce hsa])
-AT_CHECK_UNQUOTED([$abs_top_builddir/examples/example1/example1], 0,
-[$(cat $abs_top_srcdir/tests/example_expout.txt)
-])
-AT_CLEANUP
-
-AT_SETUP([example1: dot product (SPIR64)])
-# This SPIR example works because it does not use local memory nor
-# call builtins that are mangled with address spaces.
-AT_SKIP_IF([! grep "#define POCL_DEVICE_ADDRESS_BITS 64" $abs_top_builddir/config.h])
-AT_KEYWORDS([spir])
-AT_CHECK_UNQUOTED([$abs_top_builddir/examples/example1-spir64/example1-spir], 0,
-[$(cat $abs_top_srcdir/tests/example_expout.txt)
-], ignore)
-AT_CLEANUP
-
-AT_SETUP([example1: dot product (SPIR32)])
-# This SPIR example works because it does not use local memory nor
-# call builtins that are mangled with address spaces.
-AT_SKIP_IF([! grep "#define POCL_DEVICE_ADDRESS_BITS 32" $abs_top_builddir/config.h])
-AT_KEYWORDS([spir])
-AT_CHECK_UNQUOTED([$abs_top_builddir/examples/example1-spir32/example1-spir32], 0,
-[$(cat $abs_top_srcdir/tests/example_expout.txt)
-], ignore)
-AT_CLEANUP
-
-AT_SETUP([example2: matrix transpose])
-AT_KEYWORDS([tce hsa])
-AT_CHECK([$abs_top_builddir/examples/example2/example2], 0, [OK
-])
-AT_CLEANUP
-
-AT_SETUP([example2a: matrix transpose (automatic locals)])
-AT_KEYWORDS([tce hsa])
-AT_CHECK([$abs_top_builddir/examples/example2a/example2a], 0, [OK
-])
-AT_CLEANUP
-        
-AT_BANNER([Kernel runtime library])
-
-AT_SETUP([Kernel functions convert_char*])
-AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc && 
-             grep -q "#define LLVM_3_2" $abs_top_builddir/config.h])
-AT_KEYWORDS([short16])
-AT_DATA([expout],
-[Running test test_short16...
-OK
-])
-AT_CHECK([$abs_top_builddir/tests/kernel/kernel test_short16], 0, expout)
-AT_CLEANUP
-
-AT_SETUP([Kernel functions printf])
-# On 32-bit x86, accessing int4 via va_arg segfaults (wrong alignment?)
-AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q i686])
-AT_XFAIL_IF([egrep -q "#define LLVM_3_2|#define LLVM_3_3" $abs_top_builddir/config.h])
-AT_KEYWORDS([printf])
-AT_CHECK_UNQUOTED([$abs_top_builddir/tests/kernel/kernel test_printf], 0,
-[$(cat $abs_top_srcdir/tests/test_printf_expout.txt)
-])
-AT_CLEANUP
-
-AT_SETUP([Kernel functions as_type])
-AT_SKIP_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc])
-AT_KEYWORDS([conversion])
-AT_DATA([expout],
-[Running test test_as_type...
-OK
-])
-AT_CHECK([$abs_top_builddir/tests/kernel/kernel test_as_type], 0, expout)
-AT_CLEANUP
-
-AT_SETUP([Kernel functions convert_type - scalars])
-# ppc codegen issue, see bug #26
-AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc64])
-AT_KEYWORDS([conversion])
-AT_CHECK([$abs_top_builddir/tests/kernel/kernel test_convert_type_1], 0,
-[Running test test_convert_type_1...
-OK
-])
-AT_CLEANUP
-AT_SETUP([Kernel functions convert_type - vector of 2])
-# ppc codegen issue, see bug #26
-AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc64])
-AT_KEYWORDS([conversion])
-AT_CHECK([$abs_top_builddir/tests/kernel/kernel test_convert_type_2], 0,
-[Running test test_convert_type_2...
-OK
-])
-AT_CLEANUP
-AT_SETUP([Kernel functions convert_type - vector of 4])
-# ppc codegen issue, see bug #26
-AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc64])
-AT_KEYWORDS([conversion])
-AT_CHECK([$abs_top_builddir/tests/kernel/kernel test_convert_type_4], 0,
-[Running test test_convert_type_4...
-OK
-])
-AT_CLEANUP
-AT_SETUP([Kernel functions convert_type - vector of 8])
-# ppc codegen issue, see bug #26
-AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc64])
-AT_KEYWORDS([conversion])
-AT_CHECK([$abs_top_builddir/tests/kernel/kernel test_convert_type_8], 0,
-[Running test test_convert_type_8...
-OK
-])
-AT_CLEANUP
-AT_SETUP([Kernel functions convert_type - vector of 16])
-# ppc codegen issue, see bug #26
-AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc64])
-AT_KEYWORDS([conversion])
-AT_CHECK([$abs_top_builddir/tests/kernel/kernel test_convert_type_16], 0,
-[Running test test_convert_type_16...
-OK
-])
-AT_CLEANUP
-
-# This reproduces an issue that looks like an LLVM code gen issue.
-AT_SETUP([Kernel functions min and max when the operands are of different sign])
-# This used to produces an LLVM 3.3 bug that appeared only with Intel CPUs 
-# without SSE4.2. http://llvm.org/bugs/show_bug.cgi?id=15977
-AT_KEYWORDS([min_max])
-AT_DATA([expout],
-[Running test test_min_max...
-OK
-])
-AT_CHECK([$abs_top_builddir/tests/kernel/kernel test_min_max], 0, expout)
-AT_CLEANUP
-
-AT_SETUP([Kernel functions length, distance, and normalize])
-# This tests in particular for unintended overflow
-AT_KEYWORDS([length_distance])
-AT_DATA([expout],
-[Running test test_length_distance...
-OK
-])
-AT_CHECK([$abs_top_builddir/tests/kernel/kernel test_length_distance], 0, expout)
-AT_CLEANUP
-
-AT_SETUP([Kernel functions fmin, fmax, fma])
-AT_KEYWORDS([fmin_fmax_fma])
-AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc ])
-AT_DATA([expout],
-[Running test test_fmin_fmax_fma...
-OK
-])
-AT_CHECK([$abs_top_builddir/tests/kernel/kernel test_fmin_fmax_fma], 0, expout)
-AT_CLEANUP
-
-AT_SETUP([Kernel functions frexp modf])
-AT_KEYWORDS([frexp_modf])
-AT_DATA([expout],
-[Running test test_frexp_modf...
-frexp(8e2f): 0.8 10
-modf(1.5f): 0.5 1.000000
-OK
-])
-AT_CHECK([$abs_top_builddir/tests/kernel/kernel test_frexp_modf], 0, expout)
-AT_XFAIL_IF([grep -q "#undef USE_VECMATHLIB" $abs_top_builddir/config.h])
-AT_CLEANUP
-
-# This was a an LLVM 3.3 code gen bug.
-AT_SETUP([A saturating conversion from long to uint])
-AT_KEYWORDS([convert_sat_regression])
-AT_DATA([expout],
-[Running test test_convert_sat_regression...
-OK
-])
-AT_CHECK([$abs_top_builddir/tests/kernel/kernel test_convert_sat_regression], 0, expout)
-AT_CLEANUP
-
-
-AT_SETUP([Kernel functions abs bitselect clz max min popcount])
-#Fails on tce due to bug #1160383
-#AT_KEYWORDS([tce])
-AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc])
-AT_DATA([expout],
-[Running test test_bitselect...
-OK
-])
-AT_CHECK([$abs_top_builddir/tests/kernel/kernel test_bitselect], 0, expout)
-AT_CLEANUP
-
-AT_SETUP([Kernel functions fabs signbit isfinite isinf isnan isnormal copysign ilogb ldexp])
-#Fails with what looks like rounding with vecmathlib on ppc32. ppc64 uninvestigated
-AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc64])
-AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc && 
-             grep -q "#define LLVM_3_2" $abs_top_builddir/config.h])
-AT_KEYWORDS([])
-AT_DATA([expout],
-[Running test test_fabs...
-OK
-])
-AT_CHECK([$abs_top_builddir/tests/kernel/kernel test_fabs], 0, expout)
-AT_CLEANUP
-
-AT_SETUP([Kernel functions abs abs_diff add_sat hadd mad_hi mad_sat mul_hi rhadd sub_sat (loopvec)])
-AT_DATA([expout],
-[Running test test_hadd...
-OK
-])
-AT_CHECK([POCL_WORK_GROUP_METHOD=loopvec $abs_top_builddir/tests/kernel/kernel test_hadd 2>&1], 0, expout)
-AT_CLEANUP
-
-AT_SETUP([Kernel functions abs abs_diff add_sat hadd mad_hi mad_sat mul_hi rhadd sub_sat (loops)])
-AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc])
-AT_DATA([expout],
-[Running test test_hadd...
-OK
-])
-AT_CHECK([POCL_WORK_GROUP_METHOD=loops $abs_top_builddir/tests/kernel/kernel test_hadd 2>&1], 0, expout)
-AT_CLEANUP
-
-AT_SETUP([Kernel functions << >> rotate])
-# Fails because of bugs in rotate. 
-# At least in Debian 6.0/x86_64/LLVM 3.2 and
-# Ubuntu 12.04/x86/LLVM 3.2
-AT_XFAIL_IF([grep -q "#define LLVM_3_2" $abs_top_builddir/config.h])
-# tce fails currently this test so removed it from the keywords.
-# https://bugs.launchpad.net/tce/+bug/1180309
-#LLVM 3.3 regression on ppc - looks like an optimization+ppc codegen bug
-AT_XFAIL_IF([grep HOST_CPU $abs_top_builddir/config.h | cut -d\" -f2 | grep -q powerpc && 
-             grep -q "#define LLVM_3_3" $abs_top_builddir/config.h])
-AT_KEYWORDS([rotate])
-AT_DATA([expout],
-[Running test test_rotate...
-OK
-])
-AT_CHECK([$abs_top_builddir/tests/kernel/kernel test_rotate], 0, expout)
-AT_CLEANUP
-
-AT_SETUP([Trigonometric functions])
-AT_DATA([expout],
-[OK
-])
-AT_CHECK([$abs_top_builddir/examples/trig/trig], 0, expout)
-AT_CLEANUP
-
-AT_SETUP([Sampler address clamp])
-AT_DATA([expout],
-[Running test test_sampler_address_clamp...
-OK
-])
-AT_CHECK([$abs_top_builddir/tests/kernel/sampler_address_clamp], 0, expout)
-AT_CLEANUP 
-
-AT_SETUP([Image query functions])
-AT_DATA([expout],
-[Running test test_image_query_funcs...
-OK
-])
-AT_CHECK([$abs_top_builddir/tests/kernel/image_query_funcs], 0, expout)
-AT_CLEANUP 
-        
-AT_SETUP([Kernel functions: shuffle charN])
-AT_KEYWORDS([shuffle long])
-AT_DATA([expout],
-OK
-])
-AT_CHECK([$abs_top_builddir/tests/kernel/test_shuffle char], 0, expout)
-AT_CLEANUP
-
-AT_SETUP([Kernel functions: shuffle shortN])
-AT_KEYWORDS([shuffle long])
-AT_DATA([expout],
-OK
-])
-AT_CHECK([$abs_top_builddir/tests/kernel/test_shuffle short], 0, expout)
-AT_CLEANUP
-
-AT_SETUP([Kernel functions: shuffle ushortN])
-AT_KEYWORDS([shuffle long])
-AT_DATA([expout],
-OK
-])
-AT_CHECK([$abs_top_builddir/tests/kernel/test_shuffle ushort], 0, expout)
-AT_CLEANUP
-
-AT_SETUP([Kernel functions: shuffle halfN])
-AT_KEYWORDS([shuffle long])
-AT_DATA([expout],
-OK
-])
-AT_SKIP_IF(true)
-AT_XFAIL_IF([grep -q "#define LLVM_3_3" $abs_top_builddir/config.h ||
-             grep -q "#define LLVM_3_4" $abs_top_builddir/config.h ])
-AT_CHECK([$abs_top_builddir/tests/kernel/test_shuffle half], 0, expout)
-AT_CLEANUP
-
-AT_SETUP([Kernel functions: shuffle intN])
-AT_KEYWORDS([shuffle long])
-AT_DATA([expout],
-OK
-])
-AT_CHECK([$abs_top_builddir/tests/kernel/test_shuffle int], 0, expout)
-AT_CLEANUP
-
-AT_SETUP([Kernel functions: shuffle uintN])
-AT_KEYWORDS([shuffle long])
-AT_DATA([expout],
-OK
-])
-AT_CHECK([$abs_top_builddir/tests/kernel/test_shuffle uint], 0, expout)
-AT_CLEANUP
-
-AT_SETUP([Kernel functions: shuffle floatN])
-AT_KEYWORDS([shuffle long])
-AT_DATA([expout],
-OK
-])
-AT_CHECK([$abs_top_builddir/tests/kernel/test_shuffle float], 0, expout)
-AT_CLEANUP
-
-AT_SETUP([Kernel functions: shuffle longN])
-AT_KEYWORDS([shuffle long])
-AT_DATA([expout],
-OK
-])
-AT_CHECK([$abs_top_builddir/tests/kernel/test_shuffle long], 0, expout)
-AT_CLEANUP
-
-AT_SETUP([Kernel functions: shuffle ulongN])
-AT_KEYWORDS([shuffle long])
-AT_DATA([expout],
-OK
-])
-AT_CHECK([$abs_top_builddir/tests/kernel/test_shuffle ulong], 0, expout)
-AT_CLEANUP
-
-AT_SETUP([Kernel functions: shuffle doubleN])
-AT_KEYWORDS([shuffle long])
-AT_DATA([expout],
-OK
-])
-AT_CHECK([$abs_top_builddir/tests/kernel/test_shuffle double], 0, expout)
-AT_CLEANUP
-
-AT_BANNER([Full applications])
-
-AT_SETUP([Scalar wave equation])
-AT_KEYWORDS([scalarwave])
-#this is a check for if doubles are available
-AT_SKIP_IF([grep DISABLE_LONG $abs_top_builddir/Makefile])
-AT_CHECK_UNQUOTED([$abs_top_builddir/examples/scalarwave/scalarwave], 0,
-[$(cat $abs_top_srcdir/tests/scalarwave_expout.txt)
-])
-AT_CLEANUP
-
-AT_SETUP([Intel SVM Coarse-grained])
-AT_KEYWORDS([intel_svm])
-AT_SKIP_IF([grep "define TCE_AVAILABLE" $abs_top_builddir/config.h])
-AT_SKIP_IF([test ! -f $abs_top_srcdir/examples/IntelSVM/source/SVMBasicCoarseGrained/svmbasic])
-AT_CHECK([make test_CoarseGrained -sC $abs_top_builddir/examples/IntelSVM | grep -q PASSED], 0)
-AT_CLEANUP
-
-AT_SETUP([Intel SVM Fine-grained])
-AT_KEYWORDS([intel_svm])
-AT_SKIP_IF([grep "define TCE_AVAILABLE" $abs_top_builddir/config.h])
-AT_SKIP_IF([test ! -f $abs_top_srcdir/examples/IntelSVM/source/SVMBasicFineGrained/svmbasic])
-AT_CHECK([make test_FineGrained -sC $abs_top_builddir/examples/IntelSVM | grep -q PASSED], 0)
-AT_CLEANUP
-
-
-#m4_include(testsuite-llvmopencl.at)
-m4_include(testsuite-workgroup.at)
-m4_include(testsuite-regression.at)
-m4_include(testsuite-runtime.at)
-m4_include(testsuite-tce.at)
-m4_include(testsuite-samples.at)
-m4_include(testsuite-viennacl.at)
-m4_include(testsuite-rodinia.at)
-m4_include(testsuite-parboil.at)
-m4_include(testsuite-amd.at)
-m4_include(testsuite-amdsdk2_9.at)
-m4_include(testsuite-amdsdk3_0.at)
-m4_include(testsuite-vexcl.at)
-m4_include(testsuite-halide.at)
-m4_include(testsuite-cloverleaf.at)
-m4_include(testsuite-piglit.at)
-m4_include(testsuite-opencv.at)
diff --git a/tests/workgroup/CMakeLists.txt b/tests/workgroup/CMakeLists.txt
index 04e8511..59e0475 100644
--- a/tests/workgroup/CMakeLists.txt
+++ b/tests/workgroup/CMakeLists.txt
@@ -132,6 +132,9 @@ add_test_pocl(NAME "workgroup/workgroup_sizes_work_items_get_wrong_ids_LOOPS"
               EXPECTED_OUTPUT "print_all_ids_114114.txt"
               COMMAND "run_kernel" "print_all_ids.cl" 1 1 1 4)
 
+add_test_pocl(NAME "workgroup/issue_548_convergent_propagation_LOOPS"
+              EXPECTED_OUTPUT "issue_548_1_2_1_1.stdout"
+              COMMAND "run_kernel" "issue_548.cl" 1 2 1 1)
 
 set_tests_properties( "workgroup/unconditional_barriers_LOOPS"
   "workgroup/unbarriered_for_loops_LOOPS"
@@ -142,6 +145,7 @@ set_tests_properties( "workgroup/unconditional_barriers_LOOPS"
   "workgroup/loop_with_two_paths_to_the_latch_LOOPS"
   "workgroup/b_loop_with_two_latches_LOOPS"
   "workgroup/workgroup_sizes_work_items_get_wrong_ids_LOOPS"
+  "workgroup/issue_548_convergent_propagation_LOOPS"
   PROPERTIES
     COST 2.0
     PROCESSORS 1
@@ -149,3 +153,8 @@ set_tests_properties( "workgroup/unconditional_barriers_LOOPS"
     ENVIRONMENT "POCL_DEVICES=basic;POCL_WORK_GROUP_METHOD=workitemloops"
     DEPENDS "pocl_version_check"
     LABELS "internal;workgroup")
+
+if(LLVM_OLDER_THAN_4_0)
+  set_tests_properties("workgroup/issue_548_convergent_propagation_LOOPS"
+    PROPERTIES WILL_FAIL 1)
+endif()
diff --git a/tests/workgroup/issue_548.cl b/tests/workgroup/issue_548.cl
new file mode 100644
index 0000000..9be52fc
--- /dev/null
+++ b/tests/workgroup/issue_548.cl
@@ -0,0 +1,49 @@
+// Remove this attribute to reproduce #548, which should get fixed after
+// upgrading to Clang 6.0:
+__attribute__((convergent))
+inline
+void
+auxfunc()
+{
+    printf("auxfunc\n");
+    barrier(CLK_LOCAL_MEM_FENCE); // XXXX
+}
+
+
+__kernel
+void
+test_kernel() {
+    int id = get_local_id(0);
+    int localsize = get_local_size(0);
+    __local int x;
+
+    // For a workgroup size of 2, we expect CCC to be printed twice,
+    // with id=0 and id=1 but the printfs have id=0 for both work-items.
+    // Comment out any one or more lines marked XXXX and it works as expected.
+
+#define TESTID1 // XXXX
+#define TESTID2 // XXXX
+
+#ifdef TESTID1
+    bool test1 = (id == 0);
+#else
+    bool test1 = true;
+#endif
+#ifdef TESTID2
+    bool test2 = (id == 0);
+#else
+    bool test2 = true;
+#endif
+
+    printf("id=%d: AAA.\n", id);
+    if (test1) {
+      x += 1; // XXXX
+      //x = 1; // uncomment this line and it also fixes it
+    }
+    printf("id=%d: CCC.\n", id);
+    auxfunc();
+    if (test2) {
+      x = 1; // XXXX
+    }
+    printf("id=%d: EEE.\n", id);
+}
diff --git a/tests/workgroup/issue_548_1_2_1_1.stdout b/tests/workgroup/issue_548_1_2_1_1.stdout
new file mode 100644
index 0000000..e9a4cfc
--- /dev/null
+++ b/tests/workgroup/issue_548_1_2_1_1.stdout
@@ -0,0 +1,8 @@
+id=0: AAA.
+id=0: CCC.
+auxfunc
+id=1: AAA.
+id=1: CCC.
+auxfunc
+id=0: EEE.
+id=1: EEE.
diff --git a/tools/docker/ArchLinux/default b/tools/docker/ArchLinux/default
new file mode 100644
index 0000000..1cbd219
--- /dev/null
+++ b/tools/docker/ArchLinux/default
@@ -0,0 +1,9 @@
+FROM pritunl/archlinux:latest
+ARG GIT_COMMIT=master
+LABEL git-commit=$GIT_COMMIT vendor=pocl distro=Arch version=1.0
+RUN pacman --noconfirm -S gcc patch hwloc cmake git pkg-config make ninja ocl-icd clang llvm llvm-libs clinfo
+
+RUN cd /home ; git clone https://github.com/pocl/pocl.git ; cd /home/pocl ; git checkout $GIT_COMMIT
+RUN cd /home/pocl ; mkdir b ; cd b; cmake -G Ninja  ..
+RUN cd /home/pocl/b ; ninja
+CMD cd /home/pocl/b ; clinfo ;  ctest -j4 --output-on-failure -L internal
\ No newline at end of file
diff --git a/tools/docker/ArchLinux/distro b/tools/docker/ArchLinux/distro
new file mode 100644
index 0000000..246c92e
--- /dev/null
+++ b/tools/docker/ArchLinux/distro
@@ -0,0 +1,14 @@
+FROM pritunl/archlinux:latest
+ARG GIT_COMMIT=master
+LABEL git-commit=$GIT_COMMIT vendor=pocl distro=Arch version=1.0
+RUN pacman --noconfirm -S gcc patch hwloc cmake git pkg-config make ninja ocl-icd clang llvm llvm-libs clinfo
+
+RUN cd /home ; git clone https://github.com/pocl/pocl.git ; cd /home/pocl ; git checkout $GIT_COMMIT
+RUN cd /home/pocl ; mkdir b ; cd b; \
+   cmake -DCMAKE_INSTALL_PREFIX=/usr \
+         -DKERNELLIB_HOST_CPU_VARIANTS=distro \
+         -DPOCL_ICD_ABSOLUTE_PATH=OFF \
+         -G Ninja ..
+RUN cd /home/pocl/b ; ninja
+RUN cd /home/pocl/b ; ninja install
+CMD cd /home/pocl/b ; rm CTestCustom.cmake ; clinfo ;  ctest -j4 --output-on-failure -L internal
diff --git a/tools/docker/ArchLinux/test_install b/tools/docker/ArchLinux/test_install
new file mode 100644
index 0000000..621e17e
--- /dev/null
+++ b/tools/docker/ArchLinux/test_install
@@ -0,0 +1,10 @@
+FROM pritunl/archlinux:latest
+ARG GIT_COMMIT=master
+LABEL git-commit=$GIT_COMMIT vendor=pocl distro=Arch version=1.0
+RUN pacman --noconfirm -S gcc patch hwloc cmake git pkg-config make ninja ocl-icd clang llvm llvm-libs clinfo
+
+RUN cd /home ; git clone https://github.com/pocl/pocl.git ; cd /home/pocl ; git checkout $GIT_COMMIT
+RUN cd /home/pocl ; mkdir b ; cd b; cmake -DCMAKE_INSTALL_PREFIX=/usr -G Ninja ..
+RUN cd /home/pocl/b ; ninja
+RUN cd /home/pocl/b ; ninja install
+CMD cd /home/pocl/b ; rm CTestCustom.cmake ; clinfo ;  ctest -j4 --output-on-failure -L internal
diff --git a/tools/docker/Debian/stretch b/tools/docker/Debian/stretch
new file mode 100644
index 0000000..0495e90
--- /dev/null
+++ b/tools/docker/Debian/stretch
@@ -0,0 +1,11 @@
+FROM debian:stretch
+ARG GIT_COMMIT=master
+LABEL git-commit=$GIT_COMMIT vendor=pocl distro=Debian version=1.0
+RUN apt update
+RUN apt upgrade -y
+RUN apt install -y build-essential ocl-icd-libopencl1 cmake git gcc g++ libhwloc5 pkg-config libclang-3.9-dev  clang-3.9 llvm-3.9 make ninja-build ocl-icd-libopencl1 ocl-icd-dev ocl-icd-opencl-dev libhwloc-dev zlib1g zlib1g-dev clinfo
+
+RUN cd /home ; git clone https://github.com/pocl/pocl.git ; cd /home/pocl ; git checkout $GIT_COMMIT
+RUN cd /home/pocl ; mkdir b ; cd b; cmake -G Ninja ..
+RUN cd /home/pocl/b ; ninja
+CMD cd /home/pocl/b ; clinfo ; ctest -j4 --output-on-failure -L internal
diff --git a/tools/docker/Debian/testing b/tools/docker/Debian/testing
new file mode 100644
index 0000000..f2e3130
--- /dev/null
+++ b/tools/docker/Debian/testing
@@ -0,0 +1,11 @@
+FROM debian:testing
+ARG GIT_COMMIT=master
+LABEL git-commit=$GIT_COMMIT vendor=pocl distro=Debian version=1.0
+RUN apt update
+RUN apt upgrade -y
+RUN apt install -y build-essential ocl-icd-libopencl1 cmake git gcc g++ libhwloc5 pkg-config libclang-4.0-dev llvm-4.0-dev clang-4.0 llvm-4.0 make ninja-build ocl-icd-libopencl1 ocl-icd-dev ocl-icd-opencl-dev libhwloc-dev zlib1g zlib1g-dev clinfo
+
+RUN cd /home ; git clone https://github.com/pocl/pocl.git ; cd /home/pocl ; git checkout $GIT_COMMIT
+RUN cd /home/pocl ; mkdir b ; cd b; cmake -G Ninja ..
+RUN cd /home/pocl/b ; ninja
+CMD cd /home/pocl/b ; clinfo ; ctest -j4 --output-on-failure -L internal
diff --git a/tools/docker/Fedora/default b/tools/docker/Fedora/default
new file mode 100644
index 0000000..7a15f39
--- /dev/null
+++ b/tools/docker/Fedora/default
@@ -0,0 +1,11 @@
+FROM fedora:latest
+ARG GIT_COMMIT=master
+LABEL git-commit=$GIT_COMMIT vendor=pocl distro=Fedora version=1.0
+RUN dnf upgrade -y ; echo "almost done"
+RUN dnf upgrade -y ; echo "done"
+RUN dnf install -y gcc gcc-c++ hwloc-devel hwloc-libs cmake git-core pkgconfig make ninja-build ocl-icd ocl-icd-devel clang clang-devel clang-libs llvm llvm-devel llvm-libs patch redhat-rpm-config findutils libtool-ltdl libtool-ltdl-devel
+
+RUN cd /home ; git clone https://github.com/pocl/pocl.git ; cd /home/pocl ; git checkout $GIT_COMMIT
+RUN cd /home/pocl ; mkdir b ; cd b; cmake -G Ninja ..
+RUN cd /home/pocl/b ; ninja
+CMD cd /home/pocl/b ; ctest -j4 --output-on-failure -L internal
diff --git a/tools/docker/Ubuntu/16_04.32bit b/tools/docker/Ubuntu/16_04.32bit
new file mode 100644
index 0000000..3b4251d
--- /dev/null
+++ b/tools/docker/Ubuntu/16_04.32bit
@@ -0,0 +1,12 @@
+FROM i386/ubuntu:16.04
+ARG GIT_COMMIT=master
+LABEL git-commit=$GIT_COMMIT vendor=pocl distro=Ubuntu version=1.0
+ENV TERM dumb
+RUN apt update
+RUN apt install -y dialog apt-utils
+RUN apt install -y build-essential ocl-icd-libopencl1 cmake git pkg-config libclang-4.0-dev  clang-4.0 llvm-4.0 make ninja-build ocl-icd-libopencl1 ocl-icd-dev ocl-icd-opencl-dev libhwloc-dev zlib1g zlib1g-dev clinfo
+
+RUN cd /home ; git clone https://github.com/pocl/pocl.git ; cd /home/pocl ; git checkout $GIT_COMMIT
+RUN cd /home/pocl ; mkdir b ; cd b; i386 cmake -G Ninja -DWITH_LLVM_CONFIG=/usr/bin/llvm-config-4.0 ..
+RUN cd /home/pocl/b ; ninja
+CMD cd /home/pocl/b ; clinfo ; ctest -j4 --output-on-failure -L internal
diff --git a/tools/docker/Ubuntu/16_04.64bit b/tools/docker/Ubuntu/16_04.64bit
new file mode 100644
index 0000000..6c649b5
--- /dev/null
+++ b/tools/docker/Ubuntu/16_04.64bit
@@ -0,0 +1,12 @@
+FROM amd64/ubuntu:16.04
+ARG GIT_COMMIT=master
+LABEL git-commit=$GIT_COMMIT vendor=pocl distro=Ubuntu version=1.0
+ENV TERM dumb
+RUN apt update
+RUN apt upgrade -y
+RUN apt install -y build-essential ocl-icd-libopencl1 cmake git pkg-config libclang-4.0-dev  clang-4.0 llvm-4.0 make ninja-build ocl-icd-libopencl1 ocl-icd-dev ocl-icd-opencl-dev libhwloc-dev zlib1g zlib1g-dev clinfo
+
+RUN cd /home ; git clone https://github.com/pocl/pocl.git ; cd /home/pocl ; git checkout $GIT_COMMIT
+RUN cd /home/pocl ; mkdir b ; cd b; cmake -G Ninja -DWITH_LLVM_CONFIG=/usr/bin/llvm-config-4.0 ..
+RUN cd /home/pocl/b ; ninja
+CMD cd /home/pocl/b ; clinfo ; ctest -j4 --output-on-failure -L internal
diff --git a/tools/docker/Ubuntu/default.32bit b/tools/docker/Ubuntu/default.32bit
new file mode 100644
index 0000000..e2123f7
--- /dev/null
+++ b/tools/docker/Ubuntu/default.32bit
@@ -0,0 +1,12 @@
+FROM i386/ubuntu:latest
+ARG GIT_COMMIT=master
+LABEL git-commit=$GIT_COMMIT vendor=pocl distro=Ubuntu version=1.0
+ENV TERM dumb
+RUN apt update
+RUN apt install -y dialog apt-utils
+RUN apt install -y build-essential ocl-icd-libopencl1 cmake git pkg-config libclang-dev clang llvm make ninja-build ocl-icd-libopencl1 ocl-icd-dev ocl-icd-opencl-dev libhwloc-dev zlib1g zlib1g-dev clinfo
+
+RUN cd /home ; git clone https://github.com/pocl/pocl.git ; cd /home/pocl ; git checkout $GIT_COMMIT
+RUN cd /home/pocl ; mkdir b ; cd b; i386 cmake -G Ninja ..
+RUN cd /home/pocl/b ; ninja
+CMD cd /home/pocl/b ; clinfo ; ctest -j4 --output-on-failure -L internal
diff --git a/tools/docker/Ubuntu/default.64bit b/tools/docker/Ubuntu/default.64bit
new file mode 100644
index 0000000..ae1646b
--- /dev/null
+++ b/tools/docker/Ubuntu/default.64bit
@@ -0,0 +1,12 @@
+FROM amd64/ubuntu:latest
+ARG GIT_COMMIT=master
+LABEL git-commit=$GIT_COMMIT vendor=pocl distro=Ubuntu version=1.0
+ENV TERM dumb
+RUN apt update
+RUN apt upgrade -y
+RUN apt install -y build-essential ocl-icd-libopencl1 cmake git pkg-config libclang-dev clang llvm make ninja-build ocl-icd-libopencl1 ocl-icd-dev ocl-icd-opencl-dev libhwloc-dev zlib1g zlib1g-dev clinfo
+
+RUN cd /home ; git clone https://github.com/pocl/pocl.git ; cd /home/pocl ; git checkout $GIT_COMMIT
+RUN cd /home/pocl ; mkdir b ; cd b; cmake -G Ninja ..
+RUN cd /home/pocl/b ; ninja
+CMD cd /home/pocl/b ; clinfo ; ctest -j4 --output-on-failure -L internal
diff --git a/tools/docker/Ubuntu/default.conformance b/tools/docker/Ubuntu/default.conformance
new file mode 100644
index 0000000..830858d
--- /dev/null
+++ b/tools/docker/Ubuntu/default.conformance
@@ -0,0 +1,13 @@
+FROM ubuntu:latest
+ARG GIT_COMMIT=master
+LABEL git-commit=$GIT_COMMIT vendor=pocl distro=Ubuntu version=1.0
+ENV TERM dumb
+RUN apt update
+RUN apt upgrade -y
+RUN apt install -y build-essential ocl-icd-libopencl1 cmake git pkg-config libclang-dev clang llvm make ninja-build ocl-icd-libopencl1 ocl-icd-dev ocl-icd-opencl-dev libhwloc-dev zlib1g zlib1g-dev clinfo
+
+RUN cd /home ; git clone https://github.com/pocl/pocl.git ; cd /home/pocl ; git checkout $GIT_COMMIT
+RUN cd /home/pocl ; mkdir b ; cd b; cmake -G Ninja -DENABLE_TESTSUITES=conformance ..
+RUN cd /home/pocl/b ; ninja prepare_examples
+RUN cd /home/pocl/b ; ninja
+CMD cd /home/pocl/b ; clinfo ; ctest -j1 --output-on-failure -L conformance
diff --git a/tools/docker/Ubuntu/distro b/tools/docker/Ubuntu/distro
new file mode 100644
index 0000000..94455a3
--- /dev/null
+++ b/tools/docker/Ubuntu/distro
@@ -0,0 +1,17 @@
+FROM amd64/ubuntu:latest
+ARG GIT_COMMIT=master
+LABEL git-commit=$GIT_COMMIT vendor=pocl distro=Ubuntu version=1.0
+ENV TERM dumb
+RUN apt update
+RUN apt upgrade -y
+RUN apt install -y build-essential ocl-icd-libopencl1 cmake git pkg-config libclang-dev clang llvm make ninja-build ocl-icd-libopencl1 ocl-icd-dev ocl-icd-opencl-dev libhwloc-dev zlib1g zlib1g-dev clinfo
+
+RUN cd /home ; git clone https://github.com/pocl/pocl.git ; cd /home/pocl ; git checkout $GIT_COMMIT
+RUN cd /home/pocl ; mkdir b ; cd b; \
+   cmake -DCMAKE_INSTALL_PREFIX=/usr \
+         -DKERNELLIB_HOST_CPU_VARIANTS=distro \
+         -DPOCL_ICD_ABSOLUTE_PATH=OFF \
+         -G Ninja ..
+RUN cd /home/pocl/b ; ninja
+RUN cd /home/pocl/b ; ninja install
+CMD cd /home/pocl/b ; rm CTestCustom.cmake ; clinfo ;  ctest -j4 --output-on-failure -L internal
diff --git a/tools/docker/Ubuntu/test_install b/tools/docker/Ubuntu/test_install
new file mode 100644
index 0000000..c12f230
--- /dev/null
+++ b/tools/docker/Ubuntu/test_install
@@ -0,0 +1,13 @@
+FROM amd64/ubuntu:latest
+ARG GIT_COMMIT=master
+LABEL git-commit=$GIT_COMMIT vendor=pocl distro=Ubuntu version=1.0
+ENV TERM dumb
+RUN apt update
+RUN apt upgrade -y
+RUN apt install -y build-essential ocl-icd-libopencl1 cmake git pkg-config libclang-dev clang llvm make ninja-build ocl-icd-libopencl1 ocl-icd-dev ocl-icd-opencl-dev libhwloc-dev zlib1g zlib1g-dev clinfo
+
+RUN cd /home ; git clone https://github.com/pocl/pocl.git ; cd /home/pocl ; git checkout $GIT_COMMIT
+RUN cd /home/pocl ; mkdir b ; cd b; cmake -DCMAKE_INSTALL_PREFIX=/usr -G Ninja ..
+RUN cd /home/pocl/b ; ninja
+RUN cd /home/pocl/b ; ninja install
+CMD cd /home/pocl/b ; rm CTestCustom.cmake ; clinfo ; ctest -j4 --output-on-failure -L internal
diff --git a/tools/scripts/devel-envs.sh b/tools/scripts/devel-envs.sh
index 9ce0ce8..ae9e120 100755
--- a/tools/scripts/devel-envs.sh
+++ b/tools/scripts/devel-envs.sh
@@ -8,13 +8,6 @@ export OCL_ICD_VENDORS=$PWD/ocl-vendors
 # AMDSDK supports the overriding via other env name.
 export OPENCL_VENDOR_PATH=$OCL_ICD_VENDORS
 
-# pocl test-cases don't link against pthreads, but libpocl does.
-# this confuses gdb unless we preload libpthread.
-# Not having this also makes cl2.hpp throw std::system_error exception for
-# an unknown reason (at least on Ubuntu 14.04 / gcc 4.8.4).
-# If libpocl is not built yet, this will fail...
-export LD_PRELOAD=$(ldd lib/CL/$libs_subdir/libpocl.so | grep pthread | cut -f 3 -d' ')
-
 #sometimes useful variable when ICD fails (and we use ocl-icd)
 #export OCL_ICD_DEBUG=15
 export PATH=$PWD/bin:$PATH
diff --git a/tools/scripts/run_cuda_tests b/tools/scripts/run_cuda_tests
new file mode 100755
index 0000000..cc3beb0
--- /dev/null
+++ b/tools/scripts/run_cuda_tests
@@ -0,0 +1,36 @@
+#!/bin/bash
+# run_cuda_tests - Runs tests against CUDA devices.
+#
+# Copyright (c) 2016 James Price / University of Bristol
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+export POCL_BUILDING=1
+export POCL_DEVICES="CUDA"
+
+if [ -z "$OCL_ICD_VENDORS" ]; then
+  export OCL_ICD_VENDORS=$PWD/ocl-vendors
+fi
+if [ ! -e "$OCL_ICD_VENDORS" ]; then
+  echo "Not using OCL_ICD_VENDORS ($OCL_ICD_VENDORS doesn't exist)"
+else
+  echo "Using OCL_ICD_VENDORS: $OCL_ICD_VENDORS"
+fi
+
+ctest -L cuda $@
diff --git a/windows/setup_and_build_win64.sh b/windows/setup_and_build_win64.sh
deleted file mode 100644
index 3a81ef7..0000000
--- a/windows/setup_and_build_win64.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#/bin/bash
-
-# This script requires following software preinstalled:
-# * Git + git bash (default installer settings are fine, make sure its in your PATH)
-# * Python 2.7
-# * Cmake 2.8 or later (make sure it will be added to PATH during installation)
-# * Visual Studio Community edition 2013
-
-# no spaces in this path please
-export POCLBUILDROOT=/c/pocl-playground
-
-mkdir $POCLBUILDROOT
-cd $POCLBUILDROOT
-
-# Get external libs
-curl ftp://sourceware.org/pub/pthreads-win32/pthreads-w32-2-9-1-release.zip -O
-curl http://www.open-mpi.org/software/hwloc/v1.10/downloads/hwloc-win64-build-1.10.0.zip -O
-unzip hwloc-win64-build-1.10.0.zip
-unzip pthreads-w32-2-9-1-release.zip -d pthreads-win32-full
-cp -r pthreads-win32-full/Pre-built.2 pthreads-win32
-
-# Build llvm
-cd $POCLBUILDROOT
-git clone --single-branch https://github.com/llvm-mirror/llvm -b release_36
-cd llvm/tools
-git clone --single-branch https://github.com/llvm-mirror/clang.git -b release_36
-mkdir $POCLBUILDROOT/llvm-build
-cd $POCLBUILDROOT/llvm-build
-cmake -G "Visual Studio 12 Win64" ../llvm
-cmake --build . --config MinSizeRel
-
-# Build pocl
-cd $POCLBUILDROOT
-git clone https://github.com/pocl/pocl.git
-mkdir $POCLBUILDROOT/pocl-build
-cd $POCLBUILDROOT/pocl-build
-export PATH=$PATH:$POCLBUILDROOT/llvm-build/MinSizeRel/bin
-Hwloc_ROOT=../hwloc-win64-build-1.10.0/ Pthreads_ROOT=../pthreads-win32/ cmake -DSTATIC_LLVM:BOOL=ON -DDEFAULT_ENABLE_ICD:BOOL=OFF -DCMAKE_INSTALL_PREFIX:PATH=$PWD/../install-pocl -G "Visual Studio 12 Win64" ../pocl/
-cmake --build . --config MinSizeRel
-
-## Run test suite
-# export PATH=$PATH:$POCLBUILDROOT/pocl-build/lib/CL/MinSizeRel:$POCLBUILDROOT/pocl-build/lib/llvmopencl/MinSizeRel:$POCLBUILDROOT/pocl-build/lib/poclu/MinSizeRel:$POCLBUILDROOT/hwloc-win64-build-1.10.0/bin:$POCLBUILDROOT/pthreads-win32/dll/x64
-# cd $POCLBUILDROOT/pocl-build
-# ctest -j8

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/collab-maint/pocl.git